1. 项目简介
在上图中可以看到,深圳今天,7天,8-15天等的天气数据,这里爬取7天的天气预报数据。
2. HTML 代码分析
分析这段代码:
7天的天气预报实际上在一个
- 元素中,每天是一个M
- 元素,7天的结构差不多是一样的(注意:今天没有最高温度与最低温度)。
3. 爬取天气预报数据
from bs4 import BeautifulSoup from bs4.dammit import UnicodeDammit # BS内置库,用于推测文档编码 import urllib.request # 发起请求,获取响应 url = "http://www.weather.com.cn/weather/101280601.shtml" try: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.78"} req = urllib.request.Request(url, headers=headers) # 创建请求对象 data = urllib.request.urlopen(req) # 发起请求 data = data.read() # 获得响应体 dammit = UnicodeDammit(data, ["utf-8", "gbk"]) data = dammit.unicode_markup # 解码 soup = BeautifulSoup(data, "lxml") lis = soup.select("ul[class='t clearfix'] li") x = 0 for li in lis: try: date = li.select('h1')[0].text weather = li.select('p[class="wea"]')[0].text if x == 0: # 为今天只有一个温度做判断 <i>14℃</i> x += 1 temp = li.select('p[class="tem"] i')[0].text else: temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text print(date, weather, temp) # 22日(今天) 晴 14℃ # 23日(明天) 晴 23℃/14℃ # 24日(后天) 晴转多云 25℃/13℃ # 25日(周六) 多云 21℃/13℃ # 26日(周日) 多云转晴 22℃/12℃ # 27日(周一) 晴 21℃/12℃ # 28日(周二) 晴 24℃/14℃ except Exception as err: print(err) except Exception as err: print(err)
4. 爬取与存储天气预报数据
获取北京、上海、广州、深圳等城市的代码,爬取这些城市的天气预报数据,并存储到sqllite数据库weathers.db中,存储的数据表weathers是:
create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))"
编写程序依次爬取各个城市的天气预报数据存储在数据库中,程序如下:
from bs4 import BeautifulSoup from bs4.dammit import UnicodeDammit import urllib.request import sqlite3 # 天气数据库 class WeatherDB: def __init__(self): self.cursor = None self.con = None def openDB(self): self.con = sqlite3.connect("weathers.db") self.cursor = self.con.cursor() try: self.cursor.execute( "create table weathers (wCity varchar(16)," "wDate varchar(16)," "wWeather varchar(64)," "wTemp varchar(32)," "constraint pk_weather primary key (wCity,wDate))") except Exception as err: print(err) self.cursor.execute("delete from weathers") def closeDB(self): self.con.commit() self.con.close() def insert(self, city, date, weather, temp): try: self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)", (city, date, weather, temp)) except Exception as err: print(err) def show(self): self.cursor.execute("select * from weathers") rows = self.cursor.fetchall() print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp")) for row in rows: print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3])) # 天气预报 class WeatherForecast: def __init__(self): self.headers = { "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) " "Gecko/2008072421 Minefield/3.0.2pre"} self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"} def forecastCity(self, city): if city not in self.cityCode.keys(): print(city + " 找不到代码") return url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml" try: req = urllib.request.Request(url, headers=self.headers) data = urllib.request.urlopen(req) data = data.read() dammit = UnicodeDammit(data, ["utf-8", "gbk"]) data = dammit.unicode_markup soup = BeautifulSoup(data, "lxml") lis = soup.select("ul[class='t clearfix'] li") x = 0 for li in lis: try: date = li.select('h1')[0].text weather = li.select('p[class="wea"]')[0].text if x == 0: # 为今天只有一个温度做判断 <i>14℃</i> x += 1 temp = li.select('p[class="tem"] i')[0].text else: temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text print(city, date, weather, temp) self.db.insert(city, date, weather, temp) except Exception as err: print(err) except Exception as err: print(err) def process(self, cities): self.db = WeatherDB() self.db.openDB() for city in cities: self.forecastCity(city) # self.db.show() self.db.closeDB() ws = WeatherForecast() ws.process(["北京", "上海", "广州", "深圳"]) print("completed")
程序执行结果如下:
版权归原作者 即使再小的船也能远航 所有, 如有侵权,请联系我们删除。