Python实训day14pm【Python网络爬虫综合大作业-参考解析】_随笔

Python实训day14pm【Python网络爬虫综合大作业-参考解析】
Python实训-15天-博客汇总表
题目：天气数据的爬取和统计

大作业题目思路引导：定时爬取每个地级市的实时天气状况、存入excel中、为每个城市生成html展示、历史excel文件以每日为单位归档到文件夹中。

考察点：爬虫+文件读写+目录 *** 作+pandas

网址：首页

每个地市（找数据接口）：1.从网站上寻找所有的地市列表-->地市编码；2.根据地市编号，爬取该地的实时天气；
'''
大作业题目思路引导：
题目1：定时爬取每个地级市的实时天气状况、存入excel中、为每个城市生成html展示、历史excel文件以每日为单位归档到文件夹中。
考察点：爬虫+文件读写+目录 *** 作+pandas
网址：http://www.weather.com.cn/

每个地市：
1.从网站上寻找所有的地市列表-->地市编码（beautifulsoup解析页面）或（找数据接口）
2.根据地市编号，爬取该地的实时天气（beautifulsoup解析页面）或（找数据接口）

省份没什么用，我们只需要地级市的信息。
'''
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.schedulers.blocking import BlockingScheduler
from datetime import datetime
import requests as req
import pandas as pd
import threading
import schedule
import logging
import json
import time
import os

hds = {  # 全局变量：伪装为浏览器
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'}


# demo1：获取所有地市信息（k:名、v：编码，放入dict中）
def getAllCityInformation():
    # hds = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'}
    resp = req.get('https://j.i8tq.com/weather2020/search/city.js', headers=hds)
    ct = resp.content.decode('utf-8')
    ct = ct[ct.index('=') + 1:];  # dict格式的str->字符串切割，保留=后面的内容
    info = json.loads(ct);  # str转dict
    # print(info)
    # print(type(info))  # 
    cityinfo = {};  # k：城市名、v：城市编码，将信息存储到dict中
    topcity = ['北京', '天津', '上海', '重庆'];  # 单独列出直辖市
    # 遍历字典
    for province, cities in info.items():  # 最外层：省份(所有省份)与直辖市(地级市)
        for cityname, areas in cities.items():  # 城市名与直辖市、遍历所有地级市
            # print(cityname)  # 北京、上海、天津、重庆、哈尔滨、齐齐哈尔、郑州、商丘...
            # print(areas[cityname]['AREAID'])  # 101010100、101020100...
            if cityname in topcity:  # 若是4个直辖市之一，拉取所有的数据【存储键值对-城市名:城市编码】
                for k, v in areas.items():
                    cityinfo[k] = v['AREAID'];
            else:
                cityinfo[cityname] = areas[cityname]['AREAID'];
    # print(len(cityinfo))  # 441
    # print(cityinfo)  # {'北京': '101010100', '海淀': '101010200', '朝阳': '101071201', '顺义': '101010400', '怀柔': '101010500', '通州': '101010600',
    return cityinfo;


# demo2：根据城市编码爬取实时天气
# '绍兴': '101210507', '台州': '101210601', '温州': '101210701'
def loadWeather(cityname, code):
    # hds = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'}
    hds['Referer'] = 'http://www.weather.com.cn/';
    resp = req.get('http://d1.weather.com.cn/sk_2d/{}.html'.format(code), headers=hds)
    ct = resp.content.decode('utf-8')
    ct = json.loads(ct[ct.index('=') + 1:])
    # print(ct)  # {'nameen': 'shaoxing', 'cityname': '绍兴', 'city': '101210507', 'temp': '16', 'tempf': '60', 'WD': '东风', 'wde': 'E', 'WS': '1级', 'wse': '2km/h', 'SD': '48%', 'sd': '48%', 'qy': '1021', 'njd': '7km', 'time': '16:00', 'rain': '0', 'rain24h': '0', 'aqi': '84', 'aqi_pm25': '84', 'weather': '晴', 'weathere': 'Sunny', 'weathercode': 'd00', 'limitnumber': '', 'date': '01月19日(星期三)'}
    # print(cityname, ':')  # 1、城市名称
    # print(ct['temp'])  # 2、温度
    # print(ct['SD'])  # 3、湿度
    # print(ct['WD'], ct['WS'])  # 4、风力
    # print(ct['aqi'])  # 5、PM2.5
    # print(cityname, '：', ct['temp'], '、', ct['SD'], '、', ct['WD'], '、', ct['WS'], '、', ct['aqi'])
    return [cityname, ct['temp'], ct['SD'], ct['WD'] + ct['WS'], ct['aqi']]


# demo3：利用以下demo，就可以生成所有城市的html天气展示。
def generateHTML(ls):
    # demo = ['北京', '-4', '64%', '东方一级', '87']
    root = r'C:UserslwxDesktopweather';
    f = open(os.path.join(root, 'weatherTemplate.html'), 'r', encoding='utf-8')
    temp = f.read();
    f.close()

    # 生成存放HTML的文件夹
    root_excel = r'C:UserslwxDesktopweather实时天气HTML页面';
    detailTime = str(datetime.now())[:13]  # 2022-01-19 20:53:35
    detailTime = detailTime.replace(' ', '--').replace(':', '-') + '点';  # 2022-01-19--20点
    dir_detailedTime = os.path.join(root_excel, detailTime);  # 文件夹路径
    if not os.path.exists(dir_detailedTime):  # 文件夹是否存在
        os.mkdir(dir_detailedTime);  # 创建文件夹
    # print(dir_detailedTime)  # C:UserslwxDesktopweather实时天气HTML页面2022-01-19-21-04-52

    f = open(os.path.join(dir_detailedTime, ls[0] + '.html'), 'w', encoding='utf-8')  # 生成HTML页面
    content = temp.format(ls[0], ls[0], ls[1], ls[2], ls[3], ls[4])
    f.write(content)
    f.close()
    # print(ls[0], end="")
    # print("——动态生成HTML页面完毕！")
    # loadweather('绍兴', '101210507');


# demo4：生成Excel文件。1.数据导入Excel；2.Excel文件的命名；3.文件夹的自动创建。
# demo4：以时间日期命名文件夹并将数据存入Excel
# 二维列表or字典 ——> Dataframe ——> Excel
def outputDataToExcel(bigList):
    # print(bigList)
    df = pd.Dataframe(bigList, columns=['城市名称', '实时温度', '相对湿度', '风力', '空气质量AQI']);  # 行名称
    root_excel = r'C:UserslwxDesktopweatherExcel_归档';
    day = str(datetime.date(datetime.now()));
    # print(day)  # 2022-01-19
    time = str(datetime.time(datetime.now()))[:5].replace(":", "时") + '分';
    # print(time)  # 17时18分
    # 归档
    dir_day = os.path.join(root_excel, day);  # 文件夹路径
    if not os.path.exists(dir_day):  # 文件夹是否存在
        os.mkdir(dir_day);  # 创建文件夹
    time_file = os.path.join(dir_day, time + '.xls');  # 生成Excel文件
    df.to_excel(time_file, index=False);  # False去除最左边的一列
    print("Excel-Over！")


def main():
    bigList = [];  # 存放所有城市的信息，最后统一再将数据输出到Excel中
    # 遍历所有的城市，count用来计数与测试
    count = 0;
    cityinfo = getAllCityInformation();  # 字典存储城市信息
    for k, v in cityinfo.items():
        w = loadWeather(k, v);  # 根据城市编码爬取实时天气，得到列表
        # print(w)  # 打印天气信息列表
        generateHTML(w);  # 为每个城市生成HTML页面
        # count += 1;  # 取前5个城市的信息进行测试
        # if count == 5:
        #     break;
        temp = w.copy();  # 复制list
        bigList.append(temp);  # 将list存到bigList中
    # print(bigList)  # [['北京', '-1', '40%南风', '2级', '72'], ['海淀', '-1', '41%南风', '2级', '73'], ['朝阳', '-10', '39%东北风', '3级', '28'], ['顺义', '-2', '38%南风', '2级', '85']]
    # for i in bigList:
    #     print(i)
    outputDataToExcel(bigList);  # 以时间日期命名文件夹并将数据存入Excel


# main();  # 调用主函数

# while True:
# sched = BlockingScheduler();
# sched = BackgroundScheduler(timezone='Asia/Shanghai')
# sched.add_job(main, 'interval', seconds=180);  # 每xxx分钟执行一次：minutes=60、seconds=1
# sched.start(); # sched._logger = logging;
sched = BlockingScheduler(timezone='MST')
sched.add_job(main, 'interval', seconds=180)  # 180秒后第一次执行
sched.start()
欢迎分享，转载请注明来源：内存溢出
原文地址: http://outofmemory.cn/zaji/5712145.html
Python实训day14pm【Python网络爬虫综合大作业-参考解析】

发表评论

评论列表（0条）