#!/usr/bin/env python import sys import random import time import requests from bs4 import BeautifulSoup import pandas as pd from sqlalchemy import create_engine connect_info = 'mysql+pymysql://{}:{}@{}:{}/{}?charset=utf8'.format("root", "123456", "localhost", 3306, "flush") engine = create_engine(connect_info) from config import config class Flush(object): def __init__(self): self.PAGE_TRACK = 1 self.MAX_PAGE = 5 self.PROXY_POOL_API = "http://127.0.0.1:5555/random" self.PAGE_LIST = [] self.proxy_con = 0 self.MAX_PAGE_flag = True def downloader(self, url, num_retries=1): headers = config.get_headers() bord_list = [] try: time.sleep(random.random() * 1) # 设置延时 respons = requests.get(url, headers=headers, timeout=4) html = str(respons.content, encoding="gbk") soup = BeautifulSoup(html, 'html.parser') # 按标签名查找 print(soup.title) # 读取title属性 cate_group = soup.select(".cate_group .cate_items a") for group in cate_group: bord_dic = {} bord = group.string bord_dic["name"] =bord bord_dic["bord"] =group['href'].split("/")[-2] # 需要再加 bord_list.append(bord_dic) df1 = pd.Dataframe(bord_list) print(bord_list) df1.to_sql('concept_bord', engine, if_exists='append', index=False) print('PAGGE is {} , URL is:{} to Mysql table successfully!'.format(self.PAGE_TRACK, url)) except Exception as e: print("异常{}, 重新下载{}".format(e, url)) def main(): try: flush = Flush() flush.downloader("http://q.10jqka.com.cn/gn/") except Exception as err: print(err) if __name__ == "__main__": try: main() finally: sys.exit()
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)