# -*- coding=utf-8 -*- import requests import json import re from bs4 import BeautifulSoup import pymysql class SpiderJd: def __init__(self): self.header = { 'Content-Type': 'text/html; charset=UTF-8', 'Content-Encoding': 'gzip', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36' } def response_handle(self, base_url): req = requests.get(base_url, headers=self.header) return req def parse(self, response): soup = BeautifulSoup(response.text, 'lxml') all_da00 = soup.find_all('ul', class_="gl-warp clearfix")[0] all_data = all_da00.find_all('li') return all_data def save_data(self, data): title = data.find_all('i', class_="promo-words")[0].text img_base = "https://" + re.findall('-img="//(.*?\.jpg)"', str(data))[0] price = "¥" + data.select('strong > i')[0].text # comment_num = data.find_all('div', class_="p-commit") # 评论数据异步加载 privilege = [i.text for i in data.find_all('i', class_="goods-icons4 J-picon-tips")] # 优惠信息 data_d = {'title': title, 'img': img_base, 'price': price, 'privilege': privilege} with open('Jd.json', 'a+', encoding='utf-8') as f: json.dump(data_d, f, ensure_ascii=False) f.write('n') def main(self): url = "https://list.jd.com/list.html?cat=16750,16755,16809" resp = self.response_handle(url) par = self.parse(resp) for i in par: self.save_data(i) if __name__ == '__main__': crawl = SpiderJd() crawl.main()
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)