今天打开pycharm发现去年十二月各种翻阅资料总结写的爬虫记录还在,记录下
温馨提示:爬虫玩得越行,你就越刑。切忌在合法合理情况下使用。
中国python爬虫违法违规案例大汇总:https://blog.csdn.net/huang5333/article/details/114656888
下面开始正题,先来个最终获取数据图
1、输入关键字,获取每个物品URL
2、将所获取的URL存储到一个集合中A,当使用时,从集合取出一条,集合数量A-1
3、取出一条URL,抓取信息数据
4、存储数据
模拟在搜索栏输入关键字,获取页面物品URL(目前代码只能获取30条URL)
例如:url = ‘https://search.jd.com/Search?keyword=小米8’
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/92.0.4515.131 Safari/537.36'
}
class url_data:
def get_url(self, url):
urls = list()
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
detail = etree.HTML(response.text) # etree转HTML
for i in range(30):
[product_name] = detail.xpath(
'//*[@id="J_goodsList"]/ul/li[' + str(i + 1) + ' ]/div/div[1]/a/@href') # 商品url
# print("href:" + str(product_name))
urls.append(str(product_name))
print(urls)
return urls
if __name__ == '__main__':
url = 'https://search.jd.com/Search?keyword=小米8'
u = url_data()
u.get_url(url)
二、保存URL到集合中
class URLManager(object):
def __init__(self):
# 设定新url与以爬取url集合
self.new_urls = set()
self.old_urls = set()
def save_new_url(self, url):
# 将单条新url保存到待爬取集合中
if url is not None:
if url not in self.new_urls and url not in self.old_urls:
print("保存新URL:{}".format(url))
self.new_urls.add(url)
def save_new_urls(self, url_list):
# 批量保存url
for url in url_list:
self.save_new_url(url)
def get_new_url(self):
# 取出一条未爬取的url,同时保存到以爬取url中
if self.get_new_url_num() > 0:
url = self.new_urls.pop()
self.old_urls.add(url)
return url
else:
return None
def get_new_url_num(self):
# 返回未爬取的url数量
return len(self.new_urls)
def get_old_url_num(self):
# 返回已经爬取的url数量
return len(self.old_urls)
三、取数
提示:当前创作和你的工作、学习是什么样的关系
例如:
- 创作是否已经是你生活的一部分了
- 有限的精力下,如何平衡创作和工作学习
import re
import requests
import json
from lxml import etree
import pandas as pd
# 代理
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/92.0.4515.131 Safari/537.36'
}
class JDdata:
def get_data(self, url):
# data = list()
# 取出商品id
global product_name, title
url_p = 'https://item.jd.com/(.*?).html'
Goods_id = re.findall(url_p, url, re.S)
# print(Goods_id)
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
detail = etree.HTML(response.text) # etr转HTML
"""---------------名称------------------------"""
try:
# [product_name] = detail.xpath('//*[@id = "detail"]/div[2]/div[1]/div[1]/ul[3]/li[1]/@title')
[product_name] = detail.xpath('//*[@id="detail"]/div[2]/div[1]/div[1]/ul[2]/li[1]/@title') # 商品名称
# print("商品名称:" + str(product_name))
except:
print('商品名称无法抓取,请查看设置的“xpath”格式')
"""---------------品牌------------------------"""
# [product_brand] = detail.xpath('//*[@id="parameter-brand"]/li/a/text()') # 品牌
[product_brand] = detail.xpath('//*[@id="parameter-brand"]/li/a/text()')
print("品牌:" + str(product_brand))
"""---------------标题------------------------"""
# [title] = detail.xpath('/html/body/div[6]/div/div[2]/div[1]/text()') # 标题
# # title = str(title).strip()
# print("标题:" + str(title).strip())
"""---------------价格------------------------"""
# 请求价格信息json
p = requests.get('https:' + '//p.3.cn/prices/mgets?skuIds=J_' + Goods_id[0], headers=headers) # 请求商品价格json
[product_dict] = json.loads(p.text) # 获取商品价格
product_m_price = product_dict['m'] # 最高价格
product_price = product_dict['p'] # 当前价格
product_o_price = product_dict['op'] # 指导价格
# print('最高价格:' + str(product_m_price))
# print("商品价格:" + str(product_price))
# print("指导价格:" + str(product_o_price))
# 请求评论信息json
c = requests.get('https://club.jd.com/comment/productCommentSummaries.action?referenceIds=' + Goods_id[0],
headers=headers)
comment_dict = json.loads(c.text.split('[')[-1].split(']')[0]) # json内容截取
total_comment_num = comment_dict['CommentCountStr']
good_comment_num = comment_dict['GoodCountStr']
good_percent_com = comment_dict['GoodRate']
bad_comment_num = comment_dict['PoorCountStr']
bad_percent_com = comment_dict['PoorRate']
# print('总评论数为:' + str(total_comment_num))
# print('好评数:' + str(good_comment_num))
# print('好评率:' + str(good_percent_com))
# print('差评数:' + str(bad_comment_num))
# print('差评率:' + str(bad_percent_com))
print('差评率:' + str(bad_percent_com))
data = pd.DataFrame([[str(product_name), str(product_brand).strip(), product_price, product_o_price,
str(total_comment_num), str(good_comment_num), str(good_percent_com),
str(bad_comment_num), str(bad_percent_com)]],
columns=['product_name', 'product_brand', 'product_price', 'product_o_price',
'total_comment_num', 'good_comment_num', 'good_percent_com',
'bad_comment_num', 'bad_percent_com'])
return data
# 测试
if __name__ == '__main__':
url = 'https://item.jd.com/100020693650.html'
jd = JDdata()
jd.get_data(url)
四、存储数据
import os
import pandas as pd
class DataSave:
# 指定数据保存的文件路径
def __init__(self, path):
self.path = path
def save(self, data):
# 判断文件路径是否存在,不存在自动创建添加数据,否则追加数据,去重表头
if not os.path.exists(self.path):
print('--->' + '\'' + self.path + '\'' + '路径不存在')
print('--->' + '已自动创建该路径')
data.to_csv(self.path, encoding='utf_8_sig', index=False, index_label=False, mode='a')
print('--->' + '数据首次保存')
else:
data.to_csv(self.path, encoding='utf_8_sig', index=False, mode='a', header=False, index_label=False)
print('--->' + '数据追加保存')
if __name__ == "__main__":
data = pd.DataFrame([['李王三', '48', '男'], ['张一五', '43', '男']], columns=['name', 'age', 'sex'])
save_path = 'E:\jd.csv'
ds = DataSave(save_path)
ds.save(data)
五、任务调度
from urlmanager import URLManager
from goods_data import JDdata
from url_data import url_data
from save_data import DataSave
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/92.0.4515.131 Safari/537.36'
}
class Scheduler:
def __init__(self, path, root_url):
# 初始化各个组件
self.url_manager = URLManager()
self.url_data = url_data()
self.jd_data = JDdata()
self.save_data = DataSave(path)
self.root_url = root_url
# self.count = count
def run_spider(self):
# 先添加一条url到未爬取url集合中
self.url_manager.save_new_url(self.root_url)
# 获取一条未爬取url
url = self.url_manager.get_new_url()
# 通过搜索商品获得的所有url(目前是30条)
new_urls = self.url_data.get_url(url)
# 将获取到的url保存到未爬取url集合中
self.url_manager.save_new_urls(new_urls)
# print(new_urls)
# 保存数据到本地文件
# self.data_save.save(data)
# get_new_url会减少一条url,所以只需判断大于0
while len(self.url_manager.new_urls) > 0:
url = self.url_manager.get_new_url()
url = 'https:' + url
print(url)
# print(url)
data = self.jd_data.get_data(url)
self.save_data.save(data)
if __name__ == "__main__":
root_url = "https://search.jd.com/Search?keyword=裤子"
save_url = "E:\jd.csv"
Spider = Scheduler(save_url, root_url)
Spider.run_spider()
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)