Python+Requests+Xpath(解析)爬取某站点简历图片(数据分析三)

Python+Requests+Xpath(解析)爬取某站点简历图片(数据分析三),第1张

概述1、环境安装pipinstalllxml2、解析原理使用通用爬虫爬取网页数据实例化etree对象,且将页面数据加载到该对象中使用xpath函数结合xpath表达式进行标签定位和指定数据提取3、实战案例-项目需求:解析房天下新房的相关数据importrequestsimportosfromlxmlimportet

1、环境安装

pip install lxml

2、解析原理

使用通用爬虫爬取网页数据实例化etree对象,且将页面数据加载到该对象中使用xpath函数结合xpath表达式进行标签定位和指定数据提取

3、实战案例

- 项目需求:解析房天下新房的相关数据

import requests
import os
from lxml import etree
import Json
import csv

if __name__ == '__main__':
url = 'https://huizhou.newhouse.fang.com/house/s/'
headers = {
'User-Agent':'Mozilla/5.0 (windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
}
if not os.path.exists('./fangtianxialibs'):
os.makedirs('./fangtianxialibs')
response = requests.get(url=url,headers=headers)
# 手动设置响应数据的编码格式
response.enCoding = 'utf-8'
page_text = response.text
tree = etree.HTML(page_text)
li_List = tree.xpath('//div[@ID = "newhouse_loupai_List"]/ul/li')
# 爬取的数据信息放到列表里面
datas = []
for li in li_List:
# 解析标题
try:
detail_url = li.xpath('.//div[@]/a/@href')[0]
if detail_url != []:
detail_url = 'https:'+detail_url
detail_text = requests.get(url=detail_url,headers=headers).text
# 字符串替换Url后缀
detail_url_new = detail_url.replace('.htm','/housedetail.htm')
tree = etree.HTML(detail_text)
# 解析二级页面的描述和价格(均价)
Title = tree.xpath('//div[@]//div[@]/h1/strong/text()')[0]
price = "".join(tree.xpath('//div[@]/div[@]/h3/text() | //div[@]/div[@]/span/text() | //div[@]/div[@]/text()')).strip('\n \t')
# 二级页面再次发起请求
detail_text_new = requests.get(url=detail_url_new,headers=headers).text
tree_new = etree.HTML(detail_text_new)
# 解析详情页信息
tree_List = tree_new.xpath('//div[@ID="Configuration"]')
# print(tree_List[0].xpath('./h3/text()'))

for index in tree_List:
zhoubian = "".join(index.xpath('./h3/text()')).strip('\n \t \r ')
jiaotong = "".join(index.xpath('./ul[@]/li/span/text()|./ul[@]/li[@]/text()')).strip('\n \t \r ')
qita = "".join(index.xpath('./ul[@]/li/span/text()|./ul[@]/li/text()')).strip('\n \t \r ')
desc = zhoubian+":"+jiaotong+':'+qita+'\n'
dic = {
'Title':Title,
'desc':desc,
'price':price
}
datas.append(dic)
except Exception as msg:
pass
# print('报错原因:{}'.format(msg))
filename = './fangtianxialibs/'+Title+'.txt'
print(datas)
Title_header = ['Title','desc','price']
with open(filename,'a',enCoding='utf-8') as fp:
writer = csv.DictWriter(fp,Title_header)
writer.writeheader()
writer.writerows(datas)

- 项目需求:解析图片数据:http://pic.netbian.com/4kmeinv/

import requests
from lxml import etree
url = 'http://pic.netbian.com/4kmeinv/'
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
response = requests.get(url=url,headers=headers)
#获取页面原始编码格式
print(response.enCoding)
page_text = response.text
tree = etree.HTML(page_text)
li_List = tree.xpath('//div[@]/ul/li')
for li in li_List:
img_url = 'http://pic.netbian.com'+li.xpath('./a/img/@src')[0]
img_name = li.xpath('./a/img/@alt')[0]
img_name = img_name.encode('iso-8859-1').decode('gbk')
print(img_url,img_name)

- 项目需求:解析出所有城市名称https://www.aqistudy.cn/historydata/
import requests
from lxml import etree
url = 'https://www.aqistudy.cn/historydata/'
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
response = requests.get(url=url,headers=headers)
#获取页面原始编码格式
print(response.enCoding)
page_text = response.text
tree = etree.HTML(page_text)
li_List = tree.xpath('//div[@]/ul/li | //div[@]/ul//li')
for li in li_List:
city_name = li.xpath('./a/text()')[0]
city_url = 'https://www.aqistudy.cn/historydata/'+li.xpath('./a/@href')[0]
print(city_name,city_url)

- 项目需求:下载网站站点简历中的图片数据:https://sc.chinaz.com/

import requests
from lxml import etree
import os

# 新建文件夹
if not os.path.exists('./jianlilibs'):
os.makedirs('./jianlilibs')

# 站点第一层 进入简历门户站点
url = 'https://sc.chinaz.com/'
headers = {
'User-Agent':'Mozilla/5.0 (windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
}
response_text = requests.get(url=url,headers=headers).text
# 解析获取模板信息
tree = etree.HTML(response_text)
# 解析出简历模板Url
def page_index(latest):
for index in range(1,latest):
if index == 1:
muban_url = 'https://sc.chinaz.com' + tree.xpath('//div[@]//li[@]/a/@href')[3]
# print("one",muban_url)
else:
muban_url = 'https://sc.chinaz.com' + tree.xpath('//div[@]//li[@]/a/@href')[3] + 'index_{}.HTML'.format(index)
# print("two",muban_url)
# 模板简历站点获取每个简历的信息
response = requests.get(muban_url,headers=headers)
# 手动设置响应数据的编码格式
response.enCoding = 'utf-8'
muban_text = response.text
# print(muban_text)
# 解析获取简历信息
jianli_tree = etree.HTML(muban_text)
# 解析出简历信息的Url
jianli_url_List = jianli_tree.xpath('//div[@]//a/@href')
# print(jianli_url_List)
for jianli_url in jianli_url_List:
jianli_url = "https:"+jianli_url
# print(jianli_url)
# 第三层获取简历信息
jianli_detail = requests.get(jianli_url,headers=headers).text
detail_tree = etree.HTML(jianli_detail)
img_src_List = detail_tree.xpath('//div[@]//img/@src')
for img_src in img_src_List:
img_src = 'https:'+img_src
# print(img_src)
img_src_content = requests.get(img_src,headers=headers).content
# print(img_src_content)
# 生成图片的名称
imgname = img_src.split('/')[-2]
# print(imgname)
# 图片路径
imgPath = './jianlilibs/'+imgname+'.jpg'
# 持久化存储
with open(imgPath, 'wb') as fp:
fp.write(img_src_content)
print('简历:'+imgname, '下载成功!!!')

if __name__ == '__main__':
while True:
try:
values = int(input('请输入站点页分页数:'))
page_index(values)
except Exception as msg:
print('输入错误,错误信息为{}'.format(msg))
finally:
break

总结

以上是内存溢出为你收集整理的Python+Requests+Xpath(解析)爬取某站点简历图片(数据分析三)全部内容,希望文章能够帮你解决Python+Requests+Xpath(解析)爬取某站点简历图片(数据分析三)所遇到的程序开发问题。

如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/langs/1186741.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-06-03
下一篇 2022-06-03

发表评论

登录后才能评论

评论列表(0条)

保存