一,要爬取的页面
二,代码
#coding=gbk 导入模块 ```python import io import sys import pandas as pd import requests from lxml import etree url = "https://www.bilibili.com/v/popular/rank/guochan" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/95.0.4638.69 Safari/537.36 Edg/95.0.1020.53 " } sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='gb18030') # 解决gbk问题 response = requests.get(url, headers=headers) # print(response.text) tree = etree.HTML(response.text) divs = tree.xpath("/html/body/div[3]/div/div[2]/div[2]/ul") # 确定位置 # print(divs) for div in divs: div_name = div.xpath("./li/div[2]/div[2]/a/text()") # print(div_name) # 动漫名称 div_number = div.xpath("./li/div[2]/div[2]/div[1]/text()") # print(div_number) # 更新到多少集 div_amount = div.xpath("./li/div[2]/div[2]/div[2]/span[1]/text()") # print(div_amount) div_amount_01 = [] for i in div_amount: div_amount_01.append(i.replace("n ","")) #去除里面的换行和空格 # print(div_amount_01) # 播放量 div_comment = div.xpath("./li/div[2]/div[2]/div[2]/span[2]/text()") # print(div_comment) div_comment_01 = [] for i in div_comment: div_comment_01.append(i.replace("n ","")) # # 评论 div_give = div.xpath("./li/div[2]/div[2]/div[2]/span[3]/text()") # print(div_give) div_give_01 = [] for i in div_give: div_give_01.append(i.replace("n ","")) # 点赞 div_heat = div.xpath("./li/div[2]/div[2]/div[3]/div/text()") # print(div_heat) # 综合评分 title = {"动漫名称":div_name,"更新到多少集":div_number,"综合评分":div_heat,"播放量":div_amount_01,"评论":div_comment_01,"点赞":div_give_01} # 保存到字典里 # print(title) df = pd.Dataframe(title) # 转换成Dataframe格式 df.index = df.index+1 # 索引从1开始 df.to_excel("D:/安装包/哔哩哔哩动画.xlsx") # 存放到excel中
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)