继上次(爬取twitter数据_Chloris_的博客-CSDN博客)在github上寻找代码看不懂的后续尝试:
其中包含selenium登录&异步加载&xpath--由于twitter仅展现近一个周的数据,所以当前数据爬取也是不全面的,还需要继续调整代码。
from selenium import webdriver import time from datetime import datetime from datetime import timedelta import pandas as pd import requests import json from fake_useragent import UserAgent from bs4 import BeautifulSoup import random import logging import urllib.error from lxml import etree from lxml import html # 获取页面内所有帖子的url def get_posts(url): """ url:包含所有帖子的浏览页面 """ wb = webdriver.Chrome() wb.get(url) time.sleep(3) #处理网页加载 js = 'return action=document.body.scrollHeight' height = wb.execute_script(js) wb.execute_script('window.scrollTo(0, document.body.scrollHeight)') time.sleep(5) t1 = int(time.time()) status = True num = 0 post_list = [] while status: t2 = int(time.time()) if t2 - t1 < 30:#一边翻页一边读取网页源码,由于twitter异步加载后翻页的源码不全,所以在翻页过程中获取网页源码,但是获取的数据需要进行去重处理 selector = html.etree.HTML(wb.page_source)# # 是将HTML转化为二进制/html 格式 infos = selector.xpath("//*/div[@class='css-1dbjc4n r-18u37iz']/div[2]/div[2]/div[1]") for info in infos: post = info.xpath("string(.)").strip() post_list.append(post) new_height = wb.execute_script(js) if new_height > height: time.sleep(1) wb.execute_script( 'window.scrollTo(0, document.body.scrollHeight)') height = new_height t1 = int(time.time()) elif num < 3: time.sleep(3) num = num + 1 else: # 超时且重试后停止,到底页面底部 status = False return post_list url = 'https://twitter.com/search?q=nuclear%20waste%20water&src=typed_query' post_list = get_posts(url)
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)