今天分享一篇关于搜狗问问采集组合文章,希望对大家有所帮助。
附上代码:
# encoding='utf-8' # coding: utf-8 # Author: 小章哥儿 # Date: 2021-08-03 from lxml import etree import re import requests import time class Sogou(): def __init__(self): return def get_html(self, keyword): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"} url = f'https://www.sogou.com/sogou?query={keyword}&ie=utf8&insite=wenwen.sogou.com' html = requests.get(url, headers=headers) return html.text def collect_urls(self, keyword): """ 采集问答关键词前三个标题和链接,列表 元祖形式 """ headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"} html = self.get_html(keyword) selector = etree.HTML(html) questions = [i.xpath('string(.)').replace('搜狗问问', '').replace('搜狗', '').replace('-', '') for i in selector.xpath('//div[@class="vrwrap"]/h3[@class="vrTitle"]/a')] link = ['https://www.sogou.com' + i for i in selector.xpath('//div[@class="vrwrap"]/h3[@class="vrTitle"]/a/@href')] links = link[:5] or link[:4] or link[:3] or link[:2] or link[:1] uu = [] try: for i in links: url = requests.get(i, headers=headers).text obj = re.search('https://wenwen.sogou.com/z/(.*?).htm', url) obj = f'https://wenwen.sogou.com/z/{obj.group(1)}.htm' # print(obj) # 采集的链接 self.parser_answer(obj, keyword) uu.append(url) except AttributeError: print('采集链接未能解析成功') pairs = list(zip(questions, uu)) return pairs[:5] # 采集问答内容 def parser_answer(self, url, keyword): """ 采集问答内容 """ headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"} html = requests.get(url, headers=headers) selector = etree.HTML(html.text) # answers = [ # i.xpath('string(.)').replace('\ue010', '').replace('\ufeff', '').replace('\u3000', '').replace('\u2022', # '').replace( # '\ue5e5', '').replace('\n', '').replace('\xa0', '').replace(' ', '。').replace( # '\r', '') for i in selector.xpath('//pre')] answers = [] for i in selector.xpath('//pre'): an = i.xpath('string(.)').replace('\u2003', '').replace('\ue010', '').replace('\ufeff', '').replace( '\u3000', '').replace('\u2022', '').replace( '\ue5e5', '').replace('\n', '').replace('\xa0', '').replace(' ', '。').replace( '\r', '').replace( '\u339b', '').replace(r'^\u[A-Za-z0-9]{1,5}|\x[A-Za-z0-9]{1,4}$', '') answers.append(an) answers = [i for i in answers if '?' not in i and '?' not in i and len(set(i)) > 2 and '为什么' not in i] # 让多个内容变成字典 进行字典的形式排序 answer_dict = {answer: len(answer) for answer in answers} answers = [i[0] for i in sorted(answer_dict.items(), key=lambda asd: asd[1])] url = f'C:\Users\Administrator.PC-20190922SNXD\Desktop\搜狗问问\{keyword}.txt' with open(url, 'a+') as f: try: f.write('00' + str(answers[0]) + '\n\n') except IndexError: print('列表索引超出') def star(): so = Sogou() txtfile = 'C:\Users\Administrator.PC-20190922SNXD\Desktop\key.txt' try: with open(txtfile, 'r+') as f: for i in iter(f.readlines()): keyword = i.strip() time.sleep(3) so.collect_urls(keyword) print(f'{keyword}***文章组合完成啦') except UnicodeEncodeError: with open(txtfile, 'r+') as f: ha = f.readline() ha.replace(ha, '') for i in iter(f.readlines()): keyword = i.strip() time.sleep(3) so.collect_urls(keyword) print(f'{keyword}***文章生成啦') if __name__ == '__main__': """ 需要设置两个参数: 第63行:需要手动创建桌面目录 目前“搜狗问问”就是桌面目录名称 第70行:需要手动创建txt文本,关键词一行一个 注:无法采集内容,更换源码中变量headers中的“UA” """ star()
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)