#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2022/4/11 17:58
# @Author : shaofei
# @Email : shaochenshuo@126.com
# @File : 内核月报搜索.py
# @Software: PyCharm
"""
该程序通过指定(-k)关键字搜索淘宝内核月报的标题,正文,代码等部分,只要有其中某个部分命中就输出该月报标题及链接
"""
#pip3 install beautifulsoup4
#pip3 install lxml
#pip install requests
#import urllib2
import urllib.request
import requests
import threading
import argparse
import re
import sys
from bs4 import BeautifulSoup
url = "http://mysql.taobao.org/monthly"
#加上请求头,模拟成浏览器取访问网站,避免请求头反爬策略:HTTP Error 418,注意headers是dic
USER_AGENT = {'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Mobile Safari/537.36'}
URL_TIMEOUT = 10
def parameter_parse():
# 解析命令行参数(-k 指定过滤的关键字,-l指定多个关键字间的关系)
parse = argparse.ArgumentParser()
parse.add_argument('-k', '--key', nargs='+', type=str, default=None,
help='指定搜索的关键字,必填参数,支持指定多个关键字,多个关键字间以空格分隔', required=True)
parse.add_argument('-l', '--relation', type=str, choices=['or', 'and'], help='多个关键词之间的关系 or或者and')
args = parse.parse_args()
# print(args)
# print(args.key[0])
if 1 < len(args.key) <= 3:
if args.relation:
print('本次过滤指定的关键字列表为:{},关键字间关系为:{}'.format(args.key, args.relation))
else:
print('当你使用 -k 指定多个关键字时,必须使用 -l 指定关键字间的关系(and 或者 or)')
sys.exit()
elif len(args.key) == 1:
print('本次过滤指定的关键字为:{}'.format(args.key[0]))
else:
print(' -k 最多只支持指定三个关键字')
sys.exit()
return args
def get_html(url, USER_AGENT, URL_TIMEOUT):
"""
访问url返回html
"""
response = requests.get(url, headers=USER_AGENT, timeout=URL_TIMEOUT)
response.encoding = response.apparent_encoding
#text = response.text
html = response.content.decode('utf-8')
#print(text)
return html
def call_gethtml(url, USER_AGENT, URL_TIMEOUT):
try:
html = get_html(url, USER_AGENT, URL_TIMEOUT)
except Exception as e:
error = str(e)
if "Connection to mysql.taobao.org timed out" in error:
print('访问网页超时,重新访问')
html = get_html(url, USER_AGENT, URL_TIMEOUT)
else:
print('访问网页报错,报错代码如下:{}'.format(e))
return html
def get_month_list():
"""
用户生成淘宝内核月报,每月url 列表
"""
html = call_gethtml(url, USER_AGENT, URL_TIMEOUT)
#通过BeautifulSoup对html进行格式化,便于访问
soup = BeautifulSoup(html, 'lxml')
h3_list = soup.find_all('h3')
#上面的 soup 是 'bs4.element.Tag' 类型,下面的 h3 也是 'bs4.element.Tag'类型
month_url = []
for h3 in h3_list:
#h3.a['href'] 访问h3的a标签,h3.a.string 获取a标签的文本内容(如果有多个a标签,下面获取的是第一个a标签)
child_list = '{}{}'.format('http://mysql.taobao.org', h3.a['href'])
#print('链接为:{}, 链接标题为:{}'.format(child_list, h3.a.string))
month_url.append(child_list)
#print('月报列表长度为:{}。月报列表如下:{}'.format(len(month_url), month_url))
return month_url
def key_check(url_month, key: list, relation = 0):
"""
通过顺序匹配文章标题,正文,及代码内容,如果能匹配上关键字则输出该链接(按标题,正文,代码顺序匹配,标题匹配上就不会再进行正文匹配)
:param url_month: 如http://mysql.taobao.org/monthly/2022/03/
:param key:
:return:
"""
#访问某月的内核月报列表(如http://mysql.taobao.org/monthly/2022/03/)
html_title = call_gethtml(url_month, USER_AGENT, URL_TIMEOUT)
soup = BeautifulSoup(html_title, 'lxml')
article_h3_list = soup.find_all('h3')
for h3_in in article_h3_list:
#指定3个关键字参数时对月报进行过滤
if len(key) == 3:
if relation == 'or':
# 查看文章标题中是否含有key参数指定的关键字,re.I 表示不区分大小写。如果标题中含有关键字,则直接放入article_url字典,不继续检查正文
if h3_in.find('a', text=re.compile(key[0], re.I)) or h3_in.find('a', text=re.compile(key[1], re.I)) or h3_in.find('a', text=re.compile(key[2], re.I)):
url_page = '{}{}'.format('http://mysql.taobao.org', h3_in.a['href'])
url_string = h3_in.a.string.strip()
print('标题匹配上为:{},链接地址为:{}'.format(url_string, url_page))
article_url['标题匹配:{}'.format(url_string)] = url_page
continue
# 如果标题中不包含关键字,继续搜索正文中是否包含关键字
else:
url_page = '{}{}'.format('http://mysql.taobao.org', h3_in.a['href'])
url_string = h3_in.a.string.strip()
# 访问具体某篇内核月报(如http://mysql.taobao.org/monthly/2022/03/01/)
html_page = call_gethtml(url_page, USER_AGENT, URL_TIMEOUT)
soup = BeautifulSoup(html_page, 'lxml')
# 检查正文中是否包含key参数指定的关键字
if soup.find_all('p', text=re.compile(key[0], re.I)) or soup.find_all('p', text=re.compile(key[1], re.I)) or soup.find_all('p', text=re.compile(key[2], re.I)):
article_url['正文匹配:{}'.format(url_string)] = url_page
print('正文匹配上为:{},链接地址为:{}'.format(url_string, url_page))
# 检查代码段中是否包含key参数指定的关键字
elif soup.find('code', text=re.compile(key[0], re.I)) or soup.find('code', text=re.compile(key[1], re.I)) or soup.find('code', text=re.compile(key[2], re.I)):
article_url['代码匹配:{}'.format(url_string)] = url_page
print('代码匹配上为:{},链接地址为:{}'.format(url_string, url_page))
else:
pass
pass
if relation == 'and':
# 查看文章标题中是否含有key参数指定的关键字,re.I 表示不区分大小写。如果标题中含有关键字,则直接放入article_url字典,不继续检查正文
if h3_in.find('a', text=re.compile(key[0], re.I)) and h3_in.find('a', text=re.compile(key[1], re.I)) and h3_in.find('a', text=re.compile(key[2], re.I)):
url_page = '{}{}'.format('http://mysql.taobao.org', h3_in.a['href'])
url_string = h3_in.a.string.strip()
print('标题匹配上为:{},链接地址为:{}'.format(url_string, url_page))
article_url['标题匹配:{}'.format(url_string)] = url_page
continue
# 如果标题中不包含关键字,继续搜索正文中是否包含关键字
else:
url_page = '{}{}'.format('http://mysql.taobao.org', h3_in.a['href'])
url_string = h3_in.a.string.strip()
# 访问具体某篇内核月报(如http://mysql.taobao.org/monthly/2022/03/01/)
html_page = call_gethtml(url_page, USER_AGENT, URL_TIMEOUT)
soup = BeautifulSoup(html_page, 'lxml')
# 检查正文中是否包含key参数指定的关键字
if soup.find_all('p', text=re.compile(key[0], re.I)) and soup.find_all('p', text=re.compile(key[1], re.I)) and soup.find_all('p', text=re.compile(key[2], re.I)):
article_url['正文匹配:{}'.format(url_string)] = url_page
print('正文匹配上为:{},链接地址为:{}'.format(url_string, url_page))
# 检查代码段中是否包含key参数指定的关键字
elif soup.find('code', text=re.compile(key[0], re.I)) and soup.find('code', text=re.compile(key[1], re.I)) and soup.find('code', text=re.compile(key[2], re.I)):
article_url['代码匹配:{}'.format(url_string)] = url_page
print('代码匹配上为:{},链接地址为:{}'.format(url_string, url_page))
else:
pass
else:
pass
#指定两个关键字参数时,对月报进行过滤
elif len(key) == 2:
if relation == 'or':
# 查看文章标题中是否含有key参数指定的关键字,re.I 表示不区分大小写。如果标题中含有关键字,则直接放入article_url字典,不继续检查正文
if h3_in.find('a', text=re.compile(key[0], re.I)) or h3_in.find('a', text=re.compile(key[1], re.I)):
url_page = '{}{}'.format('http://mysql.taobao.org', h3_in.a['href'])
url_string = h3_in.a.string.strip()
print('标题匹配上为:{},链接地址为:{}'.format(url_string, url_page))
article_url['标题匹配:{}'.format(url_string)] = url_page
continue
# 如果标题中不包含关键字,继续搜索正文中是否包含关键字
else:
url_page = '{}{}'.format('http://mysql.taobao.org', h3_in.a['href'])
url_string = h3_in.a.string.strip()
# 访问具体某篇内核月报(如http://mysql.taobao.org/monthly/2022/03/01/)
html_page = call_gethtml(url_page, USER_AGENT, URL_TIMEOUT)
soup = BeautifulSoup(html_page, 'lxml')
# 检查正文中是否包含key参数指定的关键字
if soup.find_all('p', text=re.compile(key[0], re.I)) or soup.find_all('p', text=re.compile(key[1], re.I)):
article_url['正文匹配:{}'.format(url_string)] = url_page
print('正文匹配上为:{},链接地址为:{}'.format(url_string, url_page))
# 检查代码段中是否包含key参数指定的关键字
elif soup.find('code', text=re.compile(key[0], re.I)) or soup.find('code', text=re.compile(key[1], re.I)):
article_url['代码匹配:{}'.format(url_string)] = url_page
print('代码匹配上为:{},链接地址为:{}'.format(url_string, url_page))
else:
pass
if relation == 'and':
# 查看文章标题中是否含有key参数指定的关键字,re.I 表示不区分大小写。如果标题中含有关键字,则直接放入article_url字典,不继续检查正文
if h3_in.find('a', text=re.compile(key[0], re.I)) and h3_in.find('a', text=re.compile(key[1], re.I)):
url_page = '{}{}'.format('http://mysql.taobao.org', h3_in.a['href'])
url_string = h3_in.a.string.strip()
print('标题匹配上为:{},链接地址为:{}'.format(url_string, url_page))
article_url['标题匹配:{}'.format(url_string)] = url_page
continue
# 如果标题中不包含关键字,继续搜索正文中是否包含关键字
else:
url_page = '{}{}'.format('http://mysql.taobao.org', h3_in.a['href'])
url_string = h3_in.a.string.strip()
# 访问具体某篇内核月报(如http://mysql.taobao.org/monthly/2022/03/01/)
html_page = call_gethtml(url_page, USER_AGENT, URL_TIMEOUT)
soup = BeautifulSoup(html_page, 'lxml')
# 检查正文中是否包含key参数指定的关键字
if soup.find_all('p', text=re.compile(key[0], re.I)) and soup.find_all('p', text=re.compile(key[1], re.I)):
article_url['正文匹配:{}'.format(url_string)] = url_page
print('正文匹配上为:{},链接地址为:{}'.format(url_string, url_page))
# 检查代码段中是否包含key参数指定的关键字
elif soup.find('code', text=re.compile(key[0], re.I)) and soup.find('code', text=re.compile(key[1], re.I)):
article_url['代码匹配:{}'.format(url_string)] = url_page
print('代码匹配上为:{},链接地址为:{}'.format(url_string, url_page))
else:
pass
else:
pass
#指定一个关键字参数时对月报进行过滤
else:
#查看文章标题中是否含有key参数指定的关键字,re.I 表示不区分大小写。如果标题中含有关键字,则直接放入article_url字典,不继续检查正文
if h3_in.find('a', text=re.compile(key[0], re.I)):
url_page = '{}{}'.format('http://mysql.taobao.org', h3_in.a['href'])
url_string = h3_in.a.string.strip()
print('标题匹配上为:{},链接地址为:{}'.format(url_string, url_page))
article_url['标题匹配:{}'.format(url_string)] = url_page
continue
# 如果标题中不包含关键字,继续搜索正文中是否包含关键字
else:
url_page = '{}{}'.format('http://mysql.taobao.org', h3_in.a['href'])
url_string = h3_in.a.string.strip()
#访问具体某篇内核月报(如http://mysql.taobao.org/monthly/2022/03/01/)
html_page = call_gethtml(url_page, USER_AGENT, URL_TIMEOUT)
soup = BeautifulSoup(html_page, 'lxml')
#检查正文中是否包含key参数指定的关键字
if soup.find_all('p', text=re.compile(key[0], re.I)):
article_url['正文匹配:{}'.format(url_string)] = url_page
print('正文匹配上为:{},链接地址为:{}'.format(url_string, url_page))
#检查代码段中是否包含key参数指定的关键字
elif soup.find('code', text=re.compile(key[0], re.I)):
article_url['代码匹配:{}'.format(url_string)] = url_page
print('代码匹配上为:{},链接地址为:{}'.format(url_string, url_page))
else:
pass
if __name__ == '__main__':
args = parameter_parse()
key = args.key
if len(key) > 1:
key_relation = args.relation
else:
key_relation = 0
#这里不比考虑python 中字典是否线程安全,因为我的并发线程只会对自字典增加不同key:value 键值对
article_url = {}
i_parallel = 2
month_list = get_month_list()
list_count = len(month_list)
operate_count = 1
if list_count > 1:
while operate_count <= list_count:
threads = []
for i in range(i_parallel):
t_check = threading.Thread(target=key_check, args=(month_list[operate_count - 1], key, key_relation))
threads.append(t_check)
operate_count += 1
if operate_count > (list_count):
break
for s in threads:
s.start()
for j in threads:
j.join()
print('当前过滤分析进度为 ---------------------{}%---------------------'.format(round(((operate_count - 1)/list_count)*100, 2)))
else:
key_check(month_list[0], key)
print('最终检查结果为:{}'.format(article_url))
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)