# author : sunzd# date : 2019/9/01# position : beijingfrom fake_useragent import UserAgentfrom bs4 import BeautifulSoupfrom urllib import requestfrom urllib import errorimport reimport timedef HTML_request(url):if url is None:returnprint("download HTML is :{0}".format(url))# 如果url包含中文,则需要进行编码# 模拟浏览器行为headers = {'UserAgent': str(UserAgent().random)}req = request.Request(url, headers=headers)try:HTML = request.urlopen(req).read().decode('utf-8')except error.URLError as e:if hasattr(e, "code"):print(e.code)if hasattr(e, "reason"):print(e.reason)return None# print(HTML)return HTMLdef HTML_parser(url, HTML):if url is None or HTML is None:return# pattern = '<main>(.+?)</main>' #因为<main>后紧跟的时‘\n’因此需要忽略掉使用模式修正符re.S使'.'可以匹配任意字符# articles = re.compile(pattern, re.S).findall(HTML)# articles = articles[0]pattern_art = '<div data(.+?)</div>'# print(articles)articles = re.compile(pattern_art, re.S).findall(HTML.replace('\n', ''))print(articles.__len__())for article in articles:soup = BeautifulSoup(article, 'HTML.parser')Title = soup.find('a', attrs={'target': '_blank'})# print(Title)print("文章题目:{0}\n文章类型:{1}".format(Title.text.replace(' ', '').replace("原", "").replace("转", ""), Title.span.text))print("文章链接:{0}".format(Title.attrs['href']))HTML_request(Title.attrs['href'])infors = soup.find('div', attrs={'class': 'info-Box d-flex align-content-center'})# for infor in infors.p.next_siblings: next_siblings : 因为不包括自己,因此会把第一个p节点信息去掉。# for infor in infors.children:# if infor == ' ': # ‘ ’空格也会识别为他的孩子,因此需要过滤掉# continue# # print("======{0}".format(infor))# if infor.span: # 只需要<span >节点的信息# print("{0}".format(infor.span.text))pattern_next = '<li >'next = re.compile(pattern_next).findall(HTML)# print(HTML)print("是否为最后一页:{0}----{1}".format(len(next), next))if len(next) == 0:return 0else:return 0if __name__ == '__main__':name = '你自己的名称'page = 1url = "https://blog.csdn.net/" + name + "/article/List/" + str(page) + '?'while page < 7:HTML = HTML_request(url)# print(HTML)next = HTML_parser(url, HTML)page += 1if page > 6:page = 1url = "https://blog.csdn.net/" + name + "/article/List/" + str(page) + '?'总结
以上是内存溢出为你收集整理的Python入门学习之:10分钟1500访问量全部内容,希望文章能够帮你解决Python入门学习之:10分钟1500访问量所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)