本脚本分为5部分:
spIDer_main 主程序
url_manager url管理器
HTML_downloader 网页下载器
HTML_parser 网页解析器
HTML_outputer 网页解析器
spIDer_main程序源码
import HTML_downloaderimport HTML_outputerimport HTML_parserimport url_managerclass SpIDerMain(object): # 初始化 def __init__(self): self.urls = url_manager.UrlManager() self.downloader = HTML_downloader.HTMLDownloader() self.parser = HTML_parser.HTMLParser() self.outputer = HTML_outputer.HTMLOutputer() def craw(self,start,end): for i in range(start,end): url = 'http://bj.58.com/pbdn/0/pn{}/'.format(i) print('爬取第{}个列表页,网址是:{}'.format(i,url)) HTML_cont = self.downloader.download(url) # 提取链接 new_urls = self.parser.parser_url(HTML_cont) # 把提取待爬取的url放入url管理器中 self.urls.add_new_urls(new_urls) # 判断url管理器中是否还有没有爬取的url while self.urls.has_new_url(): # 从url管理器中提取待爬取的url new_url = self.urls.get_new_url() # 检查URL # print("待爬取的url",new_url) # 下载页面 content = self.downloader.download(new_url) # 解析页面 data = self.parser.parser_data(content) # 检查是否正确获取到data # print(data) # 把结果存入输出器 self.outputer.collect_data(data) # 爬取完成后,输出结果 self.outputer.output_text() print("脚本运行结束")if __name__ =='__main__': obj_spIDer = SpIDerMain() obj_spIDer.craw(1,4)
url_manager url管理器
class UrlManager(object): def __init__(self): self.new_urls = set() self.old_urls = set() def add_new_urls(self, urls): if urls is None or len(urls) == 0: return for url in urls: new_url = url.get('href') page_url = new_url.split('?')[0] # 检查url # print("提取url",page_url) if page_url not in self.new_urls and page_url not in self.old_urls: self.new_urls.add(page_url) def has_new_url(self): return len(self.new_urls) != 0 def get_new_url(self): url = self.new_urls.pop() self.old_urls.add(url) return url
HTML_downloader 网页下载器
import requestsheaders = { 'User-Agent':'Mozilla/5.0 (windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.36'}class HTMLDownloader(object): def download(self, url): if url is None: return response = requests.get(url) if response.status_code != 200: return return response.text
HTML_parser 解析器
from bs4 import BeautifulSoupimport reclass HTMLParser(object): def parser_url(self, HTML_cont): soup = BeautifulSoup(HTML_cont, 'lxml') urls = soup.find_all('a',class_="t",href=re.compile('http://zhuanzhuan.58.com/detail/')) return urls def parser_data(self, content): soup = BeautifulSoup(content, 'lxml') category = soup.select('span.crb_i > a')[0].text Title = soup.select('div.Box_left_top > h1')[0].text vIEw = soup.select('p > span.look_time')[0].text price = soup.select('div.price_li > span > i')[0].text place = soup.select('div.palce_li > span > i')[0].text category,Title,vIEw,price,place in zip(category,place) data = { 'category':category, 'Title':Title, 'vIEw':vIEw, 'price':price, 'place':place } return data
HTML_outputer 输出器
class HTMLOutputer(object): def __init__(self): self.datas= [] def collect_data(self, data): if data is None: return self.datas.append(data) def output_text(self): with open(r'D:\python3_spIDer\info.txt','w') as file: for data in self.datas: file.writelines('分类:{}\t,题目:{}\t,浏览人数:{}\t,价格:{}\t,区域:{}\t'.\ format(data['category'],data['Title'],data['vIEw'],data['price'],\ data['place']))
以上是内存溢出为你收集整理的python爬虫程序 58同城二手交易信息爬取全部内容,希望文章能够帮你解决python爬虫程序 58同城二手交易信息爬取所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)