import requests,pymongo,time, randomfrom bs4 import BeautifulSoupfrom multiprocessing import PoolclIEnt = pymongo.MongoClIEnt('localhost',27017)ganji =clIEnt['ganji']List_urls = ganji['List_urls']# 获取北京赶集网的列表页# url = 'http://bj.ganji.com/wu/'# response = requests.get(url)# soup = BeautifulSoup(response.text,'lxml')## urls = soup.select('div > dl > dt > a')# for new_url in urls:# add_new_url = 'http://bj.ganji.com'+new_url.get('href')# print(add_new_url)channel_List = ''' http://bj.ganji.com/shouji/ http://bj.ganji.com/shoujihaoma/ http://bj.ganji.com/shoujipeijian/ http://bj.ganji.com/bijibendiannao/ http://bj.ganji.com/taishIDiannaozhengji/ http://bj.ganji.com/diannaoyingjian/ http://bj.ganji.com/wangluoshebei/ http://bj.ganji.com/shumaxiangji/ http://bj.ganji.com/youxiji/ http://bj.ganji.com/xuniwupin/ http://bj.ganji.com/jiaju/ http://bj.ganji.com/jiadian/ http://bj.ganji.com/zixingchemaimai/ http://bj.ganji.com/rirongbaihuo/ http://bj.ganji.com/yingyouyunfu/ http://bj.ganji.com/fushixiaobaxuemao/ http://bj.ganji.com/meironghuazhuang/ http://bj.ganji.com/yundongqicai/ http://bj.ganji.com/yueqi/ http://bj.ganji.com/tushu/ http://bj.ganji.com/bangongjiaju/ http://bj.ganji.com/wujingongju/ http://bj.ganji.com/nongyongpin/ http://bj.ganji.com/xianzhilipin/ http://bj.ganji.com/shoucangpin/ http://bj.ganji.com/baojianpin/ http://bj.ganji.com/laonianyongpin/ http://bj.ganji.com/gou/ http://bj.ganji.com/qitaxiaochong/ http://bj.ganji.com/xiaofeika/ http://bj.ganji.com/menpiao/ http://bj.ganji.com/jiaju/ http://bj.ganji.com/rirongbaihuo/ http://bj.ganji.com/shouji/ http://bj.ganji.com/shoujihaoma/ http://bj.ganji.com/bangong/ http://bj.ganji.com/nongyongpin/ http://bj.ganji.com/jiadian/ http://bj.ganji.com/ershoubijibendiannao/ http://bj.ganji.com/ruanjiantushu/ http://bj.ganji.com/yingyouyunfu/ http://bj.ganji.com/diannao/ http://bj.ganji.com/xianzhilipin/ http://bj.ganji.com/fushixiaobaxuemao/ http://bj.ganji.com/meironghuazhuang/ http://bj.ganji.com/shuma/ http://bj.ganji.com/laonianyongpin/ http://bj.ganji.com/xuniwupin/ http://bj.ganji.com/qitawupin/ http://bj.ganji.com/ershoufree/ http://bj.ganji.com/wupinjiaohuan/'''def get_links_from(channel,page): url = '{}{}/'.format(channel,page) print(url) time.sleep(random.randint(1,5)) response = requests.get(url) soup = BeautifulSoup(response.text,'lxml') if soup.find('div', class_="pageBox"): links = soup.select('dd.feature > div > ul > li > a') Titles = soup.select('#wrapper > div.leftBox > div.layoutList > dl > dt > a') for links,Titles in zip(links,Titles ): data = { 'url': links.get('href'), 'Title':Titles.get('Title') } # print(data) List_urls.insert_one(data) #print(url+'已经抓取完成') #print("该频道抓取完成")def get_all_links_from(channel): for i in range(1,101): get_links_from(channel,i)if __name__ =='__main__': pool = Pool() pool.map(get_all_links_from,channel_List.split())
以上是内存溢出为你收集整理的python爬虫程序 赶集网列表页抓取以及把列表存入mongodb全部内容,希望文章能够帮你解决python爬虫程序 赶集网列表页抓取以及把列表存入mongodb所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)