import requestsfrom bs4 import BeautifulSoupurls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(1,15,1)]def spIDer_prepare(urls): spIDer_urls = [] for url in urls: if len(spIDer_urls)< 300: print(len(spIDer_urls)) response = requests.get(url) soup = BeautifulSoup(response.text, 'lxml') links = soup.select('a[]') for link in links: new_link = link.get('href') spIDer_urls.append(new_link) return spIDer_urls# 查看是否抓取待爬取的url链接成功print(spIDer_prepare(urls))data = {}for url in spIDer_prepare(urls): print(url) response = requests.get(url) soup = BeautifulSoup(response.text, 'lxml') Title = soup.select('div.pho_info > h4')[0].text address = soup.select('div.con_l > div.pho_info > p')[0].get('Title') price = soup.select('#pricePart > div.day_l > span')[0].text pic = soup.select('#curBigImage')[0].get('src') name = soup.select('a.lorder_name')[0].text name_pic = soup.select('div.member_pic > a > img')[0].get('src') name_gender = soup.select('div.w_240 > h6 > span')[0].get('class') # # 查看结果 # print(Title) # print(address) # print(price) # print(pic) # print(name) # print(name_pic) # print(name_gender) def gender(name_gender): if name_gender[0] == "member_boy_ico": return 'boy' elif name_gender[0] =="member_girl_ico": return 'girl' else: return '未知' # 验证结果 # print(gender(name_gender)) data = { 'Title':Title, 'address':address, 'price':price, 'pic':pic, 'name':name, 'name_pic':name_pic, 'name_gender':gender(name_gender) } # 验证data print(data) with open(r'D:\python3\tripadvisor_spIDer\xiaozhu.txt','a+',enCoding='utf-8') as file_text: # data['Title'].strip('\n') 去除字典Title的\n符号 file_text.writelines('标题:{}\t地址:{}\t价格:{}\t照片:{}\t屋主姓名:{}\t屋主照片:{}\t屋主性别:{}\n'.\ format(data['Title'].strip('\n'),data['address'],data['price'],data['pic'],data['name'],\ data['name_pic'],data['name_gender']))
以上是内存溢出为你收集整理的python爬虫程序 小猪短租北京房子300个详情页爬取全部内容,希望文章能够帮你解决python爬虫程序 小猪短租北京房子300个详情页爬取所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)