from bs4 import BeautifulSoup import re import urllib.request, urllib.error import xlwt # 该程序用来爬取链家的数据,房山的数据,把网址换一换就能改成其他的 def main(area,pageNum): # baseurl = "https://bj.lianjia.com/ershoufang/fengtai/pg" baseurl = "https://bj.lianjia.com/ershoufang/"+area+"/pg" savepath = "./data/链家-"+area+".xls" datalist = getData(pageNum, baseurl) saveData(pageNum, datalist, savepath) findName = re.compile(r'target="_blank">(.*?) (.*?)') findSinglePrice = re.compile(r'data-price="(.*?)"') findHouseInfo = re.compile(r'houseIcon">(.*?)') def getData(pageNum, baseurl): datalist = [] for i in range(0, pageNum): # 调用100次,每次30条 print("正在爬取第"+str(i+1)+"页") url = baseurl + str(i+1) html = askURL(url) # 每获取一个 就解析一个 # 逐一解析数据 soup = BeautifulSoup(html, "html.parser") ii = 1 # 计数器,计数到30就完了 for item in soup.find_all('div', class_="title"): # 这个循环是为了获取名称 name_list = [] item = str(item) # print("这是第%d个item"%ii) ii = ii+1 if ii > 31: break else: name = re.findall(findName, item)[0] # print(name) name_list.append(name) datalist.append(name_list) for item in soup.find_all('div', class_='priceInfo'): price_list = [] item = str(item) sum_price = re.findall(findSumPrice, item)[0] price_list.append(sum_price) single_price = re.findall(findSinglePrice, item)[0] price_list.append(single_price) datalist.append(price_list) for item in soup.find_all('div', class_='houseInfo'): item = str(item) houseinfo = re.findall(findHouseInfo, item)[0] houseinfo = houseinfo.split('|') # 以“|”为分隔符分开 # 房源信息这一块 列表长度不一定,为增强鲁棒性,进行以下判断, # 目的是对齐7个房源属性 if len(houseinfo) == 4: houseinfo.insert(3, "空") houseinfo.insert(4, "空") houseinfo.insert(5, "空") if len(houseinfo) == 5: houseinfo.insert(4, " ") houseinfo.insert(5, "空") if len(houseinfo) == 6: houseinfo.insert(5, " ") if len(houseinfo) == 8: houseinfo.pop() datalist.append(houseinfo) return datalist def askURL(url): # 这就是个用户代理,表示我自己是个浏览器,而不是爬虫 head = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36" } request = urllib.request.Request(url, headers=head) html = "" try: response = urllib.request.urlopen(request) html = response.read().decode('utf-8') except urllib.error.URLError as e: if hasattr(e, "code"): print(e.code) if hasattr(e, "reason"): print(e.reason) return html def saveData(pageNum, datalist, savepath): # 一页三十个房源,一共100页,共计3000套 book = xlwt.Workbook(encoding="utf-8", style_compression=0) sheet = book.add_sheet('房山二手房', cell_overwrite_ok=True) col = ('房源名称', "总价/万", "单价/元", "几居室", "面积", "朝向", "装修", "楼层", "建筑年代", "板楼or塔楼") # 共计以上10个属性 print(datalist) print("存储数据ing......") for i in range(0, 10): sheet.write(0, i, col[i]) for i in range(0, pageNum): # 这个相当于是页码 for j in range(0, 30): # 先把名称写进去 sheet.write(i*30+j+1, 0, datalist[j+30*i*3]) # 然后是总价 sheet.write(i*30+j+1, 1, float(datalist[j+30*i*3+30][0])) # 然后是单价 sheet.write(i * 30 + j + 1, 2, int(datalist[j + 30 * i * 3 + 30][1])) # 几居室 sheet.write(i * 30 + j + 1, 3, datalist[j + 30 * i * 3 + 60][0]) # 面积 sheet.write(i * 30 + j + 1, 4, datalist[j + 30 * i * 3 + 60][1]) # 朝向, sheet.write(i * 30 + j + 1, 5, datalist[j + 30 * i * 3 + 60][2]) # 装修, sheet.write(i * 30 + j + 1, 6, datalist[j + 30 * i * 3 + 60][3]) # 楼层, sheet.write(i * 30 + j + 1, 7, datalist[j + 30 * i * 3 + 60][4]) # 建筑年代, sheet.write(i * 30 + j + 1, 8, datalist[j + 30 * i * 3 + 60][5]) # 板楼or塔楼 sheet.write(i * 30 + j + 1, 9, datalist[j + 30 * i * 3 + 60][6]) book.save(savepath) if __name__ == "__main__": area = "huairou" pageNum = 36 main(area, pageNum) print("爬取并存储数据完毕")
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)