信息爬取
import requests from lxml import etree import time import json import pandas as pd # 获取商户名称和ID result = [] for i in range(1,51): print(i) url = r'http://www.dianping.com/haikou/ch10/p{page}'.format(page=i) headers = { "Cache-Control": "max-age=0", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,**; q=0.01", # "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36", # "X-Requested-With": "XMLHttpRequest", # "Referer": "http://www.dianping.com/shop/Ga3PMQe8ZoSdod7r", # "Accept-Language": "zh-CN,zh;q=0.9", # "cookie": "fspop=test; cy=23; cye=haikou; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=17d85390949c8-060fddc5b4ecfa-5d11371e-240000-17d8539094ac8; _lxsdk=17d85390949c8-060fddc5b4ecfa-5d11371e-240000-17d8539094ac8; _hc.v=6f6ee73a-db19-6427-de36-f4710afec8b8.1638617648; s_ViewType=10; ctu=88d747e0aa8639b8b6451d577574db552ced28f278254e1a47e8525faff77616; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1638617648,1638620196,1638787904,1638842852; dper=92b45344e065dccd73f49a5f35c3199c3d907477f6b3e9fadd830d19d2904701a3018335efdd719657fc51c0b35a95e16feb7a78856989abfd30e64925a98770bdc6d2a4853ea19f7a87d9d0c877b9efe52b4a17c9b91206d718e25a8be391b4; ll=7fd06e815b796be3df069dec7836c3df; ua=dpuser_8723703376; uamo=18508921021; dplet=87f469cd05e56b09fb2552b4f14df7f5; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1638842889; _lxsdk_s=17d92a5b4f6-954-2fd-bcf%7C%7C66" # } # # 爬取的时候有时候会莫名冒出异常,这里做了异常处理 # try: # response = json.loads(requests.get(url=url,headers=headers).text).get('msg').get('shopInfo') # except json.JSONDecodeError: # print('失败') # else: # if response: # print('成功') # info={} # info['shopname'] = response.get('shopName') # info['address'] = str_replace(response.get('address')) # info['phone'] = str_replace(response.get('phoneNo')) # info['id'] = response.get('shopId') # print(info) # shop_list.append(info) # else: # print('失败') # time.sleep(5) # 详情页 for i in result: url = r'https://www.dianping.com/shop/{id}'.format(id=i.get('id')) headers = { "Cache-Control": "max-age=0", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Language": "zh-CN,zh;q=0.9", "cookie": "fspop=test; cy=23; cye=haikou; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=17d85390949c8-060fddc5b4ecfa-5d11371e-240000-17d8539094ac8; _lxsdk=17d85390949c8-060fddc5b4ecfa-5d11371e-240000-17d8539094ac8; _hc.v=6f6ee73a-db19-6427-de36-f4710afec8b8.1638617648; s_ViewType=10; ctu=88d747e0aa8639b8b6451d577574db552ced28f278254e1a47e8525faff77616; dper=92b45344e065dccd73f49a5f35c3199c3d907477f6b3e9fadd830d19d2904701a3018335efdd719657fc51c0b35a95e16feb7a78856989abfd30e64925a98770bdc6d2a4853ea19f7a87d9d0c877b9efe52b4a17c9b91206d718e25a8be391b4; ua=dpuser_8723703376; uamo=18508921021; ll=7fd06e815b796be3df069dec7836c3df; dplet=c6b2f0bba2d39dfffe3dbbdded1c9693; Hm_lvt_602b80cf8079ae6591966cc70a3940e7=1638787904,1638842852,1638846334,1638855073; Hm_lpvt_602b80cf8079ae6591966cc70a3940e7=1638855073; _lxsdk_s=17d935fd8bf-a00-3db-efa%7C%7C20"} response = requests.get(url=url,headers=headers).text html = etree.HTML(response) i['address'] = html.xpath('//span[@id="address"]//text()') i['phone'] = html.xpath('//p[@]//text()') print(i) shop_list.append(i) time.sleep(3) file = pd.Dataframe(shop_list) file.to_excel(r'C:UsersAdministratorDesktop店铺.xlsx',index=False)
- 这里用了两种方式,第一种API接口数据会更规范一点,只是反爬限制太大,没跑多久就会被封。没心情去搞UA和IP,后摸索发现了详情页,基本不限制。
- 通过API接口爬出来的数据是带有CSS代码,这里数据清洗过程没有用正则提取,而是直接定义函数将不需要的全部替换掉
文字反解析
from fontTools.ttLib import TTFont import pandas as pd word = '1234567890店中美家馆小车大市公酒行国品发电金心业商司超生装园场食有新限天面工服海华水房饰城乐汽香部利子老艺花专东肉菜学福饭人百餐茶务通味所山区门药银农龙停尚安广鑫一容动南具源兴鲜记时机烤文康信果阳理锅宝达地儿衣特产西批坊州牛佳化五米修爱北养卖建材三会鸡室红站德王光名丽油院堂烧江社合星货型村自科快便日民营和活童明器烟育宾精屋经居庄石顺林尔县手厅销用好客火雅盛体旅之鞋辣作粉包楼校鱼平彩上吧保永万物教吃设医正造丰健点汤网庆技斯洗料配汇木缘加麻联卫川泰色世方寓风幼羊烫来高厂兰阿贝皮全女拉成云维贸道术运都口博河瑞宏京际路祥青镇厨培力惠连马鸿钢训影甲助窗布富牌头四多妆吉苑沙恒隆春干饼氏里二管诚制售嘉长轩杂副清计黄讯太鸭号街交与叉附近层旁对巷栋环省桥湖段乡厦府铺内侧元购前幢滨处向座下臬凤港开关景泉塘放昌线湾政步宁解白田町溪十八古双胜本单同九迎第台玉锦底后七斜期武岭松角纪朝峰六振珠局岗洲横边济井办汉代临弄团外塔杨铁浦字年岛陵原梅进荣友虹央桂沿事津凯莲丁秀柳集紫旗张谷的是不了很还个也这我就在以可到错没去过感次要比觉看得说常真们但最喜哈么别位能较境非为欢然他挺着价那意种想出员两推做排实分间甜度起满给热完格荐喝等其再几只现朋候样直而买于般豆量选奶打每评少算又因情找些份置适什蛋师气你姐棒试总定啊足级整带虾如态且尝主话强当更板知己无酸让入啦式笑赞片酱差像提队走嫩才刚午接重串回晚微周值费性桌拍跟块调糕' font = TTFont(r'C:UsersAdministratorDesktop地址.woff') phid = font.getGlyphOrder()[2:] address_dict = {} for i in zip(phid,word): address_dict[i[0][3:]] = i[1] font1 = TTFont(r'C:UsersAdministratorDesktop数字.woff') phid1 = font1.getGlyphOrder()[2:] num_dict = {} for i in zip(phid1,word): num_dict[i[0][3:]] = i[1] # API # def func1(x): # address = x.split(' ') # info = [] # for i in address: # if address_dict.get(i): # info.append(address_dict.get(i)) # else: # if num_dict.get(i): # info.append(num_dict.get(i)) # else: # info.append(i) # return ''.join(info).strip() # def func2(x): # num = x.split(' ') # info = [] # for i in num: # if num_dict.get(i): # info.append(num_dict.get(i)) # else: # info.append(i) # return ''.join(info).strip() # 详情页 def func1(x): info = [] for i in x: k = (r''+repr(i))[3:7] if address_dict.get(k): info.append(address_dict.get(k)) else: if num_dict.get(k): info.append(num_dict.get(k)) else: info.append(i) return ''.join(info).strip() def func2(x): info = [] for i in x: k = (r''+repr(i))[3:7] # 提取出来的字符串含有转义字符 u,所以这里需要做额外处理 if num_dict.get(k): info.append(num_dict.get(i)) else: info.append(i) return ''.join(info).strip() data = pd.read_excel(r'C:UsersAdministratorDesktop店铺.xlsx',engine='openpyxl') data['address'] = data['address'].map(func1) data['phone'] = data['phone'].map(func2) data.to_excel(r'C:UsersAdministratorDesktop大众点评.xlsx',index=False)
- 文字反解析的重点是提取 GlyphOrder参数与对应的文字形成映射字典,这里的文字是复制网友的,自己手打太长了
- 有一点需要注意的是通过详情页提取出来的字符串是含有转义字符u,类似uebc2这种。Python 不能直接在这样的字符串上面做替换,所以做了额外的处理r''+repr(i)
- 字体文件会变,每次爬需要把对应的字体文件下载下来
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)