import requestsfrom lxml import etreeimport csvfrom fake_useragent import UserAgentclass ScendHouseSpIDe: def __init__(self): self.base_url = " https://cs.lianjia.com/ershoufang/pg{}/" def get_@R_403_6832@(self, num): url = self.base_url.format(num) headers = { "User-Agent": UserAgent().Chrome } res = requests.get(url, headers=headers) return res.content.decode() def get_data(self, num): lst = [] @R_403_6832@_element = etree.@R_403_6832@(self.get_@R_403_6832@(num)) prices = @R_403_6832@_element.xpath("//*[@ID='content']/div[1]/ul/li/div[1]/div[6]/div[1]/span/text()") names = @R_403_6832@_element.xpath("//*[@ID='content']/div[1]/ul/li/div[1]/div[2]/div/a[1]/text() ") positions = @R_403_6832@_element.xpath("//*[@ID='content']/div[1]/ul/li/div[1]/div[2]/div/a[1]/text() ") infos = @R_403_6832@_element.xpath("//*[@ID='content']/div[1]/ul/li/div[1]/div[3]/div/text() ") for pr, na, po, inf in zip(prices, names, positions, infos): data_dict = { "价格": pr+"万", "名称": na, "位置": po, "房源信息": inf } lst.append(data_dict) return lst @staticmethod def save_data(lst): headers = {"价格", "名称", "位置", "房源信息"} with open(file="链家长沙二手房信息.csv", mode='w', enCoding='utf-8') as f: writer = csv.DictWriter(f, headers) writer.writeheader() writer.writerows(lst)if __name__ == '__main__': lst1 = [] for i in range(1, 5): # 只爬取了5页 lst1 += ScendHouseSpIDe().get_data(i) ScendHouseSpIDe().save_data(lst1)@H_301_21@
总结 以上是内存溢出为你收集整理的python爬虫第十一讲-阶段复习全部内容,希望文章能够帮你解决python爬虫第十一讲-阶段复习所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)