这种分页并不是看起来那么简单。解决它是一个有趣的挑战。以下是有关该解决方案的一些重要说明:
- 这里的想法是按照分页页面逐页在字典中的当前页面周围传递
Request.meta
- 使用常规,
baseSpider
因为分页涉及一些逻辑 headers
假装成为真正的浏览器很重要- 产生
FormRequest
s很重要,dont_filter=True
因为我们基本上是POST
向相同的URL发出请求,但参数不同
编码:
import refrom scrapy.http import FormRequestfrom scrapy.spider import baseSpiderHEADERS = { 'X-MicrosoftAjax': 'Delta=true', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.76 Safari/537.36'}URL = 'http://exitrealty.com/agent_list.aspx?firstName=&lastName=&country=USA&state=NY'class ExitRealtySpider(baseSpider): name = "exit_realty" allowed_domains = ["exitrealty.com"] start_urls = [URL] def parse(self, response): # submit a form (first page) self.data = {} for form_input in response.css('form#aspnetForm input'): name = form_input.xpath('@name').extract()[0] try: value = form_input.xpath('@value').extract()[0] except IndexError: value = "" self.data[name] = value self.data['ctl00$MainContent$scriptManager1'] = 'ctl00$MainContent$UpdatePanel1|ctl00$MainContent$agentList' self.data['__EVENTTARGET'] = 'ctl00$MainContent$List' self.data['__EVENTARGUMENT'] = 'Page' return FormRequest(url=URL, method='POST', callback=self.parse_page, formdata=self.data, meta={'page': 1}, dont_filter=True, headers=HEADERS) def parse_page(self, response): current_page = response.meta['page'] + 1 # parse agents (TODO: yield items instead of printing) for agent in response.xpath('//a[@]/text()'): print agent.extract() print "------" # request the next page data = { '__EVENTARGUMENT': 'Page$%d' % current_page, '__EVENTVALIDATION': re.search(r"__EVENTVALIDATION|(.*?)|", response.body, re.MULTILINE).group(1), '__VIEWSTATE': re.search(r"__VIEWSTATE|(.*?)|", response.body, re.MULTILINE).group(1), '__ASYNCPOST': 'true', '__EVENTTARGET': 'ctl00$MainContent$agentList', 'ctl00$MainContent$scriptManager1': 'ctl00$MainContent$UpdatePanel1|ctl00$MainContent$agentList', '': '' } return FormRequest(url=URL, method='POST', formdata=data, callback=self.parse_page, meta={'page': current_page}, dont_filter=True, headers=HEADERS)
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)