CrawlSpider继承了baseSpider。它只是添加了规则来提取和跟踪链接。如果这些规则不够灵活,请使用baseSpider:
class USpider(baseSpider): """my spider. """ start_urls = ['http://www.amazon.com/s/?url=search-alias%3Dapparel&sort=relevance-fs-browse-rank'] allowed_domains = ['amazon.com'] def parse(self, response): '''Parse main category search page and extract subcategory search link.''' self.log('Downloaded category search page.', log.DEBUG) if response.meta['depth'] > 5: self.log('Categories depth limit reached (recursive links?). Stopping further following.', log.WARNING) hxs = HtmlXPathSelector(response) subcategories = hxs.select("//div[@id='refinements']/*[starts-with(.,'Department')]/following-sibling::ul[1]/li/a[span[@]]/@href").extract() for subcategory in subcategories: subcategorySearchlink = urlparse.urljoin(response.url, subcategorySearchlink) yield Request(subcategorySearchlink, callback = self.parseSubcategory) def parseSubcategory(self, response): '''Parse subcategory search page and extract item links.''' hxs = HtmlXPathSelector(response) for itemlink in hxs.select('//a[@]/@href').extract(): itemlink = urlparse.urljoin(response.url, itemlink) self.log('Requesting item page: ' + itemlink, log.DEBUG) yield Request(itemlink, callback = self.parseItem) try: nextPagelink = hxs.select("//a[@id='pagnNextlink']/@href").extract()[0] nextPagelink = urlparse.urljoin(response.url, nextPagelink) self.log('nGoing to next search page: ' + nextPagelink + 'n', log.DEBUG) yield Request(nextPagelink, callback = self.parseSubcategory) except: self.log('Whole category parsed: ' + categoryPath, log.DEBUG) def parseItem(self, response): '''Parse item page and extract product info.''' hxs = HtmlXPathSelector(response) item = UItem() item['brand'] = self.extractText("//div[@]/span[1]/a[1]", hxs) item['title'] = self.extractText("//span[@id='btAsinTitle']", hxs) ...
即使baseSpider的start_urls对您来说不够灵活,也请重写start_requests方法。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)