我在settings.py中设置了SCHEDulER_IDLE_BEFORE_CLOSE,但它似乎不起作用.
这是我的settings.py:
SCHEDulER = "scrapy_redis.scheduler.Scheduler"DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"SCHEDulER_IDLE_BEFORE_CLOSE = 10REdis_HOST = 'localhost'DOWNLOADER_MIDDLEWARES = { 'serp_crawl.mIDdlewares.RandomUserAgentMIDdleware': 200,'scrapy_crawlera.CrawleraMIDdleware': 300}CRAWLERA_ENABLED = TrueCRAWLERA_USER = ''CRAWLERA_PASS = ''#Activate Crawlera User AgentDEFAulT_REQUEST_headerS = { "X-Crawlera-UA": "pass",}
UPDATE
这是我的蜘蛛代码:
from scrapy_redis.spIDers import RedisSpIDerfrom elasticsearch import Elasticsearchfrom serp_crawl.settings import *from datetime import datetimefrom redis import Redisimport scrapyimport Jsonclass SerpSpIDer(RedisSpIDer): name = "serpcrawler" redis_key = 'serp_crawler:request' def __init__(self,redis_host='localhost',redis_port='6379',elasticsearch_host='localhost',elasticsearch_port='9200',MysqL_host='localhost',dev=False,): super(SerpSpIDer,self).__init__() self.platform = None self.dev = bool(dev) self.q = Redis(redis_host,redis_port) self.es = Elasticsearch([{'host': elasticsearch_host,'port': elasticsearch_port}]) @classmethod def from_crawler(self,crawler,*args,**kwargs): crawler.settings.attributes['REdis_HOST'].value = kwargs['redis_host'] obj = super(RedisSpIDer,self).from_crawler(crawler,**kwargs) obj.setup_redis(crawler) return obj def make_requests_from_url(self,url): data = Json.loads(url) self.logger.info('Got new url to parse: ',data['url']) self.settings.attributes['DEFAulT_REQUEST_headerS'].value.attributes['X-Crawlera-UA'].value = data['platform'] self.platform = data['platform'] return scrapy.Request(url=data['url'],callback=self.parse,Meta={'keyword': data['keyword'],'ID': data['ID_keyword'],'country': data['country'],'platform': data['platform']},dont_filter=True) def parse(self,response): doc = dict() try: doc['content'] = response.body.decode('cp1252') except: doc['content'] = response.body doc['date'] = datetime.Now().strftime('%Y-%m-%d') doc['keyword'] = str(response.Meta['keyword']) doc['type_platform'] = str(response.Meta['platform']) doc['country'] = str(response.Meta['country']) if not self.dev: ID_index = self.es.index(index='serp_HTML',doc_type='page',body=doc) self.q.lpush('batching_serp',{'ID_index': str(ID_index['_ID']),'type_batching': 'default','country': doc['country'],'type_platform': doc['type_platform'],'keyword': doc['keyword'],'ID_keyword': int(response.Meta['ID'])}) self.logger.info('Indexed new page. ID_es : [' + str(ID_index['_ID']) + ']')
谢谢你的帮助.
解决方法 scrapy-redis文档说:# Max IDle time to prevent the spIDer from being closed when distributed crawling.# This only works if queue class is SpIDerQueue or SpIDerStack,# and may also block the same time when your spIDer start at the first time (because the queue is empty).SCHEDulER_IDLE_BEFORE_CLOSE = 10
因此,您需要设置以下任一设置:
SCHEDulER_QUEUE_CLASS = 'scrapy_redis.queue.FifoQueue'# orSCHEDulER_QUEUE_CLASS = 'scrapy_redis.queue.lifoQueue'总结
以上是内存溢出为你收集整理的python – scrapy_redis在IDLE的x时间之后停止我的蜘蛛全部内容,希望文章能够帮你解决python – scrapy_redis在IDLE的x时间之后停止我的蜘蛛所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)