scrapy框架初识04-CrawlSpider

scrapy框架初识04-CrawlSpider,第1张

- CrawlSpider:类,Spider的一个子类
    - 全站数据爬取的方式
        - 基于Spider:手动请求
        - 基于CrawlSpider
    - CrawlSpider的使用:
        - 创建一个工程
        - cd XXX
        - 创建爬虫文件(CrawlSpider):
            - scrapy genspider -t crawl xxx www.xxxx.com
            - 链接提取器:
                - 作用:根据指定的规则(allow)进行指定链接的提取
            - 规则解析器:
                - 作用:将链接提取器提取到的链接进行指定规则(callback)的解析
        #需求:爬取sun网站中的编号,新闻标题,新闻内容,标号
            - 分析:爬取的数据没有在同一张页面中。
            - 1.可以使用链接提取器提取所有的页码链接
            - 2.让链接提取器提取所有的新闻详情页的链接

本次案例建议使用代理IP,芝麻代理注册即可每天使用免费ip

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from sunPro.items import SunproItem,DetailItem

class SunSpider(CrawlSpider):
    name = 'sun'
    # allowed_domains = ['']
    start_urls = ['https://wz.sun0769.com/political/index/politicsNewest?id=1&page=']
    #链接提取器:根据指定规则(allows=“正则”)进行指定连接提取,从start_urls中提取
    link=LinkExtractor(allow=r'id=1&page=\d+')
    link_detail=LinkExtractor(allow=r'id=\d+')
    rules = (
        #规则解析器:将链接提取器提取到的链接进行指定规则的解析
        Rule(link, callback='parse_item', follow=True),
        # follow=True:可以将链接提取器 继续作用到 连接提取器提取到的链接 所对应的页面中 即例如:在第2个页面中提取第3个页面链接 ...
        Rule(link_detail,callback='parse_detail')
    )
#以下两个方法不可以实现请求传参!
#解析问题编号和问题标题
    #如何将两个方法解析到的数据存到同一个item中,存到两个item中
    def parse_item(self, response):
        li_list=response.xpath('/html/body/div[2]/div[3]/ul[2]/li')
        for li in li_list:
            p_num=li.xpath('./span[1]/text()').extract_first()
            p_title=li.xpath('./span[3]/a/text()').extract_first()
            # print(p_num,p_title)
            item=SunproItem()
            item['p_title']=p_title
            item['p_num']=p_num
            yield item
#解析问题具体内容和编号
    def parse_detail(self,response):
        p_content=response.xpath('/html/body/div[3]/div[2]/div[2]/div[2]/pre/text()').extract_first()
        p_id=response.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/span[4]/text()').extract_first()
        # print(p_id,p_content)
        if not p_id:
            print('已处理空值!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!')
        else:
            p_id=p_id[3:]
            item=DetailItem()
            item['p_content']=p_content
            item['p_id']=p_id
            yield item
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql

class SunproPipeline:
    def process_item(self, item, spider):
        #如何判断是那个item
        if item.__class__.__name__== 'SunproItem':
            print(item['p_num'],item['p_title'])
        else:
            print(item['p_id'],item['p_content'])

        return item
class mysqlPileLine:
    conn=None
    cursor = None
    def open_spider(self,spider):
        self.coon = pymysql.Connect(host='127.0.0.1',user='root',password='123456',db='rxkc',charset='utf8')
    def process_item(self, item, spider):
        self.cursor=self.coon.cursor()
        try:
            if item.__class__.__name__ == 'SunproItem':
                self.cursor.execute('insert into sun values("%s","%s")'%(item["p_num"],item["p_title"]))
                self.coon.commit()
            else:
                self.cursor.execute('insert into sunn values("%s","%s")' % (item["p_id"], item["p_content"]))
                self.coon.commit()
        except Exception as e:
            print(e)
            self.coon.rollback()
        return item
    def close_spider(self,spider):
        self.cursor.close()
        self.coon.close()
# Define here the models for your spider middleware
#
# See documentation in:
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals

# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
import random
class SunproDownloaderMiddleware:
    user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]
    PROXY_http = [
        '125.106.138.41:4234',
        '42.6.114.117:7018',
    ]
    PROXY_https = [
        '110.89.123.129:4213',
        '118.123.40.30:4231',
        '42.6.114.104:2687'
    ]

    # 拦截请求
    def process_request(self, request, spider):
        request.headers['User-Agent'] = random.choice(self.user_agent_list)
        # request.meta['proxy'] = 'https://42.6.114.99:9702'
        return None

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # 拦截发生异常的请求
        if request.url.split(':')[0] == 'http':
            # 代理
            request.meta['proxy'] = 'http://' + random.choice(self.PROXY_http)
        else:
            request.meta['proxy'] = 'https://' + random.choice(self.PROXY_https)

        return request  # 将修正之后的请求对象进行重新的请求发送


import scrapy


class SunproItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    p_num = scrapy.Field()
    p_title=scrapy.Field()
class DetailItem(scrapy.Item):
    p_content = scrapy.Field()
    p_id =scrapy.Field()

连接一下两个表的结果:

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/langs/725795.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-04-26
下一篇 2022-04-26

发表评论

登录后才能评论

评论列表(0条)

保存