scrapy框架初识04-CrawlSpider_java

- CrawlSpider:类，Spider的一个子类
    - 全站数据爬取的方式
        - 基于Spider：手动请求
        - 基于CrawlSpider
    - CrawlSpider的使用：
        - 创建一个工程
        - cd XXX
        - 创建爬虫文件（CrawlSpider）：
            - scrapy genspider -t crawl xxx www.xxxx.com
            - 链接提取器：
                - 作用：根据指定的规则（allow）进行指定链接的提取
            - 规则解析器：
                - 作用：将链接提取器提取到的链接进行指定规则（callback）的解析
        #需求：爬取sun网站中的编号，新闻标题，新闻内容，标号
            - 分析：爬取的数据没有在同一张页面中。
            - 1.可以使用链接提取器提取所有的页码链接
            - 2.让链接提取器提取所有的新闻详情页的链接

本次案例建议使用代理IP,芝麻代理注册即可每天使用免费ip

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from sunPro.items import SunproItem,DetailItem

class SunSpider(CrawlSpider):
    name = 'sun'
    # allowed_domains = ['']
    start_urls = ['https://wz.sun0769.com/political/index/politicsNewest?id=1&page=']
    #链接提取器：根据指定规则(allows=“正则”)进行指定连接提取,从start_urls中提取
    link=LinkExtractor(allow=r'id=1&page=\d+')
    link_detail=LinkExtractor(allow=r'id=\d+')
    rules = (
        #规则解析器:将链接提取器提取到的链接进行指定规则的解析
        Rule(link, callback='parse_item', follow=True),
        # follow=True：可以将链接提取器 继续作用到 连接提取器提取到的链接 所对应的页面中 即例如：在第2个页面中提取第3个页面链接 ...
        Rule(link_detail,callback='parse_detail')
    )
#以下两个方法不可以实现请求传参！
#解析问题编号和问题标题
    #如何将两个方法解析到的数据存到同一个item中，存到两个item中
    def parse_item(self, response):
        li_list=response.xpath('/html/body/div[2]/div[3]/ul[2]/li')
        for li in li_list:
            p_num=li.xpath('./span[1]/text()').extract_first()
            p_title=li.xpath('./span[3]/a/text()').extract_first()
            # print(p_num,p_title)
            item=SunproItem()
            item['p_title']=p_title
            item['p_num']=p_num
            yield item
#解析问题具体内容和编号
    def parse_detail(self,response):
        p_content=response.xpath('/html/body/div[3]/div[2]/div[2]/div[2]/pre/text()').extract_first()
        p_id=response.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/span[4]/text()').extract_first()
        # print(p_id,p_content)
        if not p_id:
            print('已处理空值！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！！')
        else:
            p_id=p_id[3:]
            item=DetailItem()
            item['p_content']=p_content
            item['p_id']=p_id
            yield item

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql

class SunproPipeline:
    def process_item(self, item, spider):
        #如何判断是那个item
        if item.__class__.__name__== 'SunproItem':
            print(item['p_num'],item['p_title'])
        else:
            print(item['p_id'],item['p_content'])

        return item
class mysqlPileLine:
    conn=None
    cursor = None
    def open_spider(self,spider):
        self.coon = pymysql.Connect(host='127.0.0.1',user='root',password='123456',db='rxkc',charset='utf8')
    def process_item(self, item, spider):
        self.cursor=self.coon.cursor()
        try:
            if item.__class__.__name__ == 'SunproItem':
                self.cursor.execute('insert into sun values("%s","%s")'%(item["p_num"],item["p_title"]))
                self.coon.commit()
            else:
                self.cursor.execute('insert into sunn values("%s","%s")' % (item["p_id"], item["p_content"]))
                self.coon.commit()
        except Exception as e:
            print(e)
            self.coon.rollback()
        return item
    def close_spider(self,spider):
        self.cursor.close()
        self.coon.close()

# Define here the models for your spider middleware
#
# See documentation in:
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals

# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
import random
class SunproDownloaderMiddleware:
    user_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]
    PROXY_http = [
        '125.106.138.41:4234',
        '42.6.114.117:7018',
    ]
    PROXY_https = [
        '110.89.123.129:4213',
        '118.123.40.30:4231',
        '42.6.114.104:2687'
    ]

    # 拦截请求
    def process_request(self, request, spider):
        request.headers['User-Agent'] = random.choice(self.user_agent_list)
        # request.meta['proxy'] = 'https://42.6.114.99:9702'
        return None

    def process_response(self, request, response, spider):
        # Called with the response returned from the downloader.

        # Must either;
        # - return a Response object
        # - return a Request object
        # - or raise IgnoreRequest
        return response

    def process_exception(self, request, exception, spider):
        # 拦截发生异常的请求
        if request.url.split(':')[0] == 'http':
            # 代理
            request.meta['proxy'] = 'http://' + random.choice(self.PROXY_http)
        else:
            request.meta['proxy'] = 'https://' + random.choice(self.PROXY_https)

        return request  # 将修正之后的请求对象进行重新的请求发送

import scrapy


class SunproItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    p_num = scrapy.Field()
    p_title=scrapy.Field()
class DetailItem(scrapy.Item):
    p_content = scrapy.Field()
    p_id =scrapy.Field()

连接一下两个表的结果：

欢迎分享，转载请注明来源：内存溢出

原文地址: http://outofmemory.cn/langs/725795.html