前置准备:【需要的软件】:PyCharm,Mysql,Navicat(这个可有可无)
【需要的知识】:会下载Scrapy(注意解释器),会打开mysql
先建包,PyCharm终端里敲三个命令三次回车
scrapy startproject work cd work scrapy genspider bei_bus beijing.8684.cn
打开新建的文件
bei_bus.py代码
from urllib.parse import urljoin import scrapy from scrapy import FormRequest, Request from work.items import WorkItem class BeiBusSpider(scrapy.Spider): name = 'bei_bus' allowed_domains = ['beijing.8684.cn'] start_urls = 'http://beijing.8684.cn/' def start_requests(self): for page in range(2): url = '{url}/list{page}'.format(url=self.start_urls, page=(page + 1)) yield FormRequest(url, callback=self.parse_index) def parse_index(self, response): hrefs = response.xpath("//div[@class='cc-content service-area']/div[@class='list clearfix']/a/@href").extract() for href in hrefs: url2 = urljoin(self.start_urls, href) yield Request(url2, callback=self.parse_detail) def parse_detail(self, response): name = response.xpath('//h1[@]/text()').extract_first() type = response.xpath('//a[@]/text()').extract_first() time = response.xpath('//ul[@]/li[1]/text()').extract_first() trip = response.xpath('//div[@]/text()').extract() luxian = response.xpath('//div[@]/ol/li/a/text()').extract() bus_item = WorkItem() for field in bus_item.fields: bus_item[field] = eval(field) yield bus_item
settings.py代码
BOT_NAME = 'work' SPIDER_MODULES = ['work.spiders'] NEWSPIDER_MODULE = 'work.spiders' ROBOTSTXT_OBEY = False DEFAULT_REQUEST_HEADERS = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language':'en', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36 Edg/94.0.992.47' } ITEM_PIPELINES = { 'work.pipelines.WorkPipeline': 300, } DB_HOST='localhost' DB_USER='root' //你的用户名 DB_PWD='123456' //你的密码 DB_CHARSET='UTF8' DB='busdb'
pipelines.py代码
import pymysql from itemadapter import ItemAdapter from work import settings class WorkPipeline: def __init__(self): self.host = settings.DB_HOST self.user = settings.DB_USER self.pwd = settings.DB_PWD self.db = settings.DB self.charset = settings.DB_CHARSET self.connect() def connect(self): self.conn = pymysql.Connect(host=self.host, user=self.user, password=self.pwd, db=self.db, charset=self.charset) self.cursor = self.conn.cursor() def close_spider(self, spider): self.conn.close() self.cursor.close() def process_item(self, item, spider): sql = 'insert into businfo2(name,type,time,trip,luxian) values("%s","%s","%s","%s","%s")' % (item['name'], item['type'], item['time'], item['trip'], item['luxian']) self.cursor.execute(sql) self.conn.commit() return item
items.py代码
import scrapy class WorkItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() name = scrapy.Field() type = scrapy.Field() time = scrapy.Field() trip = scrapy.Field() luxian = scrapy.Field()
最后建表
用Navicat查看,最终效果图:
注意:settings文件有些是在注释基础上改动的,User-Agent从自己浏览器找。
官网:北京公交查询_北京公交车线路查询_北京公交地图 - 北京公交网 (8684.cn)
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)