运用scrapy加上selenium爬取校园网课表_随笔

运用scrapy加上selenium爬取校园网课表

我们在大学里肯定有过、周末不想待在寝室以及吃完饭后下午第四大节有课而又不想回到遥远的寝室，这时候你就需要空教室，但一个一个的去找的话就太费时间了，所以就有了我们这次的项目。我们的项目只要输入星期几的第几大节就可以查询到那个时间的空教室，以及在这个基础上我们添加了查询班级课表的功能，可以查询到其他班级的课表，主要运用到的技术就是scrapy加上selenium

首先这是我们学校的课表

这是对于scrapy的爬虫部件

import scrapy
from selenium import webdriver
from Class.items import ClassItem
import re
import time


class ClassplanSpider(scrapy.Spider):
    name = 'ClassPlan'
    #allowed_domains = ['www.xxx.com']
    //校园网的地址
    start_urls = ['http://m.dean.nsu.edu.cn/ZNPK/KBFB_RoomSel.aspx']

    //对爬取数据进行解析
    def parse(self, response):
        //一天有着六节课，存储数据时开始将其设为1
        classNumber=1
        //同理一周有七天
        ClassWeek=1
        base0=response.xpath('//*[@id="pageRpt"]/table[3]/tbody/tr[position()>1 and position()<8]')
        Text=response.xpath('//*[@id="pageRpt"]/table[2]/tbody/tr/td/text()').extract()
        if(Text!=[]):
            text=Text[0]
            index1 =text.find("房：")
            index2=text.find("教")
            JXL=text[index1 + 2:index2 - 2]
            Room=text[index2+3:]
        for j in base0:
            base=j.xpath('./td[@ valign="top"]')
            for i in base:
                text=i.xpath('./text()').extract()
                item=ClassItem()
                item['classInfo']=text
                classTime='星期'+str(ClassWeek)+'第'+str(classNumber)+'节课'
                item['classTime']= classTime
                item['classJXL']=JXL
                item['classRoom']=Room
                ClassWeek=ClassWeek+1
                yield item
            classNumber=classNumber+1
            ClassWeek=1
        yield scrapy.Request(url='http://m.dean.nsu.edu.cn/ZNPK/KBFB_RoomSel.aspx',dont_filter = True)

这是定义的item文件

import scrapy


class ClassItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    classInfo=scrapy.Field()
    classTime=scrapy.Field()
    classJXL=scrapy.Field()
    classRoom=scrapy.Field()


    pass

这是对于中间键的一些重定义

class SeleniumMiddleware(object):
    def __init__(self):
        self.chaojiying = Chaojiying_Client('hhhhio', 'sabercon110', '922661')
        self.timeout = 50
        # 2.Firefox---------------------------------
        # 实例化参数对象
        options = webdriver.FirefoxOptions()
        self.JxlCount=12
        self.RoomCount=1
        # 无界面
        options.add_argument('--headless')
        # 关闭浏览器d窗
        options.set_preference('dom.webnotifications.enabled', False)
        options.set_preference('dom.push.enabled', False)
        # 打开浏览器
        self.browser = webdriver.Firefox(firefox_options=options)
        # 指定浏览器窗口大小
        self.browser.set_window_size(1400, 700)
        # 设置页面加载超时时间
        self.browser.set_page_load_timeout(self.timeout)
        self.wait = WebDriverWait(self.browser, self.timeout)

    def process_request(self, request, spider):
        # 当请求的页面不是当前页面时
        if self.browser.current_url != request.url:
            # 获取页面
            self.browser.get(request.url)
            time.sleep(5)
        else:
            pass
        self.browser.switch_to_default_content()
        //对下拉选框，进行选择
        s1 = Select(self.browser.find_element_by_name("Sel_XQ"))
        s1.select_by_visible_text("本校区")
        s2 = Select(self.browser.find_element_by_name("Sel_JXL"))
        s2.select_by_index(self.JxlCount)
        s3 = Select(self.browser.find_element_by_name("Sel_ROOM"))
        roomNum = len(s3.options)
        print(roomNum)
        if(roomNum==1):
            s2.select_by_index(self.JxlCount+1)
            s3 = Select(self.browser.find_element_by_name("Sel_ROOM"))
            s3.select_by_index(self.RoomCount)
        else:
            s3.select_by_index(self.RoomCount)

        input=self.browser.find_element_by_id('txt_yzm')
        text=self.browser.find_element_by_id('txt_yzm').get_attribute('value')

        while(text==''):
            image_element = self.browser.find_element_by_id('imgCode')
            image_element.screenshot("image_element.png")
            with open("image_element.png", 'rb') as f:
                content = f.read()
                # 测试识别验证码
                resp = requests.post("http://127.0.0.1:7788", data=content)
                code = resp.json()["code"]
                print(code)
            input.send_keys(code)
            text = self.browser.find_element_by_id('txt_yzm').get_attribute('value')
            time.sleep(5)
        search = self.browser.find_element_by_name("btnSearch")
        time.sleep(5)
        search.click()
        //切换iframe
        self.browser.switch_to_frame('frmRpt')
        if(self.RoomCount 
 
重定义了init方法，在开始的时候运用selenium打开浏览器，在爬取前利用selenium选取校区，教学楼，教室等数据，将验证码放入三方网站,进行识别,当验证码为空时，填入验证码然后模拟点击检索 
 
注意，获得到的数据存在另一个iframe里面，要返回html需要selenium先切换到这个iframe,在返回结果后爬虫文件中定义的parse2进行解析 
对于数据存储方面的代码 
class mysqlPileLine(object):
    conn = None
    cursor= None
    def open_spider(self,spider):
        self.conn=pymysql.connect(host='127.0.0.1',port=3306,user='root',password='123456',db='class',charset='utf8')
        self.index=0
    def process_item(self, item, spider):
        self.cursor=self.conn.cursor()
        self.index=self.index+1
        text=('insert into class(JXL,Room,Info,Time) values("%s","%s","%s","%s")'%(item["classJXL"],item["classRoom"],item["classInfo"],item["classTime"]))
        print(text)
        try:
            self.cursor.execute(text)
            self.cursor.connection.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()
    def close_spider(self):
        self.cursor.close()
        self.conn.close()

 
结果展示 
 
  
  
  
 
 
 
 					
										


					
						欢迎分享，转载请注明来源：内存溢出
原文地址: http://outofmemory.cn/zaji/5679640.html

运用scrapy加上selenium爬取校园网课表

发表评论

评论列表（0条）