selenium的爬虫案例

selenium的爬虫案例,第1张

from time import sleep
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import random
import pymysql
from lxml import etree
from webdriver_manager.chrome import ChromeDriverManager


class Xkpm():

    def __init__(self):
        """
        建议使用火狐浏览器进行爬虫 *** 作,不需要根据浏览器版本更新浏览器驱动,无需过多的反反爬配置,
        对于个别网站谷歌了浏览器在加载时,不断出现问题(网站和反爬的原因都有),火狐浏览器都能解决不需要配置设置什么
        """
        
        #避免谷歌浏览器升级后需要重新下载浏览器驱动
        _path = ChromeDriverManager(url="https://npm.taobao.org/mirrors/chromedriver").install()
        chrome_options = Options()
        #添加一些反反爬的配置
        # chrome_options.add_argument("--headless")
        chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        self.driver = webdriver.Chrome(service=Service(_path), options=chrome_options)
        self.driver.maximize_window()
        self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
            "source": """
                        Object.defineProperty(navigator, 'webself.driver', {
                          get: () => undefined
                        })
                      """
        })
        self.driver.implicitly_wait(10)
        self.driver.get('https://www.shanghairanking.cn/')
        self.xk_list = []
        # 打开数据库连接
        self.db = pymysql.connect(host='localhost', user='root', password='957312a.', database='mymysql')

        # 使用 cursor() 方法创建一个游标对象 cursor
        self.cursor = self.db.cursor()

    # 随机等待时间
    def random_time(self):
        sleep(random.uniform(3, 8))

    # 遍历专业
    def parse_xk(self):
        self.driver.get('https://www.shanghairanking.cn/rankings/bcsr/2021')
        self.random_time()
        # 遍历一级目录
        for left in self.driver.find_elements(By.XPATH, '//*[@]'):
            category = left.find_element(By.XPATH, './/*[@]').text.split("\n")
            for link in left.find_elements(By.XPATH, './/*[@]'):
                xk_dic = {}
                xk_dic['xkmudm'] = category[0]  # 学科目录代号
                xk_dic['xkmumc'] = category[1]  # 学科目录名称
                subj_link = link.text.split(" ")
                xk_dic['xkmcdm'] = subj_link[0]  # 学科名称代码
                xk_dic['xkmc'] = subj_link[1]  # 学科名称
                xk_dic['href'] = link.get_attribute('href')
                self.xk_list.append(xk_dic)
        # print(self.xk_list)
        for xk in self.xk_list:
            self.driver.get(xk['href'])
            self.parse_xx(xk)
        self.end_pachong()

    # 排名学校
    def parse_xx(self,xk):
        self.random_time()
        tree = etree.HTML(self.driver.page_source)
        tr_list = tree.xpath('//*[@id="content-box"]/div[2]/table/tbody/tr')
        for tr in tr_list:
            xxmp1 = self.knrcl(tr.xpath('.//*[@]/text()')).strip()#排名1
            xxmp2 = self.knrcl(tr.xpath('./td/span/text()')).strip()#排名2
            qbcc = self.knrcl(tr.xpath('./td[3]/text()')).strip()#全部层次
            xxmc = self.knrcl(tr.xpath('.//a[@]/text()')).strip()#学校名称
            zf = self.knrcl(tr.xpath('./td[5]/text()')).strip()#总分
            print(xk['xkmudm'], xk['xkmumc'], xk['xkmcdm'],xk['xkmc'],xxmp1,xxmp2,qbcc,xxmc,zf,xk['href'])
            sql = """INSERT INTO xkpm(xkmudm,xkmumc,xkmcdm,xkmc,xxmp1,xxmp2,qbcc,xxmc,zf,hrefxxmc)
                                  VALUES ("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}")""".format(
                xk['xkmudm'], xk['xkmumc'], xk['xkmcdm'],xk['xkmc'],xxmp1,xxmp2,qbcc,xxmc,zf,xk['href']+':'+xxmc
            )
            try:
                self.cursor.execute(sql)
                self.db.commit()
            except Exception as e:
                print(e)
                self.db.rollback()
            sleep(.1)
        self.random_time()
        try:
            pa_xy = self.driver.find_element(By.XPATH, '//li[@title="下一页"]')
            if pa_xy.get_attribute('tabindex') == '0':
                pa_xy.click()
                self.random_time()
                self.parse_xx(xk)
            else:
                pass
        except:
            pass
        self.random_time()

    # 空内容处理
    def knrcl(self, cl):
        try:
            return cl[0]
        except:
            return ''

    # 关闭浏览器和数据库
    def end_pachong(self):
        self.driver.quit()
        self.db.close()

Xkpm().parse_xk()

 

 

 

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/langs/735319.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-04-27
下一篇 2022-04-27

发表评论

登录后才能评论

评论列表(0条)

保存