from time import sleep from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options import random import pymysql from lxml import etree from webdriver_manager.chrome import ChromeDriverManager class Xkpm(): def __init__(self): """ 建议使用火狐浏览器进行爬虫 *** 作,不需要根据浏览器版本更新浏览器驱动,无需过多的反反爬配置, 对于个别网站谷歌了浏览器在加载时,不断出现问题(网站和反爬的原因都有),火狐浏览器都能解决不需要配置设置什么 """ #避免谷歌浏览器升级后需要重新下载浏览器驱动 _path = ChromeDriverManager(url="https://npm.taobao.org/mirrors/chromedriver").install() chrome_options = Options() #添加一些反反爬的配置 # chrome_options.add_argument("--headless") chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) chrome_options.add_argument("--disable-blink-features=AutomationControlled") self.driver = webdriver.Chrome(service=Service(_path), options=chrome_options) self.driver.maximize_window() self.driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { "source": """ Object.defineProperty(navigator, 'webself.driver', { get: () => undefined }) """ }) self.driver.implicitly_wait(10) self.driver.get('https://www.shanghairanking.cn/') self.xk_list = [] # 打开数据库连接 self.db = pymysql.connect(host='localhost', user='root', password='957312a.', database='mymysql') # 使用 cursor() 方法创建一个游标对象 cursor self.cursor = self.db.cursor() # 随机等待时间 def random_time(self): sleep(random.uniform(3, 8)) # 遍历专业 def parse_xk(self): self.driver.get('https://www.shanghairanking.cn/rankings/bcsr/2021') self.random_time() # 遍历一级目录 for left in self.driver.find_elements(By.XPATH, '//*[@]'): category = left.find_element(By.XPATH, './/*[@]').text.split("\n") for link in left.find_elements(By.XPATH, './/*[@]'): xk_dic = {} xk_dic['xkmudm'] = category[0] # 学科目录代号 xk_dic['xkmumc'] = category[1] # 学科目录名称 subj_link = link.text.split(" ") xk_dic['xkmcdm'] = subj_link[0] # 学科名称代码 xk_dic['xkmc'] = subj_link[1] # 学科名称 xk_dic['href'] = link.get_attribute('href') self.xk_list.append(xk_dic) # print(self.xk_list) for xk in self.xk_list: self.driver.get(xk['href']) self.parse_xx(xk) self.end_pachong() # 排名学校 def parse_xx(self,xk): self.random_time() tree = etree.HTML(self.driver.page_source) tr_list = tree.xpath('//*[@id="content-box"]/div[2]/table/tbody/tr') for tr in tr_list: xxmp1 = self.knrcl(tr.xpath('.//*[@]/text()')).strip()#排名1 xxmp2 = self.knrcl(tr.xpath('./td/span/text()')).strip()#排名2 qbcc = self.knrcl(tr.xpath('./td[3]/text()')).strip()#全部层次 xxmc = self.knrcl(tr.xpath('.//a[@]/text()')).strip()#学校名称 zf = self.knrcl(tr.xpath('./td[5]/text()')).strip()#总分 print(xk['xkmudm'], xk['xkmumc'], xk['xkmcdm'],xk['xkmc'],xxmp1,xxmp2,qbcc,xxmc,zf,xk['href']) sql = """INSERT INTO xkpm(xkmudm,xkmumc,xkmcdm,xkmc,xxmp1,xxmp2,qbcc,xxmc,zf,hrefxxmc) VALUES ("{}","{}","{}","{}","{}","{}","{}","{}","{}","{}")""".format( xk['xkmudm'], xk['xkmumc'], xk['xkmcdm'],xk['xkmc'],xxmp1,xxmp2,qbcc,xxmc,zf,xk['href']+':'+xxmc ) try: self.cursor.execute(sql) self.db.commit() except Exception as e: print(e) self.db.rollback() sleep(.1) self.random_time() try: pa_xy = self.driver.find_element(By.XPATH, '//li[@title="下一页"]') if pa_xy.get_attribute('tabindex') == '0': pa_xy.click() self.random_time() self.parse_xx(xk) else: pass except: pass self.random_time() # 空内容处理 def knrcl(self, cl): try: return cl[0] except: return '' # 关闭浏览器和数据库 def end_pachong(self): self.driver.quit() self.db.close() Xkpm().parse_xk()
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)