因为一些特殊的需要,需要爬取一些数据,对之前的进行了更改。供大家交流。
from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys import xlrd from urllib.parse import urljoin import time import random import json import csv import os import pandas as pd from itertools import zip_longest # 设置谷歌驱动器的环境 options = webdriver.ChromeOptions() # 设置chrome不加载图片,提高速度 options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2}) # 创建一个谷歌驱动器 browser = webdriver.Chrome() # tbser=webdriver.Chrome() def start_spider(aurname): #爬取知网信息 # 请求url url = 'https://kns.cnki.net/kns/brief/result.aspx?dbprefix=SCDB' browser.get(url) # 显示等待输入框是否加载完成 WebDriverWait(browser, 1000).until( EC.presence_of_all_elements_located( (By.ID, 'iframeResult') ) ) browser.find_element(By.ID,'txt_1_value1').send_keys(aurname) search_btn = browser.find_element(By.ID,'btnSearch') search_btn.click() # 显示等待文献是否加载完成,等待iframe加载完成 time.sleep(random.randint(2, 4)) print("start!") browser.switch_to.default_content() browser.switch_to.frame('iframeResult') # 定位到页面元素 browser.find_element(By.link_TEXT, '50').click() pagecount= browser.find_elements(By.CSS_SELECTOR,'span.countPageMark') pcount=str(pagecount[0].text) pct=pcount.split('/')[1] ptc=int(pct) print("共计有:",ptc) browser.find_element(By.link_TEXT, '切换到摘要').click() df = [] for page in range(1,ptc): yzm = browser.find_elements(By.ID, 'CheckCode') if len(yzm) == 0: time.sleep(random.randint(3, 5)) browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') paper_name = browser.find_elements(By.XPATH,'//h3[@]/a') source = browser.find_elements(By.XPATH,'//span[@]') datetime = browser.find_elements(By.XPATH,'//div[@]/label') database = browser.find_elements(By.CSS_SELECTOR,'span.database') abstext=browser.find_elements(By.CSS_SELECTOR,'p.abstract_c') for k in range(len(paper_name)): # print(k,"abs",abstext[k].text) # print(k, "tit", paper_name[k].text) # print(k, "su", source[k].text) # print(k, "dt", datetime[k].text) df.append([paper_name[k].text, source[k].text, datetime[k].text, database[k].text, abstext[k].text]) print('第{}页爬取完毕'.format(page)) target=browser.find_element(By.link_TEXT, '下一页') browser.execute_script("arguments[0].scrollIntoView();", target) browser.find_element(By.link_TEXT,'下一页').click() time.sleep(random.randint(4, 7)) else: yzcode=input("请输入验证码:") page=page-1 # browser.find_element(By.ID, 'CheckCode').send_keys(yzcode) # browser.find_element(By., 'CheckCode').send_keys(yzcode) inf=pd.Dataframe(df) inf.to_csv('paper_info.csv',mode='a', header=False,index=False,encoding='utf-8-sig') print("保存完毕!") # driver.execute_script("arguments[0].scrollIntoView();", target) def main(): inf = pd.Dataframe(columns=['论文名', '来源', '发表日期', '数据库', '摘要']) inf.to_csv('paper_info.csv', index=False, encoding='utf-8-sig') aurname='冯雪峰' start_spider(aurname) browser.quit() if __name__ == '__main__': main()
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)