- 程序
- 总结
仅仅提供学习使用
程序
仅仅提供学习使用
import requests from lxml import etree from bs4 import BeautifulSoup import json from selenium.webdriver.chrome.options import Options #实现五可视化 from selenium.webdriver import ChromeOptions #实现规避被检测到的风险 from time import sleep from selenium.webdriver import Chrome import pandas as pd chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable--gpu') #实现规避检测 option = ChromeOptions() option.add_experimental_option('excludeSwitches',['enable-automation']) chrome = Chrome(executable_path=r'E:浏览器下载chromedriver',options=chrome_options) url = 'https://search.51job.com/list/000000,000000,0000,32,9,99,+,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=' chrome.get(url) input = chrome.find_elements_by_id('keywordInput')[0] input.send_keys('java 成都') search = input.find_element_by_xpath('//*[@id="search_btn"]') search.click() text = chrome.page_source tree = etree.HTML(text) xinxi = tree.xpath('//script[@type="text/javascript"]')[3] dict_ = xinxi.xpath('./text()') num = json.loads(dict_[0][28:])['total_page'] data = [] for _ in range(int(num)-1): text = chrome.page_source tree = etree.HTML(text) xinxi = tree.xpath('//script[@type="text/javascript"]')[3] dict_ = xinxi.xpath('./text()') # print(dict_[0][28:]) dict_ = json.loads(dict_[0][28:])['engine_jds'] for i in dict_: data.append([i['job_name'], i['workarea_text'], i['updatedate'], list(i['attribute_text'])[1],list(i['attribute_text'])[2], i['providesalary_text'],list(i['attribute_text'])[-1]]) # print(i['job_name'], i['workarea_text'], i['updatedate'], list(i['attribute_text'])[1],list(i['attribute_text'])[2], i['providesalary_text'],list(i['attribute_text'])[-1]) try: sleep(1) next_ = chrome.find_elements_by_xpath('//li[@]')[0] next_.click() chrome.refresh() except: break data = pd.Dataframe(data,columns=['招聘岗位','招聘城市','招聘年月','工作经验','学历要求','招聘薪资','招聘数量']) writer = pd.ExcelWriter(r'C:Users24132Desktopdata_spyder.xlsx') # 写入Excel文件 data.to_excel(writer, 'page_1', float_format='%.10f') # ‘page_1’是写入excel的sheet名 writer.save() writer.close() # for i in range(1,101): # text = chrome.page_source # # tree = etree.HTML(text) # xinxi = tree.xpath('//script[@type="text/javascript"]')[3] # # dict_ = xinxi.xpath('./text()') # print(dict_[0][28:]) # dict_ = json.loads(dict_[0][28:])['engine_jds'] # # job_name workarea_text updatedate attribute_text providesalary_text # for i in dict_: # print(i['job_name'], i['workarea_text'], i['updatedate'], list(i['attribute_text']), i['providesalary_text']) # for i in range(1,101): # text = requests.get(url=url.format(i), headers=headers) # if text.status_code != 200: # break # tree = etree.HTML(text.text) # xinxi = tree.xpath('//script[@type="text/javascript"]')[2] # dict_ = xinxi.xpath('./text()') # dict_ = json.loads(dict_[0][29:])['engine_jds'] # # job_name workarea_text updatedate attribute_text providesalary_text # for i in dict_: # print(i['job_name'], i['workarea_text'], i['updatedate'], list(i['attribute_text']), i['providesalary_text'])总结 老老实实的迈好每一步,相信成功就在你身边 甘愿为理想“头破血流”
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)