对某招聘网站的全国排行榜进行内容爬取,从一级页面通过遍历进入二级页面获取二级页面的内容,我经常503,代码应该没问题,仅供自己记录
import requests from bs4 import BeautifulSoup from urllib import parse import pandas as pd import random import time from sqlalchemy import create_engine import MySQLdb class Company_craw: def get_ua(self): ua_list = [ 这 里 是 很 多 ua ] return random.choice(ua_list) # 导出需要爬取的公司 def com_to_excel(self): company_Name_list = [] url1 = 'https://www.jobui.com/rank/company/view/quanguo/' resp = requests.get(url1, headers={'user-agent':self.get_ua()}) context = resp.text # print(context) soup = BeautifulSoup(context, 'html.parser') time.sleep(15) company_PHB = soup.find_all('div', class_='c-company-list') for item in company_PHB: try: company_Name = item.find('div', class_='company-segmetation').find('h3').text company_Name_list.append(company_Name) except: company_Name = '该公司不想被你看到' company_Name_dict = {'company_Name': company_Name_list} # print() print(company_Name_dict) df2 = pd.Dataframe(company_Name_dict) df2.to_excel('企业信息.xlsx', sheet_name='公司名称') # 读取字典中城市 def get_company(self): file_path = '企业信息.xlsx' df = pd.read_excel(io=file_path,sheet_name='公司名称',usecols=[1]) # print(df) return list(df['company_Name']) # 获取城市链接 def city_url(self): city = self.get_company() city_list = [] for item in city: city = parse.quote(item) url = f'https://www.jobui.com/cmp?area=%E5%85%A8%E5%9B%BD&keyword={city}' city_list.append(url) # print(city_list) return city_list # 数据爬取 def company_info_craw(self): city_list = self.city_url() company_name_list = [] company_viewNum_list = [] company_fol_list = [] company_info_list = [] company_nature_list = [] company_scale_list = [] company_create_time_list = [] company_industry_list = [] company_info2_list = [] company_list = [] for item in city_list: resp = requests.get(item,headers={'user-agent':self.get_ua()}) context = resp.text soup = BeautifulSoup(context,'html.parser') time.sleep(15) company_find = soup.find('div',class_ = 'company-content') #公司名称 company_name = company_find.find('div',class_ = 'company-segmetation').find('h3').text #浏览次数/关注量 company_view = company_find.find('span',class_ = 'company-desc').text.replace('n','').strip().replace('t','').split('/') if len(company_view) == 2: # 浏览次数 company_viewNum = company_view[0] # 关注量 company_fol = company_view[1] else: company_viewNum = company_view[0] company_fol = 0 try: #公司简介 company_info = company_find.find('div',class_ = 'company-short-content company-segmetation').text except: company_info = '公司无简介' #公司详情连接 company_url = company_find.find('div',class_ = 'company-segmetation').find('a').get('href') print(company_name) print(company_viewNum) print(company_fol) print(company_info) print(company_url) # todo 第二页数据爬取 company_url = 'https://www.jobui.com'+company_url resp = requests.get(company_url, headers={'user-agent': self.get_ua()}) context = resp.text soup = BeautifulSoup(context, 'html.parser') time.sleep(10) try: # 公司详细信息 company_det_info = soup.find('div', class_='cfix fs16') # 公司性质 company_nature = company_det_info.find('div', class_='company-nature').get('title').split(':')[1] # 公司规模 company_scale = company_det_info.find('div', class_='company-worker').get('title').split(':')[1] # 成立时间 company_create_time = company_det_info.find('span',class_='fs18 fwb').text # 行业 company_industry = company_det_info.find('span',class_='comInd').text # 公司简介 company_info2 = company_det_info.find('p', class_='mb10 cmp-txtshow').text except: company_create_time = '无' company_industry = '无' company_det_info = '无' company_nature= '无' company_scale= '无' company_info2 ='无' print(company_nature) print(company_scale) print(company_create_time) print(company_industry) print(company_info2) company_name_list.append(company_name) company_viewNum_list.append(company_viewNum) company_fol_list.append(company_fol) company_info_list.append(company_info) company_nature_list.append(company_nature) company_scale_list.append(company_scale) company_create_time_list.append(company_create_time) company_industry_list.append(company_industry) company_info2_list.append(company_info2) company_list = [company_name_list, company_viewNum_list, company_fol_list, company_info_list, company_nature_list, company_scale_list, company_create_time_list, company_industry_list, company_info2_list] return company_list # 数据存储 def data_to_excel(self): company_list = self.company_info_craw() company_dict = {'company_name':company_list[0], 'company_viewNum':company_list[1], 'company_fol':company_list[2], 'company_info':company_list[3], 'company_nature':company_list[4], 'company_scale':company_list[5], ' company_create_time':company_list[6], 'company_industry':company_list[7], 'company_info2':company_list[8] } df = pd.Dataframe(company_dict) df.to_excel('company.xlsx',sheet_name= '公司信息') return df class Mysql: def dataToMysql(self, df): engine = create_engine('mysql://root:这里是密码@这里是数据库名/这里是下级数据库名?charset=utf8') df.to_sql('公司', con=engine, if_exists='append', index=False) if __name__ == '__main__': company_craw = Company_craw() company_craw.data_to_excel() df1 = company_craw.data_to_excel() mysql = Mysql() mysql.dataToMysql(df1)
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)