- 介绍
- 结果展示
- 爬取的网页
- 参数
- 运行环境
- 项目分析
- 简单爬取
- 多线程爬取
- 数据的存储
- csv
- json
- excel
- mysql
- 展示数据
- 运行
- 注意
该项目为课程期末作业,python三个月的学习成果展示,分析各地职位薪水的多少
结果展示可视化结果:
excel结果:
json结果:
mysql:
源代码上传到Gitee:
https://gitee.com/ren-wenqing/PythonLearning.git
https://search.51job.com/
参数例:https://search.51job.com/list/090000,000000,0000,00,9,99,java,2,79.html
090000:城市代码
java:关键词
79:页数
具体参数可以自己在https://search.51job.com/中的搜索框中输入数据,观察浏览器地址的变化
运行环境python:3.9.6
mysql:8.0.18
需要用到的库:
import collections import json import xlsxwriter as xlsxwriter from utils import get_header import requests import queue import threading import os import csv import re import pymysql import matplotlib.pyplot as plt from openpyxl import load_workbook from faker import Faker项目分析 简单爬取
直接向网页发送请求就可以获取到网页的源代码
req=requests.get(url=url, headers=self.header)
获取结果:
这里页面中的加载框一直是加载状态,并没有我们需要的数据,但爬取的网址在浏览器中却可以正常显示:
这是因为requests库无法解析js,页面源代码中可能包含招聘列表
查看页面源代码,可以看到有一行特别长的js
复制到解析工具:
都是我们需要的数据
这里直接用正则匹配最快
def Spider(self): while not self.pagequeue.empty(): url = self.pagequeue.get() print('正在爬取:{}'.format(url)) req = requests.get(url, headers=get_header()) req.encoding = 'gbk' response = req.content.decode('gbk') for i in range(1, 12): try: title = re.findall(r'"job_title":"(.*?)"', response) if title[0] == None: break company_name = re.findall(r'"company_name":"(.*?)"', response) job_name = re.findall(r'"job_title":"(.*?)"', response) salary = re.findall(r'"providesalary_text":"(.*?)"', response) work_area = re.findall(r'"workarea_text":"(.*?)"', response) time = re.findall(r'"issuedate":"(.*?)"', response) company_type = re.findall(r'"companytype_text":"(.*?)"', response) company_hangye = re.findall(r'"companyind_text":"(.*?)"', response) detail = re.findall(r'"job_href":"(.*?)"', response) if len(salary[i]) != 0: salary = salary[i].replace('\/', '/') salary=formatyue(salary) salary = formatqian(salary) salary=fomatpjun(salary) detail = detail[i].replace('\/', '/') company_hangye = company_hangye[i].replace('\/', '/') data = { "职位名称": job_name[i], "薪资(万/月)": salary, "公司名称": company_name[i], "工作地点": work_area[i], "发布时间": time[i], "公司链接": detail, "公司类型": company_type[i], "公司行业": company_hangye, } self.jobqueue.put(data) except: continue
这些是我debug后显示的数据,可以看到里面有很多转义“”符号,需要剔除
我定义了三个方法对薪水解析统一格式,因为有的薪水是(千/月),有的是(万/月),还有的是(万/年)
def formatyue(str): sa = re.findall(r'(.*?)万/年', str) if len(sa) ==0: return str else: sa1 = re.findall(r'(.*?)-(.*)', sa[0]) saf = float(sa1[0][0]) sal = float(sa1[0][1]) saf = saf / 12 sal = sal / 12 str = "{}-{}万/月".format(round(saf, 1), round(sal, 1)) return str def formatqian(str): sa = re.findall(r'(.*?)千/月', str) if len(sa) ==0: return str else: sa1 = re.findall(r'(.*?)-(.*)', sa[0]) saf = float(sa1[0][0]) sal = float(sa1[0][1]) saf = saf / 10 sal = sal / 10 str = "{}-{}万/月".format(round(saf, 1), round(sal, 1)) return str def fomatpjun(str): sa = re.findall(r'(.*?)万/月', str) sa1 = re.findall(r'(.*?)-(.*)', sa[0]) saf = float(sa1[0][0])+float(sa1[0][1]) str = "{}".format(round(saf/2, 1)) return str多线程爬取
加快爬取速度
def run(self): thread_list = [] for i in range(self.thread): t = threading.Thread(target=self.Spider) thread_list.append(t) for t in thread_list: t.setDaemon(True) t.start() for t in thread_list: t.join()
地址字段输入的是文字,但浏览器里是地址的数字代码,需要遍历地址对应的json
链接: https://js.51jobcdn.com/in/js/2016/layer/area_array_c.js.
def _get_city_code(self): url = 'https://js.51jobcdn.com/in/js/2016/layer/area_array_c.js' req = requests.get(url, headers=self.header).text a = req.find(self.city) return req[a - 9:a - 3]数据的存储 csv
读取文件路径后,根据Spider方法里返回的列表,进行读取写入
if os.path.exists(self.path): data_list = [] self.path = os.path.join(self.path, 'save_list') while not self.jobqueue.empty(): data_list.append(self.jobqueue.get()) with open(os.path.join(self.path, 'info——job——{}——zone{}.csv'.format(self.keyword, self.city)), 'w', newline='', encoding='utf-8') as f: f_csv = csv.DictWriter(f, self.csv_header) f_csv.writeheader() f_csv.writerows(data_list)json
这个是通过数据库排序输出获得的
import pymysql def check(): con = pymysql.connect(host='localhost', port=3306, user='root', password='0', database='qcwy', charset='utf8') cursor = con.cursor() sql = "SELECT * FROM `jobinfo` order by salary desc;" cursor.execute(sql) con.commit() results = cursor.fetchall() col_names = ['job_name','salary', 'company_name', 'work_area','company_hangye','detail'] strs = {} for row in results: strs[row[0]] = dict(zip(col_names, row[2:])) cursor.close() con.close() result = json.dumps(strs) filename = 'gongsi.json' with open(filename, 'w') as file_obj: file_obj.write(result)excel
import xlsxwriter as xlsxwriter workbook = xlsxwriter.Workbook(r'C:UsersSixStartPycharmProjectspythonProject1期末python.xlsx') worksheet = workbook.add_worksheet() worksheet.write("A1", "职位名称") worksheet.write("B1", "薪资(万/月)") worksheet.write("C1", "公司名称") worksheet.write("D1", "工作地点") worksheet.write("E1", "发布时间") worksheet.write("F1", "公司链接") worksheet.write("G1", "公司类型") worksheet.write("H1", "公司行业") worksheet.write("L1", "工资") worksheet.write("M1", "次数") salarylist = [] for i in range(1, len(data_list)): job_name = data_list[i]['职位名称'] salary = data_list[i]['薪资(万/月)'] company_name = data_list[i]['公司名称'] work_area = data_list[i]['工作地点'] time = data_list[i]['发布时间'] detail = data_list[i]['公司链接'] company_type = data_list[i]['公司类型'] company_hangye = data_list[i]['公司行业'] salarylist.append(salary) try: worksheet.write("A{}".format(i+1), job_name) worksheet.write("B{}".format(i+1), salary) worksheet.write("C{}".format(i+1), company_name) worksheet.write("D{}".format(i+1), work_area) worksheet.write("E{}".format(i+1), time) worksheet.write("F{}".format(i+1), detail) worksheet.write("G{}".format(i+1), company_type) worksheet.write("H{}".format(i+1), company_hangye) except: pass workbook.close()mysql
CREATE TABLE `jobinfo` ( `id` int(11) NOT NULL AUTO_INCREMENT COMMENT '信息编号', `job_name` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '职位名称', `salary` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '面议' COMMENT '薪资', `company_name` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '公司名称', `work_area` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '工作地点', `time` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '发布时间', `detail` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '公司链接', `company_type` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '公司类型', `company_hangye` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '公司行业', PRIMARY KEY (`id`) USING BTREE ) ENGINE = InnoDB AUTO_INCREMENT = 48561 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic; SET FOREIGN_KEY_CHECKS = 1;
import pymysql con = pymysql.connect(host='localhost', port=3306, user='root', password='0', database='qcwy', charset='utf8') cursor = con.cursor() sql = "INSERT INTO `qcwy`.`jobinfo`(`id`, `job_name`, `salary`, `company_name`, `work_area`, `time`, `detail`, `company_type`, `company_hangye`) VALUES" sqlv = "(null,'{}','{}','{}','{}','{}','{}','{}','{}');".format(job_name,salary,company_name,work_area, time, detail, company_type,company_hangye) sql2 = sql + sqlv try: cursor.execute(sql2) con.commit() except: con.rollback() cursor.close() # 关闭连接 con.close()展示数据
这个是和爬取分开的,所以重新打开了excel表读取(毕竟不能每次运行都一次爬几百个网页,防止我人无了)
import matplotlib.pyplot as plt from openpyxl import load_workbook def showpicuure(): salarymoneylist = [] salarycountlist = [] # 读取文件 wb = load_workbook('python.xlsx') ws = wb.active # 获取表中L列,L列为薪水从低到高排列 for col in ws['L']: if isinstance(col.value, str): salarymoneylist.append(col.value) salarymoneylist.remove("工资") # 获取表M列,M列为该薪水的招聘数量 for col in ws['M']: if isinstance(col.value, int): salarycountlist.append(col.value) #绘图 plt.style.use('ggplot') customers_index = range(len(salarymoneylist)) #设定图表长宽和像素 fig = plt.figure(figsize=(15, 3), dpi=100) #设定图表所占区域 ax1 = fig.add_subplot(1, 1, 1) #设定xy列,文字居中,图表颜色 ax1.bar(customers_index, salarycountlist, align='center', color='darkblue') #设定文字在x轴底部,y轴左侧 ax1.xaxis.set_ticks_position('bottom') ax1.yaxis.set_ticks_position('left') #设定x轴的元素 plt.xticks(customers_index, salarymoneylist, rotation=0, fontsize='small') plt.xlabel('nums') plt.ylabel('salary') plt.title('Salary—Statistics—{}') plt.figure(dpi=80) plt.savefig('salary.png', bbox_inches='tight') plt.show()运行
源代码在本博客开头哦,我博客上的这些当然是运行不了的,只能作为源代码的讲解
if __name__ == '__main__': key1 = input("请输入关键词") city1 = input("请输入城市") #爬虫 zhaopin(keyword=key1, city=city1).run() #数据库正序输出json check() #读取excel生成图像 showpicuure()注意
不是每次都能运行成功,但是大部分都没什么问题,可能和网络质量有关,如果报错的gbk编码那些,就和网速有关,也可能是我引入的faker 库的问题
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)