目录
1.打开网站的网址右击检查进行抓包分析
2.两步构造python请求,分享一个复制粘贴就可以完成的奇淫技巧
3.提取JSON数据,并保存到Excel,方便分析
4.数据图表展示
5.源码展示(代码不多,包含数据爬取、数据处理、数据可视化完整步骤):
效果展示:
开发详细步骤
1.打开网站的网址右击检查进行抓包分析 2.两步构造python请求,分享一个复制粘贴就可以完成的奇淫技巧2.1第一步打开F12抓取请求,找到并且复制
2.2打开Postman粘贴
2.3使用Postman成功发送请求,复制请求代码:
2.4复制到pycharm成功运行:
3.提取JSON数据,并保存到Excel,方便分析 4.数据图表展示代码如下:
"""
5.对招聘公司规模大小进行_可视化分析
"""
companysize_count = df.groupby(by=['companysize_text'])['companysize_text'].count()
labels = list(companysize_count.index)
sizes = list(companysize_count.values)
# 设置分离的距离,0表示不分离
# explode = (0, 0, 0.1)
plt.title("对招聘公司规模大小进行_可视化分析")
plt.pie(sizes, labels=labels, autopct='%1.1f%%',
shadow=True, startangle=90)
plt.axis('equal')
plt.show()
plt.rcParams['font.sans-serif'] = ['SimHei'] # 显示中文标签,不加会乱码。。
plt.rcParams['axes.unicode_minus'] = False # 解决负号“-”显示为方块的问题
"""
0.对学历要求进行_可视化分析
"""
# 提取地址出来
df['workarea_text'] = df['workarea_text'] + '-'
df['address'] = df['workarea_text'].str.extract(r'(.*?-)') # 非贪婪模式匹配
df['address'] = df['address'].str[:-1] # 替换掉 '-'
df['attribute_address'] = df['attribute_text'] + ',' + df['address']
attribute_address = df['attribute_address'].str.split(',', expand=True) # 参数expand,这个参数取True时,会把切割出来的内容当做一列。 如果不需要pandas为你分好列,expand=False就可以了。
attribute_address.columns = ["详细地址", "工作经验", "学历要求", "工作地址"]
attribute_address['标记'] = 1
pivotTableEducation = attribute_address.pivot_table('标记', index=['工作地址'], columns=['学历要求'], aggfunc='count',
margins=True, fill_value=0)
pivotTableEducation.sort_values(by='All', ascending=False, inplace=True) # 降序排列
aatop10 = pivotTableEducation.head(10)
x_data = list(aatop10.index)
y_data = list(aatop10['大专'])
y2_data = list(aatop10['本科'])
plt.plot(x_data, y_data, color="pink", line, label="大专")
plt.plot(x_data, y2_data, color="skyblue", line, label="本科")
# 柱状图
plt.bar(x_data, y_data, lw=0.5, fc="r", width=0.3, label="大专", alpha=0.5)
plt.bar(x_data, y2_data, lw=0.5, fc="b", width=0.3, label="本科", alpha=0.5, bottom=y_data)
for i, j in zip(x_data, y_data):
plt.text(i, j + 0.05, "%d" % j, ha="center", va="bottom")
for i2, j2 in zip(x_data, y2_data):
plt.text(i2, j2 + 180, "%d" % j2, ha="center", va="bottom")
# 添加图例
#
plt.legend(loc='upper right', frameon=True)
plt.title("对学历要求进行_可视化分析")
plt.xlabel("地区")
plt.ylabel("数量")
plt.show()
time.sleep(2)
就不一一展示了.......
5.源码展示(代码不多,包含数据爬取、数据处理、数据可视化完整步骤):import time
import requests
import json
import pandas as pd
import matplotlib.pyplot as plt
def start():
global job_href_list
global job_name_list
global company_href_list
global company_name_list
global providesalary_text_list
global workarea_text_list
global issuedate_list
global companytype_text_list
global attribute_text_list
global companysize_text_list
global companyind_text_list
url = "xxxxxx"
payload = {}
headers = {
'Connection': 'keep-alive',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'X-Requested-With': 'XMLHttpRequest',
'sec-ch-ua-mobile': '?0',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36',
'sec-ch-ua-platform': '"Windows"',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Dest': 'empty',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cookie': ''
}
response = requests.request("GET", url, headers=headers, data=payload)
# print(response.text)
json_obj = json.loads(response.text)
engine_jds = json_obj['engine_jds']
curpage = 1
for k, v in enumerate(engine_jds):
job_href = v['job_href']
job_name = v['job_name']
company_href = v['company_href']
company_name = v['company_name']
providesalary_text = v['providesalary_text']
workarea_text = v['workarea_text']
issuedate = v['issuedate']
companytype_text = v['companytype_text']
attribute_text = str(v['attribute_text']).replace('"', '').replace('[', '').replace(']', '').replace(' ', '').replace("'", '')
companysize_text = v['companysize_text']
companyind_text = v['companyind_text']
print('页码:' + str(curpage) + ' ' + str(k+1) + ' 招聘岗位:' + job_name + ' 薪资:' + providesalary_text + ' 地点:' + workarea_text + ' 要求:' + attribute_text+ ' 公司:' + company_name + ' 更新日期:' + issuedate)
job_href_list.append(job_href)
job_name_list.append(job_name)
company_href_list.append(company_href)
company_name_list.append(company_name)
providesalary_text_list.append(providesalary_text)
workarea_text_list.append(workarea_text)
issuedate_list.append(issuedate)
companytype_text_list.append(companytype_text)
attribute_text_list.append(attribute_text)
companysize_text_list.append(companysize_text)
companyind_text_list.append(companyind_text)
total_page = int(json_obj['total_page'])
if total_page > 1:
while curpage <= total_page:
curpage = curpage + 1
url = "xxxxxxxxx"
response = requests.request("GET", url, headers=headers, data=payload)
# print(response.text)
json_obj = json.loads(response.text)
engine_jds = json_obj['engine_jds']
for k, v in enumerate(engine_jds):
job_href = v['job_href']
job_name = v['job_name']
company_href = v['company_href']
company_name = v['company_name']
providesalary_text = v['providesalary_text']
workarea_text = v['workarea_text']
issuedate = v['issuedate']
companytype_text = v['companytype_text']
attribute_text = str(v['attribute_text']).replace('"', '').replace('[', '').replace(']', '').replace(' ', '').replace("'", '')
companysize_text = v['companysize_text']
companyind_text = v['companyind_text']
print('页码:' + str(curpage) + ' ' + str(k+1) + ' 招聘岗位:' + job_name + ' 薪资:' + providesalary_text + ' 地点:' + workarea_text + ' 要求:' + attribute_text + ' 公司:' + company_name + ' 更新日期:' + issuedate)
job_href_list.append(job_href)
job_name_list.append(job_name)
company_href_list.append(company_href)
company_name_list.append(company_name)
providesalary_text_list.append(providesalary_text)
workarea_text_list.append(workarea_text)
issuedate_list.append(issuedate)
companytype_text_list.append(companytype_text)
attribute_text_list.append(attribute_text)
companysize_text_list.append(companysize_text)
companyind_text_list.append(companyind_text)
def visualization(df):
plt.rcParams['font.sans-serif'] = ['SimHei'] # 显示中文标签,不加会乱码。。
plt.rcParams['axes.unicode_minus'] = False # 解决负号“-”显示为方块的问题
"""
0.对学历要求进行_可视化分析
"""
# 提取地址出来
df['workarea_text'] = df['workarea_text'] + '-'
df['address'] = df['workarea_text'].str.extract(r'(.*?-)') # 非贪婪模式匹配
df['address'] = df['address'].str[:-1] # 替换掉 '-'
df['attribute_address'] = df['attribute_text'] + ',' + df['address']
attribute_address = df['attribute_address'].str.split(',', expand=True) # 参数expand,这个参数取True时,会把切割出来的内容当做一列。 如果不需要pandas为你分好列,expand=False就可以了。
attribute_address.columns = ["详细地址", "工作经验", "学历要求", "工作地址"]
attribute_address['标记'] = 1
pivotTableEducation = attribute_address.pivot_table('标记', index=['工作地址'], columns=['学历要求'], aggfunc='count',
margins=True, fill_value=0)
pivotTableEducation.sort_values(by='All', ascending=False, inplace=True) # 降序排列
aatop10 = pivotTableEducation.head(10)
x_data = list(aatop10.index)
y_data = list(aatop10['大专'])
y2_data = list(aatop10['本科'])
plt.plot(x_data, y_data, color="pink", line, label="大专")
plt.plot(x_data, y2_data, color="skyblue", line, label="本科")
# 柱状图
plt.bar(x_data, y_data, lw=0.5, fc="r", width=0.3, label="大专", alpha=0.5)
plt.bar(x_data, y2_data, lw=0.5, fc="b", width=0.3, label="本科", alpha=0.5, bottom=y_data)
for i, j in zip(x_data, y_data):
plt.text(i, j + 0.05, "%d" % j, ha="center", va="bottom")
for i2, j2 in zip(x_data, y2_data):
plt.text(i2, j2 + 180, "%d" % j2, ha="center", va="bottom")
# 添加图例
# https://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.legend
plt.legend(loc='upper right', frameon=True)
plt.title("对学历要求进行_可视化分析")
plt.xlabel("地区")
plt.ylabel("数量")
plt.show()
time.sleep(2)
"""
1.对公司属性类型进行_可视化分析
"""
companytype_count = df.groupby(by=['companytype_text'])['companytype_text'].count()
labels = list(companytype_count.index)
sizes = list(companytype_count.values)
plt.title("对公司属性类型进行_可视化分析")
# 设置分离的距离,0表示不分离
# explode = (0, 0, 0.1)
plt.pie(sizes, labels=labels, autopct='%1.1f%%',
shadow=True, startangle=90)
plt.axis('equal')
plt.show()
time.sleep(2)
"""
2.对每天发布招聘信息数量进行_可视化分析
"""
days_df = pd.DataFrame({'Joined date': pd.to_datetime(list(df['issuedate']))})
days_df['date'] = days_df['Joined date'].dt.strftime('%Y-%m-%d')
date_count = days_df.groupby(by=['date'])['date'].count()
x_axis_data = list(date_count.index) # x
y_axis_data = list(date_count.values) # y
plt.title("对每天发布招聘信息数量进行_可视化分析")
plt.plot(x_axis_data, y_axis_data, 'b*--', alpha=0.5, linewidth=1, label='数量') # 'bo-'表示蓝色实线,数据点实心原点标注
## plot中参数的含义分别是横轴值,纵轴值,线的形状('s'方块,'o'实心圆点,'*'五角星 ...,颜色,透明度,线的宽度和标签 ,
plt.legend() # 显示上面的label
plt.xlabel('发布招聘信息数') # x_label
plt.ylabel('日期') # y_label
# plt.ylim(-1,1)#仅设置y轴坐标范围
plt.show()
time.sleep(2)
"""
3.对热招top10办公地点进行_可视化分析
"""
workarea_dd = df['workarea_text'].str.split('-', expand=True) #参数expand,这个参数取True时,会把切割出来的内容当做一列。 如果不需要pandas为你分好列,expand=False就可以了。
workarea_dd1 = workarea_dd[0]
workarea_dd1 = pd.DataFrame({'办公地点': workarea_dd1.values})
workarea_count = workarea_dd1.groupby(by=['办公地点'])['办公地点'].count()
workarea_count_df = pd.DataFrame({'办公地点': workarea_count.index, '办公地点招聘数': workarea_count.values})
workarea_count_df.sort_values(by='办公地点招聘数', ascending=False, inplace=True) # 降序排列
top10 = workarea_count_df.head(10)
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams["axes.unicode_minus"] = False
x_data = list(top10['办公地点'])
y_data = list(top10['办公地点招聘数'])
plt.bar(x_data, y_data)
plt.title("对热招top10办公地点进行_可视化分析")
plt.xlabel("地点")
plt.ylabel("招聘数")
plt.show()
time.sleep(2)
"""
5.对招聘公司规模大小进行_可视化分析
"""
companysize_count = df.groupby(by=['companysize_text'])['companysize_text'].count()
labels = list(companysize_count.index)
sizes = list(companysize_count.values)
# 设置分离的距离,0表示不分离
# explode = (0, 0, 0.1)
plt.title("对招聘公司规模大小进行_可视化分析")
plt.pie(sizes, labels=labels, autopct='%1.1f%%',
shadow=True, startangle=90)
plt.axis('equal')
plt.show()
if __name__ == '__main__':
"""
《源码只供学习参考》
"""
job_href_list = []
job_name_list = []
company_href_list = []
company_name_list = []
providesalary_text_list = []
workarea_text_list = []
issuedate_list = []
companytype_text_list = []
attribute_text_list = []
companysize_text_list = []
companyind_text_list = []
start()
df = pd.DataFrame({'job_href': job_href_list, 'job_name': job_name_list, 'company_href': company_href_list, 'company_name': company_name_list,
'providesalary_text': providesalary_text_list, 'workarea_text': workarea_text_list, 'issuedate': issuedate_list, 'companytype_text': companytype_text_list,
'attribute_text': attribute_text_list, 'companysize_text': companysize_text_list, 'companyind_text': companyind_text_list})
writer = pd.ExcelWriter(r'C:\Users653\Desktop\招聘信息.xlsx')
df.to_excel(excel_writer=writer, sheet_name='招聘信息', index=False)
writer.save()
writer.close()
visualization(df)
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)