用Python采集招聘网岗位信息,用Pandas处理做数据可视化(含源码)

用Python采集招聘网岗位信息,用Pandas处理做数据可视化(含源码),第1张

目录

1.打开网站的网址右击检查进行抓包分析

2.两步构造python请求,分享一个复制粘贴就可以完成的奇淫技巧

3.提取JSON数据,并保存到Excel,方便分析

4.数据图表展示

5.源码展示(代码不多,包含数据爬取、数据处理、数据可视化完整步骤):


效果展示:

 

开发详细步骤

1.打开网站的网址右击检查进行抓包分析

2.两步构造python请求,分享一个复制粘贴就可以完成的奇淫技巧

2.1第一步打开F12抓取请求,找到并且复制

2.2打开Postman粘贴

 2.3使用Postman成功发送请求,复制请求代码:

 2.4复制到pycharm成功运行:

3.提取JSON数据,并保存到Excel,方便分析

4.数据图表展示

代码如下: 

    """
        5.对招聘公司规模大小进行_可视化分析
    """
    companysize_count = df.groupby(by=['companysize_text'])['companysize_text'].count()
 
    labels = list(companysize_count.index)
    sizes = list(companysize_count.values)
    # 设置分离的距离,0表示不分离
    # explode = (0, 0, 0.1)
    plt.title("对招聘公司规模大小进行_可视化分析")
    plt.pie(sizes, labels=labels, autopct='%1.1f%%',
            shadow=True, startangle=90)
    plt.axis('equal')
    plt.show()

 

    plt.rcParams['font.sans-serif'] = ['SimHei']  # 显示中文标签,不加会乱码。。
    plt.rcParams['axes.unicode_minus'] = False  # 解决负号“-”显示为方块的问题
    """
        0.对学历要求进行_可视化分析
    """
    # 提取地址出来
    df['workarea_text'] = df['workarea_text'] + '-'
    df['address'] = df['workarea_text'].str.extract(r'(.*?-)')  # 非贪婪模式匹配
    df['address'] = df['address'].str[:-1]  # 替换掉 '-'
    df['attribute_address'] = df['attribute_text'] + ',' + df['address']
    attribute_address = df['attribute_address'].str.split(',', expand=True)  # 参数expand,这个参数取True时,会把切割出来的内容当做一列。 如果不需要pandas为你分好列,expand=False就可以了。
    attribute_address.columns = ["详细地址", "工作经验", "学历要求", "工作地址"]
    attribute_address['标记'] = 1
    pivotTableEducation = attribute_address.pivot_table('标记', index=['工作地址'], columns=['学历要求'], aggfunc='count',
                                                  margins=True, fill_value=0)
    pivotTableEducation.sort_values(by='All', ascending=False, inplace=True)  # 降序排列
    aatop10 = pivotTableEducation.head(10)
    x_data = list(aatop10.index)
    y_data = list(aatop10['大专'])
    y2_data = list(aatop10['本科'])
 
    plt.plot(x_data, y_data, color="pink", line, label="大专")
    plt.plot(x_data, y2_data, color="skyblue", line, label="本科")
 
    # 柱状图
    plt.bar(x_data, y_data, lw=0.5, fc="r", width=0.3, label="大专", alpha=0.5)
    plt.bar(x_data, y2_data, lw=0.5, fc="b", width=0.3, label="本科", alpha=0.5, bottom=y_data)
 
    for i, j in zip(x_data, y_data):
        plt.text(i, j + 0.05, "%d" % j, ha="center", va="bottom")
 
    for i2, j2 in zip(x_data, y2_data):
        plt.text(i2, j2 + 180, "%d" % j2, ha="center", va="bottom")
    # 添加图例
    # 
    plt.legend(loc='upper right', frameon=True)
    plt.title("对学历要求进行_可视化分析")
    plt.xlabel("地区")
    plt.ylabel("数量")
    plt.show()
    time.sleep(2)

就不一一展示了.......

5.源码展示(代码不多,包含数据爬取、数据处理、数据可视化完整步骤):
import time
import requests
import json
import pandas as pd
import matplotlib.pyplot as plt
 
 
def start():
    global job_href_list
    global job_name_list
    global company_href_list
    global company_name_list
    global providesalary_text_list
    global workarea_text_list
    global issuedate_list
    global companytype_text_list
    global attribute_text_list
    global companysize_text_list
    global companyind_text_list
    url = "xxxxxx"
    payload = {}
    headers = {
        'Connection': 'keep-alive',
        'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'X-Requested-With': 'XMLHttpRequest',
        'sec-ch-ua-mobile': '?0',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36',
        'sec-ch-ua-platform': '"Windows"',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-Mode': 'cors',
        'Sec-Fetch-Dest': 'empty',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cookie': ''
        }
 
    response = requests.request("GET", url, headers=headers, data=payload)
    # print(response.text)
    json_obj = json.loads(response.text)
    engine_jds = json_obj['engine_jds']
    curpage = 1
    for k, v in enumerate(engine_jds):
        job_href = v['job_href']
        job_name = v['job_name']
        company_href = v['company_href']
        company_name = v['company_name']
        providesalary_text = v['providesalary_text']
        workarea_text = v['workarea_text']
        issuedate = v['issuedate']
        companytype_text = v['companytype_text']
        attribute_text = str(v['attribute_text']).replace('"', '').replace('[', '').replace(']', '').replace(' ', '').replace("'", '')
        companysize_text = v['companysize_text']
        companyind_text = v['companyind_text']
        print('页码:' + str(curpage) + ' ' + str(k+1) + ' 招聘岗位:' + job_name + ' 薪资:' + providesalary_text + ' 地点:' + workarea_text + ' 要求:' + attribute_text+ ' 公司:' + company_name + ' 更新日期:' + issuedate)
        job_href_list.append(job_href)
        job_name_list.append(job_name)
        company_href_list.append(company_href)
        company_name_list.append(company_name)
        providesalary_text_list.append(providesalary_text)
        workarea_text_list.append(workarea_text)
        issuedate_list.append(issuedate)
        companytype_text_list.append(companytype_text)
        attribute_text_list.append(attribute_text)
        companysize_text_list.append(companysize_text)
        companyind_text_list.append(companyind_text)
 
    total_page = int(json_obj['total_page'])
    if total_page > 1:
        while curpage <= total_page:
            curpage = curpage + 1
            url = "xxxxxxxxx"
            response = requests.request("GET", url, headers=headers, data=payload)
            # print(response.text)
            json_obj = json.loads(response.text)
            engine_jds = json_obj['engine_jds']
            for k, v in enumerate(engine_jds):
                job_href = v['job_href']
                job_name = v['job_name']
                company_href = v['company_href']
                company_name = v['company_name']
                providesalary_text = v['providesalary_text']
                workarea_text = v['workarea_text']
                issuedate = v['issuedate']
                companytype_text = v['companytype_text']
                attribute_text = str(v['attribute_text']).replace('"', '').replace('[', '').replace(']', '').replace(' ', '').replace("'", '')
                companysize_text = v['companysize_text']
                companyind_text = v['companyind_text']
                print('页码:' + str(curpage) + ' ' + str(k+1) + ' 招聘岗位:' + job_name + ' 薪资:' + providesalary_text + ' 地点:' + workarea_text + ' 要求:' + attribute_text + ' 公司:' + company_name + ' 更新日期:' + issuedate)
                job_href_list.append(job_href)
                job_name_list.append(job_name)
                company_href_list.append(company_href)
                company_name_list.append(company_name)
                providesalary_text_list.append(providesalary_text)
                workarea_text_list.append(workarea_text)
                issuedate_list.append(issuedate)
                companytype_text_list.append(companytype_text)
                attribute_text_list.append(attribute_text)
                companysize_text_list.append(companysize_text)
                companyind_text_list.append(companyind_text)
 
 
def visualization(df):
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 显示中文标签,不加会乱码。。
    plt.rcParams['axes.unicode_minus'] = False  # 解决负号“-”显示为方块的问题
    """
        0.对学历要求进行_可视化分析
    """
    # 提取地址出来
    df['workarea_text'] = df['workarea_text'] + '-'
    df['address'] = df['workarea_text'].str.extract(r'(.*?-)')  # 非贪婪模式匹配
    df['address'] = df['address'].str[:-1]  # 替换掉 '-'
    df['attribute_address'] = df['attribute_text'] + ',' + df['address']
    attribute_address = df['attribute_address'].str.split(',', expand=True)  # 参数expand,这个参数取True时,会把切割出来的内容当做一列。 如果不需要pandas为你分好列,expand=False就可以了。
    attribute_address.columns = ["详细地址", "工作经验", "学历要求", "工作地址"]
    attribute_address['标记'] = 1
    pivotTableEducation = attribute_address.pivot_table('标记', index=['工作地址'], columns=['学历要求'], aggfunc='count',
                                                  margins=True, fill_value=0)
    pivotTableEducation.sort_values(by='All', ascending=False, inplace=True)  # 降序排列
    aatop10 = pivotTableEducation.head(10)
    x_data = list(aatop10.index)
    y_data = list(aatop10['大专'])
    y2_data = list(aatop10['本科'])
 
    plt.plot(x_data, y_data, color="pink", line, label="大专")
    plt.plot(x_data, y2_data, color="skyblue", line, label="本科")
 
    # 柱状图
    plt.bar(x_data, y_data, lw=0.5, fc="r", width=0.3, label="大专", alpha=0.5)
    plt.bar(x_data, y2_data, lw=0.5, fc="b", width=0.3, label="本科", alpha=0.5, bottom=y_data)
 
    for i, j in zip(x_data, y_data):
        plt.text(i, j + 0.05, "%d" % j, ha="center", va="bottom")
 
    for i2, j2 in zip(x_data, y2_data):
        plt.text(i2, j2 + 180, "%d" % j2, ha="center", va="bottom")
    # 添加图例
    # https://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.legend
    plt.legend(loc='upper right', frameon=True)
    plt.title("对学历要求进行_可视化分析")
    plt.xlabel("地区")
    plt.ylabel("数量")
    plt.show()
    time.sleep(2)
 
    """
        1.对公司属性类型进行_可视化分析
    """
 
    companytype_count = df.groupby(by=['companytype_text'])['companytype_text'].count()
 
    labels = list(companytype_count.index)
    sizes = list(companytype_count.values)
    plt.title("对公司属性类型进行_可视化分析")
    # 设置分离的距离,0表示不分离
    # explode = (0, 0, 0.1)
    plt.pie(sizes, labels=labels, autopct='%1.1f%%',
            shadow=True, startangle=90)
    plt.axis('equal')
    plt.show()
    time.sleep(2)
 
    """
        2.对每天发布招聘信息数量进行_可视化分析
    """
    days_df = pd.DataFrame({'Joined date': pd.to_datetime(list(df['issuedate']))})
    days_df['date'] = days_df['Joined date'].dt.strftime('%Y-%m-%d')
    date_count = days_df.groupby(by=['date'])['date'].count()
 
    x_axis_data = list(date_count.index)  # x
    y_axis_data = list(date_count.values)  # y
    plt.title("对每天发布招聘信息数量进行_可视化分析")
    plt.plot(x_axis_data, y_axis_data, 'b*--', alpha=0.5, linewidth=1, label='数量')  # 'bo-'表示蓝色实线,数据点实心原点标注
    ## plot中参数的含义分别是横轴值,纵轴值,线的形状('s'方块,'o'实心圆点,'*'五角星   ...,颜色,透明度,线的宽度和标签 ,
 
    plt.legend()  # 显示上面的label
    plt.xlabel('发布招聘信息数')  # x_label
    plt.ylabel('日期')  # y_label
 
    # plt.ylim(-1,1)#仅设置y轴坐标范围
    plt.show()
    time.sleep(2)
 
    """
        3.对热招top10办公地点进行_可视化分析
    """
    workarea_dd = df['workarea_text'].str.split('-', expand=True) #参数expand,这个参数取True时,会把切割出来的内容当做一列。 如果不需要pandas为你分好列,expand=False就可以了。
    workarea_dd1 = workarea_dd[0]
    workarea_dd1 = pd.DataFrame({'办公地点': workarea_dd1.values})
    workarea_count = workarea_dd1.groupby(by=['办公地点'])['办公地点'].count()
    workarea_count_df = pd.DataFrame({'办公地点': workarea_count.index, '办公地点招聘数': workarea_count.values})
    workarea_count_df.sort_values(by='办公地点招聘数', ascending=False, inplace=True)  # 降序排列
    top10 = workarea_count_df.head(10)
 
    plt.rcParams["font.sans-serif"] = ['SimHei']
    plt.rcParams["axes.unicode_minus"] = False
 
    x_data = list(top10['办公地点'])
    y_data = list(top10['办公地点招聘数'])
 
    plt.bar(x_data, y_data)
    plt.title("对热招top10办公地点进行_可视化分析")
    plt.xlabel("地点")
    plt.ylabel("招聘数")
    plt.show()
    time.sleep(2)
 
 
    """
        5.对招聘公司规模大小进行_可视化分析
    """
    companysize_count = df.groupby(by=['companysize_text'])['companysize_text'].count()
 
    labels = list(companysize_count.index)
    sizes = list(companysize_count.values)
    # 设置分离的距离,0表示不分离
    # explode = (0, 0, 0.1)
    plt.title("对招聘公司规模大小进行_可视化分析")
    plt.pie(sizes, labels=labels, autopct='%1.1f%%',
            shadow=True, startangle=90)
    plt.axis('equal')
    plt.show()
 
 
 
if __name__ == '__main__':
    """
        《源码只供学习参考》
    """
    job_href_list = []
    job_name_list = []
    company_href_list = []
    company_name_list = []
    providesalary_text_list = []
    workarea_text_list = []
    issuedate_list = []
    companytype_text_list = []
    attribute_text_list = []
    companysize_text_list = []
    companyind_text_list = []
    start()
    df = pd.DataFrame({'job_href': job_href_list, 'job_name': job_name_list, 'company_href': company_href_list, 'company_name': company_name_list,
                  'providesalary_text': providesalary_text_list, 'workarea_text': workarea_text_list, 'issuedate': issuedate_list, 'companytype_text': companytype_text_list,
                  'attribute_text': attribute_text_list, 'companysize_text': companysize_text_list, 'companyind_text': companyind_text_list})
    writer = pd.ExcelWriter(r'C:\Users653\Desktop\招聘信息.xlsx')
    df.to_excel(excel_writer=writer, sheet_name='招聘信息', index=False)
    writer.save()
    writer.close()
    visualization(df)

  

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/langs/876077.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-05-13
下一篇 2022-05-13

发表评论

登录后才能评论

评论列表(0条)

保存