python爬取51job的示例

python爬取51job的示例,第1张

如何爬取51job的岗位和薪资信息,可参考以下代码

import json
import re
import sqlite3
import urllib.error
import urllib.request
from urllib import parse

dbpath = './51job.db'
#kw = input("请输入你要搜索的岗位关键字:")
#keyword = parse.quote(parse.quote(kw))
pageNum = 10

# 主流程
def main():
    #url = "https://search.51job.com/list/090200,000000,0000,00,9,99,"+keyword+",2,"+str(pageNum)+".html"
    url = "https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%259E%25B6%25E6%259E%2584%25E5%25B8%2588,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="
    #html = askURL(url)
    #print(html)
    result = open('result.html', 'r', encoding="gbk")
    data = re.findall(r"\"engine_jds\":(.+?),\"jobid_count\"", str(result.readlines()))
    #print(data[0])
    jsonObj = json.loads(data[0])
    for item in jsonObj:
        print(item['job_name'] + item['providesalary_text'])
    #print(url)



def init_db(dbpath):
    sql = '''
    create table job
    (
    id integer primary key autoincrement,
    job_link text,
    jname text,
    cname varchar,
    area varchar ,
    salary text,
    educate text,
    info text
    )
    '''

    conn = sqlite3.connect(dbpath)
    cursor = conn.cursor()
    cursor.execute(sql)
    conn.commit()
    conn.close()


# init_db(dbpath)

def askURL(url):
    head = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36"
    }
    request = urllib.request.Request(url, headers=head)
    html = ""
    try:
        response = urllib.request.urlopen(request)
        html = response.read().decode("gbk")
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
    return html


if __name__ == '__main__':
    main()

爬出来的结果如下图所示

加入公司名字

jsonObj = json.loads(data[0])
    for item in jsonObj:
        print(item['job_name'] + ':' + item['company_name'] +' '+ item['providesalary_text'])

欢迎分享,转载请注明来源:内存溢出

原文地址: https://outofmemory.cn/langs/715998.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-04-25
下一篇 2022-04-25

发表评论

登录后才能评论

评论列表(0条)

保存