如何爬取51job的岗位和薪资信息,可参考以下代码
import json
import re
import sqlite3
import urllib.error
import urllib.request
from urllib import parse
dbpath = './51job.db'
#kw = input("请输入你要搜索的岗位关键字:")
#keyword = parse.quote(parse.quote(kw))
pageNum = 10
# 主流程
def main():
#url = "https://search.51job.com/list/090200,000000,0000,00,9,99,"+keyword+",2,"+str(pageNum)+".html"
url = "https://search.51job.com/list/000000,000000,0000,00,9,99,%25E6%259E%25B6%25E6%259E%2584%25E5%25B8%2588,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare="
#html = askURL(url)
#print(html)
result = open('result.html', 'r', encoding="gbk")
data = re.findall(r"\"engine_jds\":(.+?),\"jobid_count\"", str(result.readlines()))
#print(data[0])
jsonObj = json.loads(data[0])
for item in jsonObj:
print(item['job_name'] + item['providesalary_text'])
#print(url)
def init_db(dbpath):
sql = '''
create table job
(
id integer primary key autoincrement,
job_link text,
jname text,
cname varchar,
area varchar ,
salary text,
educate text,
info text
)
'''
conn = sqlite3.connect(dbpath)
cursor = conn.cursor()
cursor.execute(sql)
conn.commit()
conn.close()
# init_db(dbpath)
def askURL(url):
head = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36"
}
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("gbk")
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
if __name__ == '__main__':
main()
爬出来的结果如下图所示
加入公司名字
jsonObj = json.loads(data[0])
for item in jsonObj:
print(item['job_name'] + ':' + item['company_name'] +' '+ item['providesalary_text'])
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)