python爬虫爬去东方财富财务数据

python爬虫爬去东方财富财务数据,第1张

概述python爬虫爬去东方财富财务数据importrequestsimportrefrommultiprocessingimportPoolimportjsonimportcsvimportpandasaspdimportosimporttime#设置文件保存在D盘eastmoney文件夹下file_path=r'C:\Users\admir\Desktop\银行竞争\报表数据'ifnotos

Python爬虫爬去东方财富财务数据

import requestsimport refrom multiprocessing import Poolimport Jsonimport csvimport pandas as pdimport osimport time# 设置文件保存在D盘eastmoney文件夹下file_path = r'C:\Users\admir\Desktop\银行竞争\报表数据'if not os.path.exists(file_path):    os.mkdir(file_path)os.chdir(file_path)# 1 设置表格爬取时期def set_table():    # 1 设置财务报表获取时期    year = int(float(input('请输入要查询的年份(四位数2007-2020):\n')))    # int表示取整,里面加float是因为输入的是str,直接int会报错,float则不会    quarter = int(float(input('请输入小写数字季度(1:1季报,2-年中报,3:3季报,4-年报):\n')))    while (quarter < 1 or quarter > 4):        quarter = int(float(input('季度数值输入错误,请重新输入:\n')))    quarter = '{:02d}'.format(quarter * 3)     # 确定季度所对应的最后一天是30还是31号    if (quarter == '06') or (quarter == '09'):        day = 30    else:        day = 31    date = '{}-{}-{}' .format(year, quarter, day)    # 2 设置财务报表种类    tables = int(        input('请输入查询的报表种类对应的数字(1-业绩报表;2-业绩快报表:3-业绩预告表;4-预约披露时间表;5-资产负债表;6-利润表;7-现金流量表): \n'))    dict_tables = {1: '业绩报表', 2: '业绩快报表', 3: '业绩预告表',                   4: '预约披露时间表', 5: '资产负债表', 6: '利润表', 7: '现金流量表'}    dict = {1: 'YJBB', 2: 'YJKB', 3: 'YJYG',            4: 'YYPL', 5: 'ZCFZB', 6: 'LRB', 7: 'XJLLB'}    category = dict[tables]    # Js请求参数里的type,第1-4个表的前缀是'YJBB20_',后3个表是'CWBB_'    # 设置set_table()中的type、st、sr、filter参数    if tables == 1:        category_type = 'YJBB20_'        st = 'latestnoticedate'        sr = -1        filter =  "(securitytypecode in ('058001001','058001002'))(reportdate=^%s^)" %(date)    elif tables == 2:        category_type = 'YJBB20_'        st = 'ldate'        sr = -1        filter = "(securitytypecode in ('058001001','058001002'))(rdate=^%s^)" %(date)    elif tables == 3:        category_type = 'YJBB20_'        st = 'ndate'        sr = -1        filter=" (IsLatest='T')(enddate=^2018-06-30^)"    elif tables == 4:        category_type = 'YJBB20_'        st = 'frdate'        sr = 1        filter =  "(securitytypecode ='058001001')(reportdate=^%s^)" %(date)    else:        category_type = 'CWBB_'        st = 'noticedate'        sr = -1        filter = '(reportdate=^%s^)' % (date)    category_type = category_type + category    # print(category_type)    # 设置set_table()中的filter参数    yIEld{    'date':date,    'category':dict_tables[tables],    'category_type':category_type,    'st':st,    'sr':sr,    'filter':filter    }# 2 设置表格爬取起始页数def page_choose(page_all):    # 选择爬取页数范围        start_page = int(input('请输入下载起始页数:\n'))    nums = input('请输入要下载的页数,(若需下载全部则按回车):\n')    print('*' * 80)    # 判断输入的是数值还是回车空格    if nums.isdigit():        end_page = start_page + int(nums)    elif nums == '':        end_page = int(page_all.group(1))    else:        print('页数输入错误')    # 返回所需的起始页数,供后续程序调用    yIEld{        'start_page': start_page,        'end_page': end_page    }# 3 表格正式爬取def get_table(date, category_type,st,sr,filter,page):    # 参数设置    params = {        # 'type': 'CWBB_LRB',        'type': category_type,  # 表格类型        'token': '70f12f2f4f091e459a279469fe49eca5',        'st': st,        'sr': sr,        'p': page,        'ps': 50,  # 每页显示多少条信息        'Js': 'var LFtlXDqn={pages:(tp),data: (x)}',        'filter': filter,        # 'rt': 51294261  可不用    }    url = 'http://dcfm.eastmoney.com/em_mutisvcexpandinterface/API/Js/get?'    # print(url)    response = requests.get(url, params=params).text    # print(response)    # 确定页数    pat = re.compile('var.*?{pages:(\d+),data:.*?')    page_all = re.search(pat, response)    print(page_all.group(1))  # ok    # 提取{},Json.loads出错    # pattern = re.compile('var.*?data: \[(.*)]}', re.S)    # 提取出List,可以使用Json.dumps和Json.loads    pattern = re.compile('var.*?data: (.*)}', re.S)    items = re.search(pattern, response)    # 等价于    # items = re.findall(pattern,response)    # print(items[0])    data = items.group(1)    data = Json.loads(data)    # data = Json.dumps(data,ensure_ascii=False)    return page_all, data,page# 写入表头# 方法1 借助csv包,最常用def write_header(data,category):    with open('{}.csv' .format(category), 'a', enCoding='utf_8_sig', newline='') as f:        headers = List(data[0].keys())        # print(headers)  # 测试 ok        writer = csv.writer(f)        writer.writerow(headers)def write_table(data,page,category):    print('\n正在下载第 %s 页表格' % page)    # 写入文件方法1    for d in data:        with open('{}.csv' .format(category), 'a', enCoding='utf_8_sig', newline='') as f:            w = csv.writer(f)            w.writerow(d.values())def main(date, category_type,st,sr,filter,page):    func = get_table(date, category_type,st,sr,filter,page)    data = func[1]    page = func[2]    write_table(data,page,category)if __name__ == '__main__':    # 获取总页数,确定起始爬取页数    for i in set_table():        date = i.get('date')        category = i.get('category')        category_type = i.get('category_type')        st = i.get('st')        sr = i.get('sr')        filter = i.get('filter')    constant = get_table(date,category_type,st,sr,filter, 1)    page_all = constant[0]    for i in page_choose(page_all):        start_page = i.get('start_page')        end_page = i.get('end_page')    # 写入表头    write_header(constant[1],category)    start_time = time.time()  # 下载开始时间    # 爬取表格主程序    for page in range(start_page, end_page):        main(date,category_type,st,sr,filter, page)    end_time = time.time() - start_time  # 结束时间    print('下载完成')    print('下载用时: {:.1f} s' .format(end_time))

https://github.com/makcyun/eastmoney_spIDer

发表于: 2018-10-13原文链接:https://kuaibao.qq.com/s/20181013G1EQ5V00?refer=cp_1026腾讯「云+社区」是腾讯内容开放平台帐号(企鹅号)传播渠道之一,根据《腾讯内容开放平台服务协议》转载发布内容。如有侵权,请联系 yunjia_community@tencent.com 删除。 总结

以上是内存溢出为你收集整理的python爬虫爬去东方财富财务数据全部内容,希望文章能够帮你解决python爬虫爬去东方财富财务数据所遇到的程序开发问题。

如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/langs/1187095.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-06-03
下一篇 2022-06-03

发表评论

登录后才能评论

评论列表(0条)

保存