import json import pandas as pd import numpy as np import requests from lxml import etree def resolveJson(path): file = open(path, "rb") fileJson = json.load(file) data_list = [] for i in fileJson: first_name = i['name'] result = i['children'] for j in result: print(j) result1 = j['children'] for d in result1: name = d['name'] pcode = d['code'] # print(name,pcode) data_list.append((first_name,name,pcode)) end_data = pd.Dataframe(data_list,columns=["first_name","name","pcode1"], dtype=object) print(end_data) return end_data def mca_data(): url = 'http://www.mca.gov.cn/article/sj/xzqh/2020/20201201.html' Headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'www.mca.gov.cn', 'If-Modified-Since': 'Wed, 30 Jun 2021 06:35:23 GMT', 'If-None-Match': 'W/"825152-151781-5c5f5ed62bae1"', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36' } response = requests.get(url, headers=Headers) response.encoding = 'utf-8' resHtml = response.text # print(resHtml) html = etree.HTML(resHtml) data = html.xpath('//tr[@height="19"]') data_list =[] for i in data: try: pcode = i.xpath('./td[2]/text()')[0].strip()+'000000' except: pcode = 0 try: name = i.xpath('./td[3]/text()')[0].strip() except: name = i.xpath('./td[3]/span/text()')[0].strip() # print(name,pcode) data_list.append((name,pcode)) end_data = pd.Dataframe(data_list,columns=["name","pcode2"], dtype=object) # print(end_data) end_data['pcode2'] = end_data['pcode2'].astype(str) end_data['pcode3'] = end_data['pcode2'].apply(lambda x: x[:2]) data_list1 = [] for code, group in end_data.groupby(["pcode3"]): group_data = group.sort_values(by='pcode2', ascending=True, na_position='first') # group_data['first_name'] = group_data.iloc[0]['name'] first_name = group_data.iloc[0]['name'] # print(group_data) for i in range(0, len(group_data)): name = group_data.iloc[i]['name'] pcode2 = group_data.iloc[i]['pcode2'] data_list1.append((first_name,name,pcode2)) df1 = pd.Dataframe(data_list1, columns=["first_name","name", "pcode2"], dtype=object) print(df1) return df1 def function(a, b): if a == b: return 1 else: return 0 if __name__ == '__main__': path = r"C:UsersPCDesktoparea.json" jsondata = resolveJson(path) mcadata = mca_data() df = jsondata.merge(mcadata, on=["name","first_name"], how="left") df['pcode1'] = df['pcode1'].astype(float) df['pcode2'] = df['pcode2'].astype(float) df['check'] = df.apply(lambda x: function(x['pcode1'], x['pcode2']), axis=1) print(df) result = df[df['check']==0] print(result)
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)