python连接Mysql共享汽车行业案例分析

python连接Mysql共享汽车行业案例分析,第1张

概述学习记录——共享汽车分析前言本文仅记录个人学习过程写的代码供自己复盘使用,如果对你有帮助和启发那就更好了,新人作品,欢迎讨论和斧正,大神轻喷。纯代码实现,无结论。一些相似的维度举一反三就行。需要数据集练习的可以留言目标python,mysql,matplotlib代码练习常用数据 学习记录——共享汽车分析前言本文仅记录个人学习过程写的代码供自己复盘使用,如果对你有帮助和启发那就更好了,新人作品,欢迎讨论和斧正,大神轻喷。纯代码实现,无结论。一些相似的维度举一反三就行。需要数据集练习的可以留言目标python, MysqL, matplotlib 代码练习常用数据指标的实现数据导入

python 连接数据库 MysqL

#获取数据方法import pyMysqL import pandas as pd from matplotlib import pyplot as pltfrom matplotlib import Font_managerdef get_MysqL_data(DB, sql):    conn = pyMysqL.connect(host=DB['host'], port=DB['port'], user=DB['user'], password=DB['password'], database=DB['dbname'])     # 创建游标    cursor = conn.cursor()    # 执行SQL语句    cursor.execute(sql)    # 调出数据    data = cursor.fetchall()    # cols为字段信息     cols = cursor.description    # 将数据转换为DataFrame    col = []    for i in cols:        col.append(i[0])    # data转成List形式     data = List(map(List, data))    data = pd.DataFrame(data,columns=col)    # 关闭游标以及连接    cursor.close()    conn.close()    return data
日期时间处理——获取年月日时维度
def time_data(data):            data['下单年'] = data["下单时间"].astype("str").apply(lambda x: x.split('-')[0])     data['下单年月'] = data["下单时间"].astype("str").apply(lambda x: x.split('-')[0]+"/"+x.split('-')[1])    data['下单月'] = data["下单时间"].astype("str").apply(lambda x: x.split('-')[1])    data['下单日'] = data["下单时间"].astype("str").apply(lambda x: x.split('-')[2])     data['下单时'] = data["下单时间"].astype("str").apply(lambda x: x.split('-')[0])     data['付款年'] = data["付款日期"].astype("str").apply(lambda x: x.split('-')[0])     data['付款年月'] = data["付款日期"].astype("str").apply(lambda x: x.split('-')[0]+"/"+x.split('-')[1])    data['付款月'] =data["付款日期"].astype("str").apply(lambda x: x.split('-')[1])    data['付款日'] = data["付款日期"].astype("str").apply(lambda x: x.split('-')[2])     data['付款时'] = data["付款日期"].astype("str").apply(lambda x: x.split('-')[0])     return dataDB = { 'host':"127.0.0.1",'port':3306, 'user':'root','password':'password','dbname':'datebase'}
留存分析
# 留存分析sql='''select *,concat(ROUND(100*次日留存用户数/日活跃用户数),"%") 次日留存率,concat(ROUND(100*三日留存用户数/日活跃用户数),"%") 三日留存率,concat(ROUND(100*七日留存用户数/日活跃用户数),"%") 七日留存率from(select date(a.`付款时间`) 日期 ,DATE_FORMAT(a.`付款时间`,"%H") 时间,COUNT(disTINCT a.`用户ID`) 日活跃用户数,COUNT(disTINCT b.`用户ID`) 次日留存用户数,COUNT(disTINCT c.`用户ID`) 三日留存用户数,COUNT(disTINCT d.`用户ID`) 七日留存用户数from paper_data a left join paper_data b on  a.`用户ID`=b.`用户ID`and date(b.`付款时间`)=date(a.`付款时间`)+1left join paper_data con  a.`用户ID`=c.`用户ID`and date(c.`付款时间`)=date(a.`付款时间`)+3left join paper_data don  a.`用户ID`=d.`用户ID`and date(d.`付款时间`)=date(a.`付款时间`)+7where a.`付款时间` between "2020-01-01" and "2020-01-31"GROUP BY date(a.`付款时间`))f;'''liuchun = get_MysqL_data(DB, sql)liuchunx=liuchun['日期']y1=liuchun['次日留存率']y2=liuchun['三日留存率']y3=liuchun['七日留存率']plt.figure(figsize=(25,10),dpi=80)my_Font = Font_manager.FontPropertIEs(fname="C:/Users/wty_pc/Anaconda3/lib/site-packages/matplotlib/mpl-data/Fonts/ttf/STSONG.TTF",size = 18)rects1=plt.plot(x,y1,color="red",Alpha=0.5,line,linewidth=3,label="次日留存率")rects2=plt.plot(x,y2,color="g",Alpha=0.5,line,linewidth=3,label="三日留存率")rects3=plt.plot(x,y3,color="b",Alpha=0.5,line,linewidth=3,label="七日留存率")rects4=plt.legend(prop=my_Font,loc="best")for i in range(len(x)):        plt.text(x[i],y1[i],y1[i],Fontsize=15,ha="center")for i in range(len(x)):        plt.text(x[i],y2[i],y2[i],Fontsize=15,ha="center")for i in range(len(x)):        plt.text(x[i],y3[i],y3[i],Fontsize=15,ha="center")        plt.show()

订单分析

订单分析(可以把订单换成其他的,比如用户数,销售量等数据)

# 1.根据下单的时间来统计订单量# (1)年维度sql='''select date_format(付款时间,"%Y")下单年 ,count(订单ID) as 订单量 from paper_data group by date_format(付款时间,"%Y")'''order_num_year=get_MysqL_data(DB, sql)my_Font = Font_manager.FontPropertIEs(fname="C:/Users/wty_pc/Anaconda3/lib/site-packages/matplotlib/mpl-data/Fonts/ttf/STSONG.TTF",size = 18)x=order_num_year["下单年"]y=order_num_year["订单量"]plt.figure(figsize=(20,8),dpi=80)rects = plt.bar(x,y,wIDth=0.3,color="b",label="2019")# 标注for rect in rects:    height=rect.get_height()    plt.text(rect.get_x()+rect.get_wIDth()/2,height+0.3,str(height),ha="center")plt.Title("年度订单量",FontPropertIEs=my_Font)plt.xlabel("年份",FontPropertIEs=my_Font)plt.ylabel("订单量",FontPropertIEs=my_Font)plt.legend(prop=my_Font,loc="best")plt.show()

# 订单分析# 1.根据下单的时间来统计订单量# (2)年月维度sql='''select date_format(付款时间,"%Y/%m")下单年月 ,count(订单ID) as 订单量 from paper_data group by date_format(付款时间,"%Y/%m")'''order_num_month=get_MysqL_data(DB, sql)my_Font = Font_manager.FontPropertIEs(fname="C:/Users/wty_pc/Anaconda3/lib/site-packages/matplotlib/mpl-data/Fonts/ttf/STSONG.TTF",size = 18)x=order_num_month["下单年月"]y=order_num_month["订单量"]plt.figure(figsize=(20,8),dpi=80)rects = plt.bar(x,y,wIDth=0.3,color=["r","g","b"])# 标注for rect in rects:    height=rect.get_height()    plt.text(rect.get_x()+rect.get_wIDth()/2,height+0.3,str(height),ha="center")plt.show()

# (3)年月、城市维度(折线图)sql='''select date_format(下单时间,"%Y/%m")下单年月 ,城市,count(订单ID) as 订单量 from paper_datagroup by date_format(下单时间,"%Y/%m"),城市order by date_format(下单时间,"%Y/%m");'''order_num_month1=get_MysqL_data(DB, sql)#把每个城市的数据取出来order_month_city=order_num_month1.groupby(["城市"])order_shanghai=order_month_city.get_group("上海").reset_index(drop="true")order_beijing=order_month_city.get_group("北京").reset_index(drop="true")order_hangzhou=order_month_city.get_group("杭州").reset_index(drop="true")order_xian=order_month_city.get_group("西安").reset_index(drop="true")# matplotlib画图部分my_Font = Font_manager.FontPropertIEs(fname="C:/Users/wty_pc/Anaconda3/lib/site-packages/matplotlib/mpl-data/Fonts/ttf/STSONG.TTF",size = 18)plt.figure(figsize=(25,10),dpi=80)plt.plot(order_shanghai["下单年月"],order_shanghai["订单量"],marker='o',color="r",label="上海订单量")plt.plot(order_beijing["下单年月"],order_beijing["订单量"],marker='v',color="g",label="北京订单量")plt.plot(order_hangzhou["下单年月"],order_hangzhou["订单量"],marker='^',color="b",label="杭州订单量")plt.plot(order_xian["下单年月"],order_xian["订单量"],marker='.',color="y",label="西安订单量")plt.xlabel("时间",FontPropertIEs=my_Font)plt.ylabel("订单量",FontPropertIEs=my_Font)plt.Title("各城市每月订单量",FontPropertIEs=my_Font)plt.legend(prop=my_Font,loc="best")#  df.IDxmax()默认值是0,求列最大值的行索引,1就是反过来def city_max(df):    return df.IDxmax()# 最大值标记plt.text(order_shanghai["下单年月"].iloc[city_max(order_shanghai["订单量"])],order_shanghai.max()["订单量"]+5,"上海最大订单:{}".format(order_shanghai.max()["订单量"]),Fontsize=12,ha="center",color="r",FontPropertIEs=my_Font)plt.text(order_beijing["下单年月"].iloc[city_max(order_beijing["订单量"])],order_beijing.max()["订单量"]+5,"北京最大订单:{}".format(order_beijing.max()["订单量"]),Fontsize=12,ha="center",color="g",FontPropertIEs=my_Font)plt.text(order_hangzhou["下单年月"].iloc[city_max(order_hangzhou["订单量"])],order_hangzhou.max()["订单量"]+5,"杭州最大订单:{}".format(order_hangzhou.max()["订单量"]),Fontsize=12,ha="center",color="b",FontPropertIEs=my_Font)plt.text(order_xian["下单年月"].iloc[city_max(order_xian["订单量"])],order_xian.max()["订单量"]+5,"西安最大订单:{}".format(order_xian.max()["订单量"]),Fontsize=12,ha="center",color="y",FontPropertIEs=my_Font)plt.show()# -----------------------------# 每年、每月的一样的做法,改下日期字段就好

# (3)年城市维度(柱状图)sql='''select date_format(付款时间,"%Y")下单年,城市,count(订单ID) as 订单量 from paper_datagroup by date_format(付款时间,"%Y"),城市order by date_format(付款时间,"%Y");'''order_num_year=get_MysqL_data(DB, sql)# order_num_month.describe()# 按城市拆分数据order_year_city=order_num_year.groupby(["城市"])order_shanghai=order_year_city.get_group("上海").reset_index(drop="true")order_beijing=order_year_city.get_group("北京").reset_index(drop="true")order_hangzhou=order_year_city.get_group("杭州").reset_index(drop="true")order_xian=order_year_city.get_group("西安").reset_index(drop="true")# -----matplotlib画图部分------my_Font = Font_manager.FontPropertIEs(fname="C:/Users/wty_pc/Anaconda3/lib/site-packages/matplotlib/mpl-data/Fonts/ttf/STSONG.TTF",size = 18)plt.figure(figsize=(25,10),dpi=80)# 设置柱子位置,通过偏移量a来实现,# 由于时间是datetime类型或者object(str)类型的不能直接加偏移量所以这里用长度长度实现柱子的位置,然后把刻度值重新设置 a=0.2 x_01=List(range(len(order_shanghai["下单年"])))x_02=[i+a for i in x_01]x_03=[i-2*a for i in x_01]x_04=[i-a for i in x_01]# 画图rects1=plt.bar(x_01,order_shanghai["订单量"],a,color="r",align='edge',label="上海订单量")rects2=plt.bar(x_02,order_beijing["订单量"],a,color="g",align='edge',label="北京订单量")rects3=plt.bar(x_03,order_hangzhou["订单量"],a,color="b",align='edge',label="杭州订单量")rects4=plt.bar(x_04,order_xian["订单量"],a,color="y",align='edge',label="西安订单量")# x轴刻度plt.xticks(x_01,labels=order_shanghai["下单年"])# 轴名称plt.xlabel("年份",FontPropertIEs=my_Font)plt.ylabel("订单量",FontPropertIEs=my_Font)# 标题plt.Title("各城市每月订单量",FontPropertIEs=my_Font)# 图例plt.legend(prop=my_Font,loc="best")# 标记def mark_bar(rects):    for rect in rects:        height=rect.get_height()        plt.text(rect.get_x()+rect.get_wIDth()/2,height+0.3,str(height),Fontsize=15,ha="center")mark_bar(rects1)mark_bar(rects2)mark_bar(rects3)mark_bar(rects4)plt.show()

# 3、小时订单量分布情况sql= '''SELECT date_format(下单时间,"%H")  下单时, count(`订单ID`) 订单量from paper_datawhere  date_format(下单时间,"%Y-%m")="2020-01"GROUP BY date_format(下单时间,"%H") ORDER BY date_format(下单时间,"%H") ;'''h_dingdan=get_MysqL_data(DB, sql)my_Font = Font_manager.FontPropertIEs(fname="C:/Users/wty_pc/Anaconda3/lib/site-packages/matplotlib/mpl-data/Fonts/ttf/STSONG.TTF",size = 18)plt.figure(figsize=(25,10),dpi=80)x=h_dingdan["下单时"]y=h_dingdan["订单量"]plt.plot(x,y,color="r",label="每小时订单量")# 设置标记for a,b  in zip(x,y):    #a x坐标, b坐标, (a,b) 一个点     # ha 水平方向  va 垂直方向  Fontsize 大小     plt.text(a,b,b ,ha='center',va='bottom', Fontsize=12)#设置刻度xtick_labels = ['{}:00'.format(i) for i in x]plt.xticks(x,xtick_labels)#设置坐标名称plt.xlabel("时间",FontPropertIEs=my_Font)plt.ylabel("订单量",FontPropertIEs=my_Font)#设置表名plt.Title("1月每小时订单量",FontPropertIEs=my_Font)#设置图例plt.legend(prop=my_Font,loc="best")plt.show()

RFM模型
#RMF分层sql='''SELECT 用户ID,COUNT(订单ID) AS F,round(sum(实收金额),2) AS M,date_format(max(付款时间),"%Y/%m/%d") as R             FROM paper_data GROUP BY 用户ID ORDER BY 用户ID            ;'''User_layer=get_MysqL_data(DB, sql)# RMF评分公式# User_layer["R1"]=((pd.to_datetime("2020/03-01")-pd.to_datetime(User_layer["R"])) 后面的处理方式是为了把timedelta类型转换成Int类型User_layer["R1"]=((pd.to_datetime("2020/03-01")-pd.to_datetime(User_layer["R"]))/pd.timedelta(1,"D")).fillna(0).astype(int)User_layer["RR"]=(pd.to_datetime(User_layer["R"])-pd.to_datetime(min(User_layer["R"])))/((pd.to_datetime(max(User_layer["R"]))-pd.to_datetime(min(User_layer["R"]))))User_layer["FF"]=(User_layer["F"]-min(User_layer["F"]))/(max(User_layer["F"])-min(User_layer["F"]))User_layer["MM"]=(User_layer["M"]-min(User_layer["M"]))/(max(User_layer["M"])-min(User_layer["M"]))User_layer["score"]=100*(0.25*User_layer["FF"]+0.6*User_layer["MM"]+0.15*User_layer["RR"])User_layer
用户IDFMRR1RRFFMMscore
01323517.02021/01/25-3300.9805190.6739130.66395371.392801
110292600.02021/01/23-3280.9779220.6086960.49034559.306896
2100313906.02021/01/11-3160.9623380.6521740.73759974.995376
31000242255.02021/01/07-3120.9571430.5000000.42502852.358847
410009171.02019/04/093270.1272730.0000000.0115492.602010
..............................
59249977119.02020/10/27-2400.8636360.0000000.00170413.056779
5925998343634.02021/02/06-3420.9961040.7173910.68610474.042566
5926999333356.02021/01/27-3320.9831170.6956520.63347270.146388
592799931125.02020/11/04-2480.8740260.0000000.02177214.416713
592899941135.02020/08/22-1740.7779220.0000000.02366513.088748

5929 rows × 9 columns

# 根据权重score四分位分组的依据User_describe=User_layer.describe()User_describe
FMR1RRFFMMscore
count5929.0000005929.0000005929.0000005929.0000005929.0000005929.0000005929.000000
mean10.9790861148.957160-117.5140830.7045640.2169370.21563028.929667
std13.6568801440.872815228.5950260.2968770.2968890.27278926.743116
min1.00000010.000000-345.0000000.0000000.0000000.0000000.185039
25%1.000000112.000000-318.0000000.4896100.0000000.0193118.927758
50%2.000000198.000000-208.0000000.8220780.0217390.03559314.496745
75%26.0000002679.00000048.0000000.9649350.5434780.50530158.371452
max47.0000005292.000000425.0000001.0000001.0000001.00000097.960474
# df遍历后的结果最好用List/dict存放起来。df本身一个一个添加数据遍历效率不高。而且难取数,最好用List/dict整体取赋值List_level=[]for index ,rows in User_layer.iterrows():#     print("索引是:",index,"数据是:",rows)    if  rows["score"]<=User_describe.loc["25%"]["score"]:        rows["level"]="四等"            elif  rows["score"]<=User_describe.loc["50%"]["score"]:        rows["level"]="三等"            elif  rows["score"]<=User_describe.loc["75%"]["score"]:        rows["level"]="二等"            else:        rows["level"]="一等"      List_level.append(rows["level"])    User_layer['level']=List_levelUser_layer
用户IDFMRR1RRFFMMscorelevel
01323517.02021/01/25-3300.9805190.6739130.66395371.392801一等
110292600.02021/01/23-3280.9779220.6086960.49034559.306896一等
2100313906.02021/01/11-3160.9623380.6521740.73759974.995376一等
31000242255.02021/01/07-3120.9571430.5000000.42502852.358847二等
410009171.02019/04/093270.1272730.0000000.0115492.602010四等
.................................
59249977119.02020/10/27-2400.8636360.0000000.00170413.056779三等
5925998343634.02021/02/06-3420.9961040.7173910.68610474.042566一等
5926999333356.02021/01/27-3320.9831170.6956520.63347270.146388一等
592799931125.02020/11/04-2480.8740260.0000000.02177214.416713三等
592899941135.02020/08/22-1740.7779220.0000000.02366513.088748三等

5929 rows × 10 columns

用户标签
# 用户打标签sql='''select 用户ID,count(*) as order_num,count(date_format(付款时间,"%M")) 月数,                  sum(case when 取车类型='自取' then 1 else 0 end) as 自取次数,                  sum(case when 优惠金额>1 then 1 else 0 end) as 优惠券使用次数,                  sum(case when 非网点还车费>0 then 1 else 0 end) as  还车次数                  from paper_data                   group by 用户ID                  ;'''bq=get_MysqL_data(DB, sql)data_total=pd.merge(User_layer,bq,on="用户ID")data_total# print(data_total["FF"].dtypes,data_total["自取次数"].dtypes)for index,row in data_total.iterrows():# 取车    if row["FF"]==0:        row["FF"]= row["FF"]+0.0001# Int/浮点数会报错,注意转换类型        if float(row["自取次数"])/row["FF"]>0.5:            data_total["取车偏好"]="时租居多"        else:            data_total["取车偏好"]="日租居多"    #       网点还车          if float(row["还车次数"])/row["FF"]>0.5:            data_total["还车偏好"]="非网点还车"        else:            data_total["还车偏好"]="网点还车"    #     月均订单        if row["FF"]/row["月数"]>6:            data_total["月均单量"]="月均单量大于6"        else:            data_total["月均单量"]="月均单量小于6"data_total   
用户IDFMRR1RRFFMMscorelevelorder_num月数自取次数优惠券使用次数还车次数取车偏好还车偏好月均单量
01323517.02021/01/25-3300.9805190.6739130.66395371.392801一等3232112313时租居多网点还车月均单量小于6
110292600.02021/01/23-3280.9779220.6086960.49034559.306896一等2929162412时租居多网点还车月均单量小于6
2100313906.02021/01/11-3160.9623380.6521740.73759974.995376一等313112217时租居多网点还车月均单量小于6
31000242255.02021/01/07-3120.9571430.5000000.42502852.358847二等2424131511时租居多网点还车月均单量小于6
410009171.02019/04/093270.1272730.0000000.0115492.602010四等11101时租居多网点还车月均单量小于6
.........................................................
59249977119.02020/10/27-2400.8636360.0000000.00170413.056779三等11010时租居多网点还车月均单量小于6
5925998343634.02021/02/06-3420.9961040.7173910.68610474.042566一等3434182413时租居多网点还车月均单量小于6
5926999333356.02021/01/27-3320.9831170.6956520.63347270.146388一等3333121910时租居多网点还车月均单量小于6
592799931125.02020/11/04-2480.8740260.0000000.02177214.416713三等11011时租居多网点还车月均单量小于6
592899941135.02020/08/22-1740.7779220.0000000.02366513.088748三等11100时租居多网点还车月均单量小于6

5929 rows × 18 columns

用户生命周期
def user_life(data,a,b,c):    List_1=[]    for index,row in data.iterrows():#         print(row[a])#         print(index)        if row[a]==1:            row[c]="引入期"        elif row[a]>1 and row[a]<3 and row[b]<=200:            row[c]="成长期"        elif row[a]>=3 and row[b]<=300:            row[c]="休眠期"        elif row[b]>300:            row[c]="流失期"        else:            row[c]="其他"#         print(row[c])        List_1.append(row)    data_1=pd.DataFrame(List_1)#     print(data_1)    return data_1Userlife=pd.DataFrame()Userlife=user_life(data_total,"F","R1","生命周期")Userlife.head(10)
用户IDFMRR1RRFFMMscorelevelorder_num月数自取次数优惠券使用次数还车次数取车偏好还车偏好月均单量生命周期
01323517.02021/01/25-3300.9805190.6739130.66395371.392801一等3232112313时租居多网点还车月均单量小于6休眠期
110292600.02021/01/23-3280.9779220.6086960.49034559.306896一等2929162412时租居多网点还车月均单量小于6休眠期
2100313906.02021/01/11-3160.9623380.6521740.73759974.995376一等313112217时租居多网点还车月均单量小于6休眠期
31000242255.02021/01/07-3120.9571430.5000000.42502852.358847二等2424131511时租居多网点还车月均单量小于6休眠期
410009171.02019/04/093270.1272730.0000000.0115492.602010四等11101时租居多网点还车月均单量小于6引入期
51001283074.02021/01/24-3290.9792210.5869570.58008364.167223一等2828121812时租居多网点还车月均单量小于6休眠期
610014149.02020/08/25-1770.7818180.0000000.00738412.170287三等11110时租居多网点还车月均单量小于6引入期
710016167.02020/07/19-1400.7337660.0000000.01079111.653976三等11010时租居多网点还车月均单量小于6引入期
81002293316.02021/01/03-3080.9519480.6086960.62589967.050569一等2929121915时租居多网点还车月均单量小于6休眠期
910023181.02020/09/09-1920.8012990.0000000.01344212.825993三等11110时租居多网点还车月均单量小于6引入期
"休眠期" in Userlife["生命周期"].values
True
zq=Userlife.groupby("生命周期")yr=zq.get_group("引入期")["生命周期"].count()cz=zq.get_group("成长期")["生命周期"].count()xm=zq.get_group("休眠期")["生命周期"].count()ls=zq.get_group("流失期")["生命周期"].count()x=["引入期","成长期","休眠期","流失期"]y=[yr,cz,xm,ls]for i in range(len(x)):    print(x[i],y[i])my_Font = Font_manager.FontPropertIEs(fname="C:/Users/wty_pc/Anaconda3/lib/site-packages/matplotlib/mpl-data/Fonts/ttf/STSONG.TTF",size = 18)plt.figure(figsize=(25,10),dpi=80)plt.rcParams['Font.sans-serif']=['STSONG'] #中文标签设置# plt.rcParams['axes.unicode_minus']=False  #用来正常显示坐标负号rects=plt.bar(x,y,0.3,color=["r","g","b","y"])plt.xticks(x,x,FontpropertIEs=my_Font)for rect in rects:    height=rect.get_height()    plt.text(rect.get_x()+rect.get_wIDth()/2,height+1,str(height),Fontsize=18,ha="center")plt.show()
引入期 2528成长期 895休眠期 2414流失期 32

用户评分统计
# 评分 #0分代表未参与评分,可以统计用户的参与度sql='''select date_format(付款时间,"%Y/%m") 付款年月,评分,count(distinct 用户ID) as user_num,count(*) as order_num                    from paper_data group by date_format(付款时间,"%Y/%m"),评分;'''pinfen_num=get_MysqL_data(DB,sql)pinfen_num["评分"]=pinfen_num["评分"].astype(int)pinfen_num["参与度"]=pinfen_num["user_num"]/pinfen_num["order_num"]pinfen_num.dtypes
付款年月          object评分             int32user_num       int64order_num      int64参与度          float64dtype: object
# 用户评分参与度=有评分的用户/下单用户量import numpy as np# 按评分分组pf_group=pinfen_num.groupby(["评分"])# ID_name# 取出分组的值# 方法一List_pf=[]for key,value in pf_group:    List_pf.append(value.reset_index(drop="true"))# 方法二格式化成列表或者字典,详见后面一个代码块x=List_pf[0]["付款年月"]y=[]for i in range(len(List_pf)):    y.append(List_pf[i]["参与度"])# print("x:",x.values)# print("y:",y[1])my_Font = Font_manager.FontPropertIEs(fname="C:/Users/wty_pc/Anaconda3/lib/site-packages/matplotlib/mpl-data/Fonts/ttf/STSONG.TTF",size = 15)fig=plt.figure(figsize=(20,40),dpi=80)colors=["r","g","b"]for i in range(1,12):    ax=fig.add_subplot(11,1,i)    ax.plot(x,y[i-1],color=colors[np.random.randint(3)],label="{}分".format(i-1))    x_label="评分为{}的趋势".format(i-1)    ax.set_xlabel(x_label,FontpropertIEs=my_Font)    ax.set_ylabel("参与度",FontpropertIEs=my_Font)    ax.legend() #调整子图间距    plt.tight_layout()plt.show()

groupby取数的技巧
# groupby 格式化成列表 为转换成字典做铺垫a=List(pinfen_num.groupby(["评分"]))# 列表的每个元素是个元组,每个元组的值为分组键和值(datafarme)a[0][1].reset_index(drop = True)# print(type(a[1]))
付款年月评分user_numorder_num参与度
02019/0102302410.954357
12019/0202062130.967136
22019/0302392580.926357
32019/0402092160.967593
42019/0502132230.955157
52019/0602172300.943478
62019/0702292390.958159
72019/0802162220.972973
82019/0902142290.934498
92019/1002322440.950820
102019/1102252350.957447
112019/1202392470.967611
122020/0102712870.944251
132020/0201872000.935000
142020/0302212300.960870
152020/0402162300.939130
162020/0502192270.964758
172020/0602072180.949541
182020/0702242320.965517
192020/0802392500.956000
202020/0902242330.961373
212020/1002402500.960000
222020/1102202270.969163
232020/1202182300.947826
242021/0102382510.948207
252021/02056570.982456
# groupby 格式化成字典a=dict(List(pinfen_num.groupby(["评分"])))# type(a[0])a[0].reset_index(drop=True)
付款年月评分user_numorder_num参与度
02019/0102302410.954357
12019/0202062130.967136
22019/0302392580.926357
32019/0402092160.967593
42019/0502132230.955157
52019/0602172300.943478
62019/0702292390.958159
72019/0802162220.972973
82019/0902142290.934498
92019/1002322440.950820
102019/1102252350.957447
112019/1202392470.967611
122020/0102712870.944251
132020/0201872000.935000
142020/0302212300.960870
152020/0402162300.939130
162020/0502192270.964758
172020/0602072180.949541
182020/0702242320.965517
192020/0802392500.956000
202020/0902242330.961373
212020/1002402500.960000
222020/1102202270.969163
232020/1202182300.947826
242021/0102382510.948207
252021/02056570.982456
pinfen_num.groupby(["评分"]).size()
评分0     261     262     263     264     265     266     267     268     269     2610    26dtype: int64
pinfen_num.groupby(["评分"]).count()
付款年月user_numorder_num参与度
评分
026262626
126262626
226262626
326262626
426262626
526262626
626262626
726262626
826262626
926262626
1026262626
# 每个评分最小参与度pinfen_num["参与度"].groupby(pinfen_num["评分"]).min()pinfen_num.groupby(pinfen_num["评分"]).min()["参与度"]pinfen_num.groupby(pinfen_num["评分"])["参与度"].min()
评分0     0.9263571     0.9236952     0.9177493     0.9256204     0.9120005     0.9346946     0.9163187     0.9299618     0.9295779     0.94009210    0.928571name: 参与度, dtype: float64
# 按评分选取多列# pinfen_num.groupby(pinfen_num["评分"])[["参与度","order_num"]].min()pinfen_num[["参与度","order_num"]].groupby(pinfen_num["评分"]).min()
参与度order_num
评分
00.92635757
10.92369547
20.91774967
30.92562061
40.91200062
50.93469468
60.91631852
70.92996162
80.92957765
90.94009248
100.92857166
# 多维分组用unstack可以解成二维表(不堆叠)a=pinfen_num["参与度"].groupby([pinfen_num["付款年月"],pinfen_num["评分"]]).min().unstack()a# a.shape
评分012345678910
付款年月
2019/010.9543570.9500000.9710740.9668050.9678710.9763780.9375000.9615380.9357430.9506170.957806
2019/020.9671360.9389670.9724770.9520000.9463410.9439250.9313730.9444440.9345790.9545450.944954
2019/030.9263570.9451480.9280000.9587160.9397590.9409280.9668050.9306120.9563490.9653680.937799
2019/040.9675930.9668250.9704430.9469030.9255810.9786320.9907410.9495800.9776790.9699570.944700
2019/050.9551570.9539750.9469030.9471540.9745760.9521740.9576270.9491530.9626170.9442060.970588
2019/060.9434780.9236950.9485980.9369370.9488190.9581590.9570310.9652170.9533900.9513270.945378
2019/070.9581590.9504130.9688890.9422220.9500000.9541670.9578060.9358490.9483390.9579830.977876
2019/080.9729730.9375000.9451480.9568970.9736840.9680370.9215690.9707110.9617020.9489360.958506
2019/090.9344980.9778760.9488370.9592760.9728510.9739130.9707320.9508930.9547510.9465650.970588
2019/100.9508200.9699570.9824560.9519650.9274190.9448820.9541280.9605260.9464290.9502070.967611
2019/110.9574470.9495800.9827590.9608700.9423080.9655170.9655170.9601590.9346940.9644670.965217
2019/120.9676110.9406390.9489360.9372380.9670780.9405940.9586780.9299610.9466670.9617020.930736
2020/010.9442510.9456070.9618640.9778760.9434780.9769590.9419090.9655170.9510200.9482070.928571
2020/020.9350000.9678900.9631340.9631150.9733330.9444440.9729730.9347830.9694320.9484980.967136
2020/030.9608700.9859150.9414230.9434780.9414410.9775780.9739130.9636360.9468600.9648440.955157
2020/040.9391300.9272030.9382240.9314520.9521740.9459460.9617020.9591840.9455250.9400920.942387
2020/050.9647580.9354840.9177490.9446640.9780700.9592760.9333330.9638550.9547170.9635630.974684
2020/060.9495410.9590910.9527900.9256200.9515420.9638550.9530520.9641260.9581590.9626170.941909
2020/070.9655170.9662450.9492190.9517540.9120000.9457010.9245280.9556450.9674420.9427480.942652
2020/080.9560000.9397590.9260870.9906100.9310340.9442060.9163180.9547330.9763780.9613730.966038
2020/090.9613730.9600000.9589040.9563320.9449150.9809520.9353450.9372380.9600000.9553570.945607
2020/100.9600000.9353450.9432310.9523810.9267400.9417040.9649120.9360000.9314520.9547510.956140
2020/110.9691630.9688720.9523810.9466670.9189190.9601590.9422220.9647580.9295770.9551570.947581
2020/120.9478260.9515420.9579830.9466670.9675930.9517540.9481130.9581750.9517540.9487180.959016
2021/010.9482070.9363640.9612070.9317270.9501920.9346940.9670780.9566930.9610890.9574470.944444
2021/020.9824560.9787230.9701490.9836071.0000000.9705881.0000001.0000000.9692311.0000000.984848
#多维分组堆叠成多索引 b=pinfen_num["参与度"].groupby([pinfen_num["付款年月"],pinfen_num["评分"]]).min()b# b.shape
付款年月     评分2019/01  0     0.954357         1     0.950000         2     0.971074         3     0.966805         4     0.967871                 ...   2021/02  6     1.000000         7     1.000000         8     0.969231         9     1.000000         10    0.984848name: 参与度, Length: 286, dtype: float64
pinfen_num["参与度"].groupby(pinfen_num["评分"]).describe().unstack()
       评分count  0     26.000000       1     26.000000       2     26.000000       3     26.000000       4     26.000000               ...    max    6      1.000000       7      1.000000       8      0.977679       9      1.000000       10     0.984848Length: 88, dtype: float64
pinfen_num["参与度"].groupby(pinfen_num["评分"]).describe()
countmeanstdmin25%50%75%max
评分
026.00.9553720.0131880.9263570.9479210.9567230.9653270.982456
126.00.9524080.0166140.9236950.9391650.9502070.9666800.985915
226.00.9541870.0164780.9177490.9455860.9525850.9674500.982759
326.00.9524200.0155170.9256200.9437750.9518600.9591360.990610
426.00.9510660.0211230.9120000.9401800.9494090.9678021.000000
526.00.9575050.0143770.9346940.9445540.9561630.9699500.980952
626.00.9540350.0204230.9163180.9386020.9573290.9664831.000000
726.00.9547300.0152480.9299610.9456210.9574340.9638011.000000
826.00.9532910.0132830.9295770.9464880.9540530.9615490.977679
926.00.9565100.0118250.9400920.9487730.9549540.9623881.000000
1026.00.9549200.0148030.9285710.9445080.9556490.9668620.984848
总结

以上是内存溢出为你收集整理的python连接Mysql共享汽车行业案例分析全部内容,希望文章能够帮你解决python连接Mysql共享汽车行业案例分析所遇到的程序开发问题。

如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/langs/1189004.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-06-03
下一篇 2022-06-03

发表评论

登录后才能评论

评论列表(0条)

保存