注:下面的几个例子修改自蔡驰聪《python数据分析从入门到精通》,因为自己最近在跟着这本书学习,所以一边看一边根据自己的理解修改案例
import pandas as pd import numpy数组案例 as np #from IPython.display import display #显示所有列 pd.set_option('display.max_columns', None) #显示所有行 pd.set_option('display.max_rows', None) #设置value的显示长度为100,默认为50 pd.set_option('max_colwidth',100) """ 下面是dataframe的遍历方式,切片方式和loc方式 """ students=pd.Dataframe( {"name":["lucy","rick","mary","john"], "age":[12,14,19,20], "score":[72,65,76,99]}, index=[1,2,3,4], columns=["name","school","age","score","goal"] ) print( "--------------------", 'tn', students, 'nt', "--------------------") #书本4.8图的实现方法,两种方法在于销量和型号一一对应,我的是用字典方式实现,书里是用列表提前设定好,但我觉得我的比较直观易懂 #书本的方法太简洁了,以致有时候看的不是很懂 #我自己想的实现方法 values={ "销量":[10,11,13,10,12,12], "型号":["A","B","C","D","E","F"]} sales=pd.Dataframe(values, index=[["一月","一月","二月","二月","三月","三月"],["冰箱","电视","冰箱","电视","冰箱","电视"]], columns=["销量","型号"]) print(sales,"n""⬆️my method") #书上的实现方法 values2=[ [10,"A"],[11,"B"],[13,"C"],[10,"D"],[12,"E"],[12,"F"]] sales2=pd.Dataframe(values2, columns=["销量","型号"], index=[["一月","一月","二月","二月","三月","三月"],["冰箱","电视","冰箱","电视","冰箱","电视"]]) print(sales2) print("------------------读取dataframe的单列数据-----------------") print(students["name"]) print("------------------读取dataframe的多列数据-----------------") #读取多列需要用两个中括号[] print(students[["name","age"]]) print("------------------读取dataframe的多行数据-----------------") #这里的索引和列表list一样,最后面的索引3不读取 print(students[1:3]) print("------------------读取dataframe的某行数据-----------------") print(students[0:1],"n""0000000000---00000000") print(students.loc[1],"n""11111------------111") print(students.loc[1,["name","age"]],"n""2222--------2222") print(students.loc[[1,3],["name","age"]],"n""333333--------333333") #注意这里的[1:3]包括了索引1,2,3对应的值,跟list的索引不一样⬇️ print(students.loc[1:3],"n""444444--------444444")#注意这里[1:3]和[0:3]得到的取值是一样的,都是第1-3行。 print(students.loc[0:3],"n""444444--------444444") print(students.loc[1:2],"n""tttt1-----444444--------444444") print(students.loc[0:5],"n""tttt0-----444444--------444444")#1-1这里的loc方法可以选择超过dataframe行数的行, #1-2例如,student只有4行,但是选择的索引区间写0-5行即[0:5]也不会报错,取值等同于[1:4]或者[0:4] print(students.loc[0:4],"n""tttt0000---4444tttt-----444444--------444444") print(students.loc[1:4],"n""tttt1111-44444tttt-----444444--------444444") print(students.loc[lambda x :x.index%2==0],"n""用匿名函数取索引为偶数的行值,%2表示整除2") print(students.loc[students.index%2==0],"n""用数据框架的变量取索引为偶数的行值,%2表示整除2")#跟上面的实现原理一样 print(students.loc[lambda x :x['age']>18],"n""用匿名函数取年龄大于18的行值") print(students.loc[lambda x :x['score']>70],"n""用匿名函数取分数大于70行值") print("------------------按条件筛选后选择其中某列或某几列读取dataframe的数据-----------------") #读取多列需要用两个中括号[] print(students.loc[lambda x :x['age']>18,["name","score"]],"n""用匿名函数取年龄大于18的行值,然后再选出其中符合条件的人名") #用loc方法多层次索引数据 print(sales.loc["一月"],"n""这是我自己想的方法") print(sales2.loc["一月"],"n""这是书本的方法") print(sales.loc["一月","电视"],"n""这是我自己想的方法") print(sales2.loc["一月","电视"],"n""这是书本的方法") shuju={ "关键词":['包装盒厂','纸箱厂','纸箱打样','包装厂家','纸箱定做','纸箱制造','五层纸箱','纸箱包装厂','纸箱厂家','纸箱生产设备'], "展现":[405,856,21,1155,80,378,17,24,380,128], "点击":[19,23,1,36,10,5,0,2,12,6], "消费":[129.58,189.89,6.08,232.77,130.42,45.61,0,6,91.72,22.05], "点击率":[4.69,2.69,4.76,3.12,12.50,1.32,0.00,8.33,3.16,4.69], "平均点击价格":[6.82,8.26,6.08,6.47,13.04,9.12,0,3,7.64,3.68], "首页平均排名":[1.92,1.91,1.75,1.95,2.15,1.65,1.75,2.17,1.83,2.18], # "网页转化":[1,3,0,5,1,0,0,0,1,0], # "商桥转化":[1,3,0,5,1,0,0,0,1,0], "出价":[4.98,5.64,4.07,5.12,6,5.98,0.57,2.94,6,1.3,] } # shuju={ # "word":['包装盒厂','纸箱厂','纸箱打样','包装厂家','纸箱定做','纸箱制造','五层纸箱','纸箱包装厂','纸箱厂家','纸箱生产设备'], # "show":[405,856,21,1155,80,378,17,24,380,128], # "clik":[19,23,1,36,10,5,0,2,12,6], # "cost":[129.58,189.89,6.08,232.77,130.42,45.61,0,6,91.72,22.05], # "ctr":['4.69%','2.69%','4.76%','3.12%','12.50%','1.32%','0.00%','8.33%','3.16%','4.69%'], # "acp":[6.82,8.26,6.08,6.47,13.04,9.12,0,3,7.64,3.68], # "perrank":[1.92,1.91,1.75,1.95,2.15,1.65,1.75,2.17,1.83,2.18], # "pagecv":[1,3,0,5,1,0,0,0,1,0], # "dialogcv":[1,3,0,5,1,0,0,0,1,0], # "price":[4.98,5.64,4.07,5.12,6,5.98,0.57,2.94,6,1.3,] # } #注意全部的数组阵列必须一样长度,比如计划名,单元名,关键词的个数都要一样的。计划名一样,也要全部写上,不能省略。 shujulie=[ ['纸箱-2019','纸箱-2019','纸箱-2019','纸箱-2019','纸箱-2019','纸箱-2019','纸箱-2019','纸箱-2019','纸箱-2019','纸箱-2019'], ['纸箱-厂家词','纸箱-厂家词','纸箱-厂家词','纸箱-厂家词','纸箱-厂家词','纸箱-厂家词','纸箱-厂家词','纸箱-厂家词','纸箱-厂家词','纸箱-厂家词'], #['包装盒厂', '纸箱厂', '纸箱打样', '包装厂家', '纸箱定做', '纸箱制造', '五层纸箱', '纸箱包装厂', '纸箱厂家', '纸箱生产设备'] ] #报错的写法:(注释掉是因为只是提供参考)这个运行会报错:ValueError: all arrays must be same length # shujulie=[ # ['纸箱-2019','纸箱-2019','纸箱-2019','纸箱-2019','纸箱-2019','纸箱-2019','纸箱-2019','纸箱-2019','纸箱-2019','纸箱-2019'], # ['纸箱-厂家词','纸箱-厂家词','纸箱-厂家词','纸箱-厂家词','纸箱-厂家词','纸箱-厂家词','纸箱-厂家词','纸箱-厂家词','纸箱-厂家词','纸箱-厂家词'], # ['包装盒厂', '纸箱厂', '纸箱打样', '包装厂家', '纸箱定做', '纸箱制造', '五层纸箱', '纸箱包装厂', '纸箱厂家', '纸箱生产设备'] # ] toufang=pd.Dataframe(shuju, columns=["关键词","展现","点击","消费","点击率","平均点击价格","首页平均排名", # "网页转化","商桥转化", "出价"], #columns=["word","show","clik","cost","ctr","acp","perrank","pagecv","dialogcv","price"], #index=shujulie #index=list(range(1,11)) ) print("n""这是print打印的""n", toufang, "n""这是print打印的") #display(toufang,"n""这是display打印的") print(toufang.loc[lambda x:x["点击"]>20],"n""-----------------")#输出点击等于0的关键词 """ 上面是dataframe的遍历方式,切片方式和loc方式 """ """ 下面是dataframe选取数据的iloc方式 """ #用iloc选取某一行,下面的1是行的索引,表示第2行,第一行的索引值是0 print(toufang.iloc[1],"n""------iloc[1]-----------") print(toufang.loc[1],"n""--------loc[1]---------") #iloc的取值原理和列表list相似,后面的数字3不取,但是loc的不一样,loc后面的数字3取。[:3]等同于[0:3],表示0到3的索引 print(toufang.iloc[:3],"n""-------iloc[:3]----------") print(toufang.loc[:3],"n""-------loc[:3]----------") #注意这里的位置是0,0,所以[0,1]表示第0行第1列 print(toufang.iloc[0,1],"n""-------iloc[0:1]----------")#取第一行第二列的数据 print(toufang.iloc[0:5,1:5],"n""-------iloc[0:5,1:5]----------")#取第一行到第五行的第二到第五列第数据 #选择某些行和某些列 print(toufang.iloc[[1,3,5],[2,4,6]],"n""-------iloc[0:5,1:5]----------")#选择第二、四、六行的第三、五、七列数据 #选取整行,就是相当于取索引0-3的行的全部列,就是相当于是这几行的横着的列都取了,就是整行 print(toufang.iloc[0:3, :]) #选取整列,就是相当于取索引0-3的列的全部行,就是相当于是这几列的竖着的行都取了,就是整列 print(toufang.iloc[:, 0:3],"n""-------iloc[:, 0:3]----------") """ 上面是dataframe选取数据的iloc方式 """ """ 下面是dataframe遍历数据的方式 """ #按行来遍历 print(toufang.iterrows,"n""-----interrows-----------") for index,row in toufang.iterrows(): print('index:{}'.format(index)) print('{}'.format(row)) #按列来遍历 print(toufang.items,"n""-------items---------") for label,column in toufang.items(): print(label) print(column) """ 上面是dataframe遍历数据的方式 """ sales.to_excel("woshiexcel.xlsx") sales2.to_excel("woshiexcel_sales2.xlsx") toufang.to_excel("woshiexcel_toufang.xlsx") def ddd(x): return x['score']*2 students=students.apply(ddd,axis=1) print(students)
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)