对象
Series对象(一维)
Numpy与Series
index行索引 创建Series对象 Dataframe对象(二维)
创建Dataframe对象
columns Index对象
交 并 异或 数据取值与选择
Series数据选择方式
keys、items 索引器
loc、iloc Dataframe数据选择方式 数值运算
Series
保留索引索引对齐 DataframeDataframe与Series运算 处理缺失值
Pandas的缺失值
python对象类型的缺失值数值类型的缺失值NaN:不是一个数字Pandas中NaN与None的差异 处理缺失值
发现缺失值:isnull()和notnull()剔除缺失值:dropna()填充缺失值:fillna() 层级索引MultiIndex
多级索引Series
笨方法好方法:Pandas多级索引高维数据的多级索引 多级索引的创建方法
显示地创建多级索引多级索引的等级名称多级列索引 多级索引的取片与切片
Series多级索引Dataframe多级索引 多级索引行列转换
有序的索引和无序的索引索引stack与unstack索引的设置与重置 多级索引的数据累计方法 合并数据集:Concat与Append *** 作
Numpy数组合并通过pd.concat实现简易合并
索引重复
对象 Series对象(一维)Series对象是带有索引数据构成的数组,可以作为通用型的Numpy数组,也可以看作特殊的Python字典
import pandas as pd data = pd.Series([0.25,0.5,0.75,1.0]) print(data) '''数据和索引绑定在一起 0 0.25 1 0.50 2 0.75 3 1.00 dtype: float64 ''' print(data.values) # [0.25 0.5 0.75 1. ] print(data.index) # RangeIndex(start=0, stop=4, step=1) print(data[1]) # 0.5 print(data[1:3]) ''' 1 0.50 2 0.75 dtype: float64 '''Numpy与Series index行索引
numpy是隐试定义索引获取数值,Series是显示定义索引获取数值
import pandas as pd data = pd.Series([0.25,0.5,.75,1.0], index=['a','b','c','d']) # 第二个参数可省 print(data) ''' a 0.25 b 0.50 c 0.75 d 1.00 dtype: float64 ''' print(data['b']) # 0.5 data = pd.Series([0.25,0.5,.75,1.0], index=[2,5,3,7]) print(data) ''' 2 0.25 5 0.50 3 0.75 7 1.00 dtype: float64 ''' print(data[5]) # 0.5创建Series对象
pd.Series(data, index=index)第二个参数可省
data可以是数组
data = pd.Series([0.25,0.5,0.75,1.0])
data可以是标量
data = pd.Series(5,index=[100,200,300]); print(data) ''' 100 5 200 5 300 5 dtype: int64 '''
data可以是字典,与字典不同的是它还支持数组型的 *** 作,比如切片
# Series是特殊的字典 popu = {'aa':123, 'bb':456, 'cc':789} po = pd.Series(popu) print(po) ''' aa 123 bb 456 cc 789 dtype: int64 ''' print(po['bb']) # 456 # 与字典不同的是它还支持数组型的 *** 作,比如且切片 print(po['aa':'bb']) ''' aa 123 bb 456 dtype: int64 '''
data = pd.Series({2:'a',1:'b',3:'c'}, index=[3,2]) print(data) ''' 3 c 2 a dtype: object '''Dataframe对象(二维)
Dataframe可以作为通用型的Numpy数组,也可以看作特殊的Python字典
,索引[列][行],不能调换顺序,否则出错
import numpy as np import pandas as pd ages = {'Li':15, 'Liu':26, 'Wan':19} age = pd.Series(ages) scores = {'Li':80, 'Liu':88, 'Wan':92} score = pd.Series(scores) stu = pd.Dataframe({'age':age,'score':score}) print(stu) ''' age score Li 15 80 Liu 26 88 Wan 19 92 ''' # index获取索引标签 print(stu.index) # Index(['Li', 'Liu', 'Wan'], dtype='object') # 存放标签的index对象 print(stu.columns) # Index(['age', 'score'], dtype='object') print(stu['age']) ''' Li 15 Liu 26 Wan 19 Name: age, dtype: int64 ''' # 索引age 和 li不能更换 print(stu['age']['Li']) # 15创建Dataframe对象 columns
创建单列
ages = {'Li':15, 'Liu':26, 'Wan':19} age = pd.Series(ages) print(pd.Dataframe(age,columns=['age'])) ''' age Li 15 Liu 26 Wan 19 '''
通过字典创建
data = [{'a':i,'b':2*i}for i in range(3)] print(pd.Dataframe(data)) ''' a b 0 0 0 1 1 2 2 2 4 '''
缺失用NaN表示
print(pd.Dataframe([{'a':1,'b':2}, {'c':3,'d':4}])) ''' a b c d 0 1.0 2.0 NaN NaN 1 NaN NaN 3.0 4.0 '''
通过Series字典创建
ages = {'Li':15, 'Liu':26, 'Wan':19} age = pd.Series(ages) scores = {'Li':80, 'Liu':88, 'Wan':92} score = pd.Series(scores) print(pd.Dataframe({'age':age,'score':score})) ''' age score Li 15 80 Liu 26 88 Wan 19 92 '''
通过Numpy二维数组创建
print(pd.Dataframe(np.random.rand(3,2), columns=['foo','bar'], index=['a','b','c'])) ''' foo bar a 0.321252 0.393929 b 0.006765 0.450808 c 0.783284 0.667144 '''
通过Numpy结构化数组创建
a = np.zeros(3,dtype=[('a','i8'),('b','f8')]) print(a) # [(0, 0.) (0, 0.) (0, 0.)] print(pd.Dataframe(a)) ''' a b 0 0 0.0 1 0 0.0 2 0 0.0 '''Index对象
该对象是一个不可变数组或有序集合
ind = pd.Index([2,3,5,7,11]) print(ind) # Int64Index([2, 3, 5, 7, 11], dtype='int64') print(ind[1]) # 3 print(ind[1:3]) # Int64Index([3, 5], dtype='int64') print(ind.size,ind.shape,ind.dtype) # 5 (5,) int64交 并 异或
a = pd.Index([1,3,5,7,9]) b = pd.Index([2,3,5,6]) print(a&b) # Int64Index([3, 5], dtype='int64') print(a|b) # Int64Index([1, 2, 3, 5, 6, 7, 9], dtype='int64') print(a^b) # Int64Index([1, 2, 6, 7, 9], dtype='int64')数据取值与选择 Series数据选择方式 keys、items
将Series看做字典,可用字典表达式和方法来检测/索引和值
data = pd.Series([0.25,0.5,.75,1.0], index=['a','b','c','d']) print('a' in data) # True print(data.keys()) # Index(['a', 'b', 'c', 'd'], dtype='object') print(list(data.items())) # [('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)] data['e']=1.25 print(data) ''' a 0.25 b 0.50 c 0.75 d 1.00 e 1.25 dtype: float64 '''
Series看做一维数组
print(data['a':'c']) ''' a 0.25 b 0.50 c 0.75 dtype: float64 ''' print(data[0:2]) ''' a 0.25 b 0.50 dtype: float64 ''' print(data[(data>0.3)&(data<0.8)]) ''' b 0.50 c 0.75 dtype: float64 ''' print(data[['a','e']]) ''' a 0.25 e 1.25 dtype: float64 '''索引器 loc、iloc
data = pd.Series(['a','b','c'],index=[1,3,5]) print(data) ''' 1 a 3 b 5 c dtype: object ''' print(data[1])# a print(data[1:3]) ''' 3 b 5 c dtype: object ''' print(data.loc[1]) # a print(data.loc[1:3]) ''' 1 a 3 b dtype: object ''' print(data.iloc[1]) # b print(data.iloc[1:3]) ''' 3 b 5 c dtype: object '''Dataframe数据选择方式
# 将Dataframe看做字典 ages = {'Li':15, 'Liu':26, 'Wan':19} age = pd.Series(ages) scores = {'Li':80, 'Liu':88, 'Wan':92} score = pd.Series(scores) stu = pd.Dataframe({'age':age,'score':score}) print(stu) ''' age score Li 15 80 Liu 26 88 Wan 19 92 ''' print(stu['age']) ''' Li 15 Liu 26 Wan 19 Name: age, dtype: int64 ''' print(stu.age) ''' Li 15 Liu 26 Wan 19 Name: age, dtype: int64 ''' # 如果列名不是纯字符串,或者列名与Dataframe的方法名相同,就不能用属性索引 print(stu.age is stu['age']) # True # 避免对属性的形式选择的列直接赋值,可以stu['age']=z,不要用stu.age=z stu['a']=stu['age']/stu['score'] print(stu) ''' age score a Li 15 80 0.187500 Liu 26 88 0.295455 Wan 19 92 0.206522 ''' # 将Dataframe看作二维数组 print(stu.values) ''' [[15. 80. 0.1875 ] [26. 88. 0.29545455] [19. 92. 0.20652174]] ''' print(stu.T) ''' Li Liu Wan age 15.0000 26.000000 19.000000 score 80.0000 88.000000 92.000000 a 0.1875 0.295455 0.206522 '''
ages = {'Li':15, 'Liu':26, 'Wan':19} age = pd.Series(ages) scores = {'Li':80, 'Liu':88, 'Wan':92} score = pd.Series(scores) stu = pd.Dataframe({'age':age,'score':score}) print(stu) ''' age score Li 15 80 Liu 26 88 Wan 19 92 ''' print(stu.values[0]) # [15 80] print(stu['age']) ''' Li 15 Liu 26 Wan 19 Name: age, dtype: int64 ''' print(stu.iloc[:2,:1]) ''' age Li 15 Liu 26 ''' print(stu.loc[:'Liu',:'age']) ''' age Li 15 Liu 26 '''
print(stu) ''' age score Li 15 80 Liu 26 88 Wan 19 92 ''' stu.iloc[0,1] = 100 print(stu) ''' age score Li 15 100 Liu 26 88 Wan 19 92 ''' # 其他取值方式 print(stu['Li':'Wan']) ''' age score Li 15 100 Liu 26 88 Wan 19 92 ''' print(stu[1:3]) ''' age score Liu 26 88 Wan 19 92 ''' print(stu[stu.age>20]) ''' age score Liu 26 88 '''数值运算 Series 保留索引
两对象都可以使用Numpy通用函数
rng = np.random.RandomState(42) # 种子 ser = pd.Series(rng.randint(0,10,4)) # 生成4个0~10之间的数 print(ser) ''' 0 6 1 3 2 7 3 4 dtype: int32 ''' df = pd.Dataframe(rng.randint(0,10,(3,4)),columns=['a','b','c','d']) print(df) ''' a b c d 0 6 9 2 6 1 7 4 3 7 2 7 2 5 4 ''' print(np.exp(ser)) # 指数函数,e的ser次方 ''' 0 403.428793 1 20.085537 2 1096.633158 3 54.598150 dtype: float64 ''' print(df*np.pi/4) ''' a b c d 0 4.712389 7.068583 1.570796 4.712389 1 5.497787 3.141593 2.356194 5.497787 2 5.497787 1.570796 3.926991 3.141593 '''索引对齐
# Series age = pd.Series({'Li':15,'Liu':26,'Wan':19},name='age') score = pd.Series({'Chen':88,'Li':80,'Wan':92},name='score') print(age/score) '''索引为两数组的并集 Chen NaN Li 0.187500 Liu NaN Wan 0.206522 dtype: float64 ''' print(age.index|score.index) # Index(['Chen', 'Li', 'Liu', 'Wan'], dtype='object')
NaN表示此处没有数
a = pd.Series([2,4,6],index=[0,1,2]) b = pd.Series([1,3,5],index=[1,2,3]) print(a+b) ''' 0 NaN 1 5.0 2 9.0 3 NaN dtype: float64 '''
NaN不是想要的结果,用适当的对象方法代替运算符a.add(b)等价于a+b,也可自定义a或b缺失的数据
print(a.add(b,fill_value=0)) # 没有共同的加fill_value,即加0 ''' 0 2.0 1 5.0 2 9.0 3 5.0 dtype: float64 '''Dataframe
rng = np.random.RandomState(42) A = pd.Dataframe(rng.randint(0,20,(2,2)),columns=list('AB')) print(A) ''' A B 0 6 19 1 14 10 ''' B = pd.Dataframe(rng.randint(0,10,(3,3)),columns=list('BAC')) print(B) ''' B A C 0 7 4 6 1 9 2 6 2 7 4 3 ''' print(A+B) ''' A B C 0 10.0 26.0 NaN 1 16.0 19.0 NaN 2 NaN NaN NaN ''' fill = A.stack().mean() # 计算A的均值 print(fill) # 12.25 print(A.add(B,fill_value=fill))# 没有共同的加fill,即加12.25 ''' A B C 0 10.00 26.00 18.25 1 16.00 19.00 18.25 2 16.25 19.25 15.25 '''
Dataframe与Series的运算规则与Numpy中一维、二维数组运算一样的
rng = np.random.RandomState(42) A = rng.randint(0,10,(3,4)) print(A) ''' [[6 3 7 4] [6 9 2 6] [7 4 3 7]] ''' print(A-A[0]) ''' [[ 0 0 0 0] [ 0 6 -5 2] [ 1 1 -4 3]] ''' df = pd.Dataframe(A,columns=list('QRST')) print(df) ''' Q R S T 0 6 3 7 4 1 6 9 2 6 2 7 4 3 7 ''' print(df-df.iloc[0]) ''' Q R S T 0 0 0 0 0 1 0 6 -5 2 2 1 1 -4 3 ''' print(df.subtract(df['R'],axis=0)) ''' Q R S T 0 3 0 4 1 1 -3 0 -7 -3 2 3 0 -1 3 ''' halfrow = df.iloc[0,::2] print(halfrow) ''' Q 6 S 7 Name: 0, dtype: int32 ''' print(df-halfrow) ''' Q R S T 0 0.0 NaN 0.0 NaN 1 0.0 NaN -5.0 NaN 2 1.0 NaN -4.0 NaN '''处理缺失值
选择缺失值处理方法:覆盖全局掩码表示缺失值(额外存储和计算负担),标签值表示缺失值(额外CPU或GPU计算逻辑
Pandas的缺失值 python对象类型的缺失值None不能作为Numpy/Pandas的缺失值,只能用于’object’数组类型dtype=object表示Numpy认为这个数组是Python对象构成的,对于常用的快速 *** 作,这种 *** 作耗更多资源
vals1 = np.array([1, None, 3, 4]) print(vals1) # [1 None 3 4] #print(vals1.sum()) # 出错,Python中没有定义整数与None之间的运算数值类型的缺失值NaN:不是一个数字
Numpy会为这个数组选择一个原生浮点类型,这会将数组编译成C代码从而实现快速 *** 作
vals2 = np.array([1, np.nan, 3, 4]) print(vals2.dtype) # float64 print(vals2) # [ 1. nan 3. 4.]
任何数字与NaN进行任何 *** 作都会变成NaN
print(1+np.nan) # nan print(vals2.sum(),vals2.min(),vals2.max()) # nan nan nan
Numpy提供了一些特殊的累计函数,忽略NaN缺失值影响
print(np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)) # 8.0 1.0 4.0Pandas中NaN与None的差异
在适当的时候,Pandas会将NaN与None等价交换
print(pd.Series([1,np.nan,3,None])) ''' 0 1.0 1 NaN 2 3.0 3 NaN dtype: float64 '''
Pandas会将没有标签值的数据类型自动转化为NaN
x = pd.Series([1,2],dtype=int) print(x) ''' 0 1 1 2 dtype: int32 ''' x[0]=None print(x) ''' 0 NaN 1 2.0 dtype: float64 '''
Pandas对不同类型缺失值的转换规则
两种方法发现缺失值:isnull()和notnull(),返回布尔类型的掩码数据,该方法两对象都适用
data = pd.Series([1,np.nan,'hello',None]) print(data.isnull()) ''' 0 False 1 True 2 False 3 True dtype: bool ''' print(data.notnull()) ''' 0 True 1 False 2 True 3 False dtype: bool '''剔除缺失值:dropna()
Series使用该方法简单
data = pd.Series([1,np.nan,'hello',None]) print(data.dropna()) ''' 0 1 2 hello dtype: object '''
Dataframe需要一些参数,dropna()剔除整行或整列,默认为剔除整行,axis='colums’或axis=1剔除整列
df = pd.Dataframe([[1,np.nan,2], [2,3,5], [np.nan,4,6]]) print(df) ''' 0 1 2 0 1.0 NaN 2 1 2.0 3.0 5 2 NaN 4.0 6 ''' # 剔除整行或整列,默认为剔除整行 print(df.dropna()) ''' 0 1 2 1 2.0 3.0 5 ''' print(df.dropna(axis=1)) # axis='colums'整列 ''' 2 0 2 1 5 2 6 '''
how和thresh参数可以设置剔除行或列缺失值的数量阈值
默认设置是how='any',即剔除整行或整列,(行列有axis参数决定)
how='all',剔除全部是缺失值的行或列
df[3]=np.nan print(df) ''' 0 1 2 3 0 1.0 NaN 2 NaN 1 2.0 3.0 5 NaN 2 NaN 4.0 6 NaN ''' print(df.dropna(axis='columns',how='all')) ''' 0 1 2 0 1.0 NaN 2 1 2.0 3.0 5 2 NaN 4.0 6 ''' # thresh参数设置行或列中非缺失值的最小数量 print(df.dropna(axis='rows',thresh=3)) ''' 0 1 2 3 1 2.0 3.0 5 NaN '''填充缺失值:fillna()
method参数表示如何填充
method='ffill'用前面的填充后面的
method='bfill'用后面的填充前面的
data = pd.Series([1,np.nan,2,None,3],index=list('abcde')) print(data) ''' a 1.0 b NaN c 2.0 d NaN e 3.0 dtype: float64 ''' print(data.fillna(0)) ''' a 1.0 b 0.0 c 2.0 d 0.0 e 3.0 dtype: float64 ''' # 用前面的填充后面的,前面没有则仍然为缺失值 print(data.fillna(method='ffill')) ''' a 1.0 b 1.0 c 2.0 d 2.0 e 3.0 dtype: float64 ''' # 用后面的填充前面的,后面没有则仍然为缺失值 print(data.fillna(method='bfill')) ''' a 1.0 b 2.0 c 2.0 d 3.0 e 3.0 dtype: float64 ''' df = pd.Dataframe([[1,np.nan,2,np.nan], [2,3,5,np.nan], [np.nan,4,6,np.nan]]) print(df.fillna(method='ffill',axis=1)) ''' 0 1 2 3 0 1.0 1.0 2.0 2.0 1 2.0 3.0 5.0 5.0 2 NaN 4.0 6.0 6.0 '''层级索引MultiIndex 多级索引Series
用一维Series对象表示二维数据
笨方法index = [('California',2000),('California',2010), ('New York',2000),('New York', 2010), ('Texas',2000),('Texas',2010)] populations = [11111,22222,33333,44444,55555,66666] pop = pd.Series(populations, index=index) print(pop) ''' (California, 2000) 11111 (California, 2010) 22222 (New York, 2000) 33333 (New York, 2010) 44444 (Texas, 2000) 55555 (Texas, 2010) 66666 dtype: int64 ''' print(pop[('California',2010):('Texas',2000)]) # 切片 ''' (California, 2010) 22222 (New York, 2000) 33333 (New York, 2010) 44444 (Texas, 2000) 55555 dtype: int64 ''' print(pop[[i for i in pop.index if i[1] == 2010]]) # 选择2010数据 ''' (California, 2010) 22222 (New York, 2010) 44444 (Texas, 2010) 66666 dtype: int64 '''好方法:Pandas多级索引
index = [('California',2000),('California',2010), ('New York',2000),('New York', 2010), ('Texas',2000),('Texas',2010)] populations = [11111,22222,33333,44444,55555,66666] pop = pd.Series(populations, index=index) index = pd.MultiIndex.from_tuples(index) print(index) ''' MultiIndex([('California', 2000), ('California', 2010), ( 'New York', 2000), ( 'New York', 2010), ( 'Texas', 2000), ( 'Texas', 2010)], ) ''' pop = pop.reindex(index) print(pop) ''' California 2000 11111 2010 22222 New York 2000 33333 2010 44444 Texas 2000 55555 2010 66666 dtype: int64 ''' print(pop[:,2010]) # 直接使用第二个索引获取2010的全部数据 ''' California 22222 New York 44444 Texas 66666 dtype: int64 '''高维数据的多级索引
populations = [11111,22222,33333,44444,55555,66666] index = pd.MultiIndex.from_tuples([('California',2000),('California',2010), ('New York',2000),('New York', 2010), ('Texas',2000),('Texas',2010)]) pop = pd.Series(populations, index=index) pop_df = pop.unstack() print(pop_df) ''' 2000 2010 California 11111 22222 New York 33333 44444 Texas 55555 66666 ''' print(pop_df.stack()) ''' California 2000 11111 2010 22222 New York 2000 33333 2010 44444 Texas 2000 55555 2010 66666 dtype: int64 ''' pop_df = pd.Dataframe({'total':pop, 'under18':[11111,22222,33333,44444,55555,66666]}) print(pop_df) ''' total under18 California 2000 11111 11111 2010 22222 22222 New York 2000 33333 33333 2010 44444 44444 Texas 2000 55555 55555 2010 66666 66666 ''' f_u18 = pop_df['under18']/pop_df['total'] print(f_u18.unstack()) ''' 2000 2010 California 1.0 1.0 New York 1.0 1.0 Texas 1.0 1.0 '''多级索引的创建方法
df = pd.Dataframe(np.random.rand(4,2),index=[['a','a','b','b'],[1,2,1,2]],columns=['data1','data2']) print(df) ''' data1 data2 a 1 0.045858 0.391234 2 0.631418 0.924928 b 1 0.534416 0.216372 2 0.300895 0.523091 ''' data = {('California',2000):11111,('California',2010):22222, ('New York',2000):33333,('New York', 2010):44444, ('Texas',2000):55555,('Texas',2010):66666} print(pd.Series(data)) ''' California 2000 11111 2010 22222 New York 2000 33333 2010 44444 Texas 2000 55555 2010 66666 dtype: int64 '''显示地创建多级索引
print(pd.MultiIndex.from_arrays([['a','a','b','b'],[1,2,1,2]])) # 若干简单数组组成的列表创建 ''' MultiIndex([('a', 1), ('a', 2), ('b', 1), ('b', 2)], ) ''' print(pd.MultiIndex.from_tuples([('a',1),('a',2),('b',1),('b',2)])) # 包含多个索引元组构成的列表创建 ''' MultiIndex([('a', 1), ('a', 2), ('b', 1), ('b', 2)], ) ''' print(pd.MultiIndex.from_product([['a','b'],[1,2]])) # 用两个索引的笛卡尔积创建 ''' MultiIndex([('a', 1), ('a', 2), ('b', 1), ('b', 2)], ) '''
???
print(pd.MultiIndex(levels=[['a','b'],[1,2]],labels=[[0,0,1,1],[0,1,0,1]])) # 直接提供levels和labels创建多级索引的等级名称
populations = [11111,22222,33333,44444,55555,66666] index = pd.MultiIndex.from_tuples([('California',2000),('California',2010), ('New York',2000),('New York', 2010), ('Texas',2000),('Texas',2010)]) pop = pd.Series(populations, index=index) pop.index.names = ['state','year'] print(pop) ''' state year California 2000 11111 2010 22222 New York 2000 33333 2010 44444 Texas 2000 55555 2010 66666 dtype: int64 '''多级列索引
# 多级行列索引 index = pd.MultiIndex.from_product([[2013,2014],[1,2]],names=['year','visit']) columns = pd.MultiIndex.from_product([['Bob','Guido','Sue'],['HR','Temp']],names=['subject','type']) # 模拟数据 data = np.round(np.random.randn(4,6),1) data[:,::2] *= 10 data += 37 # 创建Dataframe health_data = pd.Dataframe(data,index=index,columns=columns) print(health_data) ''' subject Bob Guido Sue type HR Temp HR Temp HR Temp year visit 2013 1 49.0 37.7 29.0 36.2 45.0 36.4 2 51.0 36.9 13.0 36.3 50.0 39.2 2014 1 41.0 36.2 21.0 37.4 43.0 37.1 2 42.0 35.3 37.0 35.8 41.0 35.9 type HR Temp ''' print(health_data['Guido']) ''' year visit 2013 1 29.0 36.2 2 13.0 36.3 2014 1 21.0 37.4 2 37.0 35.8 '''多级索引的取片与切片 Series多级索引
populations = [11111,22222,33333,44444,55555,66666] index = pd.MultiIndex.from_tuples([('California',2000),('California',2010), ('New York',2000),('New York', 2010), ('Texas',2000),('Texas',2010)]) pop = pd.Series(populations, index=index) pop.index.names = ['state','year'] print(pop) ''' state year California 2000 11111 2010 22222 New York 2000 33333 2010 44444 Texas 2000 55555 2010 66666 dtype: int64 ''' print(pop['California']) ''' year 2000 11111 2010 22222 dtype: int64 ''' print(pop['California',2000]) # 11111 print(pop.loc['California':'New York']) ''' state year California 2000 11111 2010 22222 New York 2000 33333 2010 44444 dtype: int64 ''' print(pop[:,2000]) ''' state California 11111 New York 33333 Texas 55555 dtype: int64 ''' print(pop[pop>22222]) ''' state year New York 2000 33333 2010 44444 Texas 2000 55555 2010 66666 dtype: int64 ''' print(pop[['California','Texas']]) ''' state year California 2000 11111 2010 22222 Texas 2000 55555 2010 66666 dtype: int64 '''Dataframe多级索引
index = pd.MultiIndex.from_product([[2013,2014],[1,2]],names=['year','visit']) columns = pd.MultiIndex.from_product([['Bob','Guido','Sue'],['HR','Temp']],names=['subject','type']) # 模拟数据 data = np.round(np.random.randn(4,6),1) data[:,::2] *= 10 data += 37 # 创建Dataframe health_data = pd.Dataframe(data,index=index,columns=columns) print(health_data) ''' subject Bob Guido Sue type HR Temp HR Temp HR Temp year visit 2013 1 51.0 38.7 27.0 35.8 18.0 37.2 2 40.0 37.4 41.0 36.0 34.0 36.6 2014 1 17.0 35.6 39.0 36.0 41.0 36.4 2 26.0 35.8 32.0 36.2 47.0 36.9 ''' print(health_data['Guido','HR']) ''' year visit 2013 1 48.0 2 35.0 2014 1 30.0 2 48.0 Name: (Guido, HR), dtype: float64 ''' print(health_data.iloc[:2,:2]) ''' subject Bob type HR Temp year visit 2013 1 24.0 38.6 2 42.0 38.2 ''' print(health_data.loc[:,('Bob','HR')]) ''' year visit 2013 1 36.0 2 49.0 2014 1 55.0 2 35.0 Name: (Bob, HR), dtype: float64 ''' idx = pd.IndexSlice print(health_data.loc[idx[:,1],idx[:,'HR']]) ''' subject Bob Guido Sue type HR HR HR year visit 2013 1 46.0 41.0 37.0 2014 1 37.0 51.0 43.0 '''多级索引行列转换 有序的索引和无序的索引
如果MultiIndex不是有序索引,那么大多数切片 *** 作都会失败
index = pd.MultiIndex.from_product([['a','c','b'],[1,2]]) data = pd.Series(np.random.rand(6),index=index) data.index.names=['char','int'] print(data) ''' char int a 1 0.252805 2 0.934107 c 1 0.154999 2 0.363860 b 1 0.391106 2 0.316172 dtype: float64 ''' # 对索引使用局部切片,由于无序会出错 try: print(data['a':'b']) except KeyError as e: print(e) ''' 'Key length (1) was greater than MultiIndex lexsort depth (0)' ''' # Pandas提供了排序 *** 作,如sort_indx()和sortlevel() data = data.sort_index() print(data) ''' char int a 1 0.839251 2 0.853601 b 1 0.698143 2 0.926872 c 1 0.632588 2 0.959209 dtype: float64 ''' print(data['a':'b']) ''' char int a 1 0.839251 2 0.853601 b 1 0.698143 2 0.926872 dtype: float64 '''索引stack与unstack
populations = [11111,22222,33333,44444,55555,66666] index = pd.MultiIndex.from_tuples([('California',2000),('California',2010), ('New York',2000),('New York', 2010), ('Texas',2000),('Texas',2010)]) pop = pd.Series(populations, index=index) pop.index.names = ['state','year'] print(pop.unstack(level=0)) ''' state California New York Texas year 2000 11111 33333 55555 2010 22222 44444 66666 ''' print(pop.unstack(level=1)) ''' year 2000 2010 state California 11111 22222 New York 33333 44444 Texas 55555 66666 ''' print(pop.unstack().stack()) # stack()与unstack()是逆 *** 作,同时使用两个,数据不变 ''' state year California 2000 11111 2010 22222 New York 2000 33333 2010 44444 Texas 2000 55555 2010 66666 dtype: int64 '''索引的设置与重置
populations = [11111,22222,33333,44444,55555,66666] index = pd.MultiIndex.from_tuples([('California',2000),('California',2010), ('New York',2000),('New York', 2010), ('Texas',2000),('Texas',2010)]) pop = pd.Series(populations, index=index) pop.index.names = ['state','year'] pop_flat = pop.reset_index(name='population') print(pop_flat) ''' state year population 0 California 2000 11111 1 California 2010 22222 2 New York 2000 33333 3 New York 2010 44444 4 Texas 2000 55555 5 Texas 2010 66666 ''' print(pop_flat.set_index(['state','year'])) ''' population state year California 2000 11111 2010 22222 New York 2000 33333 2010 44444 Texas 2000 55555 2010 66666 '''多级索引的数据累计方法
# Pandas自带数据累计方法,如:mean()、sum()、max(),对于层级索引数据,可以设置参数level实现对数据子集的累计 *** 作 index = pd.MultiIndex.from_product([[2013,2014],[1,2]],names=['year','visit']) columns = pd.MultiIndex.from_product([['Bob','Guido','Sue'],['HR','Temp']],names=['subject','type']) # 模拟数据 data = np.round(np.random.randn(4,6),1) data[:,::2] *= 10 data += 37 # 创建Dataframe health_data = pd.Dataframe(data,index=index,columns=columns) print(health_data) ''' subject Bob Guido Sue type HR Temp HR Temp HR Temp year visit 2013 1 30.0 36.5 42.0 37.6 16.0 37.5 2 34.0 37.8 53.0 37.1 38.0 38.0 2014 1 35.0 35.7 27.0 37.6 45.0 36.8 2 42.0 39.3 48.0 36.8 31.0 36.6 ''' # 计算每一年各项平均值 data_mean = health_data.mean(level='year') print(data_mean) ''' subject Bob Guido Sue type HR Temp HR Temp HR Temp year 2013 32.0 37.15 47.5 37.35 27.0 37.75 2014 38.5 37.50 37.5 37.20 38.0 36.70 ''' # 设置axis参数,可以对列索引进行类似的累计 *** 作 print(data_mean.mean(axis=1,level='type')) ''' type HR Temp year 2013 35.5 37.416667 2014 38.0 37.133333 '''合并数据集:Concat与Append *** 作
# 定义一个创建Dataframe某种形式的函数 def make_df(cols, ind): data = {c:[str(c)+str(i) for i in ind] for c in cols} return pd.Dataframe(data,ind) print(make_df('ABC',range(3))) ''' A B C 0 A0 B0 C0 1 A1 B1 C1 2 A2 B2 C2 '''Numpy数组合并
x=[1,2,3] y=[4,5,6] z=[7,8,9] print(np.concatenate([x,y,z])) # [1 2 3 4 5 6 7 8 9] x = [[1,2],[3,4]] print(np.concatenate([x,x],axis=1)) ''' [[1 2 1 2] [3 4 3 4]] '''通过pd.concat实现简易合并
# 定义一个创建Dataframe某种形式的函数 def make_df(cols, ind): data = {c:[str(c)+str(i) for i in ind] for c in cols} return pd.Dataframe(data,ind) ## 通过pd.concat实现简易合并 ser1 = pd.Series(['A','B','C'],index=[1,2,3]) ser2 = pd.Series(['D','E','F'],index=[4,5,6]) print(pd.concat([ser1,ser2])) ''' 1 A 2 B 3 C 4 D 5 E 6 F dtype: object ''' df1 = make_df('AB',[1,2]) df2 = make_df('AB',[3,4]) print(df1) ''' A B 1 A1 B1 2 A2 B2 ''' print(df2) ''' A B 3 A3 B3 4 A4 B4 ''' print(pd.concat([df1,df2])) ''' A B 1 A1 B1 2 A2 B2 3 A3 B3 4 A4 B4 ''' df3 = make_df('AB',[0,1]) df4 = make_df('CD',[0,1]) print(df3) ''' A B 0 A0 B0 1 A1 B1 ''' print(df4) ''' C D 0 C0 D0 1 C1 D1 ''' print(pd.concat([df3,df4],axis=1)) ''' A B C D 0 A0 B0 C0 D0 1 A1 B1 C1 D1 '''索引重复
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)