pandas 基础应用_随笔

pandas 基础应用

文章目录

读取文件查看基本信息基本 *** 作Group by数值运算Merge *** 作显示设置Pivot时间 *** 作常用 *** 作

pip install --upgrade numpy==1.13.3
pip install --upgrade pandas==1.0.5

读取文件查看基本信息

print(df.shape)
# (174, 69)
print(df.info())
# 
# RangeIndex: 174 entries, 0 to 173
# Data columns (total 69 columns):
# Fab                          174 non-null object
# Family                       0 non-null float64
# ...
print(df.index)     
print(df.columns)
print(df.values)

基本 *** 作

设置index

df = df.set_index('insert_time')  # 可以设置index

简单计算

df['Wafer Number'] = df['Wafer Number'] + 100
df['Wafer Number'] = df['Wafer Number'] - 100
df['Wafer Number'] = df['Wafer Number'] * 100
df['Wafer Number'].max()

简单统计

print(df.describe())

查找位置： loc & iloc

a = df.loc['MRB463_03',:]   # 索引的名字
b = df.iloc[2,:]   # 索引所在的行数

查找数据，可以使用query()，参考文章；

赋值

df.loc['MRB463_03','Wafer Number'] = 2   # 直接赋值即可

布尔类型索引
可以作为删选数据的方法；

df = df.set_index('Wafer')
df[df['Wafer Number']>10][:5]
# 省略
df[df['Wafer Number']==10][:5]
# 所有wafer number 是10的数据；
df.loc[df['Wafer Number']==10,'yield (%)'].mean()
# 93.4
ratio = df[df['yield (%)']>95]['yield (%)'].count()/df['yield (%)'].count()
# 大于95 良率的wafer的占比；

Group by

单一列的单一统计值

df.groupby('Source Lot')['Program'].count()
# group by S.LOT 的Wafer 数量；
df.groupby('Source Lot')['yield (%)'].mean()
# S.LOT的平均良率值

单一列的多个统计值
以下代码，pandas 必须为0.25以上版本；

df.groupby('Source Lot')['yield (%)'].agg(mean = "mean",min = "min", max="max")
df.groupby('Source Lot')['yield (%)'].agg(mean = np.mean,min = np.min, max=np.max)
# 以上两句，得出结果完全相同；
#                  mean        min        max
# Source Lot
# MRB463.S    92.328675  87.993292  94.030184
# MRB466.S    93.339296  91.626607  95.226383
# MRB513.S    90.696385  84.326439  93.694802
# MRB516.S    92.161431  86.193404  94.712130
# MRB556.S    90.844047  88.060369  93.817775
# MRB557.S    92.387703  88.384572  94.488541
# MRB559.S    92.118055  88.194522  94.264952

不同列的不同统计值

df.groupby('Source Lot').agg(mean_yield = ('yield (%)','mean'),Wafer_Count =('Program','count'))
df.groupby('Source Lot').agg(mean_yield = ('yield (%)',np.mean),Wafer_Count =('Program',np.count_nonzero))
# 以上两句执行结果相同；
#             mean_yield  Wafer_Count
# Source Lot
# MRB463.S     92.328675           25
# MRB466.S     93.339296           25
# MRB513.S     90.696385           24
# MRB516.S     92.161431           25
# MRB556.S     90.844047           25
# MRB557.S     92.387703           25
# MRB559.S     92.118055           25

数值运算

df['35 - F - ER-SRT (%)'].sum()
df['35 - F - ER-SRT (%)'].mean()
df.sum(axis=0)  # 将列求和；
df.sum(axis=1)  # 将行求和；
df.max(axis=0)  # 求每列的最大值；
df.min()
df.median()

df.cov()    # 求出协方差
df.corr()    # 求出相关系数

等于某个值的个数；

df['Source Lot'].value_counts() 
# MRB557.S    25
# MRB516.S    25
df['Source Lot'].value_counts(ascending=True)   # 升序
df['43 - F - VGNDA2 (cnt)'].value_counts(ascending=True,bins=8)  # 自动bin 分类
# (18.375, 21.0]                      2
# (15.75, 18.375]                     3
# (13.125, 15.75]                     5
# (5.25, 7.875]                       6
# (7.875, 10.5]                       6
# (10.5, 13.125]                      8
# (2.625, 5.25]                      25
# (-0.022000000000000002, 2.625]    119

df['43 - F - VGNDA2 (cnt)'].value_counts(normalize = True, ascending=True,bins=8)
# (18.375, 21.0]                    0.011494
# (15.75, 18.375]                   0.017241
# (13.125, 15.75]                   0.028736
# (5.25, 7.875]                     0.034483
# (7.875, 10.5]                     0.034483
# (10.5, 13.125]                    0.045977
# (2.625, 5.25]                     0.143678
# (-0.022000000000000002, 2.625]    0.683908

改 *** 作；

df['Wafer Number'].replace(to_replace=10,value=100,inplace=True) # 将列中的数据替换，并修改原数据

df.rename(index= {'MRB463_11':'MRB463_110'},inplace=True)

增 *** 作；

df1 = pd.Dataframe([[1,2],[3,4],[5,6]],index=[1,2,3],columns=['A','B'])
df2 = pd.Dataframe([[1,2],[3,4],[5,6]],index=[1,4,5],columns=['A','B'])
print(df1.append(df2))

df1 = pd.Dataframe([[1,2],[3,4],[5,6]],index=[1,2,3],columns=['A','B'])
df2 = pd.Dataframe([[1,2],[3,4],[5,6]],index=[1,4,5],columns=['C','B'])
print(df1.append(df2))

df1 = pd.Dataframe([[1,2],[3,4],[5,6]],index=['a','b','c'],columns=['A','B'])
df1.loc['d'] = [7,8]
print(df1)

df1 = pd.Dataframe([[1,2,3],[4,5,6]],index=['a','b'],columns=['A','B','C'])
df2 = pd.Dataframe([[2,2,3],[5,5,6]],index=['a','c'],columns=['A','B','D'])
df3 = pd.concat([df1,df2])
df4 = pd.concat([df1,df2],axis=1)

print('df3:n',df3)
print('df4:n',df4)

删 *** 作
删除列

df1 = pd.Dataframe([[1,2],[3,4],[5,6]],index=['a','b','c'],columns=['A','B'])
del df1['A']

df1 = pd.Dataframe([[1,2],[3,4],[5,6]],index=['a','b','c'],columns=['A','B'])
df1.drop(['A'],axis=1,inplace=True)  # 删除列
df1.drop(['a'],axis=0,inplace=True)  # 删除行

Merge *** 作

df1 = pd.Dataframe([[1,2,3],[4,5,6]],index=['a','b'],columns=['A','B','C'])
df2 = pd.Dataframe([[2,2,3],[5,5,6]],index=['a','c'],columns=['A','B','D'])
df3 =pd.merge(df1,df2,on='B')

print('df1:n',df1)
print('df2:n',df2)
print('df3:n',df3)

也可以根据多个值，只要传列表给on就可以了~
默认的连接方式为inner；可以通过how= 来修改连接方式；如下：

df1 = pd.Dataframe([[1,2,3],[4,5,6]],index=['a','b'],columns=['A','B','C'])
df2 = pd.Dataframe([[1,2,3],[5,5,6]],index=['a','c'],columns=['A','B','D'])
df3 =pd.merge(df1,df2,on=['B','A'])  #默认为inner
df_outer =pd.merge(df1,df2,on=['B','A'],how='outer')  
df_left =pd.merge(df1,df2,on=['B','A'],how='left')  
df_right =pd.merge(df1,df2,on=['B','A'],how='right')

可以增加指示器，来标注数据来源，如下：

df1 = pd.Dataframe([[1,2,3],[4,5,6]],index=['a','b'],columns=['A','B','C'])
df2 = pd.Dataframe([[1,2,3],[5,5,6]],index=['a','c'],columns=['A','B','D'])
df_outer =pd.merge(df1,df2,on=['B','A'],how='outer',indicator=True)

显示设置

查看设置：

pd.get_option('display.max_rows')

更改设置：

pd.set_option('display.max_columns',12)  # 
pd.set_option('display.max_rows',20)
pd.set_option('display.max_colwidth',10)
pd.set_option('display.width',10000)
pd.set_option('display.precision',4)  # 显示精度
pd.set_option('display.max_info_columns',5)

Pivot

df_pivot = df.pivot_table(index = 'Wafer Number',columns='Source Lot',values= 'yield (%)') # 默认是求平均

df_pivot = df.pivot_table(index = 'Wafer Number',values= 'yield (%)',aggfunc='min')

df_pivot = df.pivot_table(index = 'Wafer Number',values= ['yield (%)'],aggfunc=['min','count'])

df_pivot = df.pivot_table(index = 'Wafer Number',values= ['yield (%)'],aggfunc=['min','count'],margins =True)

df_pivot = df.pivot_table(index = 'Source Lot',values =['yield (%)','25 - F - FAIL1H (cnt)'] ,aggfunc={'yield (%)':[max,min],'25 - F - FAIL1H (cnt)':np.mean})

时间 *** 作

ts = pd.Timestamp('2020-01-12')
print(ts)
# 2020-01-12 00:00:00
print(ts + pd.Timedelta('5 days'))
# 2020-01-17 00:00:00

各种格式的日期时间字符串，都可以通过to_datetime 转换成datetime格式；

ts= pd.to_datetime('3/2/2020')
# 2020-03-02 00:00:00

datetime 类型有一些性质，可以方便的调出各种时间的属性值；

print(ts.hour)
print(ts.weekday())

按规定生成时间序列

print(pd.date_range(start='2020-01-10',end='2020-01-20'))
# DatetimeIndex(['2020-01-10', '2020-01-11', '2020-01-12', '2020-01-13', '2020-01-14', '2020-01-15', '2020-01-16', '2020-01-17', '2020-01-18', '2020-01-19', '2020-01-20'], dtype='datetime64[ns]', freq='D')
print(pd.date_range(start='2020-01-10',periods=5,freq='2W'))
# DatetimeIndex(['2020-01-12', '2020-01-26', '2020-02-09', '2020-02-23', '2020-03-08'], dtype='datetime64[ns]', freq='2W-SUN')

对时间进行处理的时候，需要先设置成index，才可以；

df['str_year'] = pd.to_datetime(df['start_time'])
df = df.set_index('str_year')
df['day']=df.index.day
print(df['day'].value_counts())  # 可以打印出day的统计
df['2021-12-04':'2021-12-05']   # 可以打印出选定的时间；
df[(df.index.day > 13) & (df.index.day < 16)] # 指定的日期 & 时间；
df.between_time('08:00','12:00')   # 指定的时间（只能是时间）

按时间重采样

df.resample('D').mean()  # 按天重采样，并求出平均值；
df.resample('3D').mean()  # 按3天重采样，并求出平均值；

常用 *** 作

排序

df.sort_values(by=['Wafer Number','yield (%)'],ascending=[False,False],inplace=True)
print(df.loc[:,['Wafer Number','yield (%)']])

删去重复值

df.drop_duplicates()   # 一行数据中，全部重复则删除
df.drop_duplicates(subset='Wafer Number')  # 指定列进行去重

同类项合并：mapping 方法

定义一个映射字典，然后通过mapping调用该字典，如果没有对应关系，则为NaN；

mapping = {
    1:'Head',
    2:'Head',
    3:'Head',
    4:'Head',
    5:'Head',
    21:'Tail',
    22:'Tail',
    23:'Tail',
    24:'Tail',
    25:'Tail'
}

df['cate'] = df['Wafer Number'].map(mapping)

分类；按区间进行分类
例如，我想要按照yield 在 [0,90,92,100]的区间上进行分类；

bins = [0,90,92,100]
bins_cut = pd.cut(df['yield (%)'],bins)
# (92, 100]    101
# (90, 92]      48
# (0, 90]       25

可以增加标签：

bins = [0,90,92,100]
names= ['Low Yield','Normal Yield','High Yield']
df['bins_cut'] = pd.cut(df['yield (%)'],bins,labels=names)
print(df['bins_cut'])
d = pd.value_counts(df['bins_cut'])
print(d)
# High Yield      101
# Normal Yield     48
# Low Yield        25

apply 函数：对每行或者每列做相同的 *** 作；

缺失值处理

df.isnull().all(axis=0) # 对列进行检查，如果全为null，则返回True;
df.isnull().all(axis=1) # 对行进行检查，如果全为null，则返回True;
df.fillna(5) # 用5填充null值；

字符串 *** 作

df['Source Lot'].str.lower()   # 大小写转换
df['Source Lot'].str.upper()   # 大小写转换
df['Source Lot'].str.len()   # 字符串长度
df['Source Lot'].str.strip()  # 去两边空格
df['Source Lot'].str.lstrip()  # 去左边空格
df['Source Lot'].str.rstrip()  #去右边空格
df['Source Lot'].str.replace('.S','_R')  # 替换字符
df['Source Lot'].str.split('.')   # 将一列分割成多列，每行成为一个[]
df['Source Lot'].str.split('.',expand=True)  # 分割成多列，且命名为0，1...
df['Source Lot'].str.split('.',expand=True,n=1) # 只切一次，分割成两部分
df['Source Lot'].str.contains('2')  # 是否包含 2

用.分割每个值，且

df_1 = pd.Dataframe([['a,b,c'],['a,c'],['c,b']],index=[1,2,3],columns=['A'])
a = df_1['A'].str.get_dummies(sep=',')
print(a)

isin()

df_1['A'].isin(['d','a'])

欢迎分享，转载请注明来源：内存溢出

原文地址: http://outofmemory.cn/zaji/5711074.html

pandas 基础应用

发表评论

评论列表（0条）