读取文件查看基本信息基本 *** 作Group by数值运算Merge *** 作显示设置Pivot时间 *** 作常用 *** 作
pip install --upgrade numpy==1.13.3 pip install --upgrade pandas==1.0.5读取文件 查看基本信息
print(df.shape) # (174, 69) print(df.info()) #基本 *** 作# RangeIndex: 174 entries, 0 to 173 # Data columns (total 69 columns): # Fab 174 non-null object # Family 0 non-null float64 # ... print(df.index) print(df.columns) print(df.values)
设置index
df = df.set_index('insert_time') # 可以设置index
简单计算
df['Wafer Number'] = df['Wafer Number'] + 100 df['Wafer Number'] = df['Wafer Number'] - 100 df['Wafer Number'] = df['Wafer Number'] * 100 df['Wafer Number'].max()
简单统计
print(df.describe())
查找位置: loc & iloc
a = df.loc['MRB463_03',:] # 索引的名字 b = df.iloc[2,:] # 索引所在的行数
查找数据,可以使用query(),参考文章;
赋值
df.loc['MRB463_03','Wafer Number'] = 2 # 直接赋值即可
布尔类型索引
可以作为删选数据的方法;
df = df.set_index('Wafer') df[df['Wafer Number']>10][:5] # 省略 df[df['Wafer Number']==10][:5] # 所有wafer number 是10的数据; df.loc[df['Wafer Number']==10,'yield (%)'].mean() # 93.4 ratio = df[df['yield (%)']>95]['yield (%)'].count()/df['yield (%)'].count() # 大于95 良率的wafer的占比;Group by
单一列的单一统计值
df.groupby('Source Lot')['Program'].count() # group by S.LOT 的Wafer 数量; df.groupby('Source Lot')['yield (%)'].mean() # S.LOT的平均良率值
单一列的多个统计值
以下代码,pandas 必须为0.25以上版本;
df.groupby('Source Lot')['yield (%)'].agg(mean = "mean",min = "min", max="max") df.groupby('Source Lot')['yield (%)'].agg(mean = np.mean,min = np.min, max=np.max) # 以上两句,得出结果完全相同; # mean min max # Source Lot # MRB463.S 92.328675 87.993292 94.030184 # MRB466.S 93.339296 91.626607 95.226383 # MRB513.S 90.696385 84.326439 93.694802 # MRB516.S 92.161431 86.193404 94.712130 # MRB556.S 90.844047 88.060369 93.817775 # MRB557.S 92.387703 88.384572 94.488541 # MRB559.S 92.118055 88.194522 94.264952
不同列的不同统计值
df.groupby('Source Lot').agg(mean_yield = ('yield (%)','mean'),Wafer_Count =('Program','count')) df.groupby('Source Lot').agg(mean_yield = ('yield (%)',np.mean),Wafer_Count =('Program',np.count_nonzero)) # 以上两句执行结果相同; # mean_yield Wafer_Count # Source Lot # MRB463.S 92.328675 25 # MRB466.S 93.339296 25 # MRB513.S 90.696385 24 # MRB516.S 92.161431 25 # MRB556.S 90.844047 25 # MRB557.S 92.387703 25 # MRB559.S 92.118055 25数值运算
df['35 - F - ER-SRT (%)'].sum() df['35 - F - ER-SRT (%)'].mean() df.sum(axis=0) # 将列求和; df.sum(axis=1) # 将行求和; df.max(axis=0) # 求每列的最大值; df.min() df.median()
df.cov() # 求出协方差 df.corr() # 求出相关系数
等于某个值的个数;
df['Source Lot'].value_counts() # MRB557.S 25 # MRB516.S 25 df['Source Lot'].value_counts(ascending=True) # 升序 df['43 - F - VGNDA2 (cnt)'].value_counts(ascending=True,bins=8) # 自动bin 分类 # (18.375, 21.0] 2 # (15.75, 18.375] 3 # (13.125, 15.75] 5 # (5.25, 7.875] 6 # (7.875, 10.5] 6 # (10.5, 13.125] 8 # (2.625, 5.25] 25 # (-0.022000000000000002, 2.625] 119 df['43 - F - VGNDA2 (cnt)'].value_counts(normalize = True, ascending=True,bins=8) # (18.375, 21.0] 0.011494 # (15.75, 18.375] 0.017241 # (13.125, 15.75] 0.028736 # (5.25, 7.875] 0.034483 # (7.875, 10.5] 0.034483 # (10.5, 13.125] 0.045977 # (2.625, 5.25] 0.143678 # (-0.022000000000000002, 2.625] 0.683908
改 *** 作;
df['Wafer Number'].replace(to_replace=10,value=100,inplace=True) # 将列中的数据替换,并修改原数据
df.rename(index= {'MRB463_11':'MRB463_110'},inplace=True)
增 *** 作;
df1 = pd.Dataframe([[1,2],[3,4],[5,6]],index=[1,2,3],columns=['A','B']) df2 = pd.Dataframe([[1,2],[3,4],[5,6]],index=[1,4,5],columns=['A','B']) print(df1.append(df2))
df1 = pd.Dataframe([[1,2],[3,4],[5,6]],index=[1,2,3],columns=['A','B']) df2 = pd.Dataframe([[1,2],[3,4],[5,6]],index=[1,4,5],columns=['C','B']) print(df1.append(df2))
df1 = pd.Dataframe([[1,2],[3,4],[5,6]],index=['a','b','c'],columns=['A','B']) df1.loc['d'] = [7,8] print(df1)
df1 = pd.Dataframe([[1,2,3],[4,5,6]],index=['a','b'],columns=['A','B','C']) df2 = pd.Dataframe([[2,2,3],[5,5,6]],index=['a','c'],columns=['A','B','D']) df3 = pd.concat([df1,df2]) df4 = pd.concat([df1,df2],axis=1) print('df3:n',df3) print('df4:n',df4)
删 *** 作
删除列
df1 = pd.Dataframe([[1,2],[3,4],[5,6]],index=['a','b','c'],columns=['A','B']) del df1['A']
df1 = pd.Dataframe([[1,2],[3,4],[5,6]],index=['a','b','c'],columns=['A','B']) df1.drop(['A'],axis=1,inplace=True) # 删除列 df1.drop(['a'],axis=0,inplace=True) # 删除行Merge *** 作
df1 = pd.Dataframe([[1,2,3],[4,5,6]],index=['a','b'],columns=['A','B','C']) df2 = pd.Dataframe([[2,2,3],[5,5,6]],index=['a','c'],columns=['A','B','D']) df3 =pd.merge(df1,df2,on='B') print('df1:n',df1) print('df2:n',df2) print('df3:n',df3)
也可以根据多个值,只要传列表给on就可以了~
默认的连接方式为inner; 可以通过how= 来修改连接方式;如下:
df1 = pd.Dataframe([[1,2,3],[4,5,6]],index=['a','b'],columns=['A','B','C']) df2 = pd.Dataframe([[1,2,3],[5,5,6]],index=['a','c'],columns=['A','B','D']) df3 =pd.merge(df1,df2,on=['B','A']) #默认为inner df_outer =pd.merge(df1,df2,on=['B','A'],how='outer') df_left =pd.merge(df1,df2,on=['B','A'],how='left') df_right =pd.merge(df1,df2,on=['B','A'],how='right')
可以增加指示器,来标注数据来源,如下:
df1 = pd.Dataframe([[1,2,3],[4,5,6]],index=['a','b'],columns=['A','B','C']) df2 = pd.Dataframe([[1,2,3],[5,5,6]],index=['a','c'],columns=['A','B','D']) df_outer =pd.merge(df1,df2,on=['B','A'],how='outer',indicator=True)显示设置
查看设置:
pd.get_option('display.max_rows')
更改设置:
pd.set_option('display.max_columns',12) # pd.set_option('display.max_rows',20) pd.set_option('display.max_colwidth',10) pd.set_option('display.width',10000) pd.set_option('display.precision',4) # 显示精度 pd.set_option('display.max_info_columns',5)Pivot
df_pivot = df.pivot_table(index = 'Wafer Number',columns='Source Lot',values= 'yield (%)') # 默认是求平均
df_pivot = df.pivot_table(index = 'Wafer Number',values= 'yield (%)',aggfunc='min')
df_pivot = df.pivot_table(index = 'Wafer Number',values= ['yield (%)'],aggfunc=['min','count'])
df_pivot = df.pivot_table(index = 'Wafer Number',values= ['yield (%)'],aggfunc=['min','count'],margins =True)
df_pivot = df.pivot_table(index = 'Source Lot',values =['yield (%)','25 - F - FAIL1H (cnt)'] ,aggfunc={'yield (%)':[max,min],'25 - F - FAIL1H (cnt)':np.mean})
ts = pd.Timestamp('2020-01-12') print(ts) # 2020-01-12 00:00:00 print(ts + pd.Timedelta('5 days')) # 2020-01-17 00:00:00
各种格式的日期时间字符串,都可以通过to_datetime 转换成datetime格式;
ts= pd.to_datetime('3/2/2020') # 2020-03-02 00:00:00
datetime 类型有一些性质,可以方便的调出各种时间的属性值;
print(ts.hour) print(ts.weekday())
按规定生成时间序列
print(pd.date_range(start='2020-01-10',end='2020-01-20')) # DatetimeIndex(['2020-01-10', '2020-01-11', '2020-01-12', '2020-01-13', '2020-01-14', '2020-01-15', '2020-01-16', '2020-01-17', '2020-01-18', '2020-01-19', '2020-01-20'], dtype='datetime64[ns]', freq='D') print(pd.date_range(start='2020-01-10',periods=5,freq='2W')) # DatetimeIndex(['2020-01-12', '2020-01-26', '2020-02-09', '2020-02-23', '2020-03-08'], dtype='datetime64[ns]', freq='2W-SUN')
对时间进行处理的时候,需要先设置成index,才可以;
df['str_year'] = pd.to_datetime(df['start_time']) df = df.set_index('str_year') df['day']=df.index.day print(df['day'].value_counts()) # 可以打印出day的统计 df['2021-12-04':'2021-12-05'] # 可以打印出选定的时间; df[(df.index.day > 13) & (df.index.day < 16)] # 指定的日期 & 时间; df.between_time('08:00','12:00') # 指定的时间(只能是时间)
按时间重采样
df.resample('D').mean() # 按天重采样,并求出平均值; df.resample('3D').mean() # 按3天重采样,并求出平均值;常用 *** 作
排序
df.sort_values(by=['Wafer Number','yield (%)'],ascending=[False,False],inplace=True) print(df.loc[:,['Wafer Number','yield (%)']])
删去重复值
df.drop_duplicates() # 一行数据中,全部重复则删除 df.drop_duplicates(subset='Wafer Number') # 指定列进行去重
同类项合并 :mapping 方法
定义一个映射字典,然后通过mapping调用该字典,如果没有对应关系,则为NaN;
mapping = { 1:'Head', 2:'Head', 3:'Head', 4:'Head', 5:'Head', 21:'Tail', 22:'Tail', 23:'Tail', 24:'Tail', 25:'Tail' } df['cate'] = df['Wafer Number'].map(mapping)
分类;按区间进行分类
例如,我想要按照yield 在 [0,90,92,100]的区间上进行分类;
bins = [0,90,92,100] bins_cut = pd.cut(df['yield (%)'],bins) # (92, 100] 101 # (90, 92] 48 # (0, 90] 25
可以增加标签:
bins = [0,90,92,100] names= ['Low Yield','Normal Yield','High Yield'] df['bins_cut'] = pd.cut(df['yield (%)'],bins,labels=names) print(df['bins_cut']) d = pd.value_counts(df['bins_cut']) print(d) # High Yield 101 # Normal Yield 48 # Low Yield 25
apply 函数:对每行或者每列做相同的 *** 作;
缺失值处理
df.isnull().all(axis=0) # 对列进行检查,如果全为null,则返回True; df.isnull().all(axis=1) # 对行进行检查,如果全为null,则返回True; df.fillna(5) # 用5填充null值;
字符串 *** 作
df['Source Lot'].str.lower() # 大小写转换 df['Source Lot'].str.upper() # 大小写转换 df['Source Lot'].str.len() # 字符串长度 df['Source Lot'].str.strip() # 去两边空格 df['Source Lot'].str.lstrip() # 去左边空格 df['Source Lot'].str.rstrip() #去右边空格 df['Source Lot'].str.replace('.S','_R') # 替换字符 df['Source Lot'].str.split('.') # 将一列分割成多列,每行成为一个[] df['Source Lot'].str.split('.',expand=True) # 分割成多列,且命名为0,1... df['Source Lot'].str.split('.',expand=True,n=1) # 只切一次,分割成两部分 df['Source Lot'].str.contains('2') # 是否包含 2
用.分割每个值,且
df_1 = pd.Dataframe([['a,b,c'],['a,c'],['c,b']],index=[1,2,3],columns=['A']) a = df_1['A'].str.get_dummies(sep=',') print(a)
isin()
df_1['A'].isin(['d','a'])
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)