采用:
print (merged_data) id user product price[78] price[79] Source0 105 dummya egg 22 28.0 sheet11 119 dummy1 soya 67 NaN sheet12 567 dummya spinach 22 28.0 sheet23 897 dummy1 rose 67 99.0 sheet24 345 dummya egg 87 98.0 sheet35 121 dummy1 potato 98 99.0 sheet3print (Condition) Sheet ID price1_col1 price1_col2 price1_out sheet1 yes78 NaN price1_col1 1 sheet2 yes78 79.0 price1_col1+price1_col2 2 sheet3 yes78 79.0 max(price1_col1,price1_col2) price2_col1 price2_col2 price2_out sheetname 078 NaN price2_col1 yes 178 79.0 price2_col1+price2_col2 yes 279 78.0 min(price2_col1,price2_col2) no
#merge data together by left join df = merged_data.merge(Condition.rename(columns={'Sheet':'Source'}), on='Source', how='left')#replace columns to empty strings, remove sheetname and ID columnsdf['Source'] = np.where(df.pop('sheetname') == 'yes', df['Source'], '')df['id'] = np.where(df.pop('ID') == 'yes', df['id'], '')#filter integers between [] to ned Dataframe df1 = df.filter(regex='[d+]').copy()#filter all columns with price, exclude df1 df2 = df[df.filter(regex='price').columns.difference(df1.columns)].copy()#convert column to integersdf1.columns = df1.columns.str.extract('[(d+)]', expand=False).astype(int)#helper column for match missing valuesdf1['a'] = np.nan#filter columns without/with _outmask = df2.columns.str.endswith(('_col1','_col2'))final_cols = df2.columns[ ~mask]removed_cols = df2.columns[mask]#replace columns by match values from df2for c in removed_cols: df2[c] = df1.lookup(df1.index, df2[c].fillna('a'))
print (df2) price1_col1 price1_col2 price1_out price2_col1 22 NaN price1_col1 22.0 167 NaN price1_col1 67.0 222 28.0 price1_col1+price1_col2 22.0 367 99.0 price1_col1+price1_col2 67.0 487 98.0 max(price1_col1,price1_col2) 98.0 598 99.0 max(price1_col1,price1_col2) 99.0 price2_col2 price2_out 0 NaN price2_col1 1 NaN price2_col1 2 28.0 price2_col1+price2_col2 3 99.0 price2_col1+price2_col2 4 87.0 min(price2_col1,price2_col2) 5 98.0 min(price2_col1,price2_col2)
#create MultiIndex for separate eah price groupsdf2.columns = df2.columns.str.split('_', expand=True)def f(x): #remove first level x.columns = x.columns.droplevel(0) out = [] #loop each row for v in x.itertuples(index=False): #remove prefix t = v.out.replace(x.name+'_', '') #loop each namedtuple and replace values for k1, v1 in v._asdict().items(): t = t.replace(k1, str(v1)) #pd.eval cannot working with min, max, so handled different if t.startswith('min'): out.append(min(pd.eval(t[3:]))) elif t.startswith('max'): out.append(max(pd.eval(t[3:]))) #handled +-*/ else: out.append(pd.eval(t)) #return back return pd.Series(out)#overwrite original columnsdf[final_cols] = df2.groupby(level=0, axis=1).apply(f).add_suffix('_out')#if necessary remove helpersdf = df.drop(removed_cols, axis=1)
print (df) id user product price[78] price[79] Source price1_out price2_out0 105 dummya egg 22 28.0 sheet1 22.0 22.01 119 dummy1 soya 67 NaN sheet1 67.0 67.02 567 dummya spinach 22 28.0 sheet2 50.0 50.03 897 dummy1 rose 67 99.0 sheet2 166.0 166.04 345 dummya egg 87 98.0 98.0 87.05 121 dummy1 potato 98 99.0 99.0 98.0
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)