import pandas as pd import numpy as np
输入数据
# ===Python=== df = pd.Dataframe({'x': [1, 2, 3], 'y': [4, 5, 6]}) # *===Stata=== # input x, y # 1 2 # 3 4 # 5 6 # end
读取外部数据
# ===Python=== url = ("https://raw.github.com/pandas-dev/pandas/master/pandas/tests/io/data/csv/tips.csv") df = pd.read_csv(url) # *===Stata=== # import delimited tips.csv
限定输出
# ===Python=== tips.head(5) # *===Stata=== # list in 1/5
导出数据
# ===Python=== tips.to_csv("tips2.csv") # *===Stata=== # export delimited tips2.csv
对列进行 *** 作
# ===Python=== tips['total_bill'] = tips['total_bill'] - 2 tips['new_bill'] = tips['total_bill'] / 2 tips = tips.drop("new_bill", axis=1) # axis=1表示对列 *** 作 # *===Stata=== # replace total_bill = total_bill - 2 # generate new_bill = total_bill / 2 # drop new_bill
对行进行筛选
# ===Python=== tips[tips['total_bill']>10] # *===Stata=== # list if total_bill > 10
使用if逻辑创建新列
# ===Python=== tips['bucket'] = np.where(tips['total_bill']<10, 'low', 'high') # *===Stata=== # generate bucket = 'low' if total_bill < 10 # replace bucket = 'high' if total_bil >= 10
日期函数
# ===Python=== tips['date1'] = pd.Timestamp('2013-01-15') tips['date2'] = pd.Timestamp('2015-02-15') tips['date1_year'] = tips['date1'].dt.year tips['date2_month'] = tips['date2'].dt.month tips['date1_next'] = tips['date1'] + pd.offsets.MonthBegin() tips['months_between'] = tips['date2'].dt.to_period('M') - tips['date1'].dt.to_period('M') # *===Stata=== # gen date1 = mdy(1, 15, 2013) # gen date2 = date('Feb152015', 'MDY') # gen date1_year = year(date1) # gen date2_month = month(date2) # gen date1_next = mdy(month(date1)+1, 1, year(date1)) if month(date1) != 12 # replace date1_next = mdy(1, 1, year(date1)+1) if month(date1) == 12 # gen months_between = mofd(date2)- mofd(date1)
保留、删除或重命名特定的列
# ===Python=== tips[['sex', 'total_bill', 'tip']] tips.drop('sex', axis=1) tips.rename(columns={'total_bill': 'total_bill_2'}) # *===Stata=== # keep sex total_bill tip # drop sex # rename total_bill total_bill_2
以值为依据排序
# ===Python=== tips = tips.sort_values(['sex', 'total_bill']) # *===Stata=== # sort sex total_bill
确定字符串长度
# ===Python=== tips['time'].str.len() # 确定ASCII编码的字符串长度 tips['time'].str.rstrip().str.len() # 确定Unicode编码的字符串长度 # *===Stata=== # gen streln_time = strlen(time) # gen ustrlen_time = ustrlen(time)
找到某个子字符串在字符串中的位置
# ===Python=== tips['sex'].str.find('ale') # *===Stata=== # gen str_position = strpos(sex, 'ale')
以位置为依据提取子字符串
# ===Python=== tips['sex'].str[0:1] # *===Stata=== # gen short_sex = substr(sex, 1, 1)
提取第n个单词
# ===Python=== firstlast = pd.Dataframe({'String': ['John Smith', 'Jane Cook']}) firstlast['First_Name'] = firstlast['String'].str.split(' ', expand=True)[0] firstlast['Last_Name'] = firstlast['String'].str.split(' ', expand=True)[1] # *===Stata=== # input str20 string # 'John Smith' # 'Jane Cook' # end # # gen first_name = word(string, 1) # gen last_name = word(string, -1)
改变大小写
# ===Python=== firstlast = pd.Dataframe({'String': ['John Smith', 'Jane Cook']}) firstlast['upper'] = firstlast['String'].str.upper() firstlast['lower'] = firstlast['String'].str.lower() firstlast['title'] = firstlast['String'].str.title() # *===Stata=== # input str20 string # 'John Smith' # 'Jane Cook' # end # # gen upper = strupper(string) # gen lower = strlower(string) # gen title = strproper(string)
筛选缺失值
# ===Python=== outer_join[outer_join['value_x'].isna()] # 空值 outer_join[outer_join['value_x'].notna()] # 非空值 # *===Stata=== # list if value_x == . # list if value_x != .
删除有缺漏值的行
# ===Python=== outer_join.dropna() # *===Stata=== # keep if value_x!=. & if value_y!=.
使用上一行的值填充缺漏值
# ===Python=== outer_join.fillna(method='ffill') # *===Stata=== # replace value_x = value_X[_n-1] if value_x == . # replace value_y = value_y[_n-1] if value_y == .
使用特定值取代缺漏值
# ===Python=== outer_join['value_x'].fillna(outer_join['value_x'].mean()) # *===Stata=== # egen mean_value_x = mean(value_x) # replace value_x = mean_value_x if value_x == .
分组求和
# ===Python=== tips_summed = tips.groupby(['sex ', 'smoker'])[['total_bill', 'tip']].sum() # *===Stata=== # collapse (sum) total_bill tip, by (sex smoker)
分组求特定的统计量
# ===Python=== gb = tips.groupby('smoker')['total_bill'] tips['adj_total_bill'] = tips['total_bill'] - gb.transform('mean') # *===Stata=== # bysort sex smoker: egen group_bill = mean(total_bill) # gen adj_total_bill = total_bill - group_bill
对每组取第一条记录
# ===Python=== tips.groupby(['sex', 'smoker']).first() # *===Stata=== # bysort sex smoker: list if _n == 1
将数据进行横向合并:增加变量
# ===Python=== left_join = df1.merge(df2, on=['key'], how='left') # 左联接 right_join = df1.merge(df2, on=['key'], how='right') # 右联接 inner_join = df1.merge(df2, on=['key'], how='inner') # 内联接 outer_join = df1.merge(df2, on=['key'], how='outer') # 外联接 # *===Stata=== # * First create df2 and save to disk # clear # input str1 key # B # D # D # E # end # generate value = rnormal() # save df2.dta # * Now create df1 in memory # clear # input str1 key # A # B # C # D # end # generate value = rnormal() #preserve #* Left join # merge 1:n key using df2.dta # keep if _merge == 1 # * Right join # restore, preserve # merge 1:n key using df2.dta # keep if _merge == 2 # * Inner join # restore, preserve # merge 1:n key using df2.dta # keep if _merge == 3 # * Outer join # restore # merge 1:n key using df2.dta
将数据进行纵向合并:追加样本
# ===pandas=== append_u = pd.read_stata('append_u.dta', index=False) append_m = pd.read_stata('append_m.dta', index=False) append_um = pd.concat([append_u, append_m], axis=0]) # *===Stata=== # use append_m.dta, clear # append using append_u.dta
离群值
# ===pandas=== nlsw88 = pd.read_stata('work/nlsw88.dta') wage_q99 = nlsw88['wage'].quantile(0.99) wage_q1 = nlsw88['wage'].quantile(0.01) # 将离群值替换为分位数 nlsw88['wage'] = nlsw88['wage'].apply(lambda x: wage_q1 if xwage_q99 else x) # 将离群值替换为空值 # nlsw88['wage'] = nlsw88['wage'].apply(lambda x: '' if x wage_q99 else x) # *===Stata=== # *将离群值替换为分位数 # winsor2 wage, cuts(1 99) replace # *将离群值替换为空值 # winsor2 wage, cuts(1 99) trim
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)