Stata与pandas的常用功能_随笔

Stata与pandas的常用功能 Stata与pandas+numpy的常用功能

import pandas as pd
import numpy as np

输入数据

# ===Python===
df = pd.Dataframe({'x': [1, 2, 3], 'y': [4, 5, 6]})
# *===Stata===
# input x, y
# 1 2
# 3 4
# 5 6
# end

读取外部数据

# ===Python===
url = ("https://raw.github.com/pandas-dev/pandas/master/pandas/tests/io/data/csv/tips.csv")
df = pd.read_csv(url)

# *===Stata===
# import delimited tips.csv

限定输出

# ===Python===
tips.head(5)

# *===Stata===
# list in 1/5

导出数据

# ===Python===
tips.to_csv("tips2.csv")

# *===Stata===
# export delimited tips2.csv

对列进行 *** 作

# ===Python===
tips['total_bill'] = tips['total_bill'] - 2
tips['new_bill'] = tips['total_bill'] / 2
tips = tips.drop("new_bill", axis=1) # axis=1表示对列 *** 作

# *===Stata===
# replace total_bill = total_bill - 2
# generate new_bill = total_bill / 2
# drop new_bill

对行进行筛选

# ===Python===
tips[tips['total_bill']>10]

# *===Stata===
# list if total_bill > 10

使用if逻辑创建新列

# ===Python===
tips['bucket'] = np.where(tips['total_bill']<10, 'low', 'high')

# *===Stata===
# generate bucket = 'low' if total_bill < 10
# replace bucket = 'high' if total_bil >= 10

日期函数

# ===Python===
tips['date1'] = pd.Timestamp('2013-01-15')
tips['date2'] = pd.Timestamp('2015-02-15')
tips['date1_year'] = tips['date1'].dt.year
tips['date2_month'] = tips['date2'].dt.month
tips['date1_next'] = tips['date1'] + pd.offsets.MonthBegin()
tips['months_between'] = tips['date2'].dt.to_period('M') - tips['date1'].dt.to_period('M')

# *===Stata===
# gen date1 = mdy(1, 15, 2013)
# gen date2 = date('Feb152015', 'MDY')
# gen date1_year = year(date1)
# gen date2_month = month(date2)
# gen date1_next = mdy(month(date1)+1, 1, year(date1)) if month(date1) != 12
# replace date1_next = mdy(1, 1, year(date1)+1) if month(date1) == 12
# gen months_between = mofd(date2)- mofd(date1)

保留、删除或重命名特定的列

# ===Python===
tips[['sex', 'total_bill', 'tip']]
tips.drop('sex', axis=1)
tips.rename(columns={'total_bill': 'total_bill_2'})

# *===Stata===
# keep sex total_bill tip
# drop sex
# rename total_bill total_bill_2

以值为依据排序

# ===Python===
tips = tips.sort_values(['sex', 'total_bill'])

# *===Stata===
# sort sex total_bill

确定字符串长度

# ===Python===
tips['time'].str.len() # 确定ASCII编码的字符串长度
tips['time'].str.rstrip().str.len() # 确定Unicode编码的字符串长度

# *===Stata===
# gen streln_time = strlen(time)
# gen ustrlen_time = ustrlen(time)

找到某个子字符串在字符串中的位置

# ===Python===
tips['sex'].str.find('ale')

# *===Stata===
# gen str_position = strpos(sex, 'ale')

以位置为依据提取子字符串

# ===Python===
tips['sex'].str[0:1]

# *===Stata===
# gen short_sex = substr(sex, 1, 1)

提取第n个单词

# ===Python===
firstlast = pd.Dataframe({'String': ['John Smith', 'Jane Cook']})
firstlast['First_Name'] = firstlast['String'].str.split(' ', expand=True)[0]
firstlast['Last_Name'] = firstlast['String'].str.split(' ', expand=True)[1]

# *===Stata===
# input str20 string
# 'John Smith'
# 'Jane Cook'
# end
# 
# gen first_name = word(string, 1)
# gen last_name = word(string, -1)

改变大小写

# ===Python===
firstlast = pd.Dataframe({'String': ['John Smith', 'Jane Cook']})
firstlast['upper'] = firstlast['String'].str.upper()
firstlast['lower'] = firstlast['String'].str.lower()
firstlast['title'] = firstlast['String'].str.title()

# *===Stata===
# input str20 string
# 'John Smith'
# 'Jane Cook'
# end
# 
# gen upper = strupper(string)
# gen lower = strlower(string)
# gen title = strproper(string)

筛选缺失值

# ===Python===
outer_join[outer_join['value_x'].isna()] # 空值
outer_join[outer_join['value_x'].notna()] # 非空值

# *===Stata===
# list if value_x == .
# list if value_x != .

删除有缺漏值的行

# ===Python===
outer_join.dropna()

# *===Stata===
# keep if value_x!=. & if value_y!=.

使用上一行的值填充缺漏值

# ===Python===
outer_join.fillna(method='ffill')

# *===Stata===
# replace value_x = value_X[_n-1] if value_x == .
# replace value_y = value_y[_n-1] if value_y == .

使用特定值取代缺漏值

# ===Python===
outer_join['value_x'].fillna(outer_join['value_x'].mean())

# *===Stata===
# egen mean_value_x = mean(value_x)
# replace value_x = mean_value_x if value_x == .

分组求和

# ===Python===
tips_summed = tips.groupby(['sex ', 'smoker'])[['total_bill', 'tip']].sum()

# *===Stata===
# collapse (sum) total_bill tip, by (sex smoker)

分组求特定的统计量

# ===Python===
gb = tips.groupby('smoker')['total_bill']
tips['adj_total_bill'] = tips['total_bill'] - gb.transform('mean')

# *===Stata===
# bysort sex smoker: egen group_bill = mean(total_bill)
# gen adj_total_bill = total_bill - group_bill

对每组取第一条记录

# ===Python===
tips.groupby(['sex', 'smoker']).first()

# *===Stata===
# bysort sex smoker: list if _n == 1

将数据进行横向合并：增加变量

# ===Python===
left_join = df1.merge(df2, on=['key'], how='left') # 左联接
right_join = df1.merge(df2, on=['key'], how='right') # 右联接
inner_join = df1.merge(df2, on=['key'], how='inner') # 内联接
outer_join = df1.merge(df2, on=['key'], how='outer') # 外联接

# *===Stata===
# * First create df2 and save to disk
# clear
# input str1 key
# B
# D
# D
# E
# end
# generate value = rnormal()
# save df2.dta

# * Now create df1 in memory
# clear
# input str1 key
# A
# B
# C
# D
# end
# generate value = rnormal()

#preserve

#* Left join
# merge 1:n key using df2.dta
# keep if _merge == 1

# * Right join
# restore, preserve
# merge 1:n key using df2.dta
# keep if _merge == 2

# * Inner join
# restore, preserve
# merge 1:n key using df2.dta
# keep if _merge == 3

# * Outer join
# restore
# merge 1:n key using df2.dta

将数据进行纵向合并：追加样本

# ===pandas===
append_u = pd.read_stata('append_u.dta', index=False)
append_m = pd.read_stata('append_m.dta', index=False)
append_um = pd.concat([append_u, append_m], axis=0])

# *===Stata===
# use append_m.dta, clear
# append using append_u.dta

离群值

# ===pandas===
nlsw88 = pd.read_stata('work/nlsw88.dta')
wage_q99 = nlsw88['wage'].quantile(0.99)
wage_q1 = nlsw88['wage'].quantile(0.01)
# 将离群值替换为分位数
nlsw88['wage'] = nlsw88['wage'].apply(lambda x: wage_q1 if xwage_q99 else x)
# 将离群值替换为空值
# nlsw88['wage'] = nlsw88['wage'].apply(lambda x: '' if xwage_q99 else x)

# *===Stata===
# *将离群值替换为分位数
# winsor2 wage, cuts(1 99) replace
# *将离群值替换为空值
# winsor2 wage, cuts(1 99) trim

欢迎分享，转载请注明来源：内存溢出

原文地址: http://outofmemory.cn/zaji/5572243.html

Stata与pandas的常用功能

发表评论

评论列表（0条）