Pandas 库是一个免费、开源的第三方 Python 库,是 Python 数据分析必不可少的工具之一,它为 Python 数据分析提供了高性能,且易于使用的数据结构,即 Series 和 DataFrame。Pandas 库基于 Python NumPy 库开发而来,因此,它可以与 Python 的科学计算库配合使用。Pandas 提供了两种数据结构,分别是 Series(一维数组结构)与 DataFrame(二维数组结构),这两种数据结构极大地增强的了 Pandas 的数据分析能力。
中文教程:http://c.biancheng.net/pandas/
官方文档:https://pandas.pydata.org/docs/
样例代码
import pandas as pd # https://pandas.pydata.org/
import numpy as np
import random as rd
import matplotlib.pyplot as plt
sr = pd.Series([20,30,50,70,25], index=[0, 1, 2, 3, 4]) # 一列
# 类型转换
print('\nlist = \n',sr.to_list())
print('\ndict = \n',sr.to_dict())
print('\nnumpy = \n',sr.to_numpy())
print('\njson = \n',sr.to_json())
print('\nlatex = \n',sr.to_latex())
print('\nmarkdown = \n',sr.to_markdown())
print('\nunique = \n',sr.unique()) # 转换为np.array(或pandas.core.arrays.datetimes.DatetimeArray)
df = pd.DataFrame( # 数据表:若干列
{
"Name": [
"Braund, Mr. Owen Harris",
"Allen, Mr. William Henry",
"Bonnell, Miss. Elizabeth",
"Allen, Mr. Elizabeth",
"Braund, Miss. Elizabeth",
],
"Age": [22, 35, 58, 12, 11],
"Sex": ["male", "male", "female", "male", "female"],
}
)
print('\ndict = \n',df.to_dict())
print('\nnumpy = \n',df.to_numpy())
print('\njson = \n',df.to_json())
print('\nlatex = \n',df.to_latex())
print('\nmarkdown = \n',df.to_markdown())
print('\n属性:')
df.info() # 属性
print('\nshape = ',df.shape) # 表格大小
print('\ndf = \n',df)
print('\nsr = \n',sr)
df['Age'] = sr # 修改列(索引)
df.Age = sr # 修改列(成员)
df['Age2'] = 2*df.Age - df.Age # 创建新列
print('\ndf = \n',df)
df = df.rename( # 重命名
columns={
"Age2": "age",
}
)
print('\ndf = \n',df)
df.Age += 0.5 # 运算,类似numpy.array
print('\ndf = \n',df)
df.to_excel('./data.xlsx', sheet_name="123", index=False)
df2 = pd.read_excel('./data.xlsx', sheet_name="123")
print('\ndf2 = \n',df2)
print('\ndf2.head(2) = \n',df2.head(2))
age_sex = df[["Age", "Sex"]] #部分列
print('\nage - sex = \n',age_sex)
df.iloc[1:4, 1] = 123 #部分数据
print('\ndf = \n',df)
age25 = df[df["Age"] > 25] #过滤
print('\nage > 25 = \n',age25)
# 统计值
print('\nvalue_counts = \n',df["Age"].value_counts(),'\n')
print('max = ',df["Age"].max())
print('maxargmin = ',df["Age"].argmin())
print('\ndescribe = \n', type(df.describe()), '\n',df.describe())
# 按属性分组统计
print('mean = ',df["Age"].mean())
print('\nmean_by_Sex = \n',df.groupby("Sex")["Age"].mean())
# index前后缀
print('\nprefix = \n',df['Age'].add_prefix(123))
print('\nsuffix = \n',df['Age'].add_suffix('a'))
# 排序
df3 = df.sort_values(by=["Name","Age"]) # Name第一顺位,Age第二顺位
print('\nsort(df) = \n',df3)
df3 = df.sort_values(by="Age", key=lambda sr: abs(sr-60)) #设置排序函数,离60近的
print('\nsort(df) = \n',df3)
# 设置index
df3 = df.set_index(['Name','age']) # Name和age,联合为index
print('\ndf.set_index = \n',df3)
# 转动枢轴
df3 = df.pivot(index="Name",columns="Sex", values=["Age","age"])
print('\ndf.pivot = \n',df3)
# 数据透视表
df3 = df.pivot_table(values="age", index="Age", columns="Sex", aggfunc="mean", margins=True)
print('\ndf.pivot_table = \n',df3)
df3 = df3.reset_index()
print('\ndf.pivot_table.reset_index = \n',df3)
# 转化为`long format`格式(就是除id外,只有单独一列)
df3 = df.melt(id_vars="Name")
print('\ndf.melt = \n',df3)
# 组合相同struct的数据表
df3 = pd.concat([df, df], axis=0) #axis=0: 竖着拼接,axis=1: 横着拼接
print('\nconcat = \n',df3)
df3 = pd.concat([df, df], keys=["PM25", "NO2"]) #key: 为两个表添加index
print('\nconcat = \n',df3)
df2 = pd.DataFrame( # 数据表:若干列
{
"Name": [
"Braund, Mr. Owen Harris",
"Allen, Mr. William Henry",
"Bonnell, Miss. Elizabeth",
"Allen, Mr. Elizabeth",
"Braund, Miss. Elizabeth",
],
"Work": [0,0,1,1,0],
"Address": [3,1,5,3,5],
"Time": [
'2019-06-21 00:00:00+00:00',
'2019-06-20 23:00:00+00:00',
'2019-06-19 22:00:00+00:00',
'2019-06-22 01:00:00+00:00',
'2019-06-20 09:00:00+00:00',
]
}
)
# 按照Age列作为key,整合两个表
df3 = pd.merge(df, df2, how="left", on="Name")
print('\nmerge = \n',df3)
# 从文本转化为时间
print(pd.to_datetime(df2['Time']))
# 文本替换
df3 = df.replace({"male": "M", "female": "F"})
print('\ndf = \n',df3)
# 画图
df.plot(x="Sex", y="Age", c='b', linestyle='--')
df.plot.scatter(x="Sex", y="Age", c='r', marker='*')
plt.show()
测试结果
list =
[20, 30, 50, 70, 25]
dict =
{0: 20, 1: 30, 2: 50, 3: 70, 4: 25}
numpy =
[20 30 50 70 25]
json =
{"0":20,"1":30,"2":50,"3":70,"4":25}
latex =
\begin{tabular}{lr}
\toprule
{} & 0 \\
\midrule
0 & 20 \\
1 & 30 \\
2 & 50 \\
3 & 70 \\
4 & 25 \\
\bottomrule
\end{tabular}
markdown =
| | 0 |
|---:|----:|
| 0 | 20 |
| 1 | 30 |
| 2 | 50 |
| 3 | 70 |
| 4 | 25 |
unique =
[20 30 50 70 25]
dict =
{'Name': {0: 'Braund, Mr. Owen Harris', 1: 'Allen, Mr. William Henry', 2: 'Bonnell, Miss. Elizabeth', 3: 'Allen, Mr. Elizabeth', 4: 'Braund, Miss. Elizabeth'}, 'Age': {0: 22, 1: 35, 2: 58, 3: 12, 4: 11}, 'Sex': {0: 'male', 1: 'male', 2: 'female', 3: 'male', 4: 'female'}}
numpy =
[['Braund, Mr. Owen Harris' 22 'male']
['Allen, Mr. William Henry' 35 'male']
['Bonnell, Miss. Elizabeth' 58 'female']
['Allen, Mr. Elizabeth' 12 'male']
['Braund, Miss. Elizabeth' 11 'female']]
json =
{"Name":{"0":"Braund, Mr. Owen Harris","1":"Allen, Mr. William Henry","2":"Bonnell, Miss. Elizabeth","3":"Allen, Mr. Elizabeth","4":"Braund, Miss. Elizabeth"},"Age":{"0":22,"1":35,"2":58,"3":12,"4":11},"Sex":{"0":"male","1":"male","2":"female","3":"male","4":"female"}}
latex =
\begin{tabular}{llrl}
\toprule
{} & Name & Age & Sex \\
\midrule
0 & Braund, Mr. Owen Harris & 22 & male \\
1 & Allen, Mr. William Henry & 35 & male \\
2 & Bonnell, Miss. Elizabeth & 58 & female \\
3 & Allen, Mr. Elizabeth & 12 & male \\
4 & Braund, Miss. Elizabeth & 11 & female \\
\bottomrule
\end{tabular}
markdown =
| | Name | Age | Sex |
|---:|:-------------------------|------:|:-------|
| 0 | Braund, Mr. Owen Harris | 22 | male |
| 1 | Allen, Mr. William Henry | 35 | male |
| 2 | Bonnell, Miss. Elizabeth | 58 | female |
| 3 | Allen, Mr. Elizabeth | 12 | male |
| 4 | Braund, Miss. Elizabeth | 11 | female |
属性:
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Name 5 non-null object
1 Age 5 non-null int64
2 Sex 5 non-null object
dtypes: int64(1), object(2)
memory usage: 248.0+ bytes
shape = (5, 3)
df =
Name Age Sex
0 Braund, Mr. Owen Harris 22 male
1 Allen, Mr. William Henry 35 male
2 Bonnell, Miss. Elizabeth 58 female
3 Allen, Mr. Elizabeth 12 male
4 Braund, Miss. Elizabeth 11 female
sr =
0 20
1 30
2 50
3 70
4 25
dtype: int64
df =
Name Age Sex Age2
0 Braund, Mr. Owen Harris 20 male 20
1 Allen, Mr. William Henry 30 male 30
2 Bonnell, Miss. Elizabeth 50 female 50
3 Allen, Mr. Elizabeth 70 male 70
4 Braund, Miss. Elizabeth 25 female 25
df =
Name Age Sex age
0 Braund, Mr. Owen Harris 20 male 20
1 Allen, Mr. William Henry 30 male 30
2 Bonnell, Miss. Elizabeth 50 female 50
3 Allen, Mr. Elizabeth 70 male 70
4 Braund, Miss. Elizabeth 25 female 25
df =
Name Age Sex age
0 Braund, Mr. Owen Harris 20.5 male 20
1 Allen, Mr. William Henry 30.5 male 30
2 Bonnell, Miss. Elizabeth 50.5 female 50
3 Allen, Mr. Elizabeth 70.5 male 70
4 Braund, Miss. Elizabeth 25.5 female 25
df2 =
Name Age Sex age
0 Braund, Mr. Owen Harris 20.5 male 20
1 Allen, Mr. William Henry 30.5 male 30
2 Bonnell, Miss. Elizabeth 50.5 female 50
3 Allen, Mr. Elizabeth 70.5 male 70
4 Braund, Miss. Elizabeth 25.5 female 25
df2.head(2) =
Name Age Sex age
0 Braund, Mr. Owen Harris 20.5 male 20
1 Allen, Mr. William Henry 30.5 male 30
age - sex =
Age Sex
0 20.5 male
1 30.5 male
2 50.5 female
3 70.5 male
4 25.5 female
df =
Name Age Sex age
0 Braund, Mr. Owen Harris 20.5 male 20
1 Allen, Mr. William Henry 123.0 male 30
2 Bonnell, Miss. Elizabeth 123.0 female 50
3 Allen, Mr. Elizabeth 123.0 male 70
4 Braund, Miss. Elizabeth 25.5 female 25
age > 25 =
Name Age Sex age
1 Allen, Mr. William Henry 123.0 male 30
2 Bonnell, Miss. Elizabeth 123.0 female 50
3 Allen, Mr. Elizabeth 123.0 male 70
4 Braund, Miss. Elizabeth 25.5 female 25
value_counts =
123.0 3
20.5 1
25.5 1
Name: Age, dtype: int64
max = 123.0
maxargmin = 0
describe =
Age age
count 5.000000 5.000000
mean 83.000000 39.000000
std 54.800776 20.736441
min 20.500000 20.000000
25% 25.500000 25.000000
50% 123.000000 30.000000
75% 123.000000 50.000000
max 123.000000 70.000000
mean = 83.0
mean_by_Sex =
Sex
female 74.250000
male 88.833333
Name: Age, dtype: float64
prefix =
1230 20.5
1231 123.0
1232 123.0
1233 123.0
1234 25.5
Name: Age, dtype: float64
suffix =
0a 20.5
1a 123.0
2a 123.0
3a 123.0
4a 25.5
Name: Age, dtype: float64
sort(df) =
Name Age Sex age
3 Allen, Mr. Elizabeth 123.0 male 70
1 Allen, Mr. William Henry 123.0 male 30
2 Bonnell, Miss. Elizabeth 123.0 female 50
4 Braund, Miss. Elizabeth 25.5 female 25
0 Braund, Mr. Owen Harris 20.5 male 20
sort(df) =
Name Age Sex age
4 Braund, Miss. Elizabeth 25.5 female 25
0 Braund, Mr. Owen Harris 20.5 male 20
1 Allen, Mr. William Henry 123.0 male 30
2 Bonnell, Miss. Elizabeth 123.0 female 50
3 Allen, Mr. Elizabeth 123.0 male 70
df.set_index =
Age Sex
Name age
Braund, Mr. Owen Harris 20 20.5 male
Allen, Mr. William Henry 30 123.0 male
Bonnell, Miss. Elizabeth 50 123.0 female
Allen, Mr. Elizabeth 70 123.0 male
Braund, Miss. Elizabeth 25 25.5 female
df.pivot =
Age age
Sex female male female male
Name
Allen, Mr. Elizabeth NaN 123.0 NaN 70.0
Allen, Mr. William Henry NaN 123.0 NaN 30.0
Bonnell, Miss. Elizabeth 123.0 NaN 50.0 NaN
Braund, Miss. Elizabeth 25.5 NaN 25.0 NaN
Braund, Mr. Owen Harris NaN 20.5 NaN 20.0
df.pivot_table =
Sex female male All
Age
20.5 NaN 20.0 20.0
25.5 25.0 NaN 25.0
123.0 50.0 50.0 50.0
All 37.5 40.0 39.0
df.pivot_table.reset_index =
Sex Age female male All
0 20.5 NaN 20.0 20.0
1 25.5 25.0 NaN 25.0
2 123.0 50.0 50.0 50.0
3 All 37.5 40.0 39.0
df.melt =
Name variable value
0 Braund, Mr. Owen Harris Age 20.5
1 Allen, Mr. William Henry Age 123.0
2 Bonnell, Miss. Elizabeth Age 123.0
3 Allen, Mr. Elizabeth Age 123.0
4 Braund, Miss. Elizabeth Age 25.5
5 Braund, Mr. Owen Harris Sex male
6 Allen, Mr. William Henry Sex male
7 Bonnell, Miss. Elizabeth Sex female
8 Allen, Mr. Elizabeth Sex male
9 Braund, Miss. Elizabeth Sex female
10 Braund, Mr. Owen Harris age 20
11 Allen, Mr. William Henry age 30
12 Bonnell, Miss. Elizabeth age 50
13 Allen, Mr. Elizabeth age 70
14 Braund, Miss. Elizabeth age 25
concat =
Name Age Sex age
0 Braund, Mr. Owen Harris 20.5 male 20
1 Allen, Mr. William Henry 123.0 male 30
2 Bonnell, Miss. Elizabeth 123.0 female 50
3 Allen, Mr. Elizabeth 123.0 male 70
4 Braund, Miss. Elizabeth 25.5 female 25
0 Braund, Mr. Owen Harris 20.5 male 20
1 Allen, Mr. William Henry 123.0 male 30
2 Bonnell, Miss. Elizabeth 123.0 female 50
3 Allen, Mr. Elizabeth 123.0 male 70
4 Braund, Miss. Elizabeth 25.5 female 25
concat =
Name Age Sex age
PM25 0 Braund, Mr. Owen Harris 20.5 male 20
1 Allen, Mr. William Henry 123.0 male 30
2 Bonnell, Miss. Elizabeth 123.0 female 50
3 Allen, Mr. Elizabeth 123.0 male 70
4 Braund, Miss. Elizabeth 25.5 female 25
NO2 0 Braund, Mr. Owen Harris 20.5 male 20
1 Allen, Mr. William Henry 123.0 male 30
2 Bonnell, Miss. Elizabeth 123.0 female 50
3 Allen, Mr. Elizabeth 123.0 male 70
4 Braund, Miss. Elizabeth 25.5 female 25
merge =
Name Age Sex age Work Address Time
0 Braund, Mr. Owen Harris 20.5 male 20 0 3 2019-06-21 00:00:00+00:00
1 Allen, Mr. William Henry 123.0 male 30 0 1 2019-06-20 23:00:00+00:00
2 Bonnell, Miss. Elizabeth 123.0 female 50 1 5 2019-06-19 22:00:00+00:00
3 Allen, Mr. Elizabeth 123.0 male 70 1 3 2019-06-22 01:00:00+00:00
4 Braund, Miss. Elizabeth 25.5 female 25 0 5 2019-06-20 09:00:00+00:00
0 2019-06-21 00:00:00+00:00
1 2019-06-20 23:00:00+00:00
2 2019-06-19 22:00:00+00:00
3 2019-06-22 01:00:00+00:00
4 2019-06-20 09:00:00+00:00
Name: Time, dtype: datetime64[ns, UTC]
df =
Name Age Sex age
0 Braund, Mr. Owen Harris 20.5 M 20
1 Allen, Mr. William Henry 123.0 M 30
2 Bonnell, Miss. Elizabeth 123.0 F 50
3 Allen, Mr. Elizabeth 123.0 M 70
4 Braund, Miss. Elizabeth 25.5 F 25
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)