- 6.Pandas导入导出
- 6.1导入数据
- 6.2导出数据
- 7.Pandas合并 *** 作
- 7.1 Pandas 合并 concat
- 7.2Pandas 合并 merge
- 7.2.1定义资料集并打印出
- 7.2.2两列合并
- 7.2.3Indicator设置合并列名称
- 7.2.4 依据 index 合并
- 7.2.5 解决 overlapping 的问题
- 7.2.6Indicator设置合并列名称
- 7.2.7 依据 index 合并
- 7.2.8 解决 overlapping 的问题
- 8.Pandas plot 出图
import pandas as pd # CSV文件不可以直接改后缀名,需要另存是保存为CSV格式 data = pd.read_csv('student.csv') print("~~~~~~~~~~~~~~~~~~读取整个表格~~~~~~~~~~~~~~~~~~") print(data) # Student ID name age gender # 0 1100 Kelly 22 Famle # 1 1101 Clo 21 Famle # 2 1102 Tilly 22 Famle # 3 1103 Tony 24 Male # 4 1104 David 20 Male # 5 1105 Catty 22 Famle # 6 1106 M 3 Famle # 7 1107 N 43 Male # 8 1108 A 13 Male # 9 1109 S 12 Male # 10 1110 David 33 Male # 11 1111 Dw 3 Famle # 12 1112 Q 23 Male # 13 1113 W 11 Famle print("~~~~~~~~~~~~~~~~~~前三行~~~~~~~~~~~~~~~~~~") # 前三行 print(data.head(3)) print("~~~~~~~~~~~~~~~~~~后三行~~~~~~~~~~~~~~~~~~") print(data.tail(3))6.2导出数据
print("~~~~~~~~~~~~~~~~~~将资料存取成pickle~~~~~~~~~~~~~~~~~~") data.to_pickle('student.pickle') # 读取pickle文件并打印 print(pd.read_pickle('student.pickle')) # Student ID name age gender # 0 1100 Kelly 22 Famle # 1 1101 Clo 21 Famle # 2 1102 Tilly 22 Famle # 3 1103 Tony 24 Male # 4 1104 David 20 Male # 5 1105 Catty 22 Famle # 6 1106 M 3 Famle # 7 1107 N 43 Male # 8 1108 A 13 Male # 9 1109 S 12 Male # 10 1110 David 33 Male # 11 1111 Dw 3 Famle # 12 1112 Q 23 Male # 13 1113 W 11 Famle7.Pandas合并 *** 作 7.1 Pandas 合并 concat
pdf1 = pd.Dataframe(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd']) pdf2 = pd.Dataframe(np.ones((3, 4)) * 1, columns=['a', 'b', 'c', 'd']) pdf3 = pd.Dataframe(np.ones((3, 4)) * 2, columns=['a', 'b', 'c', 'd']) print("~~~~~~~~~~~~~~~~~~打印pdf1~~~~~~~~~~~~~~~~~~") print(pdf1) # ~~~~~~~~~~~~~~~~~~打印pdf1~~~~~~~~~~~~~~~~~~ # a b c d # 0 0.0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 print("~~~~~~~~~~~~~~~~~~打印pdf2~~~~~~~~~~~~~~~~~~") print(pdf2) # ~~~~~~~~~~~~~~~~~~打印pdf2~~~~~~~~~~~~~~~~~~ # a b c d # 0 1.0 1.0 1.0 1.0 # 1 1.0 1.0 1.0 1.0 # 2 1.0 1.0 1.0 1.0 print("~~~~~~~~~~~~~~~~~~打印pdf3~~~~~~~~~~~~~~~~~~") print(pdf3) # ~~~~~~~~~~~~~~~~~~打印pdf3~~~~~~~~~~~~~~~~~~ # a b c d # 0 2.0 2.0 2.0 2.0 # 1 2.0 2.0 2.0 2.0 # 2 2.0 2.0 2.0 2.0 # concat合并 # 将axis=0,为纵向合并,累加行 # 但是他的index行索引会重复 print("~~~~~~~~~~~~~~~~~~打印res(三个矩阵以行合并)~~~~~~~~~~~~~~~~~~") res = pd.concat([pdf1, pdf2, pdf3], axis=0) print(res) # ~~~~~~~~~~~~~~~~~~打印res(三个矩阵以行合并)~~~~~~~~~~~~~~~~~~ # a b c d # 0 0.0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 # 0 1.0 1.0 1.0 1.0 # 1 1.0 1.0 1.0 1.0 # 2 1.0 1.0 1.0 1.0 # 0 2.0 2.0 2.0 2.0 # 1 2.0 2.0 2.0 2.0 # 2 2.0 2.0 2.0 2.0 # index不重复的方法 res = pd.concat([pdf1, pdf2, pdf3], axis=0, ignore_index=True) print("~~~~~~~~~~~~~~~~~~打印res(三个矩阵以行合并后index不重复)~~~~~~~~~~~~~~~~~~") print(res) # ~~~~~~~~~~~~~~~~~~打印res(三个矩阵以行合并后index不重复)~~~~~~~~~~~~~~~~~~ # a b c d # 0 0.0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 # 3 1.0 1.0 1.0 1.0 # 4 1.0 1.0 1.0 1.0 # 5 1.0 1.0 1.0 1.0 # 6 2.0 2.0 2.0 2.0 # 7 2.0 2.0 2.0 2.0 # 8 2.0 2.0 2.0 2.0 pd_1 = pd.Dataframe(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'], index=[1, 2, 3, ]) # a b c d # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 # 3 0.0 0.0 0.0 0.0 pd_2 = pd.Dataframe(np.ones((3, 4)) * 1, columns=['b', 'c', 'd', 'e'], index=[2, 3, 4]) # b c d e # 2 1.0 1.0 1.0 1.0 # 3 1.0 1.0 1.0 1.0 # 4 1.0 1.0 1.0 1.0 ''' join='outer',函数默认为join='outer'。此方法是依照column来做纵向合并,有相同的column上下合并在一起, 其他独自的column各自成列,原来没有值的位置皆为NaN填充。 ''' res = pd.concat([pd_1, pd_2], axis=0, join='outer') print('join合并后index并没有修改') print(res) # join合并后index并没有修改 # a b c d e # 1 0.0 0.0 0.0 0.0 NaN # 2 0.0 0.0 0.0 0.0 NaN # 3 0.0 0.0 0.0 0.0 NaN # 2 NaN 1.0 1.0 1.0 1.0 # 3 NaN 1.0 1.0 1.0 1.0 # 4 NaN 1.0 1.0 1.0 1.0 print('添加ignore_index后修改index') # 通过将ignore_index选项设置为True清除现有索引并在结果中将其重置 res = pd.concat([pd_1, pd_2], axis=0, join='outer',ignore_index=True) print(res) # 添加ignore_index后修改index # a b c d e # 0 0.0 0.0 0.0 0.0 NaN # 1 0.0 0.0 0.0 0.0 NaN # 2 0.0 0.0 0.0 0.0 NaN # 3 NaN 1.0 1.0 1.0 1.0 # 4 NaN 1.0 1.0 1.0 1.0 # 5 NaN 1.0 1.0 1.0 1.0 # join = 'inner'合并相同的列字段 # 纵向“内”合并pd_1和pd_2 res = pd.concat([pd_1, pd_2], axis=0, join='inner') print('合并pd_1和pd_2相同的列头:') print(res) # b c d # 1 0.0 0.0 0.0 # 2 0.0 0.0 0.0 # 3 0.0 0.0 0.0 # 2 1.0 1.0 1.0 # 3 1.0 1.0 1.0 # 4 1.0 1.0 1.0 pd_1 = pd.Dataframe(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'], index=[1,2,3]) pd_2 = pd.Dataframe(np.ones((3, 4))*0, columns=['b', 'c', 'd', 'e'], index=[2,3,4]) print('依照pd_1.index进行横向合并') # axis - 要连接的轴 # join – 如何处理其他轴上的索引 # join_axes – 用于其他 n - 1 轴的特定索引,而不是执行内部/外部集合逻辑 # ignore_index – 如果为 True,则不使用沿串联轴的索引值。 # 结果轴将被标记为 0, ..., n - 1。 # 如果您在连接轴没有有意义的索引信息的情况下连接对象,这将非常有用。 # 请注意其他轴上的索引值在连接中仍然有效。 # keys - 如果通过了多个级别,则应包含元组。 使用传递的键作为最外层构建分层索引 # levels – 用于构建 MultiIndex 的特定级别(唯一值)。 否则他们将从密钥中推断出来 # names - 生成的分层索引中级别的名称 # verify_integrity – 检查新的连接轴是否包含重复项。 相对于实际的数据连接,这可能非常昂贵 # sort – 如果在“join”为“outer”时尚未对齐,则对非串联轴进行排序。 # 当前默认的排序已被弃用,并将在未来版本的 Pandas 中更改为不排序。 # 显式传递 ``sort=True`` 以消除警告和排序。 # 显式传递 ``sort=False`` 以消除警告而不是排序。 # 这在 ``join='inner'`` 时不起作用,它已经保留了非串联轴的顺序。 .. 版本已添加:: 0.23.0 # copy – 如果为 False,则不要不必要地复制数据 res = pd.concat([pd_1, pd_2], axis=1, join_axes=[pd_1.index]) print(res) # 依照pd_1.index进行横向合并 # a b c d b c d e # 1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN # 2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 # 3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 # append(添加数据) # append只有纵向合并,没有横向合并 # 定义资料集 pd_1 = pd.Dataframe(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd']) pd_2 = pd.Dataframe(np.ones((3, 4)) * 1, columns=['a', 'b', 'c', 'd']) pd_3 = pd.Dataframe(np.ones((3, 4)) * 2, columns=['a', 'b', 'c', 'd']) s1 = pd.Series([1, 2, 3 ,4], index = ['a', 'b', 'c', 'd']) # 将pd_2合并到pd_1下面,以及重置index,并打印结果 res = pd_1.append(pd_2, ignore_index=True) print('将pd_2合并到pd_1下面,以及重置index:') print(res) # 将pd_2合并到pd_1下面,以及重置index: # a b c d # 0 0.0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 # 3 1.0 1.0 1.0 1.0 # 4 1.0 1.0 1.0 1.0 # 5 1.0 1.0 1.0 1.0 res = pd_1.append([pd_2, pd_3], ignore_index=True) print('将pd_2和pd_3合并到pd_1下面,以及重置index:') print(res) # 将pd_2和pd_3合并到pd_1下面,以及重置index: # a b c d # 0 0.0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 # 3 1.0 1.0 1.0 1.0 # 4 1.0 1.0 1.0 1.0 # 5 1.0 1.0 1.0 1.0 # 6 2.0 2.0 2.0 2.0 # 7 2.0 2.0 2.0 2.0 # 8 2.0 2.0 2.0 2.0 print('两种不同的合并矩阵的方式:') res = pd.concat([pd_1, pd_2, pd_3], axis=0, ignore_index=True) res_1 = pd_1.append([pd_2, pd_3], ignore_index=True) print('conact方法:') print(res) # conact方法: # a b c d # 0 0.0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 # 3 1.0 1.0 1.0 1.0 # 4 1.0 1.0 1.0 1.0 # 5 1.0 1.0 1.0 1.0 # 6 2.0 2.0 2.0 2.0 # 7 2.0 2.0 2.0 2.0 # 8 2.0 2.0 2.0 2.0 print('append方法:') print(res_1) # append方法: # a b c d # 0 0.0 0.0 0.0 0.0 # 1 0.0 0.0 0.0 0.0 # 2 0.0 0.0 0.0 0.0 # 3 1.0 1.0 1.0 1.0 # 4 1.0 1.0 1.0 1.0 # 5 1.0 1.0 1.0 1.0 # 6 2.0 2.0 2.0 2.0 # 7 2.0 2.0 2.0 2.0 # 8 2.0 2.0 2.0 2.07.2Pandas 合并 merge 7.2.1定义资料集并打印出
left = pd.Dataframe({'key' : ['K0', 'K1', 'K2', 'K3'], 'A' : ['A0', 'A1', 'A2', 'A3'], 'B' : ['B0', 'B1', 'B2', 'B3'] }) print(left) # key A B # 0 K0 A0 B0 # 1 K1 A1 B1 # 2 K2 A2 B2 # 3 K3 A3 B3 right = pd.Dataframe({'key' : ['K0', 'K1', 'K2', 'K3'], 'C' : ['C0', 'C1', 'C2', 'C3'], 'D' : ['D0', 'D1', 'D2', 'D3'] }) print(right) # key C D # 0 K0 C0 D0 # 1 K1 C1 D1 # 2 K2 C2 D2 # 3 K3 C3 D37.2.2两列合并
# 依据key1与key2 columns进行合并, # 并打印出四种结果['left', 'right', 'outer', 'inner'] pd_1 = pd.Dataframe({'lkey' : ['foo', 'bar', 'baz', 'foo'], 'value': [1, 2, 3, 5]}) print(pd_1) # lkey value # 0 foo 1 # 1 bar 2 # 2 baz 3 # 3 foo 5 pd_2 = pd.Dataframe({'rkey' : ['foo', 'bar', 'baz', 'foo'], 'value': [5, 6, 7, 8]}) print(pd_2) # rkey value # 0 foo 5 # 1 bar 6 # 2 baz 7 # 3 foo 8 # 将右边与左边进行并集 *** 作 # foo(1, 5)(1, 8)(5, 5)(5,8) # bar(2,6) # baz(3,7) pd = pd_1.merge(pd_2, left_on='lkey', right_on='rkey') print(pd) # lkey value_x rkey value_y # 0 foo 1 foo 5 # 1 foo 1 foo 8 # 2 foo 5 foo 5 # 3 foo 5 foo 8 # 4 bar 2 bar 6 # 5 baz 3 baz 7 # 会进行报错 # pd = pd_1.merge(pd_2, left_on='lkey', right_on='rkey', suffixes=(False, False)) # '{rename}'.format(rename=to_rename)) # ValueError: columns overlap but no suffix specified: Index(['value'], dtype='object') # print(pd) import pandas as pd df_1 = pd.Dataframe({'a': ['foo', 'bar'], 'b': [1, 2]}) print(df_1) # a b # 0 foo 1 # 1 bar 2 df_2 = pd.Dataframe({'a': ['foo', 'baz'], 'c': [3, 4]}) print(df_2) # a c # 0 foo 3 # 1 baz 4 pd = df_1.merge(df_2, how='inner', on='a') print(pd) # a b c # 0 foo 1 3 pd = df_1.merge(df_2, how='left', on='a') print(pd) # a b c # 0 foo 1 3.0 # 1 bar 2 NaN import pandas as pd df1 = pd.Dataframe({'left': ['foo', 'bar']}) print(df1) # left # 0 foo # 1 bar df2 = pd.Dataframe({'right': [7, 8]}) print(df2) # right # 0 7 # 1 8 df = df1.merge(df2, how='cross') print(df) # left right # 0 foo 7 # 1 foo 8 # 2 bar 7 # 3 bar 87.2.3Indicator设置合并列名称
pdf_1 = pd.Dataframe({'coll' : [0, 1], 'col_left' : ['a', 'b']}) print(pdf_1) # coll col_left # 0 0 a # 1 1 b pdf_2 = pd.Dataframe({'coll' : [1, 2, 2], 'col_right' : [2, 2, 2]}) print(pdf_2) # coll col_right # 0 1 2 # 1 2 2 # 2 2 2 # # 依据col1进行合并,并启用indicator=True,最后打印 # how:使用来自两个帧的键的联合,类似于 SQL 全外连接;按字典顺序对键进行排序。 res = pd.merge(pdf_1, pdf_2, on='coll', how='outer', indicator=True) print(res) # coll col_left col_right _merge # 0 0 a NaN left_only # 1 1 b 2.0 both # 2 2 NaN 2.0 right_only # 3 2 NaN 2.0 right_only # 自定义indicator column的名称,并打印出 res = pd.merge(pdf_1, pdf_2, on='coll', how='outer', indicator='indicator_column') print(res) # coll col_left col_right indicator_column # 0 0 a NaN left_only # 1 1 b 2.0 both # 2 2 NaN 2.0 right_only # 3 2 NaN 2.0 right_only7.2.4 依据 index 合并
# 依据index合并 # 定义资料集并打印出 left = pd.Dataframe({'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2']}, index=['K0', 'K1', 'K2']) right = pd.Dataframe({'C': ['C0', 'C2', 'C3'], 'D': ['D0', 'D2', 'D3']}, index=['K0', 'K2', 'K3']) print(left) # A B # K0 A0 B0 # K1 A1 B1 # K2 A2 B2 print(right) # C D # K0 C0 D0 # K2 C2 D2 # K3 C3 D3 # 依据左右资料集的index进行合并,how='outer',并打印 res = pd.merge(left,right,left_index=True,right_index=True,how='outer') print(res) # A B C D # K0 A0 B0 C0 D0 # K1 A1 B1 NaN NaN # K2 A2 B2 C2 D2 # K3 NaN NaN C3 D3 # 依据左右资料集的index进行合并,how='inner',并打印 res = pd.merge(left,right,left_index=True,right_index=True,how='inner') print(res) # A B C D # K0 A0 B0 C0 D0 # K2 A2 B2 C2 D27.2.5 解决 overlapping 的问题
# 解决overlapping的问题 # 定义资料集 boys = pd.Dataframe({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]}) girls = pd.Dataframe({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]}) print(boys) # k age # 0 K0 1 # 1 K1 2 # 2 K2 3 print(girls) # k age # 0 K0 4 # 1 K0 5 # 2 K3 6 # 使用suffixes解决overlapping的问题 # 比如将上面两个合并时,age重复了,则可通过suffixes设置,以此保证不重复,不同名 res = pd.merge(boys,girls,on='k',suffixes=['_boy','_girl'],how='inner') print(res) # k age_boy age_girl # 0 K0 1 4 # 1 K0 1 57.2.6Indicator设置合并列名称
pdf_1 = pd.Dataframe({'coll' : [0, 1], 'col_left' : ['a', 'b']}) print(pdf_1) # coll col_left # 0 0 a # 1 1 b pdf_2 = pd.Dataframe({'coll' : [1, 2, 2], 'col_right' : [2, 2, 2]}) print(pdf_2) # coll col_right # 0 1 2 # 1 2 2 # 2 2 2 # # 依据col1进行合并,并启用indicator=True,最后打印 # how:使用来自两个帧的键的联合,类似于 SQL 全外连接;按字典顺序对键进行排序。 res = pd.merge(pdf_1, pdf_2, on='coll', how='outer', indicator=True) print(res) # coll col_left col_right _merge # 0 0 a NaN left_only # 1 1 b 2.0 both # 2 2 NaN 2.0 right_only # 3 2 NaN 2.0 right_only # 自定义indicator column的名称,并打印出 res = pd.merge(pdf_1, pdf_2, on='coll', how='outer', indicator='indicator_column') print(res) # coll col_left col_right indicator_column # 0 0 a NaN left_only # 1 1 b 2.0 both # 2 2 NaN 2.0 right_only # 3 2 NaN 2.0 right_only7.2.7 依据 index 合并
# 依据index合并 # 定义资料集并打印出 left = pd.Dataframe({'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2']}, index=['K0', 'K1', 'K2']) right = pd.Dataframe({'C': ['C0', 'C2', 'C3'], 'D': ['D0', 'D2', 'D3']}, index=['K0', 'K2', 'K3']) print(left) # A B # K0 A0 B0 # K1 A1 B1 # K2 A2 B2 print(right) # C D # K0 C0 D0 # K2 C2 D2 # K3 C3 D3 # 依据左右资料集的index进行合并,how='outer',并打印 res = pd.merge(left,right,left_index=True,right_index=True,how='outer') print(res) # A B C D # K0 A0 B0 C0 D0 # K1 A1 B1 NaN NaN # K2 A2 B2 C2 D2 # K3 NaN NaN C3 D3 # 依据左右资料集的index进行合并,how='inner',并打印 res = pd.merge(left,right,left_index=True,right_index=True,how='inner') print(res) # A B C D # K0 A0 B0 C0 D0 # K2 A2 B2 C2 D27.2.8 解决 overlapping 的问题
# 解决overlapping的问题 # 定义资料集 boys = pd.Dataframe({'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]}) girls = pd.Dataframe({'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]}) print(boys) # k age # 0 K0 1 # 1 K1 2 # 2 K2 3 print(girls) # k age # 0 K0 4 # 1 K0 5 # 2 K3 6 # 使用suffixes解决overlapping的问题 # 比如将上面两个合并时,age重复了,则可通过suffixes设置,以此保证不重复,不同名 res = pd.merge(boys,girls,on='k',suffixes=['_boy','_girl'],how='inner') print(res) # k age_boy age_girl # 0 K0 1 4 # 1 K0 1 58.Pandas plot 出图
import pandas as pd import numpy as np from matplotlib import pyplot as plt
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~') data = pd.Series(np.random.randn(1000), index=np.arange(1000)) print(data) print(data.cumsum()) # data本来就是一个数据, data.plot() plt.show()
# 随机生成1000行4列数据 arr = np.random.randn(1000, 4) Hang = np.arange(1000) colunm = list("ABCD") data = pd.Dataframe(arr, index = Hang, columns=colunm) data.cumsum() data.plot() plt.show()
# scatter # y 与 x 的散点图,具有不同的标记大小和/或颜色。 ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label='Class1') data.plot.scatter(x='A', y='C', color='LightGreen', label='Class2', ax=ax) plt.show()
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)