Pandas(一)[快速入门]_随笔

Pandas(一)[快速入门]

1.Series
2.Dataframe
3.pandas 选择数据
4.Pandas 设置值
5.Pandas 处理丢失数据

1.Series

import numpy as np
import pandas as pd

# Series(data=None, index=None, dtype=None, name=None, copy=False, fastpath=False)
# data: 类似数组、可迭代、字典或标量值。包含存储在系列中的数据。如果数据是字典，则保持参数顺序
# index: 值必须是可散列的并且与数据具有相同的长度。允许非唯一索引值。
#        如果未提供，将默认为 RangeIndex (0, 1, 2, ..., n)。
#        如果没有提供。如果 data 是 dict-like 并且 index 是 None，则数据中的键用作索引。
#        如果索引不是 None，则使用索引值重新索引生成的系列。
# dtype: 输出系列的数据类型。如果没有指定，这将从数据中推断出来。
# name: 给系列起的名字
# copy: 复制输入数据。仅影响系列或一维 ndarray 输入。默认：False
# s = pd.Series([1,3,6,np.nan,44,1])

# 从指定索引的字典构建系列
d = {'a': 1, 'b': 2, 'c': 3}
ser = pd.Series(data=d, index=['a', 'b', 'c'])
print(ser)
# 字典的键与索引值匹配，因此索引值不起作用
# ser
# a   1
# b   2
# c   3
# dtype: int64
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
# 请注意，索引首先使用字典中的键构建。
# 在此之后，系列使用给定的索引值重新索引，因此我们得到所有 NaN 作为结果。
d = {'a': 1, 'b': 2, 'c': 3}
ser = pd.Series(data=d, index=['x', 'y', 'z'])
print(ser)
# ser
# x   NaN
# y   NaN
# z   NaN
# dtype: float64
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

# 给定一个一维数组
x = [1, 2]
# 将系列的copy设为False，但是由于数据类型，系列有x数组的副本
ser = pd.Series(x, copy = False)
#修改副本中的索引值为0的value
# iloc: 纯粹基于整数位置的索引，用于按位置选择
ser.iloc[0] = 999
# 从输出可以看出只改变了副本,x本身并未改变
print(x)
print(ser)
# [1, 2]
# 0    999
# 1      2
# dtype: int64
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

# 如果arr为多维数组将会报错
arr = np.array([1, 2])
ser = pd.Series(arr, copy = False)
# 由于输入数据类型，系列对原始数据有一个视图，因此数据也发生了变化。
ser.iloc[0] = 999
print(arr)
print(ser)
# [999   2]
# 0    999
# 1      2
# dtype: int32
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
arr = np.array([1, 2, 3])
ser = pd.Series(arr)
# T: 返回转置，根据定义，它是 self。
# 只针对一维数组，否则报错
print(ser.T)
# 0    1
# 1    2
# 2    3
# dtype: int32
# array: 支持此系列或索引的数据的 ExtensionArray。
# print(pd.Series([1, 2, 3]).array)
# The ExtensionArray of the data backing this Series or Index.
# at: 访问 rowcolumn 标签对的单个值。
# attrs: 此数据集的全局属性字典。
# axes: 返回行轴标签列表。
# dtype: 返回底层数据的 dtype 对象。
# dtypes: 返回底层数据的 dtype 对象。
# flags: 获取与此 Pandas 对象关联的属性。
# hasnans:如果我有任何nans，请返回；启用各种性能加速。
# iat: 按整数位置访问行列对的单个值。
# iloc: 用于按位置选择的纯整数位置索引
# index: 系列的索引（轴标签）。
# is_monotonic: 如果对象中的值是 monotonic_increasing，则返回布尔值。
# is_monotonic_decreasing: 如果对象中的值是 monotonic_decresing，则返回布尔值。
# is_monotonic_increasing: is_monotonic 的别名。
# is_unique: 如果对象中的值是唯一的，则返回布尔值。
# loc: 通过标签或布尔数组访问一组行和列。
# name: 返回系列的名称。
# nbytes: 返回底层数据中的字节数。
# ndim: 基础数据的维数，根据定义 1。
# shape: 返回基础数据形状的元组。
# size: 返回基础数据中的元素数。
# values: 根据 dtype 将系列作为 ndarray 或 ndarray-like 返回。

print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
# date_range(start=None, end=None, periods=None, freq=None, tz=None,
#                normalize=False, name=None, closed=None, **kwargs)
# periods: 要生成的周期数
dates = pd.date_range('2021-11-28', periods=6)
print(dates)
# DatetimeIndex(['2021-11-28', '2021-11-29', '2021-11-30', '2021-12-01',
#                '2021-12-02', '2021-12-03'],
#               dtype='datetime64[ns]', freq='D')
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
'''
numpy.random.randn(d0, d1, …, dn)是从标准正态分布中返回一个或多个样本值。
numpy.random.rand(d0, d1, …, dn)的随机样本位于[0, 1)中。
(6,4)表示6行4列数据
'''
arr = np.random.randn(6, 4)
print(arr)
# 二维的、大小可变的、潜在的异构表格数据。
# Dataframe([data, index, columns, dtype, copy])
# data: 传入数据
# index: 行索引（行头）
# columns: 列索引（列头）
# copy: 默认为False

2.Dataframe

print('未指定行标签和列标签的数据')
# 默认0,1,2,3,4......为表头
df1 = pd.Dataframe(np.arange(12).reshape(3,4))
print(df1)
#    0  1   2   3
# 0  0  1   2   3
# 1  4  5   6   7
# 2  8  9  10  11
print('指定行标签和列标签的数据:')
arr = np.random.randn(6, 4)
dates = pd.date_range('2021-11-28', periods=6)
df = pd.Dataframe(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
print("整个DateForm:")
print(df)
# 整个DateForm:
#                    a         b         c         d
# 2021-11-28  0.925212 -0.032742 -0.397622 -0.685095
# 2021-11-29 -1.127364  1.058101  0.871153 -0.541429
# 2021-11-30 -0.459550  2.011948  0.264811  2.138359
# 2021-12-01 -0.434759  0.605623 -0.403327 -0.917852
# 2021-12-02  0.763358 -1.655734 -1.995348 -0.275367
# 2021-12-03 -0.474699  1.052355 -0.807073 -1.188520
print("整个DateForm的‘b’列")
print(df['b'])
# 整个DateForm的‘b’列
# 2021-11-28   -0.032742
# 2021-11-29    1.058101
# 2021-11-30    2.011948
# 2021-12-01    0.605623
# 2021-12-02   -1.655734
# 2021-12-03    1.052355
# Freq: D, Name: b, dtype: float64
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

# 另一种方式
df2 = pd.Dataframe({
    'A': [1,2,3,4],
    'B': pd.Timestamp('20211128'),
    'C': pd.Series([1,6,9,10],dtype='float32'),
    'D': np.array([3] * 4,dtype='int32'),
    'E': pd.Categorical(['test','train','test','train']),
    'F': 'foo'
})

#    A          B     C  D      E    F
# 0  1 2021-11-28   1.0  3   test  foo
# 1  2 2021-11-28   6.0  3  train  foo
# 2  3 2021-11-28   9.0  3   test  foo
# 3  4 2021-11-28  10.0  3  train  foo
print(df2)
#    A          B     C  D      E    F
# 0  1 2021-11-28   1.0  3   test  foo
# 1  2 2021-11-28   6.0  3  train  foo
# 2  3 2021-11-28   9.0  3   test  foo
# 3  4 2021-11-28  10.0  3  train  foo
print(df2.index)
# RangeIndex(start=0, stop=4, step=1)
print(df2.columns)
# Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
print(df2.values)
# [[1 Timestamp('2021-11-28 00:00:00') 1.0 3 'test' 'foo']
#  [2 Timestamp('2021-11-28 00:00:00') 6.0 3 'train' 'foo']
#  [3 Timestamp('2021-11-28 00:00:00') 9.0 3 'test' 'foo']
#  [4 Timestamp('2021-11-28 00:00:00') 10.0 3 'train' 'foo']]
print(df2.describe())
#               A          C    D
# count  4.000000   4.000000  4.0
# mean   2.500000   6.500000  3.0
# std    1.290994   4.041452  0.0
# min    1.000000   1.000000  3.0
# 25%    1.750000   4.750000  3.0
# 50%    2.500000   7.500000  3.0
# 75%    3.250000   9.250000  3.0
# max    4.000000  10.000000  3.0

print(df2.T)
#                      0         ...                             3
# A                    1         ...                             4
# B  2021-11-28 00:00:00         ...           2021-11-28 00:00:00
# C                    1         ...                            10
# D                    3         ...                             3
# E                 test         ...                         train
# F                  foo         ...                           foo
#
# [6 rows x 4 columns]

# 按行升序排列
apd = df2.sort_index(axis=1, ascending=True)
print(apd)
#    A          B     C  D      E    F
# 0  1 2021-11-28   1.0  3   test  foo
# 1  2 2021-11-28   6.0  3  train  foo
# 2  3 2021-11-28   9.0  3   test  foo
# 3  4 2021-11-28  10.0  3  train  foo
# 按行降序排列
apd_1 = df2.sort_index(axis=1, ascending=False)
print(apd_1)
#      F      E  D     C          B  A
# 0  foo   test  3   1.0 2021-11-28  1
# 1  foo  train  3   6.0 2021-11-28  2
# 2  foo   test  3   9.0 2021-11-28  3
# 3  foo  train  3  10.0 2021-11-28  4

# 对对应列的值进行升降序排列
print(df2.sort_values(by='C', ascending=False))
#    A          B     C  D      E    F
# 3  4 2021-11-28  10.0  3  train  foo
# 2  3 2021-11-28   9.0  3   test  foo
# 1  2 2021-11-28   6.0  3  train  foo
# 0  1 2021-11-28   1.0  3   test  foo

print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

3.pandas 选择数据

import pandas as pd
import numpy as np

dates = pd.date_range('20211129', periods=6)
df = pd.Dataframe(np.arange(24).reshape((6, 4)), index = dates, columns=['A', 'B', 'C', 'D'])
print(df)
print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
# 获取列数据的两种方法
print(df.A)
print(df['A'])
# 2021-11-29     0
# 2021-11-30     4
# 2021-12-01     8
# 2021-12-02    12
# 2021-12-03    16
# 2021-12-04    20
# Freq: D, Name: A, dtype: int32

# 选择跨越多行或多列
# 选取前3行
print(df[0:3])

print(df['2021-11-29' : '2021-12-01'])
#             A  B   C   D
# 2021-11-29  0  1   2   3
# 2021-11-30  4  5   6   7
# 2021-12-01  8  9  10  11

# 根据标签选择数据
# 获取特定行或列
# 指定行数据
print(df.loc['2021-12-02'])
# A    12
# B    13
# C    14
# D    15
# Name: 2021-12-02 00:00:00, dtype: int32

# 指定列
# 两种方式
# A->C之间的列，包头尾
print(df.loc[:, 'A' : 'C'])
#              A   B   C
# 2021-11-29   0   1   2
# 2021-11-30   4   5   6
# 2021-12-01   8   9  10
# 2021-12-02  12  13  14
# 2021-12-03  16  17  18
# 2021-12-04  20  21  22

# A,C两列
print(df.loc[:, ['A', 'C']])
#              A   C
# 2021-11-29   0   2
# 2021-11-30   4   6
# 2021-12-01   8  10
# 2021-12-02  12  14
# 2021-12-03  16  18
# 2021-12-04  20  22

# 行列同时检索
# '20211202'行的'A'和'B'列
print(df.loc['20211202', ['A', 'B']])
# A    12
# B    13
# Name: 2021-12-02 00:00:00, dtype: int32

# 根据序列iloc
# 获取特定位置的值
print(df.iloc[3:5, 1:3])
#              B   C
# 2021-12-02  13  14
# 2021-12-03  17  18

# 跨行 *** 作
# 1, 3, 5行
# 1, 2列
print(df.iloc[[1, 3, 5], 1:3])
#              B   C
# 2021-11-30   5   6
# 2021-12-02  13  14
# 2021-12-04  21  22

# 混合选择
# 两个数组相同的结果，ix已禁用
# print(df.ix[:3, ['A', 'C']])
print(df.iloc[:3, [0, 2]])

#             A   C
# 2021-11-29  0   2
# 2021-11-30  4   6
# 2021-12-01  8  10

# 通过判断的筛选
# 筛选掉A列小
print(df.loc[df.A > 8])
#              A   B   C   D
# 2021-12-02  12  13  14  15
# 2021-12-03  16  17  18  19
# 2021-12-04  20  21  22  23


print(df.loc['20211202','A':'B'])
# A    12
# B    13
# Name: 2021-12-02 00:00:00, dtype: int32
print(df.iloc[0,0:2])
# A    0
# B    1
# Name: 2021-11-29 00:00:00, dtype: int32

4.Pandas 设置值

print('~~~~~~~~~~~~~创建数据~~~~~~~~~~~~~~')
dates = pd.date_range('20180820',periods=6)
pdf = pd.Dataframe(np.arange(24).reshape(6,4), index=dates, columns=['A','B','C','D'])
print(pdf)
# ~~~~~~~~~~~~~创建数据~~~~~~~~~~~~~~
#              A   B   C   D
# 2018-08-20   0   1   2   3
# 2018-08-21   4   5   6   7
# 2018-08-22   8   9  10  11
# 2018-08-23  12  13  14  15
# 2018-08-24  16  17  18  19
# 2018-08-25  20  21  22  23
print('~~~~~~~~~~~~~iloc通过坐标值修改数据~~~~~~~~~~~~~~')
pdf.iloc[2,2] = 1111
print(pdf)
# iloc不能用index索引来修改，会报错
# pdf.iloc['20180823', 'B'] = 2222
#              A   B     C   D
# 2018-08-20   0   1     2   3
# 2018-08-21   4   5     6   7
# 2018-08-22   8   9  1111  11
# 2018-08-23  12  13    14  15
# 2018-08-24  16  17    18  19
# 2018-08-25  20  21    22  23
print('~~~~~~~~~~~~~loc通过index修改数据~~~~~~~~~~~~~~')
pdf.loc['20180824', 'B'] = 3333
print(pdf)
# ~~~~~~~~~~~~~iloc通过坐标值修改数据~~~~~~~~~~~~~~
#              A     B     C   D
# 2018-08-20   0     1     2   3
# 2018-08-21   4     5     6   7
# 2018-08-22   8     9  1111  11
# 2018-08-23  12    13    14  15
# 2018-08-24  16  3333    18  19
# 2018-08-25  20    21    22  23
print('~~~~~~~~~~~~~按列修改数据~~~~~~~~~~~~~~')
print('①方法')
pdf.B.loc[pdf.A > 4] = 0
print(pdf)
# ①方法
#              A  B     C   D
# 2018-08-20   0  1     2   3
# 2018-08-21   4  5     6   7
# 2018-08-22   8  0  1111  11
# 2018-08-23  12  0    14  15
# 2018-08-24  16  0    18  19
# 2018-08-25  20  0    22  23
print('②方法')
pdf.B[pdf.A > 4] = 1
print(pdf)
# ②方法
#              A  B     C   D
# 2018-08-20   0  1     2   3
# 2018-08-21   4  5     6   7
# 2018-08-22   8  1  1111  11
# 2018-08-23  12  1    14  15
# 2018-08-24  16  1    18  19
# 2018-08-25  20  1    22  23

print('~~~~~~~~~~~~~添加列~~~~~~~~~~~~~~')
# 通过Series添加列时必须在长度上对齐
pdf['E'] = pd.Series([1, 2, 3, 4, 5, 6], index = dates)
print(pdf)
#              A  B     C   D   F  E
# 2018-08-20   0  1     2   3 NaN  1
# 2018-08-21   4  5     6   7 NaN  2
# 2018-08-22   8  1  1111  11 NaN  3
# 2018-08-23  12  1    14  15 NaN  4
# 2018-08-24  16  1    18  19 NaN  5
# 2018-08-25  20  1    22  23 NaN  6

pdf['F'] = np.nan
print(pdf)
# ③方法
#              A  B     C   D   F
# 2018-08-20   0  1     2   3 NaN
# 2018-08-21   4  5     6   7 NaN
# 2018-08-22   8  1  1111  11 NaN
# 2018-08-23  12  1    14  15 NaN
# 2018-08-24  16  1    18  19 NaN
# 2018-08-25  20  1    22  23 NaN

print('~~~~~~~~~~~~~修改特定位置的值~~~~~~~~~~~~~~')
# 通过loc设定某行某列为特定值
pdf.loc['20180820', 'A'] = 56
print(pdf)
#              A  B     C   D  E   F
# 2018-08-20  56  1     2   3  1 NaN
# 2018-08-21   4  5     6   7  2 NaN
# 2018-08-22   8  1  1111  11  3 NaN
# 2018-08-23  12  1    14  15  4 NaN
# 2018-08-24  16  1    18  19  5 NaN
# 2018-08-25  20  1    22  23  6 NaN

# 通过iloc设定行列的特定值
pdf.iloc[1,1] = 76
print(pdf)
#              A   B     C   D  E   F
# 2018-08-20  56   1     2   3  1 NaN
# 2018-08-21   4  76     6   7  2 NaN
# 2018-08-22   8   1  1111  11  3 NaN
# 2018-08-23  12   1    14  15  4 NaN
# 2018-08-24  16   1    18  19  5 NaN
# 2018-08-25  20   1    22  23  6 NaN

print('~~~~~~~~~~~~~修改特定行的值~~~~~~~~~~~~~~')
# 通过loc调用行头来修改行值
pdf.loc['20180820'] = np.nan
print(pdf)
#                A     B       C     D    E   F
# 2018-08-20   NaN   NaN     NaN   NaN  NaN NaN
# 2018-08-21   4.0  76.0     6.0   7.0  2.0 NaN
# 2018-08-22   8.0   1.0  1111.0  11.0  3.0 NaN
# 2018-08-23  12.0   1.0    14.0  15.0  4.0 NaN
# 2018-08-24  16.0   1.0    18.0  19.0  5.0 NaN
# 2018-08-25  20.0   1.0    22.0  23.0  6.0 NaN

# 通过iloc调用位置数组修改行
# pdf.iloc[1, :] = np.nan
pdf.iloc[1] = np.nan
print(pdf)
#                A    B       C     D    E   F
# 2018-08-20  56.0  1.0     2.0   3.0  1.0 NaN
# 2018-08-21   NaN  NaN     NaN   NaN  NaN NaN
# 2018-08-22   8.0  1.0  1111.0  11.0  3.0 NaN
# 2018-08-23  12.0  1.0    14.0  15.0  4.0 NaN
# 2018-08-24  16.0  1.0    18.0  19.0  5.0 NaN
# 2018-08-25  20.0  1.0    22.0  23.0  6.0 NaN

5.Pandas 处理丢失数据

dates = pd.date_range('20211201', periods=6)
pdf = pd.Dataframe(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])
print('创建矩阵：')
print(pdf)
# 创建矩阵：
#              A   B   C   D
# 2021-12-01   0   1   2   3
# 2021-12-02   4   5   6   7
# 2021-12-03   8   9  10  11
# 2021-12-04  12  13  14  15
# 2021-12-05  16  17  18  19
# 2021-12-06  20  21  22  23

pdf.iloc[0, 1] = np.nan
pdf.iloc[1, 2] = np.nan
print('将ilco位置的数值改为nan：')
print(pdf)
# 将ilco位置的数值改为nan：
#              A     B     C   D
# 2021-12-01   0   NaN   2.0   3
# 2021-12-02   4   5.0   NaN   7
# 2021-12-03   8   9.0  10.0  11
# 2021-12-04  12  13.0  14.0  15
# 2021-12-05  16  17.0  18.0  19
# 2021-12-06  20  21.0  22.0  23

print('删除所有NaN的行/列(默认为行)')
# print(pdf.dropna())
# pdf.dropna() == pdf.dropna(axis = 1, how = 'any' )
print(pdf.dropna(
    axis = 1, # 0对行进行 *** 作；1对列进行 *** 作
    how = 'any' # 'any':只要存在NaN就drop；'all':必须全部是NaN才drop
))
# axis=0
#              A     B     C   D
# 2021-12-03   8   9.0  10.0  11
# 2021-12-04  12  13.0  14.0  15
# 2021-12-05  16  17.0  18.0  19
# 2021-12-06  20  21.0  22.0  23
# axis=1
#              A   D
# 2021-12-01   0   3
# 2021-12-02   4   7
# 2021-12-03   8  11
# 2021-12-04  12  15
# 2021-12-05  16  19
# 2021-12-06  20  23

# 替换NaN值为0或者其他
print('将NaN位置替换为0：')
print(pdf.fillna(value=0))
#              A     B     C   D
# 2021-12-01   0   0.0   2.0   3
# 2021-12-02   4   5.0   0.0   7
# 2021-12-03   8   9.0  10.0  11
# 2021-12-04  12  13.0  14.0  15
# 2021-12-05  16  17.0  18.0  19
# 2021-12-06  20  21.0  22.0  23

# 是否有缺失数据NaN
# 是否为空
print('NaN为True：')
print(pdf.isnull())
#                 A      B      C      D
# 2021-12-01  False   True  False  False
# 2021-12-02  False  False   True  False
# 2021-12-03  False  False  False  False
# 2021-12-04  False  False  False  False
# 2021-12-05  False  False  False  False
# 2021-12-06  False  False  False  False

# 是否为NaN
print('NaN为True：')
print(pdf.isna())
# NaN为True：
#                 A      B      C      D
# 2021-12-01  False   True  False  False
# 2021-12-02  False  False   True  False
# 2021-12-03  False  False  False  False
# 2021-12-04  False  False  False  False
# 2021-12-05  False  False  False  False
# 2021-12-06  False  False  False  False

# 检查某列是否有缺失数据NaN
print('检测列上是否有NaN：')
print(pdf.isna().any())
# 检测列上是否有NaN：
# A    False
# B     True
# C     True
# D    False

欢迎分享，转载请注明来源：内存溢出

原文地址: http://outofmemory.cn/zaji/5651442.html

Pandas(一)[快速入门]

发表评论

评论列表（0条）