#coding=utf-8
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
file_path='IMDB-Movie-Data.csv'
df=pd.read_csv(file_path)
#统计分类的列表
temp_list=df['Genre'].str.split(',').tolist() #[[],[],[]]
genre_list=list(set([i for j in temp_list for i in j])) #分类去重
#构造为0的数组
zero_df=pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list)
#给每个电影出现分类的位置赋值1
for i in range(df.shape[0]):
zero_df.loc[i,temp_list[i]]=1
print(zero_df.head(3))
#统计每个分类的电影的数量和
genre_count=zero_df.sum(axis=0)
print(genre_count)
#排序
genre_count=genre_count.sort_values()
_x=genre_count.index
_y=genre_count.values
#画图
plt.figure(figsize=(20,8),dpi=80)
plt.bar(_x,_y,width=0.3)
#展示
plt.show()
数据的合并
join合并
默认情况下将行索引相同的数据合并到一起
a.join(b) :按照a的行索引
In [1]: import pandas as pd
In [2]: import numpy as np
In [3]: df1=pd.DataFrame(np.ones((2,4)),index=['A','B'],columns=list('abcd'))
In [4]: df2=pd.DataFrame(np.zeros((3,3)),index=['A','B','C'],columns=list('xyz'))
In [5]: df1
Out[5]:
a b c d
A 1.0 1.0 1.0 1.0
B 1.0 1.0 1.0 1.0
In [6]: df2
Out[6]:
x y z
A 0.0 0.0 0.0
B 0.0 0.0 0.0
C 0.0 0.0 0.0
In [7]: df1.join(df2) #以df1每行为准
Out[7]:
a b c d x y z
A 1.0 1.0 1.0 1.0 0.0 0.0 0.0
B 1.0 1.0 1.0 1.0 0.0 0.0 0.0
In [10]: df2.join(df1) #以df1每行为准,df2没有的就填入NaN
Out[10]:
x y z a b c d
A 0.0 0.0 0.0 1.0 1.0 1.0 1.0
B 0.0 0.0 0.0 1.0 1.0 1.0 1.0
C 0.0 0.0 0.0 NaN NaN NaN NaN
merge合并
按照指定的列把数据按照一定的方式合并到一起
(1) 内连接
In [21]: df3=pd.DataFrame(np.arange(9).reshape(3,3),columns=list('fax'))
In [22]: df3
Out[22]:
f a x
0 0 1 2
1 3 4 5
2 6 7 8
In [23]: df1
Out[23]:
a b c d
A 1.0 1.0 1.0 1.0
B 1.0 1.0 1.0 1.0
In [24]: df1.merge(df3,on='a') #按照df3中的“a”列元素(1,4,7),df1中‘a’列中第一、二行都是1,所以取df3的第一行,df1的第一、二行。因为df1取了两行,所以df3重复一行变两行。
Out[24]:
a b c d f x
0 1.0 1.0 1.0 1.0 0 2
1 1.0 1.0 1.0 1.0 0 2
In [25]: df1.loc['A','a']=100
In [26]: df1
Out[26]:
a b c d
A 100.0 1.0 1.0 1.0
B 1.0 1.0 1.0 1.0
In [27]: df1.merge(df3,on='a')
Out[27]:
a b c d f x
0 1.0 1.0 1.0 1.0 0 2
(2) 外连接
In [28]: df1.merge(df3,on='a',how='inner') #默认内连接,取交集
Out[28]:
a b c d f x
0 1.0 1.0 1.0 1.0 0 2
In [29]: df1.merge(df3,on='a',how='outer') #外连接,取并集
Out[29]:
a b c d f x
0 100.0 1.0 1.0 1.0 NaN NaN
1 1.0 1.0 1.0 1.0 0.0 2.0
2 4.0 NaN NaN NaN 3.0 5.0
3 7.0 NaN NaN NaN 6.0 8.0
(3) 左连接
In [30]: df1.merge(df3,on='a',how='left') #以df1为准
Out[30]:
a b c d f x
0 100.0 1.0 1.0 1.0 NaN NaN
1 1.0 1.0 1.0 1.0 0.0 2.0
(4) 右连接
In [31]: df1.merge(df3,on='a',how='right') #以df3为准
Out[31]:
a b c d f x
0 1.0 1.0 1.0 1.0 0 2
1 4.0 NaN NaN NaN 3 5
2 7.0 NaN NaN NaN 6 8
数据分组聚合
分组、聚合
ps:pycharm设置或者取消多行注释快捷键:ctrl+/
数据分组使用:groupby();聚合使用:分组后的数据(聚合依据).count()
#coding=utf-8
import pandas as pd
import numpy as np
file_path='./starbucks_store_worldwide.csv'
df=pd.read_csv(file_path)
#print(df.head(1))
#print(df.info())
#按照国家进行分组,利用groupby()
grouped=df.groupby(by='Country')
print(grouped)
#DataFrameGroupby:
#可以进行遍历
# for i,j in grouped: #遍历出每个国家的星巴克信息
# print(i) #i--国家
# print('-'*100)
# print(j) #j--DataFrame
# print('*'*100)
#df[df["Country"]=="US"] #选择美国数据
#可以调用聚合方法
country_count=grouped['Brand'].count()
print(country_count['US']) #美国星巴克数量
print(country_count['CN']) #中国星巴克数量
#统计中国每个省份星巴克店铺的数量
china_data=df[df['Country']=='CN']
grouped=china_data.groupby(by='State/Province').count()['Brand']
print(grouped)
更复杂的分组、聚合
#多条件进行分组(调查每个国家每个城市的店铺数量),返回Series
grouped1=df['Brand'].groupby(by=[df['Country'],df['State/Province']]).count()
print(grouped1)
print(type(grouped1)) #前两列是索引,叫做复合索引,所以是<class 'pandas.core.series.Series'>
#多条件进行分组(调查每个国家每个城市的店铺数量),返回DataFrame(在分组依据'Brand'加了一个方括号)
grouped2=df[['Brand']].groupby(by=[df['Country'],df['State/Province']]).count()
# grouped2=df.groupby(by=[df['Country'],df['State/Province']])[['Brand']].count()
# grouped2=df.groupby(by=[df['Country'],df['State/Province']]).count()[['Brand']]
print(grouped2)
print(type(grouped2)) #<class 'pandas.core.frame.DataFrame'>
数据的索引学习
简单的索引
In [32]: df1
Out[32]:
a b c d
A 100.0 1.0 1.0 1.0
B 1.0 1.0 1.0 1.0
In [33]: df1.index #获取索引
Out[33]: Index(['A', 'B'], dtype='object')
In [34]: df1.index=['A','B']
In [35]: df1
Out[35]:
a b c d
A 100.0 1.0 1.0 1.0
B 1.0 1.0 1.0 1.0
In [36]: df1.index
Out[36]: Index(['A', 'B'], dtype='object')
In [37]: df1.index=['a','b'] #指定索引
In [38]: df1.index
Out[38]: Index(['a', 'b'], dtype='object')
In [39]: df1.reindex(['a','f']) #重新设置索引,原本就存在的索引按照原数据写上去,没有的索引用NaN
Out[39]:
a b c d
a 100.0 1.0 1.0 1.0
f NaN NaN NaN NaN
In [40]: df1.set_index('a') #指定某一列作为索引
Out[40]:
b c d
a
100.0 1.0 1.0 1.0
1.0 1.0 1.0 1.0
In [41]: df1.set_index('a').index
Out[41]: Float64Index([100.0, 1.0], dtype='float64', name='a')
In [42]: df1.set_index('a',drop=False) #drop默认为True,意为指定某列为索引,该列不再参与在该数据中,False意为该列数据还存在
Out[42]:
a b c d
a
100.0 100.0 1.0 1.0 1.0
1.0 1.0 1.0 1.0 1.0
In [43]: df1['d'].unique() #返回某列的值(去重)
Out[43]: array([1.])
In [44]: df1['a'].unique()
Out[44]: array([100., 1.])
In [45]: df1.set_index('b').index
Out[45]: Float64Index([1.0, 1.0], dtype='float64', name='b')
In [46]: df1.set_index('b')
Out[46]:
a c d
b
1.0 100.0 1.0 1.0
1.0 1.0 1.0 1.0
In [47]: df1.set_index('b').index.unique() #返回索引的唯一值
Out[47]: Float64Index([1.0], dtype='float64', name='b')
In [48]: len(df1.set_index('b').index.unique()) #返回某列去重后的索引长度
Out[48]: 1
In [49]: len(df1.set_index('b').index)
Out[49]: 2
In [50]: list(df1.set_index('b').index) #将某列索引变为列表
Out[50]: [1.0, 1.0]
复合索引
In [51]: df1
Out[51]:
a b c d
a 100.0 1.0 1.0 1.0
b 1.0 1.0 1.0 1.0
In [52]: df1.set_index(['a','b']) #将某两列数据作为符合索引
Out[52]:
c d
a b
100.0 1.0 1.0 1.0
1.0 1.0 1.0 1.0
In [53]: df1.set_index(['a','b']).index
Out[53]:
MultiIndex([(100.0, 1.0),
( 1.0, 1.0)],
names=['a', 'b'])
In [54]: df1.set_index(['a','b','d'],drop=False) #被选为索引的数据依旧存在
Out[54]:
a b c d
a b d
100.0 1.0 1.0 100.0 1.0 1.0 1.0
1.0 1.0 1.0 1.0 1.0 1.0 1.0
In [55]: df1.set_index(['a','b','d'],drop=False).index
Out[55]:
MultiIndex([(100.0, 1.0, 1.0),
( 1.0, 1.0, 1.0)],
names=['a', 'b', 'd'])
In [58]: a=pd.DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],'d':list('hjklm
...: no')})
In [59]: a
Out[59]:
a b c d
0 0 7 one h
1 1 6 one j
2 2 5 one k
3 3 4 two l
4 4 3 two m
5 5 2 two n
6 6 1 two o
In [60]: b=a.set_index(['c','d'])
In [61]: b
Out[61]:
a b
c d
one h 0 7
j 1 6
k 2 5
two l 3 4
m 4 3
n 5 2
o 6 1
数据分组聚合练习
数据分组聚合
In [58]: a=pd.DataFrame({'a':range(7),'b':range(7,0,-1),'c':['one','one','one','two','two','two','two'],'d':list('hjklm
...: no')})
In [59]: a
Out[59]:
a b c d
0 0 7 one h
1 1 6 one j
2 2 5 one k
3 3 4 two l
4 4 3 two m
5 5 2 two n
6 6 1 two o
In [60]: b=a.set_index(['c','d'])
In [61]: b
Out[61]:
a b
c d
one h 0 7
j 1 6
k 2 5
two l 3 4
m 4 3
n 5 2
o 6 1
In [63]: c=b['a']
In [64]: c
Out[64]:
c d
one h 0
j 1
k 2
two l 3
m 4
n 5
o 6
Name: a, dtype: int64
In [65]: type(c)
Out[65]: pandas.core.series.Series
In [66]: c['one']['j']
Out[66]: 1
In [67]: c['one']
Out[67]:
d
h 0
j 1
k 2
Name: a, dtype: int64
In [68]: d=a.set_index(['d','c'])['a']
In [69]: d
Out[69]:
d c
h one 0
j one 1
k one 2
l two 3
m two 4
n two 5
o two 6
Name: a, dtype: int64
In [70]: d.index
Out[70]:
MultiIndex([('h', 'one'),
('j', 'one'),
('k', 'one'),
('l', 'two'),
('m', 'two'),
('n', 'two'),
('o', 'two')],
names=['d', 'c'])
In [71]: d.swaplevel()['one'] #交换索引
Out[71]:
d
h 0
j 1
k 2
Name: a, dtype: int64
In [72]: b
Out[72]:
a b
c d
one h 0 7
j 1 6
k 2 5
two l 3 4
m 4 3
n 5 2
o 6 1
In [73]: b.loc['one'].loc['h'] #使用loc来取值
Out[73]:
a 0
b 7
Name: h, dtype: int64
In [74]: b.swaplevel().loc['h']
Out[74]:
a b
c
one 0 7
小练习
(1)练习一
#coding=utf-8
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
file_path='./starbucks_store_worldwide.csv'
df=pd.read_csv(file_path)
#使用matplotlib呈现出店铺总数排名前十的国家
#准备数据
detal=df.groupby(by='Country').count()['Brand'].sort_values(ascending=False)[:10]
_x=detal.index
_y=detal.values
#画图
plt.figure(figsize=(20,8),dpi=80)
plt.bar(range(len(_x)),_y)
plt.xticks(range(len(_x)),_x)
plt.show()
(2)练习二
#coding=utf-8
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import matplotlib
matplotlib.rc("font",family='SimHei')
file_path='./starbucks_store_worldwide.csv'
df=pd.read_csv(file_path)
#使用matplotlib呈现中国每个城市的店铺数量
df=df[df["Country"]=="CN"]
#准备数据
detal=df.groupby(by='City').count()['Brand'].sort_values(ascending=False)[:25]
_x=detal.index
_y=detal.values
#画图
plt.figure(figsize=(20,8),dpi=80)
plt.bar(range(len(_x)),_y,width=0.3,color='orange')
plt.xticks(range(len(_x)),_x)
plt.show()
(3)练习三
#coding=utf-8
import pandas as pd
from matplotlib import pyplot as plt
file_path='./books.csv'
df=pd.read_csv(file_path)
#统计不同年份书的数量
data1=df[pd.notnull(df['original_publication_year'])]
grouped=data1.groupby(by='original_publication_year').count()['title']
print(grouped)
#统计不同年份书的平均评分情况
#去除original_publication_year列中NaN的行
grouped1=data1['average_rating'].groupby(by=data1['original_publication_year']).mean()
print(grouped1)
_x=grouped1.index
_y=grouped1.values
#画图
plt.figure(figsize=(20,8),dpi=80)
plt.plot(range(len(_x)),_y)
plt.xticks(list(range(len(_x)))[::10],_x[::10].astype(int),rotation=45) #astype设置数值为整数
plt.show()
总结
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)