数据分析——从入门到精通

数据分析——从入门到精通,第1张

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
加载文本数据
  • pd.read_csv()
  • pd.read_table() # deprecated在当前版本过时
  • pd.read_json()
pd.read_csv('data/adults.txt')
ageworkclassfinal_weighteducationeducation_nummarital_statusoccupationrelationshipracesexcapital_gaincapital_losshours_per_weeknative_countrysalary
039State-gov77516Bachelors13Never-marriedAdm-clericalNot-in-familyWhiteMale2174040United-States<=50K
150Self-emp-not-inc83311Bachelors13Married-civ-spouseExec-managerialHusbandWhiteMale0013United-States<=50K
238Private215646HS-grad9DivorcedHandlers-cleanersNot-in-familyWhiteMale0040United-States<=50K
353Private23472111th7Married-civ-spouseHandlers-cleanersHusbandBlackMale0040United-States<=50K
428Private338409Bachelors13Married-civ-spouseProf-specialtyWifeBlackFemale0040Cuba<=50K
................................................
3255627Private257302Assoc-acdm12Married-civ-spouseTech-supportWifeWhiteFemale0038United-States<=50K
3255740Private154374HS-grad9Married-civ-spouseMachine-op-inspctHusbandWhiteMale0040United-States>50K
3255858Private151910HS-grad9WidowedAdm-clericalUnmarriedWhiteFemale0040United-States<=50K
3255922Private201490HS-grad9Never-marriedAdm-clericalOwn-childWhiteMale0020United-States<=50K
3256052Self-emp-inc287927HS-grad9Married-civ-spouseExec-managerialWifeWhiteFemale15024040United-States>50K

32561 rows × 15 columns

pd.read_csv('data/seeds.tsv')
15.26\t14.84\t0.871\t5.763\t3.312\t2.221\t5.22\tKama
014.88\t14.57\t0.8811\t5.554\t3.333\t1.018\t4.9...
114.29\t14.09\t0.905\t5.291\t3.337\t2.699\t4.82...
213.84\t13.94\t0.8955\t5.324\t3.379\t2.259\t4.8...
316.14\t14.99\t0.9034\t5.658\t3.562\t1.355\t5.1...
414.38\t14.21\t0.8951\t5.386\t3.312\t2.462\t4.9...
......
20412.19\t13.2\t0.8783\t5.137\t2.981\t3.631\t4.87...
20511.23\t12.88\t0.8511\t5.14\t2.795\t4.325\t5.00...
20613.2\t13.66\t0.8883\t5.236\t3.232\t8.315\t5.05...
20711.84\t13.21\t0.8521\t5.175\t2.836\t3.598\t5.0...
20812.3\t13.34\t0.8684\t5.243\t2.974\t5.637\t5.06...

209 rows × 1 columns

pd.read_csv('data/seeds.tsv',sep='\t',header=None)
01234567
015.2614.840.87105.7633.3122.2215.220Kama
114.8814.570.88115.5543.3331.0184.956Kama
214.2914.090.90505.2913.3372.6994.825Kama
313.8413.940.89555.3243.3792.2594.805Kama
416.1414.990.90345.6583.5621.3555.175Kama
...........................
20512.1913.200.87835.1372.9813.6314.870Canadian
20611.2312.880.85115.1402.7954.3255.003Canadian
20713.2013.660.88835.2363.2328.3155.056Canadian
20811.8413.210.85215.1752.8363.5985.044Canadian
20912.3013.340.86845.2432.9745.6375.063Canadian

210 rows × 8 columns

pd.read_table('data/seeds.tsv',header=None)
01234567
015.2614.840.87105.7633.3122.2215.220Kama
114.8814.570.88115.5543.3331.0184.956Kama
214.2914.090.90505.2913.3372.6994.825Kama
313.8413.940.89555.3243.3792.2594.805Kama
416.1414.990.90345.6583.5621.3555.175Kama
...........................
20512.1913.200.87835.1372.9813.6314.870Canadian
20611.2312.880.85115.1402.7954.3255.003Canadian
20713.2013.660.88835.2363.2328.3155.056Canadian
20811.8413.210.85215.1752.8363.5985.044Canadian
20912.3013.340.86845.2432.9745.6375.063Canadian

210 rows × 8 columns

pd.read_csv('data/usa_election.txt',low_memory=False)
---------------------------------------------------------------------------

NameError                                 Traceback (most recent call last)

 in 
----> 1 pd.read_csv('data/usa_election.txt',low_memory=False)


NameError: name 'pd' is not defined
pd.read_csv('data/seeds.tsv',sep='\t',header=None).to_json('data/seeds.json')
# 此时,在data文件夹里就会多出来一个seeds.json文件
pd.read_json('data/seeds.json')
01234567
015.2614.840.87105.7633.3122.2215.220Kama
114.8814.570.88115.5543.3331.0184.956Kama
214.2914.090.90505.2913.3372.6994.825Kama
313.8413.940.89555.3243.3792.2594.805Kama
416.1414.990.90345.6583.5621.3555.175Kama
...........................
20512.1913.200.87835.1372.9813.6314.870Canadian
20611.2312.880.85115.1402.7954.3255.003Canadian
20713.2013.660.88835.2363.2328.3155.056Canadian
20811.8413.210.85215.1752.8363.5985.044Canadian
20912.3013.340.86845.2432.9745.6375.063Canadian

210 rows × 8 columns

# 重新调整索引编号
pd.read_json('data/seeds.json').reset_index()
index01234567
0015.2614.840.87105.7633.3122.2215.220Kama
1114.8814.570.88115.5543.3331.0184.956Kama
2214.2914.090.90505.2913.3372.6994.825Kama
3313.8413.940.89555.3243.3792.2594.805Kama
4416.1414.990.90345.6583.5621.3555.175Kama
..............................
20520512.1913.200.87835.1372.9813.6314.870Canadian
20620611.2312.880.85115.1402.7954.3255.003Canadian
20720713.2013.660.88835.2363.2328.3155.056Canadian
20820811.8413.210.85215.1752.8363.5985.044Canadian
20920912.3013.340.86845.2432.9745.6375.063Canadian

210 rows × 9 columns

pd.read_json('data/seeds.json').reset_index().sort_values('index').set_index('index')
01234567
index
015.2614.840.87105.7633.3122.2215.220Kama
114.8814.570.88115.5543.3331.0184.956Kama
214.2914.090.90505.2913.3372.6994.825Kama
313.8413.940.89555.3243.3792.2594.805Kama
416.1414.990.90345.6583.5621.3555.175Kama
...........................
20512.1913.200.87835.1372.9813.6314.870Canadian
20611.2312.880.85115.1402.7954.3255.003Canadian
20713.2013.660.88835.2363.2328.3155.056Canadian
20811.8413.210.85215.1752.8363.5985.044Canadian
20912.3013.340.86845.2432.9745.6375.063Canadian

210 rows × 8 columns


加载excel的数据
  • pd.read_excel()
pd.read_excel('data/students.xlsx')
idnameagesexheight(cm)weight(kg)
01吴果2016550
12李平2515845
23王和2117975
34刘言2718080
45王平2016482
56宁国柱1917885
67刘佳敏2518390
78张先3016555
89刘四七2717061
910关小弟2919094
# 把id作为索引值
pd.read_excel('data/students.xlsx',index_col='id')
nameagesexheight(cm)weight(kg)
id
1吴果2016550
2李平2515845
3王和2117975
4刘言2718080
5王平2016482
6宁国柱1917885
7刘佳敏2518390
8张先3016555
9刘四七2717061
10关小弟2919094
加载网络数据
  • pd.read_csv()/table()/json()都可以加载网络资源,主要看数据的结构

在data文件夹下,新建data_IO文件夹,添加数据



按windows+R,输入cmd,回车,执行下面代码:

启动后,打开页面,可以看到,数据可以加载:

然后在jupyter notebook里面继续 *** 作:

df = pd.read_csv('http://127.0.0.1:8000/testSet.txt')
df
-0.017612\t14.053064\t0
0-1.395634\t4.662541\t1
1-0.752157\t6.538620\t0
2-1.322371\t7.152853\t0
30.423363\t11.054677\t0
40.406704\t7.067335\t1
......
940.677983\t2.556666\t1
950.761349\t10.693862\t0
96-2.168791\t0.143632\t1
971.388610\t9.341997\t0
980.317029\t14.739025\t0

99 rows × 1 columns

数据库加载
  • pd.read_sql()
加载sqlite3数据库
import sqlite3
# 新建数据文件  放置在data里,命名为persin.db
# 如果文件不存在,会自动创建
conn = sqlite3.connect('data/person.db')

# 如何给db文件写数据
cursor = conn.cursor()     # 打开一个游标,游标主要是与数据库进行交互的
# 创建一张表
# sqlite3 微型数据库,不需要去声明字段
cursor.execute('create table tb_person(id integer primary key,name,age,sex,phone)')

help(cursor.execute)
Help on built-in function execute:

execute(...) method of sqlite3.Cursor instance
    Executes a SQL statement.
# pymysql/mysqlclitent的包中,cursor的cursor.execute(sql,args=()|{})  argsd两种类型,一个是元组,一个是字典,args常用元组的方式实现
# 但是sqlite3里面就没有这个参数,所以执行语句的时候应该怎么做呢?
# sqlite3插入数据
cursor.execute("insert into tb_person(name,age,sex,phone) values ('Jack',18,'男','12123423434')")

# 此时数据库就有数据了,且只有一条数据
# 再插入一条
cursor.execute("insert into tb_person(name,age,sex,phone) values ('Blus',18,'女','12123423544')")

# 查询数据,并全部显示
cursor.execute('select * from tb_person')
list(cursor.fetchall())
[(1, 'Jack', 18, '男', '12123423434'), (2, 'Blus', 18, '女', '12123423544')]
# 关闭连接
conn.close()
# 再次连接
conn =sqlite3.connect('data/person.db')
# 读取
pd.read_sql('select * from tb_person',conn)
# 此时没有数据:原因:刚才关闭了连接,所以没有提交事务
idnameagesexphone
# 重新打开游标,执行sql
cursor = conn.cursor()
# 插入sql
cursor.execute("insert into tb_person(name,age,sex,phone) values ('Jack',18,'男','12123423434')")
cursor.execute("insert into tb_person(name,age,sex,phone) values ('Blus',18,'女','12123423544')")
cursor.execute("insert into tb_person(name,age,sex,phone) values ('Blua',18,'女','12123423544')")
cursor.execute("insert into tb_person(name,age,sex,phone) values ('Blux',18,'女','12123423544')")
# 提交事务
conn.commit()
# 读取
pd.read_sql('select * from tb_person',conn)
idnameagesexphone
01Jack1812123423434
12Blus1812123423544
23Blua1812123423544
34Blux1812123423544
# 再次插入sql
cursor.execute("insert into tb_person(name,age,sex,phone) values ('Blan',22,'女','2123423784')")
cursor.execute("insert into tb_person(name,age,sex,phone) values ('Blro',21,'女','2125453784')")
# 提交事务
conn.commit()
# 读取
pd.read_sql('select * from tb_person',conn)
idnameagesexphone
01Jack1812123423434
12Blus1812123423544
23Blua1812123423544
34Blux1812123423544
45Blan222123423784
56Blro212125453784
# SQLite的cursor.execute不能传参,无法写入参数
pd.read_sql("select * from tb_person where phone like '12%'",conn)
idnameagesexphone
01Jack1812123423434
12Blus1812123423544
23Blua1812123423544
34Blux1812123423544
df
-0.017612\t14.053064\t0
0-1.395634\t4.662541\t1
1-0.752157\t6.538620\t0
2-1.322371\t7.152853\t0
30.423363\t11.054677\t0
40.406704\t7.067335\t1
......
940.677983\t2.556666\t1
950.761349\t10.693862\t0
96-2.168791\t0.143632\t1
971.388610\t9.341997\t0
980.317029\t14.739025\t0

99 rows × 1 columns

# 将df数据写入到数据库中,命名为test_set
df.to_sql('test_set',conn)
# 读取
pd.read_sql('select * from test_set',conn)
index-0.017612\t14.053064\t0
00-1.395634\t4.662541\t1
11-0.752157\t6.538620\t0
22-1.322371\t7.152853\t0
330.423363\t11.054677\t0
440.406704\t7.067335\t1
.........
94940.677983\t2.556666\t1
95950.761349\t10.693862\t0
9696-2.168791\t0.143632\t1
97971.388610\t9.341997\t0
98980.317029\t14.739025\t0

99 rows × 2 columns

# 读取时指定索引列
pd.read_sql('select * from test_set',conn,index_col='index')
-0.017612\t14.053064\t0
index
0-1.395634\t4.662541\t1
1-0.752157\t6.538620\t0
2-1.322371\t7.152853\t0
30.423363\t11.054677\t0
40.406704\t7.067335\t1
......
940.677983\t2.556666\t1
950.761349\t10.693862\t0
96-2.168791\t0.143632\t1
971.388610\t9.341997\t0
980.317029\t14.739025\t0

99 rows × 1 columns

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/langs/579744.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-04-11
下一篇 2022-04-11

发表评论

登录后才能评论

评论列表(0条)

保存