import numpy as np
import pandas as pd
from pandas import Series,DataFrame
加载文本数据
- pd.read_csv()
- pd.read_table() # deprecated在当前版本过时
- pd.read_json()
pd.read_csv('data/adults.txt')
| age | workclass | final_weight | education | education_num | marital_status | occupation | relationship | race | sex | capital_gain | capital_loss | hours_per_week | native_country | salary |
---|
0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
32556 | 27 | Private | 257302 | Assoc-acdm | 12 | Married-civ-spouse | Tech-support | Wife | White | Female | 0 | 0 | 38 | United-States | <=50K |
32557 | 40 | Private | 154374 | HS-grad | 9 | Married-civ-spouse | Machine-op-inspct | Husband | White | Male | 0 | 0 | 40 | United-States | >50K |
32558 | 58 | Private | 151910 | HS-grad | 9 | Widowed | Adm-clerical | Unmarried | White | Female | 0 | 0 | 40 | United-States | <=50K |
32559 | 22 | Private | 201490 | HS-grad | 9 | Never-married | Adm-clerical | Own-child | White | Male | 0 | 0 | 20 | United-States | <=50K |
32560 | 52 | Self-emp-inc | 287927 | HS-grad | 9 | Married-civ-spouse | Exec-managerial | Wife | White | Female | 15024 | 0 | 40 | United-States | >50K |
32561 rows × 15 columns
pd.read_csv('data/seeds.tsv')
| 15.26\t14.84\t0.871\t5.763\t3.312\t2.221\t5.22\tKama |
---|
0 | 14.88\t14.57\t0.8811\t5.554\t3.333\t1.018\t4.9... |
1 | 14.29\t14.09\t0.905\t5.291\t3.337\t2.699\t4.82... |
2 | 13.84\t13.94\t0.8955\t5.324\t3.379\t2.259\t4.8... |
3 | 16.14\t14.99\t0.9034\t5.658\t3.562\t1.355\t5.1... |
4 | 14.38\t14.21\t0.8951\t5.386\t3.312\t2.462\t4.9... |
... | ... |
204 | 12.19\t13.2\t0.8783\t5.137\t2.981\t3.631\t4.87... |
205 | 11.23\t12.88\t0.8511\t5.14\t2.795\t4.325\t5.00... |
206 | 13.2\t13.66\t0.8883\t5.236\t3.232\t8.315\t5.05... |
207 | 11.84\t13.21\t0.8521\t5.175\t2.836\t3.598\t5.0... |
208 | 12.3\t13.34\t0.8684\t5.243\t2.974\t5.637\t5.06... |
209 rows × 1 columns
pd.read_csv('data/seeds.tsv',sep='\t',header=None)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
---|
0 | 15.26 | 14.84 | 0.8710 | 5.763 | 3.312 | 2.221 | 5.220 | Kama |
1 | 14.88 | 14.57 | 0.8811 | 5.554 | 3.333 | 1.018 | 4.956 | Kama |
2 | 14.29 | 14.09 | 0.9050 | 5.291 | 3.337 | 2.699 | 4.825 | Kama |
3 | 13.84 | 13.94 | 0.8955 | 5.324 | 3.379 | 2.259 | 4.805 | Kama |
4 | 16.14 | 14.99 | 0.9034 | 5.658 | 3.562 | 1.355 | 5.175 | Kama |
... | ... | ... | ... | ... | ... | ... | ... | ... |
205 | 12.19 | 13.20 | 0.8783 | 5.137 | 2.981 | 3.631 | 4.870 | Canadian |
206 | 11.23 | 12.88 | 0.8511 | 5.140 | 2.795 | 4.325 | 5.003 | Canadian |
207 | 13.20 | 13.66 | 0.8883 | 5.236 | 3.232 | 8.315 | 5.056 | Canadian |
208 | 11.84 | 13.21 | 0.8521 | 5.175 | 2.836 | 3.598 | 5.044 | Canadian |
209 | 12.30 | 13.34 | 0.8684 | 5.243 | 2.974 | 5.637 | 5.063 | Canadian |
210 rows × 8 columns
pd.read_table('data/seeds.tsv',header=None)
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
---|
0 | 15.26 | 14.84 | 0.8710 | 5.763 | 3.312 | 2.221 | 5.220 | Kama |
1 | 14.88 | 14.57 | 0.8811 | 5.554 | 3.333 | 1.018 | 4.956 | Kama |
2 | 14.29 | 14.09 | 0.9050 | 5.291 | 3.337 | 2.699 | 4.825 | Kama |
3 | 13.84 | 13.94 | 0.8955 | 5.324 | 3.379 | 2.259 | 4.805 | Kama |
4 | 16.14 | 14.99 | 0.9034 | 5.658 | 3.562 | 1.355 | 5.175 | Kama |
... | ... | ... | ... | ... | ... | ... | ... | ... |
205 | 12.19 | 13.20 | 0.8783 | 5.137 | 2.981 | 3.631 | 4.870 | Canadian |
206 | 11.23 | 12.88 | 0.8511 | 5.140 | 2.795 | 4.325 | 5.003 | Canadian |
207 | 13.20 | 13.66 | 0.8883 | 5.236 | 3.232 | 8.315 | 5.056 | Canadian |
208 | 11.84 | 13.21 | 0.8521 | 5.175 | 2.836 | 3.598 | 5.044 | Canadian |
209 | 12.30 | 13.34 | 0.8684 | 5.243 | 2.974 | 5.637 | 5.063 | Canadian |
210 rows × 8 columns
pd.read_csv('data/usa_election.txt',low_memory=False)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
in
----> 1 pd.read_csv('data/usa_election.txt',low_memory=False)
NameError: name 'pd' is not defined
pd.read_csv('data/seeds.tsv',sep='\t',header=None).to_json('data/seeds.json')
# 此时,在data文件夹里就会多出来一个seeds.json文件
pd.read_json('data/seeds.json')
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
---|
0 | 15.26 | 14.84 | 0.8710 | 5.763 | 3.312 | 2.221 | 5.220 | Kama |
1 | 14.88 | 14.57 | 0.8811 | 5.554 | 3.333 | 1.018 | 4.956 | Kama |
2 | 14.29 | 14.09 | 0.9050 | 5.291 | 3.337 | 2.699 | 4.825 | Kama |
3 | 13.84 | 13.94 | 0.8955 | 5.324 | 3.379 | 2.259 | 4.805 | Kama |
4 | 16.14 | 14.99 | 0.9034 | 5.658 | 3.562 | 1.355 | 5.175 | Kama |
... | ... | ... | ... | ... | ... | ... | ... | ... |
205 | 12.19 | 13.20 | 0.8783 | 5.137 | 2.981 | 3.631 | 4.870 | Canadian |
206 | 11.23 | 12.88 | 0.8511 | 5.140 | 2.795 | 4.325 | 5.003 | Canadian |
207 | 13.20 | 13.66 | 0.8883 | 5.236 | 3.232 | 8.315 | 5.056 | Canadian |
208 | 11.84 | 13.21 | 0.8521 | 5.175 | 2.836 | 3.598 | 5.044 | Canadian |
209 | 12.30 | 13.34 | 0.8684 | 5.243 | 2.974 | 5.637 | 5.063 | Canadian |
210 rows × 8 columns
# 重新调整索引编号
pd.read_json('data/seeds.json').reset_index()
| index | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
---|
0 | 0 | 15.26 | 14.84 | 0.8710 | 5.763 | 3.312 | 2.221 | 5.220 | Kama |
1 | 1 | 14.88 | 14.57 | 0.8811 | 5.554 | 3.333 | 1.018 | 4.956 | Kama |
2 | 2 | 14.29 | 14.09 | 0.9050 | 5.291 | 3.337 | 2.699 | 4.825 | Kama |
3 | 3 | 13.84 | 13.94 | 0.8955 | 5.324 | 3.379 | 2.259 | 4.805 | Kama |
4 | 4 | 16.14 | 14.99 | 0.9034 | 5.658 | 3.562 | 1.355 | 5.175 | Kama |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
205 | 205 | 12.19 | 13.20 | 0.8783 | 5.137 | 2.981 | 3.631 | 4.870 | Canadian |
206 | 206 | 11.23 | 12.88 | 0.8511 | 5.140 | 2.795 | 4.325 | 5.003 | Canadian |
207 | 207 | 13.20 | 13.66 | 0.8883 | 5.236 | 3.232 | 8.315 | 5.056 | Canadian |
208 | 208 | 11.84 | 13.21 | 0.8521 | 5.175 | 2.836 | 3.598 | 5.044 | Canadian |
209 | 209 | 12.30 | 13.34 | 0.8684 | 5.243 | 2.974 | 5.637 | 5.063 | Canadian |
210 rows × 9 columns
pd.read_json('data/seeds.json').reset_index().sort_values('index').set_index('index')
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
---|
index | | | | | | | | |
---|
0 | 15.26 | 14.84 | 0.8710 | 5.763 | 3.312 | 2.221 | 5.220 | Kama |
1 | 14.88 | 14.57 | 0.8811 | 5.554 | 3.333 | 1.018 | 4.956 | Kama |
2 | 14.29 | 14.09 | 0.9050 | 5.291 | 3.337 | 2.699 | 4.825 | Kama |
3 | 13.84 | 13.94 | 0.8955 | 5.324 | 3.379 | 2.259 | 4.805 | Kama |
4 | 16.14 | 14.99 | 0.9034 | 5.658 | 3.562 | 1.355 | 5.175 | Kama |
... | ... | ... | ... | ... | ... | ... | ... | ... |
205 | 12.19 | 13.20 | 0.8783 | 5.137 | 2.981 | 3.631 | 4.870 | Canadian |
206 | 11.23 | 12.88 | 0.8511 | 5.140 | 2.795 | 4.325 | 5.003 | Canadian |
207 | 13.20 | 13.66 | 0.8883 | 5.236 | 3.232 | 8.315 | 5.056 | Canadian |
208 | 11.84 | 13.21 | 0.8521 | 5.175 | 2.836 | 3.598 | 5.044 | Canadian |
209 | 12.30 | 13.34 | 0.8684 | 5.243 | 2.974 | 5.637 | 5.063 | Canadian |
210 rows × 8 columns
加载excel的数据
pd.read_excel('data/students.xlsx')
| id | name | age | sex | height(cm) | weight(kg) |
---|
0 | 1 | 吴果 | 20 | 男 | 165 | 50 |
1 | 2 | 李平 | 25 | 女 | 158 | 45 |
2 | 3 | 王和 | 21 | 男 | 179 | 75 |
3 | 4 | 刘言 | 27 | 男 | 180 | 80 |
4 | 5 | 王平 | 20 | 女 | 164 | 82 |
5 | 6 | 宁国柱 | 19 | 男 | 178 | 85 |
6 | 7 | 刘佳敏 | 25 | 女 | 183 | 90 |
7 | 8 | 张先 | 30 | 女 | 165 | 55 |
8 | 9 | 刘四七 | 27 | 女 | 170 | 61 |
9 | 10 | 关小弟 | 29 | 男 | 190 | 94 |
# 把id作为索引值
pd.read_excel('data/students.xlsx',index_col='id')
| name | age | sex | height(cm) | weight(kg) |
---|
id | | | | | |
---|
1 | 吴果 | 20 | 男 | 165 | 50 |
2 | 李平 | 25 | 女 | 158 | 45 |
3 | 王和 | 21 | 男 | 179 | 75 |
4 | 刘言 | 27 | 男 | 180 | 80 |
5 | 王平 | 20 | 女 | 164 | 82 |
6 | 宁国柱 | 19 | 男 | 178 | 85 |
7 | 刘佳敏 | 25 | 女 | 183 | 90 |
8 | 张先 | 30 | 女 | 165 | 55 |
9 | 刘四七 | 27 | 女 | 170 | 61 |
10 | 关小弟 | 29 | 男 | 190 | 94 |
加载网络数据
- pd.read_csv()/table()/json()都可以加载网络资源,主要看数据的结构
在data文件夹下,新建data_IO文件夹,添加数据
按windows+R,输入cmd,回车,执行下面代码:
启动后,打开页面,可以看到,数据可以加载:
然后在jupyter notebook里面继续 *** 作:
df = pd.read_csv('http://127.0.0.1:8000/testSet.txt')
df
| -0.017612\t14.053064\t0 |
---|
0 | -1.395634\t4.662541\t1 |
1 | -0.752157\t6.538620\t0 |
2 | -1.322371\t7.152853\t0 |
3 | 0.423363\t11.054677\t0 |
4 | 0.406704\t7.067335\t1 |
... | ... |
94 | 0.677983\t2.556666\t1 |
95 | 0.761349\t10.693862\t0 |
96 | -2.168791\t0.143632\t1 |
97 | 1.388610\t9.341997\t0 |
98 | 0.317029\t14.739025\t0 |
99 rows × 1 columns
数据库加载
加载sqlite3数据库
import sqlite3
# 新建数据文件 放置在data里,命名为persin.db
# 如果文件不存在,会自动创建
conn = sqlite3.connect('data/person.db')
# 如何给db文件写数据
cursor = conn.cursor() # 打开一个游标,游标主要是与数据库进行交互的
# 创建一张表
# sqlite3 微型数据库,不需要去声明字段
cursor.execute('create table tb_person(id integer primary key,name,age,sex,phone)')
help(cursor.execute)
Help on built-in function execute:
execute(...) method of sqlite3.Cursor instance
Executes a SQL statement.
# pymysql/mysqlclitent的包中,cursor的cursor.execute(sql,args=()|{}) argsd两种类型,一个是元组,一个是字典,args常用元组的方式实现
# 但是sqlite3里面就没有这个参数,所以执行语句的时候应该怎么做呢?
# sqlite3插入数据
cursor.execute("insert into tb_person(name,age,sex,phone) values ('Jack',18,'男','12123423434')")
# 此时数据库就有数据了,且只有一条数据
# 再插入一条
cursor.execute("insert into tb_person(name,age,sex,phone) values ('Blus',18,'女','12123423544')")
# 查询数据,并全部显示
cursor.execute('select * from tb_person')
list(cursor.fetchall())
[(1, 'Jack', 18, '男', '12123423434'), (2, 'Blus', 18, '女', '12123423544')]
# 关闭连接
conn.close()
# 再次连接
conn =sqlite3.connect('data/person.db')
# 读取
pd.read_sql('select * from tb_person',conn)
# 此时没有数据:原因:刚才关闭了连接,所以没有提交事务
# 重新打开游标,执行sql
cursor = conn.cursor()
# 插入sql
cursor.execute("insert into tb_person(name,age,sex,phone) values ('Jack',18,'男','12123423434')")
cursor.execute("insert into tb_person(name,age,sex,phone) values ('Blus',18,'女','12123423544')")
cursor.execute("insert into tb_person(name,age,sex,phone) values ('Blua',18,'女','12123423544')")
cursor.execute("insert into tb_person(name,age,sex,phone) values ('Blux',18,'女','12123423544')")
# 提交事务
conn.commit()
# 读取
pd.read_sql('select * from tb_person',conn)
| id | name | age | sex | phone |
---|
0 | 1 | Jack | 18 | 男 | 12123423434 |
1 | 2 | Blus | 18 | 女 | 12123423544 |
2 | 3 | Blua | 18 | 女 | 12123423544 |
3 | 4 | Blux | 18 | 女 | 12123423544 |
# 再次插入sql
cursor.execute("insert into tb_person(name,age,sex,phone) values ('Blan',22,'女','2123423784')")
cursor.execute("insert into tb_person(name,age,sex,phone) values ('Blro',21,'女','2125453784')")
# 提交事务
conn.commit()
# 读取
pd.read_sql('select * from tb_person',conn)
| id | name | age | sex | phone |
---|
0 | 1 | Jack | 18 | 男 | 12123423434 |
1 | 2 | Blus | 18 | 女 | 12123423544 |
2 | 3 | Blua | 18 | 女 | 12123423544 |
3 | 4 | Blux | 18 | 女 | 12123423544 |
4 | 5 | Blan | 22 | 女 | 2123423784 |
5 | 6 | Blro | 21 | 女 | 2125453784 |
# SQLite的cursor.execute不能传参,无法写入参数
pd.read_sql("select * from tb_person where phone like '12%'",conn)
| id | name | age | sex | phone |
---|
0 | 1 | Jack | 18 | 男 | 12123423434 |
1 | 2 | Blus | 18 | 女 | 12123423544 |
2 | 3 | Blua | 18 | 女 | 12123423544 |
3 | 4 | Blux | 18 | 女 | 12123423544 |
df
| -0.017612\t14.053064\t0 |
---|
0 | -1.395634\t4.662541\t1 |
1 | -0.752157\t6.538620\t0 |
2 | -1.322371\t7.152853\t0 |
3 | 0.423363\t11.054677\t0 |
4 | 0.406704\t7.067335\t1 |
... | ... |
94 | 0.677983\t2.556666\t1 |
95 | 0.761349\t10.693862\t0 |
96 | -2.168791\t0.143632\t1 |
97 | 1.388610\t9.341997\t0 |
98 | 0.317029\t14.739025\t0 |
99 rows × 1 columns
# 将df数据写入到数据库中,命名为test_set
df.to_sql('test_set',conn)
# 读取
pd.read_sql('select * from test_set',conn)
| index | -0.017612\t14.053064\t0 |
---|
0 | 0 | -1.395634\t4.662541\t1 |
1 | 1 | -0.752157\t6.538620\t0 |
2 | 2 | -1.322371\t7.152853\t0 |
3 | 3 | 0.423363\t11.054677\t0 |
4 | 4 | 0.406704\t7.067335\t1 |
... | ... | ... |
94 | 94 | 0.677983\t2.556666\t1 |
95 | 95 | 0.761349\t10.693862\t0 |
96 | 96 | -2.168791\t0.143632\t1 |
97 | 97 | 1.388610\t9.341997\t0 |
98 | 98 | 0.317029\t14.739025\t0 |
99 rows × 2 columns
# 读取时指定索引列
pd.read_sql('select * from test_set',conn,index_col='index')
| -0.017612\t14.053064\t0 |
---|
index | |
---|
0 | -1.395634\t4.662541\t1 |
1 | -0.752157\t6.538620\t0 |
2 | -1.322371\t7.152853\t0 |
3 | 0.423363\t11.054677\t0 |
4 | 0.406704\t7.067335\t1 |
... | ... |
94 | 0.677983\t2.556666\t1 |
95 | 0.761349\t10.693862\t0 |
96 | -2.168791\t0.143632\t1 |
97 | 1.388610\t9.341997\t0 |
98 | 0.317029\t14.739025\t0 |
99 rows × 1 columns
评论列表(0条)