虽然很多时候一些模块内置的数据集使用起来非常的方便,搭建模型,做实例项目非常快,但是这种形式的数据建模本质上并不通用,我们使用模型大多是要解决现实生活中的很多实际问题,在不同的应用场景和业务问题中都会遇上各式各样的数据集,没有办法像那些使用起来很方便的内置数据集一样直接导入使用,这时候我们就要把问题的处理过程抽象成一个标准的数据流程,抽象出来标准流程的好处就是,之后任何问题我们都可以将其朝着标准流程的形式进行转化,从而讲一个复杂的问题标准化,就像很多时候搭建模型一样,明确需要定义的部分就是输入层的shape,跟这里标准数据流程是一个性质的。
背景闲话就说这些,接下来回到正题,本文主要的内容是将经典的手写数字识别数据集转化为标准的图像进行存储,之后建模的数据加载会选择从本地加载图像数据的形式进行实施,这里主要是为了数据流程的标准化,本质上你不需要转化也可以直接使用mnist数据集,远比加载图像要快得多。
二进制形式的mnist数据集地址在这里,首页截图如下所示:
数据集详细介绍如下:
There are 4 files:
train-images-idx3-ubyte: training set images
train-labels-idx1-ubyte: training set labels
t10k-images-idx3-ubyte: test set images
t10k-labels-idx1-ubyte: test set labels
The training set contains 60000 examples, and the test set 10000 examples.
The first 5000 examples of the test set are taken from the original NIST training set. The last 5000 are taken from the original NIST test set. The first 5000 are cleaner and easier than the last 5000.
TRAINING SET LABEL FILE (train-labels-idx1-ubyte):
[offset] [type] [value] [description]
0000 32 bit integer 0x00000801(2049) magic number (MSB first)
0004 32 bit integer 60000 number of items
0008 unsigned byte ?? label
0009 unsigned byte ?? label
........
xxxx unsigned byte ?? label
The labels values are 0 to 9.
TRAINING SET IMAGE FILE (train-images-idx3-ubyte):
[offset] [type] [value] [description]
0000 32 bit integer 0x00000803(2051) magic number
0004 32 bit integer 60000 number of images
0008 32 bit integer 28 number of rows
0012 32 bit integer 28 number of columns
0016 unsigned byte ?? pixel
0017 unsigned byte ?? pixel
........
xxxx unsigned byte ?? pixel
Pixels are organized row-wise. Pixel values are 0 to 255. 0 means background (white), 255 means foreground (black).
TEST SET LABEL FILE (t10k-labels-idx1-ubyte):
[offset] [type] [value] [description]
0000 32 bit integer 0x00000801(2049) magic number (MSB first)
0004 32 bit integer 10000 number of items
0008 unsigned byte ?? label
0009 unsigned byte ?? label
........
xxxx unsigned byte ?? label
The labels values are 0 to 9.
TEST SET IMAGE FILE (t10k-images-idx3-ubyte):
[offset] [type] [value] [description]
0000 32 bit integer 0x00000803(2051) magic number
0004 32 bit integer 10000 number of images
0008 32 bit integer 28 number of rows
0012 32 bit integer 28 number of columns
0016 unsigned byte ?? pixel
0017 unsigned byte ?? pixel
........
xxxx unsigned byte ?? pixel
Pixels are organized row-wise. Pixel values are 0 to 255. 0 means background (white), 255 means foreground (black).
需要的可以自行下载使用。
Python转化解析代码整体实现如下所示:
#!usr/bin/env python
# encoding:utf-8
from __future__ import division
"""
__Author__:沂水寒城
功能: tensorflow 二进制 Mnist 数据集处理
数据集下载地址为:
http://yann.lecun.com/exdb/mnist
"""
import os
import time
import json
import numpy as np
import struct
from PIL import Image
# 训练集文件
train_images_idx3_ubyte_file = "./mnist/train-images.idx3-ubyte"
# 训练集标签文件
train_labels_idx1_ubyte_file = "./mnist/train-labels.idx1-ubyte"
# 测试集文件
test_images_idx3_ubyte_file = "./mnist/t10k-images.idx3-ubyte"
# 测试集标签文件
test_labels_idx1_ubyte_file = "./mnist/t10k-labels.idx1-ubyte"
def decode_idx3_ubyte(idx3_ubyte_file):
"""
解析idx3文件的通用函数
:param idx3_ubyte_file: idx3文件路径
:return: 数据集
"""
# 读取二进制数据
bin_data = open(idx3_ubyte_file, "rb").read()
# 解析文件头信息,依次为魔数、图片数量、每张图片高、每张图片宽
offset = 0
fmt_header = ">iiii"
magic_number, num_images, num_rows, num_cols = struct.unpack_from(
fmt_header, bin_data, offset
)
print(
"魔数:%d, 图片数量: %d张, 图片大小: %d*%d" % (magic_number, num_images, num_rows, num_cols)
)
# 解析数据集
image_size = num_rows * num_cols
offset += struct.calcsize(fmt_header)
fmt_image = ">" + str(image_size) + "B"
images = np.empty((num_images, num_rows, num_cols))
for i in range(num_images):
if (i + 1) % 10000 == 0:
print("已解析 %d" % (i + 1) + "张")
images[i] = np.array(struct.unpack_from(fmt_image, bin_data, offset)).reshape(
(num_rows, num_cols)
)
offset += struct.calcsize(fmt_image)
return images
def decode_idx1_ubyte(idx1_ubyte_file):
"""
解析idx1文件的通用函数
:param idx1_ubyte_file: idx1文件路径
:return: 数据集
"""
# 读取二进制数据
bin_data = open(idx1_ubyte_file, "rb").read()
# 解析文件头信息,依次为魔数和标签数
offset = 0
fmt_header = ">ii"
magic_number, num_images = struct.unpack_from(fmt_header, bin_data, offset)
print("魔数:%d, 图片数量: %d张" % (magic_number, num_images))
# 解析数据集
offset += struct.calcsize(fmt_header)
fmt_image = ">B"
labels = np.empty(num_images)
for i in range(num_images):
if (i + 1) % 10000 == 0:
print("已解析 %d" % (i + 1) + "张")
labels[i] = struct.unpack_from(fmt_image, bin_data, offset)[0]
offset += struct.calcsize(fmt_image)
return labels
def load_train_images(idx_ubyte_file=train_images_idx3_ubyte_file):
"""
:param idx_ubyte_file: idx文件路径
:return: n*row*col维np.array对象,n为图片数量
"""
return decode_idx3_ubyte(idx_ubyte_file)
def load_train_labels(idx_ubyte_file=train_labels_idx1_ubyte_file):
"""
:param idx_ubyte_file: idx文件路径
:return: n*1维np.array对象,n为图片数量
"""
return decode_idx1_ubyte(idx_ubyte_file)
def load_test_images(idx_ubyte_file=test_images_idx3_ubyte_file):
"""
:param idx_ubyte_file: idx文件路径
:return: n*row*col维np.array对象,n为图片数量
"""
return decode_idx3_ubyte(idx_ubyte_file)
def load_test_labels(idx_ubyte_file=test_labels_idx1_ubyte_file):
"""
:param idx_ubyte_file: idx文件路径
:return: n*1维np.array对象,n为图片数量
"""
return decode_idx1_ubyte(idx_ubyte_file)
def parseMnist2Img(resDir="binary-mnist/"):
"""
转化为图像数据
"""
train_images = load_train_images(idx_ubyte_file=train_images_idx3_ubyte_file)
train_labels = load_train_labels(idx_ubyte_file=train_labels_idx1_ubyte_file)
test_images = load_test_images(idx_ubyte_file=test_images_idx3_ubyte_file)
test_labels = load_test_labels(idx_ubyte_file=test_labels_idx1_ubyte_file)
# 解析训练集
trainDir = resDir + "train/"
print("train_images_nums: ", len(train_images))
for i in range(len(train_images)):
one_label = train_labels[i]
print("one_label: ", one_label)
one_img = train_images[i]
oneDir = trainDir + str(int(one_label)) + "/"
if not os.path.exists(oneDir):
os.makedirs(oneDir)
print("one_img_shape: ", one_img.shape)
onePic = Image.fromarray(np.uint8(one_img))
one_path = oneDir + str(len(os.listdir(oneDir))) + ".jpg"
onePic.save(one_path)
# 解析测试集
testDir = resDir + "test/"
print("test_images_nums: ", len(test_images))
for i in range(len(test_images)):
one_label = test_labels[i]
print("one_label: ", one_label)
one_img = test_images[i]
oneDir = testDir + str(int(one_label)) + "/"
if not os.path.exists(oneDir):
os.makedirs(oneDir)
print("one_img_shape: ", one_img.shape)
onePic = Image.fromarray(np.uint8(one_img))
one_path = oneDir + str(len(os.listdir(oneDir))) + ".jpg"
onePic.save(one_path)
if __name__ == "__main__":
print(
"=========================================Loading binaryHandle==========================================="
)
parseMnist2Img(resDir="binary-mnist/")
运行会自动创建binary-mnist目录,与递归下级目录trian和test,以train为例,结果截图如下所示:
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)