用PCA对16个特征进行选择,设特征数为K,要求贡献率>=90%

用PCA对16个特征进行选择,设特征数为K,要求贡献率>=90%,第1张

作业记录
#PCA方法
# (2)用PCA对16个特征进行选择,设特征数为K,要求贡献率>=90%

import numpy as np
import math
import cv2

def read_train_MNIST_dataset(filename=r"..\dataset\mnist.npz"):
    f = np.load(filename)
    x_train, y_train = f['x_train'], f['y_train']
    f.close()
    return (x_train, y_train)

def read_std_digit_images(src_dir):
    filenames_list =["0.png","1.png","2.png","3.png","4.png","5.png","6.png","7.png","8.png","9.png"]
    images_list=[None]*len(filenames_list)
    for idx,filename in enumerate(filenames_list):
        pathfilename =src_dir+"\"+filename
        img_data = cv2.imread(pathfilename,cv2.IMREAD_GRAYSCALE)
        images_list[idx] = img_data
    return images_list


#load data
(x_train_data, y_train_data) = read_train_MNIST_dataset()
std_digits_imgs=read_std_digit_images(src_dir=r"../dataset/std_digits")

N=5000
M=16
GRID_LINES = int(math.sqrt(M))
threshold = 128
ans_list = [0]*N #predict num list
correct_num = 0  #the number of correct prediction
features_matrix =np.empty(shape=[0, M])
std_digits_features_matrix =np.empty(shape=[0, M])


for i in range(N):
    img_matrix =x_train_data[i][6:22,6:22]
    features_vect1=[0]*M
    features_idx1=0
    for row in range(M):
        for col in range(M):
            features_idx1 =  4*(row//GRID_LINES)+(col//GRID_LINES)
            if img_matrix[row][col] >= threshold:
                features_vect1[features_idx1] += 1
    features_matrix=np.append(features_matrix,np.array(features_vect1).reshape(1,M),axis=0)

for z in range(10):
    features_vect2=[0]*M #Count the number of black spots in each region(M region)
    features_idx2=0
    img = std_digits_imgs[z][6:22,6:22]
    for row_s in range(M):
        for col_s in range(M):
            features_idx2 =  4*(row_s//GRID_LINES) +col_s//GRID_LINES
            if img[row_s][col_s] < threshold:
                features_vect2[features_idx2] += 1
    std_digits_features_matrix=np.append(std_digits_features_matrix,np.array(features_vect2).reshape(1,M),axis=0)


features_mean= np.mean(features_matrix,axis=0) #2D矩阵,axis=0返回纵轴的平均值,axis=1返回横轴的平均值
#对所有样本进行中心化处理 即将每个元素减去它的平均值,这样可以增加基向量的正交性,a feature that differentiates them
features_zeromean = features_matrix - features_mean
#rowvar=0 一行代表一个样本
covar_martix=np.cov(features_zeromean,rowvar=0)   #this way means col

e,Ev = np.linalg.eig(covar_martix)  #np.Mat(M)) #计算特征值和特征向量,e是特征值 Ev是特征向量

m=12   #维降为m维
idx_vect=np.argsort(-e)#high2low,we get index
n_sorted_vect = idx_vect[0:m]
#what we need is big,right
n_e = e[n_sorted_vect]
n_eigVector = Ev[:,n_sorted_vect]
LowDimData = np.mat(features_zeromean) * np.mat(n_eigVector)

print("(P27)前m=%d个特征值累积贡献率(即主成分保留信息量)=%.2f%%"%(m,100.0*sum(n_e)/sum(e)))

欢迎分享,转载请注明来源:内存溢出

原文地址: https://outofmemory.cn/langs/570186.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-04-09
下一篇 2022-04-09

发表评论

登录后才能评论

评论列表(0条)

保存