学习来源:日撸 Java 三百行(51-60天,kNN 与 NB)_闵帆的博客——CSDN博客
一、基于M-distance的推荐1.之前的KNN算法原理是基于K个(距离最近的K个)邻居对目标的种类进行投票来决定的。虽然KNN综合性能比较好,但在某些情况下如样本不平衡的时候,KNN对稀有类别的预测准确度较低。
2.在这里对KNN进行改进,不再将K个距离最小的数据项作为邻居,而是以一个指定的距离范围作为指标,将所有在距离范围内的测试集数据作为邻居,根据这些邻居的投票来决定对目标种类的预测。
3.由于使用的是leave-one-out交叉验证的方式,在对某项进行预测的时候要减去该项目的评分,根据剩余的评分来预测。
package JavaDay12;
/**
* @author Ke-Xiong Wang
*
* @date 2022年5月4日
*
* 基于M-distance的推荐
*/
import java.io.*;
public class M_distance {
//默认评分为1到5
public static final double DEFAULT_RATING = 3.0;
//用户数
private int numUsers;
//电影数
private int numItems;
//总评分
private int numRatings;
//预测结果的集合
private double[] predictions;
//压缩评分矩阵
private int[][] compressedRatingMatrix;
//用户已评分的电影数
private int[] userDegrees;
//用户对电影的平均评分
private double[] userAverageRatings;
//已评分的用户数
private int[] itemDegrees;
//电影的平均得分
private double[] itemAverageRatings;
//每个用户的第一个对电影评分的序号
private int[] userStartingIndices;
//没有邻居的电影数
private int numNonNeighbors;
//判断是否是邻居的范围
private double radius;
/**
*************************
* 构造评分矩阵
*
* @param paraRatingFilename 评分文件名
* @param paraNumUsers 用户数
* @param paraNumItems 电影数
* @param paraNumRatings 总评分
*************************
*/
public M_distance(String paraRatingFilename, int paraNumUsers, int paraNumItems, int paraNumRatings) throws Exception {
//初始化数组
numItems = paraNumItems;
numUsers = paraNumUsers;
numRatings = paraNumRatings;
userDegrees = new int[numUsers];
userStartingIndices = new int[numUsers + 1];
userAverageRatings = new double[numUsers];
itemDegrees = new int[numItems];
compressedRatingMatrix = new int[numRatings][3];
itemAverageRatings = new double[numItems];
predictions = new double[numRatings];
System.out.println("Reading " + paraRatingFilename);
//读文件
File tempFile = new File(paraRatingFilename);
if (!tempFile.exists()) {
System.out.println("File " + paraRatingFilename + " does not exists.");
System.exit(0);
}//Of if
BufferedReader tempBufReader = new BufferedReader(new FileReader(tempFile));
String tempString;
String[] tempStrArray;
int tempIndex = 0;
userStartingIndices[0] = 0;
userStartingIndices[numUsers] = numRatings;
while ((tempString = tempBufReader.readLine()) != null) {
//每行有三个值
tempStrArray = tempString.split(",");
compressedRatingMatrix[tempIndex][0] = Integer.parseInt(tempStrArray[0]);
compressedRatingMatrix[tempIndex][1] = Integer.parseInt(tempStrArray[1]);
compressedRatingMatrix[tempIndex][2] = Integer.parseInt(tempStrArray[2]);
userDegrees[compressedRatingMatrix[tempIndex][0]]++;
itemDegrees[compressedRatingMatrix[tempIndex][1]]++;
if (tempIndex > 0) {
//新用户的评分
if (compressedRatingMatrix[tempIndex][0] != compressedRatingMatrix[tempIndex - 1][0]) {
userStartingIndices[compressedRatingMatrix[tempIndex][0]] = tempIndex;
}//Of if
}//Of if
tempIndex++;
}//Of while
tempBufReader.close();
double[] tempUserTotalScore = new double[numUsers];
double[] tempItemTotalScore = new double[numItems];
for (int i = 0; i < numRatings; i++) {
tempUserTotalScore[compressedRatingMatrix[i][0]] += compressedRatingMatrix[i][2];
tempItemTotalScore[compressedRatingMatrix[i][1]] += compressedRatingMatrix[i][2];
}//Of for i
for (int i = 0; i < numUsers; i++) {
userAverageRatings[i] = tempUserTotalScore[i] / userDegrees[i];
}//Of for i
for (int i = 0; i < numItems; i++) {
itemAverageRatings[i] = tempItemTotalScore[i] / itemDegrees[i];
}//Of for i
}//Of M_distance
/**
*************************
* 设置半径(判断是否为邻居)
*
* @param paraRadius 范围
*************************
*/
public void setRadius(double paraRadius) {
if (paraRadius > 0) {
radius = paraRadius;
} else {
radius = 0.1;
}//Of if
}//Of setRadius
/**
*************************
* 预测目标的评分
*************************
*/
public void leaveOneOutPrediction() {
double tempItemAverageRating;
int tempUser, tempItem, tempRating;
System.out.println("\r\nLeaveOneOutPrediction for radius " + radius);
numNonNeighbors = 0;
for (int i = 0; i < numRatings; i++) {
tempUser = compressedRatingMatrix[i][0];
tempItem = compressedRatingMatrix[i][1];
tempRating = compressedRatingMatrix[i][2];
//重新计算当前电影的平均评分
tempItemAverageRating = (itemAverageRatings[tempItem] * itemDegrees[tempItem] - tempRating)
/ (itemDegrees[tempItem] - 1);
//重新计算邻居,同时获得邻居的评分
int tempNeighbors = 0;
double tempTotal = 0;
int tempComparedItem;
for (int j = userStartingIndices[tempUser]; j < userStartingIndices[tempUser + 1]; j++) {
tempComparedItem = compressedRatingMatrix[j][1];
if (tempItem == tempComparedItem) {
continue;
}//Of if
if (Math.abs(tempItemAverageRating - itemAverageRatings[tempComparedItem]) < radius) {
tempTotal += compressedRatingMatrix[j][2];
tempNeighbors++;
}//Of if
}//Of for j
//将邻居的平均评分作为预测值
if (tempNeighbors > 0) {
predictions[i] = tempTotal / tempNeighbors;
} else {
predictions[i] = DEFAULT_RATING;
numNonNeighbors++;
}//Of if
}//Of for i
}//Of leaveOneOutPrediction
/**
*************************
* 计算距离的绝对值和
*************************
*/
public double computeMAE() throws Exception {
double tempTotalError = 0;
for (int i = 0; i < predictions.length; i++) {
tempTotalError += Math.abs(predictions[i] - compressedRatingMatrix[i][2]);
}//Of for i
return tempTotalError / predictions.length;
}//Of computeMAE
/**
*************************
* 计算距离的平方再开方
*************************
*/
public double computeRSME() throws Exception {
double tempTotalError = 0;
for (int i = 0; i < predictions.length; i++) {
tempTotalError += (predictions[i] - compressedRatingMatrix[i][2])
* (predictions[i] - compressedRatingMatrix[i][2]);
}//Of for i
double tempAverage = tempTotalError / predictions.length;
return Math.sqrt(tempAverage);
}//Of computeRSME
/**
*************************
* 程序入口
*
* @param args 暂未使用
*************************
*/
public static void main(String[] args) {
try {
M_distance tempRecommender = new M_distance("D:/movielens-943u1682m.txt", 943, 1682, 100000);
for (double tempRadius = 0.2; tempRadius < 0.6; tempRadius += 0.1) {
tempRecommender.setRadius(tempRadius);
tempRecommender.leaveOneOutPrediction();
double tempMAE = tempRecommender.computeMAE();
double tempRSME = tempRecommender.computeRSME();
System.out.println("Radius = " + tempRadius + ", MAE = " + tempMAE + ", RSME = " + tempRSME + ", numNonNeighbors = " + tempRecommender.numNonNeighbors);
}//Of for tempRadius
} catch (Exception ee) {
System.out.println(ee);
}//Of try
}//Of main
}//Of class M_distance
运行结果
二、KMeans聚类算法1.数据使用的是KNN中的iris.arff,因为是根据距离来分类,所以第五个属性没有用到。
2.和M-distance相似也是根据算距离来分类。
3.不断重复①选取新的中心点②根据距离分类这两个 *** 作,直到各点的分类结果不再变动,此时分类完成。
package JavaDay12;
/**
* @author Kexiong Wang
*
* @date 2022年5月4日
*
* KMeans聚类算法
*/
import weka.core.Instances;
import java.io.FileReader;
import java.util.Arrays;
import java.util.Random;
public class KMeans {
//曼哈顿距离
public static final int MANHATTAN = 0;
//欧几里得距离
public static final int EUCLIDEAN = 1;
//距离方式
public int distanceMeasure = EUCLIDEAN;
//随机数
public static final Random random = new Random();
//数据
Instances dataset;
/**
* 簇的个数
*/
int numClusters = 2;
/**
* 分簇的结果
*/
int[][] clusters;
/**
*******************************
* 构造函数
*
* @param paraFilename 数据文件
*******************************
*/
public KMeans(String paraFilename) {
dataset = null;
try {
FileReader fileReader = new FileReader(paraFilename);
dataset = new Instances(fileReader);
fileReader.close();
} catch (Exception ee) {
System.out.println("Cannot read the file: " + paraFilename + "\r\n" + ee);
System.exit(0);
}//Of try
}//Of KMeans
/**
*******************************
* 设置簇的个数
*******************************
*/
public void setNumClusters(int paraNumClusters) {
numClusters = paraNumClusters;
}//Of the setter
/**
*********************
* 乱序
*
* @param paraLength 序列长度
* @return 一组序号(随机序列)
*********************
*/
public static int[] getRandomIndices(int paraLength) {
int[] resultIndices = new int[paraLength];
//初始化
for (int i = 0; i < paraLength; i++) {
resultIndices[i] = i;
}//Of for i
//随机交换
int tempFirst, tempSecond, tempValue;
for (int i = 0; i < paraLength; i++) {
//生成两个随机序号
tempFirst = random.nextInt(paraLength);
tempSecond = random.nextInt(paraLength);
//交换
tempValue = resultIndices[tempFirst];
resultIndices[tempFirst] = resultIndices[tempSecond];
resultIndices[tempSecond] = tempValue;
}//Of for i
return resultIndices;
}//Of getRandomIndices
/**
*********************
* 两个点之间的距离
*
* @param paraI 第一个点的序号
* @param paraArray 存储点的数组
* @return 距离
*********************
*/
public double distance(int paraI, double[] paraArray) {
int resultDistance = 0;
double tempDifference;
switch (distanceMeasure) {
case MANHATTAN:
for (int i = 0; i < dataset.numAttributes() - 1; i++) {
tempDifference = dataset.instance(paraI).value(i) - paraArray[i];
//加上绝对值
if (tempDifference < 0) {
resultDistance -= tempDifference;
} else {
resultDistance += tempDifference;
}//Of if
}//Of for i
break;
case EUCLIDEAN:
for (int i = 0; i < dataset.numAttributes() - 1; i++) {
tempDifference = dataset.instance(paraI).value(i) - paraArray[i];
resultDistance += tempDifference * tempDifference;
}//Of for i
break;
default:
System.out.println("Unsupported distance measure: " + distanceMeasure);
}//Of switch
return resultDistance;
}//Of distance
/**
*******************************
* 分簇
*******************************
*/
public void clustering() {
int[] tempOldClusterArray = new int[dataset.numInstances()];
tempOldClusterArray[0] = -1;
int[] tempClusterArray = new int[dataset.numInstances()];
Arrays.fill(tempClusterArray, 0);
double[][] tempCenters = new double[numClusters][dataset.numAttributes() - 1];
//初始化中心点
int[] tempRandomOrders = getRandomIndices(dataset.numInstances());
for (int i = 0; i < numClusters; i++) {
for (int j = 0; j < tempCenters[0].length; j++) {
tempCenters[i][j] = dataset.instance(tempRandomOrders[i]).value(j);
}//Of for j
}//Of for i
int[] tempClusterLengths = null;
while (!Arrays.equals(tempOldClusterArray, tempClusterArray)) {
System.out.println("New loop ...");
tempOldClusterArray = tempClusterArray;
tempClusterArray = new int[dataset.numInstances()];
//标记每个点依附于哪个中心点
int tempNearestCenter;
double tempNearestDistance;
double tempDistance;
for (int i = 0; i < dataset.numInstances(); i++) {
tempNearestCenter = -1;
tempNearestDistance = Double.MAX_VALUE;
for (int j = 0; j < numClusters; j++) {
tempDistance = distance(i, tempCenters[j]);
if (tempNearestDistance > tempDistance) {
tempNearestDistance = tempDistance;
tempNearestCenter = j;
}//Of if
}//Of for j
tempClusterArray[i] = tempNearestCenter;
}//Of for i
//设置一个新的中心点
tempClusterLengths = new int[numClusters];
Arrays.fill(tempClusterLengths, 0);
double[][] tempNewCenters = new double[numClusters][dataset.numAttributes() - 1];
for (int i = 0; i < dataset.numInstances(); i++) {
for (int j = 0; j < tempNewCenters[0].length; j++) {
tempNewCenters[tempClusterArray[i]][j] += dataset.instance(i).value(j);
}//Of for j
tempClusterLengths[tempClusterArray[i]]++;
}//Of for i
//取平均值为新的中心点
for (int i = 0; i < tempNewCenters.length; i++) {
for (int j = 0; j < tempNewCenters[0].length; j++) {
tempNewCenters[i][j] /= tempClusterLengths[i];
}//Of for j
}//Of for i
System.out.println("Now the new centers are: " + Arrays.deepToString(tempNewCenters));
tempCenters = tempNewCenters;
}//Of while
clusters = new int[numClusters][];
int[] tempCounters = new int[numClusters];
for (int i = 0; i < numClusters; i++) {
clusters[i] = new int[tempClusterLengths[i]];
}//Of for i
for (int i = 0; i < tempClusterArray.length; i++) {
clusters[tempClusterArray[i]][tempCounters[tempClusterArray[i]]] = i;
tempCounters[tempClusterArray[i]]++;
}//Of for i
System.out.println("The clusters are: " + Arrays.deepToString(clusters));
}//Of clustering
/**
*******************************
* 分簇方法测试
*******************************
*/
public static void testClustering() {
KMeans tempKMeans = new KMeans("D:/iris.arff");
tempKMeans.setNumClusters(3);
tempKMeans.clustering();
}//Of testClustering
/**
*************************
* 程序入口
*
* @param args 暂未使用
*************************
*/
public static void main(String args[]) {
testClustering();
}//Of main
}//Of class KMeans
运行结果
这里设置分簇的数量为3,所以最后的clusters[][]数组有三行,每一行的元素为被分为一类的点。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)