JavaDay12

JavaDay12,第1张

学习来源:日撸 Java 三百行(51-60天,kNN 与 NB)_闵帆的博客——CSDN博客

一、基于M-distance的推荐

1.之前的KNN算法原理是基于K个(距离最近的K个)邻居对目标的种类进行投票来决定的。虽然KNN综合性能比较好,但在某些情况下如样本不平衡的时候,KNN对稀有类别的预测准确度较低。

2.在这里对KNN进行改进,不再将K个距离最小的数据项作为邻居,而是以一个指定的距离范围作为指标,将所有在距离范围内的测试集数据作为邻居,根据这些邻居的投票来决定对目标种类的预测。

3.由于使用的是leave-one-out交叉验证的方式,在对某项进行预测的时候要减去该项目的评分,根据剩余的评分来预测。

package JavaDay12;

/**
 * @author Ke-Xiong Wang
 *
 * @date 2022年5月4日
 *
 * 基于M-distance的推荐
 */

import java.io.*;

public class M_distance {
    //默认评分为1到5
    public static final double DEFAULT_RATING = 3.0;

    //用户数
    private int numUsers;

    //电影数
    private int numItems;

    //总评分
    private int numRatings;

    //预测结果的集合
    private double[] predictions;

    //压缩评分矩阵
    private int[][] compressedRatingMatrix;

    //用户已评分的电影数
    private int[] userDegrees;

    //用户对电影的平均评分
    private double[] userAverageRatings;

    //已评分的用户数
    private int[] itemDegrees;

    //电影的平均得分
    private double[] itemAverageRatings;

    //每个用户的第一个对电影评分的序号
    private int[] userStartingIndices;

    //没有邻居的电影数
    private int numNonNeighbors;

    //判断是否是邻居的范围
    private double radius;

    /**
     *************************
     * 构造评分矩阵
     *
     * @param paraRatingFilename  评分文件名
     * @param paraNumUsers  用户数
     * @param paraNumItems  电影数
     * @param paraNumRatings  总评分
     *************************
     */
    public M_distance(String paraRatingFilename, int paraNumUsers, int paraNumItems, int paraNumRatings) throws Exception {
        //初始化数组
        numItems = paraNumItems;
        numUsers = paraNumUsers;
        numRatings = paraNumRatings;

        userDegrees = new int[numUsers];
        userStartingIndices = new int[numUsers + 1];
        userAverageRatings = new double[numUsers];
        itemDegrees = new int[numItems];
        compressedRatingMatrix = new int[numRatings][3];
        itemAverageRatings = new double[numItems];

        predictions = new double[numRatings];

        System.out.println("Reading " + paraRatingFilename);

        //读文件
        File tempFile = new File(paraRatingFilename);
        if (!tempFile.exists()) {
            System.out.println("File " + paraRatingFilename + " does not exists.");
            System.exit(0);
        }//Of if
        BufferedReader tempBufReader = new BufferedReader(new FileReader(tempFile));
        String tempString;
        String[] tempStrArray;
        int tempIndex = 0;
        userStartingIndices[0] = 0;
        userStartingIndices[numUsers] = numRatings;
        while ((tempString = tempBufReader.readLine()) != null) {
            //每行有三个值
            tempStrArray = tempString.split(",");
            compressedRatingMatrix[tempIndex][0] = Integer.parseInt(tempStrArray[0]);
            compressedRatingMatrix[tempIndex][1] = Integer.parseInt(tempStrArray[1]);
            compressedRatingMatrix[tempIndex][2] = Integer.parseInt(tempStrArray[2]);

            userDegrees[compressedRatingMatrix[tempIndex][0]]++;
            itemDegrees[compressedRatingMatrix[tempIndex][1]]++;

            if (tempIndex > 0) {
                //新用户的评分
                if (compressedRatingMatrix[tempIndex][0] != compressedRatingMatrix[tempIndex - 1][0]) {
                    userStartingIndices[compressedRatingMatrix[tempIndex][0]] = tempIndex;
                }//Of if
            }//Of if
            tempIndex++;
        }//Of while
        tempBufReader.close();

        double[] tempUserTotalScore = new double[numUsers];
        double[] tempItemTotalScore = new double[numItems];
        for (int i = 0; i < numRatings; i++) {
            tempUserTotalScore[compressedRatingMatrix[i][0]] += compressedRatingMatrix[i][2];
            tempItemTotalScore[compressedRatingMatrix[i][1]] += compressedRatingMatrix[i][2];
        }//Of for i

        for (int i = 0; i < numUsers; i++) {
            userAverageRatings[i] = tempUserTotalScore[i] / userDegrees[i];
        }//Of for i
        for (int i = 0; i < numItems; i++) {
            itemAverageRatings[i] = tempItemTotalScore[i] / itemDegrees[i];
        }//Of for i
    }//Of M_distance

    /**
     *************************
     * 设置半径(判断是否为邻居)
     *
     * @param paraRadius  范围
     *************************
     */
    public void setRadius(double paraRadius) {
        if (paraRadius > 0) {
            radius = paraRadius;
        } else {
            radius = 0.1;
        }//Of if
    }//Of setRadius

    /**
     *************************
     * 预测目标的评分
     *************************
     */
    public void leaveOneOutPrediction() {
        double tempItemAverageRating;
        int tempUser, tempItem, tempRating;
        System.out.println("\r\nLeaveOneOutPrediction for radius " + radius);

        numNonNeighbors = 0;
        for (int i = 0; i < numRatings; i++) {
            tempUser = compressedRatingMatrix[i][0];
            tempItem = compressedRatingMatrix[i][1];
            tempRating = compressedRatingMatrix[i][2];

            //重新计算当前电影的平均评分
            tempItemAverageRating = (itemAverageRatings[tempItem] * itemDegrees[tempItem] - tempRating)
                    / (itemDegrees[tempItem] - 1);

            //重新计算邻居,同时获得邻居的评分
            int tempNeighbors = 0;
            double tempTotal = 0;
            int tempComparedItem;
            for (int j = userStartingIndices[tempUser]; j < userStartingIndices[tempUser + 1]; j++) {
                tempComparedItem = compressedRatingMatrix[j][1];
                if (tempItem == tempComparedItem) {
                    continue;
                }//Of if

                if (Math.abs(tempItemAverageRating - itemAverageRatings[tempComparedItem]) < radius) {
                    tempTotal += compressedRatingMatrix[j][2];
                    tempNeighbors++;
                }//Of if
            }//Of for j

            //将邻居的平均评分作为预测值
            if (tempNeighbors > 0) {
                predictions[i] = tempTotal / tempNeighbors;
            } else {
                predictions[i] = DEFAULT_RATING;
                numNonNeighbors++;
            }//Of if
        }//Of for i
    }//Of leaveOneOutPrediction

    /**
     *************************
     * 计算距离的绝对值和
     *************************
     */
    public double computeMAE() throws Exception {
        double tempTotalError = 0;
        for (int i = 0; i < predictions.length; i++) {
            tempTotalError += Math.abs(predictions[i] - compressedRatingMatrix[i][2]);
        }//Of for i

        return tempTotalError / predictions.length;
    }//Of computeMAE

    /**
     *************************
     * 计算距离的平方再开方
     *************************
     */
    public double computeRSME() throws Exception {
        double tempTotalError = 0;
        for (int i = 0; i < predictions.length; i++) {
            tempTotalError += (predictions[i] - compressedRatingMatrix[i][2])
                    * (predictions[i] - compressedRatingMatrix[i][2]);
        }//Of for i

        double tempAverage = tempTotalError / predictions.length;

        return Math.sqrt(tempAverage);
    }//Of computeRSME

    /**
     *************************
     * 程序入口
     *
     * @param args  暂未使用
     *************************
     */
    public static void main(String[] args) {
        try {
            M_distance tempRecommender = new M_distance("D:/movielens-943u1682m.txt", 943, 1682, 100000);

            for (double tempRadius = 0.2; tempRadius < 0.6; tempRadius += 0.1) {
                tempRecommender.setRadius(tempRadius);

                tempRecommender.leaveOneOutPrediction();
                double tempMAE = tempRecommender.computeMAE();
                double tempRSME = tempRecommender.computeRSME();

                System.out.println("Radius = " + tempRadius + ", MAE = " + tempMAE + ", RSME = " + tempRSME + ", numNonNeighbors = " + tempRecommender.numNonNeighbors);
            }//Of for tempRadius
        } catch (Exception ee) {
            System.out.println(ee);
        }//Of try
    }//Of main
}//Of class M_distance

运行结果

二、KMeans聚类算法

1.数据使用的是KNN中的iris.arff,因为是根据距离来分类,所以第五个属性没有用到。

2.和M-distance相似也是根据算距离来分类。

3.不断重复①选取新的中心点②根据距离分类这两个 *** 作,直到各点的分类结果不再变动,此时分类完成。

package JavaDay12;

/**
 * @author Kexiong Wang
 *
 * @date 2022年5月4日
 *
 * KMeans聚类算法
 */

import weka.core.Instances;

import java.io.FileReader;
import java.util.Arrays;
import java.util.Random;

public class KMeans {
    //曼哈顿距离
    public static final int MANHATTAN = 0;

    //欧几里得距离
    public static final int EUCLIDEAN = 1;

    //距离方式
    public int distanceMeasure = EUCLIDEAN;

    //随机数
    public static final Random random = new Random();

    //数据
    Instances dataset;

    /**
     * 簇的个数
     */
    int numClusters = 2;

    /**
     * 分簇的结果
     */
    int[][] clusters;

    /**
     *******************************
     * 构造函数
     *
     * @param paraFilename  数据文件
     *******************************
     */
    public KMeans(String paraFilename) {
        dataset = null;
        try {
            FileReader fileReader = new FileReader(paraFilename);
            dataset = new Instances(fileReader);
            fileReader.close();
        } catch (Exception ee) {
            System.out.println("Cannot read the file: " + paraFilename + "\r\n" + ee);
            System.exit(0);
        }//Of try
    }//Of KMeans

    /**
     *******************************
     * 设置簇的个数
     *******************************
     */
    public void setNumClusters(int paraNumClusters) {
        numClusters = paraNumClusters;
    }//Of the setter

    /**
     *********************
     * 乱序
     *
     * @param paraLength  序列长度
     * @return  一组序号(随机序列)
     *********************
     */
    public static int[] getRandomIndices(int paraLength) {
        int[] resultIndices = new int[paraLength];

        //初始化
        for (int i = 0; i < paraLength; i++) {
            resultIndices[i] = i;
        }//Of for i

        //随机交换
        int tempFirst, tempSecond, tempValue;
        for (int i = 0; i < paraLength; i++) {
            //生成两个随机序号
            tempFirst = random.nextInt(paraLength);
            tempSecond = random.nextInt(paraLength);

            //交换
            tempValue = resultIndices[tempFirst];
            resultIndices[tempFirst] = resultIndices[tempSecond];
            resultIndices[tempSecond] = tempValue;
        }//Of for i

        return resultIndices;
    }//Of getRandomIndices

    /**
     *********************
     * 两个点之间的距离
     *
     * @param paraI  第一个点的序号
     * @param paraArray  存储点的数组
     * @return 距离
     *********************
     */
    public double distance(int paraI, double[] paraArray) {
        int resultDistance = 0;
        double tempDifference;
        switch (distanceMeasure) {
            case MANHATTAN:
                for (int i = 0; i < dataset.numAttributes() - 1; i++) {
                    tempDifference = dataset.instance(paraI).value(i) - paraArray[i];
                    //加上绝对值
                    if (tempDifference < 0) {
                        resultDistance -= tempDifference;
                    } else {
                        resultDistance += tempDifference;
                    }//Of if
                }//Of for i
                break;

            case EUCLIDEAN:
                for (int i = 0; i < dataset.numAttributes() - 1; i++) {
                    tempDifference = dataset.instance(paraI).value(i) - paraArray[i];
                    resultDistance += tempDifference * tempDifference;
                }//Of for i
                break;
            default:
                System.out.println("Unsupported distance measure: " + distanceMeasure);
        }//Of switch

        return resultDistance;
    }//Of distance

    /**
     *******************************
     * 分簇
     *******************************
     */
    public void clustering() {
        int[] tempOldClusterArray = new int[dataset.numInstances()];
        tempOldClusterArray[0] = -1;
        int[] tempClusterArray = new int[dataset.numInstances()];
        Arrays.fill(tempClusterArray, 0);
        double[][] tempCenters = new double[numClusters][dataset.numAttributes() - 1];

        //初始化中心点
        int[] tempRandomOrders = getRandomIndices(dataset.numInstances());
        for (int i = 0; i < numClusters; i++) {
            for (int j = 0; j < tempCenters[0].length; j++) {
                tempCenters[i][j] = dataset.instance(tempRandomOrders[i]).value(j);
            }//Of for j
        }//Of for i

        int[] tempClusterLengths = null;
        while (!Arrays.equals(tempOldClusterArray, tempClusterArray)) {
            System.out.println("New loop ...");
            tempOldClusterArray = tempClusterArray;
            tempClusterArray = new int[dataset.numInstances()];

            //标记每个点依附于哪个中心点
            int tempNearestCenter;
            double tempNearestDistance;
            double tempDistance;

            for (int i = 0; i < dataset.numInstances(); i++) {
                tempNearestCenter = -1;
                tempNearestDistance = Double.MAX_VALUE;

                for (int j = 0; j < numClusters; j++) {
                    tempDistance = distance(i, tempCenters[j]);
                    if (tempNearestDistance > tempDistance) {
                        tempNearestDistance = tempDistance;
                        tempNearestCenter = j;
                    }//Of if
                }//Of for j
                tempClusterArray[i] = tempNearestCenter;
            }//Of for i

            //设置一个新的中心点
            tempClusterLengths = new int[numClusters];
            Arrays.fill(tempClusterLengths, 0);
            double[][] tempNewCenters = new double[numClusters][dataset.numAttributes() - 1];
            for (int i = 0; i < dataset.numInstances(); i++) {
                for (int j = 0; j < tempNewCenters[0].length; j++) {
                    tempNewCenters[tempClusterArray[i]][j] += dataset.instance(i).value(j);
                }//Of for j
                tempClusterLengths[tempClusterArray[i]]++;
            }//Of for i

            //取平均值为新的中心点
            for (int i = 0; i < tempNewCenters.length; i++) {
                for (int j = 0; j < tempNewCenters[0].length; j++) {
                    tempNewCenters[i][j] /= tempClusterLengths[i];
                }//Of for j
            }//Of for i

            System.out.println("Now the new centers are: " + Arrays.deepToString(tempNewCenters));
            tempCenters = tempNewCenters;
        }//Of while


        clusters = new int[numClusters][];
        int[] tempCounters = new int[numClusters];
        for (int i = 0; i < numClusters; i++) {
            clusters[i] = new int[tempClusterLengths[i]];
        }//Of for i

        for (int i = 0; i < tempClusterArray.length; i++) {
            clusters[tempClusterArray[i]][tempCounters[tempClusterArray[i]]] = i;
            tempCounters[tempClusterArray[i]]++;
        }//Of for i

        System.out.println("The clusters are: " + Arrays.deepToString(clusters));
    }//Of clustering

    /**
     *******************************
     * 分簇方法测试
     *******************************
     */
    public static void testClustering() {
        KMeans tempKMeans = new KMeans("D:/iris.arff");
        tempKMeans.setNumClusters(3);
        tempKMeans.clustering();
    }//Of testClustering

    /**
     *************************
     * 程序入口
     *
     * @param args  暂未使用
     *************************
     */
    public static void main(String args[]) {
        testClustering();
    }//Of main
}//Of class KMeans

运行结果

这里设置分簇的数量为3,所以最后的clusters[][]数组有三行,每一行的元素为被分为一类的点。

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/langs/797835.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-05-06
下一篇 2022-05-06

发表评论

登录后才能评论

评论列表(0条)

保存