【聚类4】K-Means

【聚类4】K-Means,第1张

文章目录
  • 1. K-Means算法原理
  • 2. 西瓜数据集例题结果
  • 3. Java代码
    • 3.1 xigua.arff
    • 3.2 KMeans.java
    • 3.3 输出示例
  • 4. 代码补充

1. K-Means算法原理

    【聚类2】原型聚类——K-Means算法

2. 西瓜数据集例题结果
  • 西瓜数据集

  • 结果
"这个数据集应该得到3个簇:"
		1. C1 = {6,7,8,10,11,12,15,18,19,20}
		2. C2 = {1,2,4,22,23,24,25,26,27,28,29,30}
		3. C3 = {3,5,9,13,14,16,17,21}
3. Java代码
聚类是无监督学习,不应该有标签,程序里面的属性种类减了1
3.1 xigua.arff
	//我晕了,这个文件明明内容一摸一样,复制到文本文件,改后缀为arff。但是却不行,bug吧!!!
	//为了你们,不需要下载weka这个软件,我把我电脑上测试得起的xigua.arff文件,免费上传了。
@relation xigua

@attribute 密度 numeric
@attribute 含糖量 numeric
@attribute 好瓜 {,}

@data
0.697,0.46,0.774,0.376,0.634,0.264,0.608,0.318,0.556,0.215,0.403,0.237,0.481,0.149,0.437,0.211,0.666,0.091,0.243,0.267,0.245,0.057,0.343,0.099,0.639,0.161,0.657,0.198,0.36,0.37,0.593,0.042,0.719,0.103,0.359,0.188,0.339,0.241,0.282,0.257,0.748,0.232,0.714,0.346,0.483,0.312,0.478,0.437,0.525,0.369,0.751,0.489,0.532,0.472,0.473,0.376,0.725,0.445,0.446,0.459,
3.2 KMeans.java
package cluster;

import java.io.FileReader;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;

import weka.core.Instances;

public class KMeans {
	Instances dataset;
	int k;
	int[][] clusters;
	static int T = 5;

	/**
	 * 读取arff文件
	 * @param fileURL
	 * @param numClusters 聚类的簇的个数
	 */
	public KMeans(String fileURL, int numClusters) {
		dataset = null;
		k = numClusters;
		try {
			FileReader fileReader = new FileReader(fileURL);
			dataset = new Instances(fileReader);
			fileReader.close();
		} catch (Exception e) {
			System.out.println("Cannot read the file: " + fileURL + "\r\n" + e);
			System.exit(0);
		}
	}

	/**
	 * 从样本里面随机选num个样本下标
	 * @param length
	 * @param num
	 * @return
	 */
	public int[] getRandomIndices(int length, int num) {
		// 1
		Set<Integer> set = new HashSet<>(num);
		// 2
		while (set.size() != num) {
			for (int i = 0; i < num; i++) {
				set.add((int) (Math.random() * length));
			}
		}

		int[] result = new int[num];
		Iterator<Integer> it = set.iterator();
		for (int i = 0; i < num; i++) {
			result[i] = it.next();
		}

		return result;
	}

	/**
	 * 通过下标,来计算两个样本(原型样本---其余样本)的欧式距离
	 * @param prototypeIndex
	 * @param otherIndex
	 * @return
	 */
	public double eudistance(int prototypeIndex, int otherIndex) {
		double result = 0.0;
		for (int i = 0; i < dataset.numAttributes() - 1; i++) {
			double temp;
			double p1 = dataset.instance(prototypeIndex).value(i);
			double p2 = dataset.instance(otherIndex).value(i);
			temp = Math.abs((p1 - p2)) * Math.abs((p1 - p2));
			result += temp;
		}
		return Math.sqrt(result);
	}

	/**
	 * 通过向量,来计算两个样本(均值向量---样本向量)的欧式距离
	 * @param vector1
	 * @param vector2
	 * @return
	 */
	public double eudistance(double[] vector1, double[] vector2) {
		double result = 0.0;
		int n = dataset.numAttributes() - 1;
		for (int i = 0; i < n; i++) {
			double temp;
			double p1 = vector1[i];
			double p2 = vector2[i];
			temp = Math.abs((p1 - p2)) * Math.abs((p1 - p2));
			result += temp;
		}
		return Math.sqrt(result);
	}

	/**
	 * 聚类
	 */
	public void clustering() {
		int length = dataset.numInstances();
		System.out.println("Beginning ...");
		// 1
		int[] prototypeIndex = getRandomIndices(length, k);
		System.out.println("原型向量样本:" + Arrays.toString(prototypeIndex));
		// 2
		double[][] tempDistances = getDistances(prototypeIndex);
		System.out.println("距离:" + Arrays.deepToString(tempDistances));
		// 3
		clusters = selectCluster(tempDistances);
		System.out.println("聚类:" + Arrays.deepToString(clusters));

		int[][] oldClusters = clusters;
		while (Arrays.equals(clusters, oldClusters) || T > 0) {
			System.out.println("New loop ...");
			// 1
			double[][] meanVector = getMeanVector(clusters);
			System.out.println("均值向量" + Arrays.deepToString(meanVector));
			// 2
			double[][] newDistances = getDistances(meanVector);
			System.out.println("距离:" + Arrays.deepToString(newDistances));
			// 3
			clusters = selectCluster(newDistances);
			System.out.println("聚类:" + Arrays.deepToString(clusters));
			// 4
			T--;
		}
	}

	/**
	 * 更新均值变量
	 * @param clusters
	 * @return
	 */
	public double[][] getMeanVector(int[][] clusters) {
		int n = dataset.numAttributes() - 1;
		double[][] result = new double[k][n];
		// 1 计算均值
		double[][] sum = new double[k][n];
		for (int i = 0; i < k; i++) {
			for (int j = 0; j < n; j++) {
				for (int t = 0; t < clusters[i].length; t++) {
					sum[i][j] += dataset.instance(clusters[i][t]).value(j);
				}
				result[i][j] = sum[i][j] * 1.0 / clusters[i].length;
			}
		}
		return result;
	}

	/**
	 * 根据距离最近,划分样本
	 * @param distances
	 * @return
	 */
	public int[][] selectCluster(double[][] distances) {
		int length = dataset.numInstances();
		int[][] result = new int[k][];
		// 1 minIndex
		int[] minIndex = new int[length];
		for (int i = 0; i < length; i++) {
			double min = 100;
			for (int j = 0; j < k; j++) {
				if (min > distances[i][j]) {
					min = distances[i][j];
					minIndex[i] = j;
				}
			}
		}
		// 2 statistic
		int[] count = new int[k];
		for (int i = 0; i < k; i++) {
			for (int j = 0; j < length; j++) {
				if (minIndex[j] == i) {
					count[i]++;
				}
			}
		}
		// 3 init
		for (int i = 0; i < k; i++) {
			result[i] = new int[count[i]];
		}
		// 4 select
		for (int i = 0; i < k; i++) {
			int[] temp = new int[count[i]];
			int t = 0;
			for (int j = 0; j < length; j++) {
				if (minIndex[j] == i) {
					temp[t++] = j;
				}
			}
			result[i] = temp;
		}
		// 5 test
//		System.out.println("最小下标:" + Arrays.toString(minIndex));
//		System.out.println("统计个数:" + Arrays.toString(count));
//		System.out.println("簇类数:" + k);
//		System.out.println("样本长度:" + length);
		return result;
	}

	/**
	 * 例如:30行,3列。每个样本(下标)到原型样本(下标)的距离---->构成了二维矩阵
	 * @param prototypeIndex
	 * @return
	 */
	public double[][] getDistances(int[] prototypeIndex) {
		int length = dataset.numInstances();
		double[][] distances = new double[length][k];
		for (int i = 0; i < length; i++) {
			for (int j = 0; j < k; j++) {
				distances[i][j] = eudistance(i, prototypeIndex[j]);
			}
		}
		return distances;
	}

	/**
	 * 例如:30行,3列。每个样本(向量)到均值向量(向量)的距离---->构成了二维矩阵
	 * @param meanVector
	 * @return
	 */
	public double[][] getDistances(double[][] meanVector) {
		int length = dataset.numInstances();
		int n = dataset.numAttributes() - 1;
		double[][] distances = new double[length][k];
		// 1
		double[][] vector = new double[length][n];
		for (int i = 0; i < length; i++) {
			for (int j = 0; j < n; j++) {
				vector[i][j] = dataset.instance(i).value(j);
			}
		}
//		System.out.println("样本向量" + Arrays.deepToString(vextor));
//		System.out.println("测试" + Arrays.toString(vector[0]) + "," +Arrays.toString(meanVector[0]));
		// 2
		for (int i = 0; i < length; i++) {
			for (int j = 0; j < k; j++) {
				distances[i][j] = eudistance(vector[i], meanVector[j]);
			}
		}
		return distances;
	}

	public static void main(String arags[]) {
		KMeans kmeans = new KMeans("D:/data/xigua.arff", 3);
		kmeans.clustering();
	}

}// KMeans
3.3 输出示例
Beginning ...
原型向量样本:[28, 29, 25]
距离:[[0.03176476034853721, 0.2510019920239678, 0.06129437168288785], [0.08462860036654278, 0.3383385878081305, 0.11531695452100701], [0.20258825237411965, 0.2708671260969112, 0.25360205046489664], [0.17267889274604467, 0.2147673159491453, 0.2229125389025929], [0.2854137347781287, 0.26764902391004536, 0.33630492116530203], [0.383337970986439, 0.22612607103118387, 0.4296603309592357], [0.38360396244043155, 0.31196954979613, 0.4341658669218482], [0.37107950630558945, 0.2481632527188504, 0.4193804954930547], [0.35888298928759493, 0.4287470116513933, 0.40697542923375607], [0.5138170880770705, 0.2794154612758571, 0.5543897545950863], [0.6172066104636275, 0.4494496634774578, 0.6653269872776844], [0.5154027551342736, 0.3744449225186529, 0.564414741125708], [0.2967355725220689, 0.35503943442947294, 0.3465948643589515], [0.25618938307431866, 0.335621810971814, 0.30580549373744087], [0.3726258176777342, 0.12376186811776885, 0.4087077195258245], [0.42406721165400185, 0.4421515577265334, 0.4741023096336908], [0.3420526275297414, 0.4486256791580259, 0.3873241536491108], [0.44721918563496355, 0.2846225570821821, 0.49423172702690793], [0.436591342103803, 0.2428435710493486, 0.4808825220363077], [0.4812411038138783, 0.2601922366251538, 0.5232446846361652], [0.21423818520515897, 0.37780021175219053, 0.25701750913118737], [0.0996092365195116, 0.2908487579481817, 0.1477091737164622], [0.2761394575210142, 0.15158495967608399, 0.3211744074486633], [0.24712952069714375, 0.03883297567789519, 0.27790825824361537], [0.21395326592506128, 0.11975391434103524, 0.2558827856656246], [0.051107729356722545, 0.3064718584144391, 0.0], [0.1948794499171218, 0.08697700845625815, 0.21965882636488793], [0.2612757164376361, 0.0872811548961172, 0.30008832033253147], [0.0, 0.2793510336476312, 0.051107729356722545], [0.2793510336476312, 0.0, 0.3064718584144391]]
聚类:[[0, 1, 2, 3, 8, 12, 13, 15, 16, 20, 21, 28], [4, 5, 6, 7, 9, 10, 11, 14, 17, 18, 19, 22, 23, 24, 26, 27, 29], [25]]
New loop ...
均值向量[[0.6811666666666666, 0.25300000000000006], [0.4108823529411764, 0.2774117647058823], [0.751, 0.489]]
距离:[[0.20760465901430158, 0.33941386481152713, 0.06129437168288785], [0.15410070661024816, 0.3762632931126085, 0.11531695452100701], [0.04843236980000499, 0.22352037907446318, 0.25360205046489664], [0.09786910192247139, 0.20125300401807075, 0.2229125389025929], [0.13080785314515492, 0.15796949028718082, 0.33630492116530203], [0.27862644247171586, 0.04117331920713786, 0.4296603309592357], [0.22557192742990967, 0.14630811920035275, 0.4341658669218482], [0.24775262079564583, 0.07136283331846313, 0.4193804954930547], [0.1627084133589219, 0.31596575741936295, 0.40697542923375607], [0.43839026879913484, 0.16820490264363994, 0.5543897545950863], [0.47818130568970496, 0.27585920509938083, 0.6653269872776844], [0.3715813429714204, 0.19088942251025212, 0.564414741125708], [0.101202903998738, 0.25610419727445255, 0.3465948643589515], [0.060075184375728535, 0.2586119188426436, 0.30580549373744087], [0.34181431768985, 0.10564845079654196, 0.4087077195258245], [0.2286796036184931, 0.29763322450993257, 0.4741023096336908], [0.1546976441679418, 0.35405641936969007, 0.3873241536491108], [0.3286584261982508, 0.10337428217179, 0.49423172702690793], [0.34237702577389406, 0.08057846656121218, 0.4808825220363077], [0.39918670791720723, 0.13048870080609384, 0.5232446846361652], [0.07005493875840915, 0.34016251458998237, 0.25701750913118737], [0.09862569532215104, 0.3107807168716212, 0.1477091737164622], [0.20676321669430892, 0.07998312970909735, 0.3211744074486633], [0.2741034374911128, 0.17312765056744542, 0.27790825824361537], [0.19453541522760767, 0.1463258084362868, 0.2558827856656246], [0.24611520563436226, 0.40056159970185534, 0.0], [0.2649748939889294, 0.22920311024055642, 0.21965882636488793], [0.24178991110282305, 0.11652571482094794, 0.30008832033253147], [0.19693999368109844, 0.35602768544421964, 0.051107729356722545], [0.3126329494968677, 0.18495279487528693, 0.3064718584144391]]
聚类:[[2, 3, 4, 8, 12, 13, 15, 16, 20, 21], [5, 6, 7, 9, 10, 11, 14, 17, 18, 19, 22, 23, 24, 27, 29], [0, 1, 25, 26, 28]]
New loop ...
均值向量[[0.6534000000000001, 0.19699999999999998], [0.39313333333333333, 0.2686], [0.6958, 0.4484]]
距离:[[0.26658949716746155, 0.35912241800131484, 0.011661903789690599], [0.21583641954035465, 0.39571982232101766, 0.10656922632730337], [0.06975213258388598, 0.24091058737861878, 0.1944803331959301], [0.129236836853894, 0.22047232126605926, 0.157203689524133], [0.09904928066371815, 0.17145994025168423, 0.27206543330603394], [0.25357476215112584, 0.03310454819373182, 0.3611395852021763], [0.17895742510440865, 0.14840724750197043, 0.3684825640379745], [0.2168523921933997, 0.07240196436868579, 0.35119253978409054], [0.10674624115162086, 0.32557330630409154, 0.35864020968095595], [0.4163269868745, 0.15014185884615183, 0.4877845836022291], [0.4317297302711502, 0.2582983632244782, 0.59700469009883], [0.32550293393455004, 0.17685449135125494, 0.4965362021041366], [0.038773186611368436, 0.26838065835260516, 0.29295904150580504], [0.0037363083384538277, 0.2731482706842161, 0.2533882396639591], [0.340606165534331, 0.10667604125471557, 0.3448306830895418], [0.16635251726379138, 0.3021493743902913, 0.41920019083965127], [0.11462704741901006, 0.3655303604961487, 0.34617827777028415], [0.29453753580825665, 0.08752967750680021, 0.425725733307255], [0.317463950709368, 0.060763293012951296, 0.4126996486550479], [0.37621531069322534, 0.11173709221998658, 0.45592148446854314], [0.10086704119780646, 0.35674908705014385, 0.22260682828700473], [0.16085198164772477, 0.3300699589144364, 0.10400480758118832], [0.20557519305597172, 0.099797684230536, 0.2527623389668643], [0.2972627793720567, 0.18857600884288306, 0.21809814304573982], [0.21464053671196415, 0.16573767760463456, 0.18835339126227588], [0.30787945693079294, 0.42029122178688333, 0.06852298884316124], [0.30060432465285664, 0.24628339593060491, 0.16549138950410677], [0.2541361052664498, 0.1338411164195982, 0.23426822234353512], [0.25812896001804986, 0.3758356614857675, 0.029397278785629127], [0.33415379692590663, 0.19760325008573226, 0.25002479877004197]]
聚类:[[2, 3, 4, 8, 12, 13, 15, 16, 20], [5, 6, 7, 9, 10, 11, 14, 17, 18, 19, 22, 23, 24, 27, 29], [0, 1, 21, 25, 26, 28]]
New loop ...
均值向量[[0.6466666666666667, 0.18044444444444444], [0.39313333333333333, 0.2686], [0.6988333333333333, 0.4313333333333333]]
距离:[[0.2840506171202938, 0.35912241800131484, 0.028725230876163427], [0.23335756487934936, 0.39571982232101766, 0.09333705349728776], [0.08451020831025079, 0.24091058737861878, 0.17945418790196993], [0.14288681526057137, 0.22047232126605926, 0.14524165686499477], [0.09702850542081709, 0.17145994025168423, 0.25923246753102164], [0.2501439091975697, 0.03310454819373182, 0.35395311208626984], [0.16862442744414055, 0.14840724750197043, 0.3565998769240144], [0.21188145998746513, 0.07240196436868579, 0.3422038460073501], [0.09151003453038953, 0.32557330630409154, 0.3419134474623008], [0.41284215140330416, 0.15014185884615183, 0.48455079426436004], [0.4202078556801487, 0.2582983632244782, 0.5882942621587337], [0.31439885810115253, 0.17685449135125494, 0.48689095858883585], [0.02090129655143107, 0.26838065835260516, 0.276875674064893], [0.020370942752901137, 0.2731482706842161, 0.23705373277428515], [0.3436700254892084, 0.10667604125471557, 0.3443396659630655], [0.14848223903431002, 0.3021493743902913, 0.4034614465954447], [0.10597052932971385, 0.3655303604961487, 0.3289520819545336], [0.2877658727696254, 0.08752967750680021, 0.41796866575803926], [0.3135693752368362, 0.060763293012951296, 0.4070710080017435], [0.37261579524249583, 0.11173709221998658, 0.45182091462092466], [0.11369441390449406, 0.35674908705014385, 0.20530742531357427], [0.17872442405302752, 0.3300699589144364, 0.08667067298432356], [0.2099848613003058, 0.099797684230536, 0.246626179109644], [0.3070328932392492, 0.18857600884288306, 0.22090602577164395], [0.2244013710043724, 0.16573767760463456, 0.18467125445564667], [0.3257176312523502, 0.42029122178688333, 0.07776120855256535], [0.31329392975248194, 0.24628339593060491, 0.17171819614964767], [0.26153792539467985, 0.1338411164195982, 0.23251338073801736], [0.2759089579669709, 0.3758356614857675, 0.029520708362473686], [0.3433078919016796, 0.19760325008573226, 0.25434256208682193]]
聚类:[[2, 3, 4, 8, 12, 13, 15, 16, 20], [5, 6, 7, 9, 10, 11, 14, 17, 18, 19, 22, 23, 24, 27, 29], [0, 1, 21, 25, 26, 28]]
New loop ...
均值向量[[0.6466666666666667, 0.18044444444444444], [0.39313333333333333, 0.2686], [0.6988333333333333, 0.4313333333333333]]
距离:[[0.2840506171202938, 0.35912241800131484, 0.028725230876163427], [0.23335756487934936, 0.39571982232101766, 0.09333705349728776], [0.08451020831025079, 0.24091058737861878, 0.17945418790196993], [0.14288681526057137, 0.22047232126605926, 0.14524165686499477], [0.09702850542081709, 0.17145994025168423, 0.25923246753102164], [0.2501439091975697, 0.03310454819373182, 0.35395311208626984], [0.16862442744414055, 0.14840724750197043, 0.3565998769240144], [0.21188145998746513, 0.07240196436868579, 0.3422038460073501], [0.09151003453038953, 0.32557330630409154, 0.3419134474623008], [0.41284215140330416, 0.15014185884615183, 0.48455079426436004], [0.4202078556801487, 0.2582983632244782, 0.5882942621587337], [0.31439885810115253, 0.17685449135125494, 0.48689095858883585], [0.02090129655143107, 0.26838065835260516, 0.276875674064893], [0.020370942752901137, 0.2731482706842161, 0.23705373277428515], [0.3436700254892084, 0.10667604125471557, 0.3443396659630655], [0.14848223903431002, 0.3021493743902913, 0.4034614465954447], [0.10597052932971385, 0.3655303604961487, 0.3289520819545336], [0.2877658727696254, 0.08752967750680021, 0.41796866575803926], [0.3135693752368362, 0.060763293012951296, 0.4070710080017435], [0.37261579524249583, 0.11173709221998658, 0.45182091462092466], [0.11369441390449406, 0.35674908705014385, 0.20530742531357427], [0.17872442405302752, 0.3300699589144364, 0.08667067298432356], [0.2099848613003058, 0.099797684230536, 0.246626179109644], [0.3070328932392492, 0.18857600884288306, 0.22090602577164395], [0.2244013710043724, 0.16573767760463456, 0.18467125445564667], [0.3257176312523502, 0.42029122178688333, 0.07776120855256535], [0.31329392975248194, 0.24628339593060491, 0.17171819614964767], [0.26153792539467985, 0.1338411164195982, 0.23251338073801736], [0.2759089579669709, 0.3758356614857675, 0.029520708362473686], [0.3433078919016796, 0.19760325008573226, 0.25434256208682193]]
聚类:[[2, 3, 4, 8, 12, 13, 15, 16, 20], [5, 6, 7, 9, 10, 11, 14, 17, 18, 19, 22, 23, 24, 27, 29], [0, 1, 21, 25, 26, 28]]
New loop ...
均值向量[[0.6466666666666667, 0.18044444444444444], [0.39313333333333333, 0.2686], [0.6988333333333333, 0.4313333333333333]]
距离:[[0.2840506171202938, 0.35912241800131484, 0.028725230876163427], [0.23335756487934936, 0.39571982232101766, 0.09333705349728776], [0.08451020831025079, 0.24091058737861878, 0.17945418790196993], [0.14288681526057137, 0.22047232126605926, 0.14524165686499477], [0.09702850542081709, 0.17145994025168423, 0.25923246753102164], [0.2501439091975697, 0.03310454819373182, 0.35395311208626984], [0.16862442744414055, 0.14840724750197043, 0.3565998769240144], [0.21188145998746513, 0.07240196436868579, 0.3422038460073501], [0.09151003453038953, 0.32557330630409154, 0.3419134474623008], [0.41284215140330416, 0.15014185884615183, 0.48455079426436004], [0.4202078556801487, 0.2582983632244782, 0.5882942621587337], [0.31439885810115253, 0.17685449135125494, 0.48689095858883585], [0.02090129655143107, 0.26838065835260516, 0.276875674064893], [0.020370942752901137, 0.2731482706842161, 0.23705373277428515], [0.3436700254892084, 0.10667604125471557, 0.3443396659630655], [0.14848223903431002, 0.3021493743902913, 0.4034614465954447], [0.10597052932971385, 0.3655303604961487, 0.3289520819545336], [0.2877658727696254, 0.08752967750680021, 0.41796866575803926], [0.3135693752368362, 0.060763293012951296, 0.4070710080017435], [0.37261579524249583, 0.11173709221998658, 0.45182091462092466], [0.11369441390449406, 0.35674908705014385, 0.20530742531357427], [0.17872442405302752, 0.3300699589144364, 0.08667067298432356], [0.2099848613003058, 0.099797684230536, 0.246626179109644], [0.3070328932392492, 0.18857600884288306, 0.22090602577164395], [0.2244013710043724, 0.16573767760463456, 0.18467125445564667], [0.3257176312523502, 0.42029122178688333, 0.07776120855256535], [0.31329392975248194, 0.24628339593060491, 0.17171819614964767], [0.26153792539467985, 0.1338411164195982, 0.23251338073801736], [0.2759089579669709, 0.3758356614857675, 0.029520708362473686], [0.3433078919016796, 0.19760325008573226, 0.25434256208682193]]
聚类:[[2, 3, 4, 8, 12, 13, 15, 16, 20], [5, 6, 7, 9, 10, 11, 14, 17, 18, 19, 22, 23, 24, 27, 29], [0, 1, 21, 25, 26, 28]]
4. 代码补充
  • 随机生成3个整数,在1~30范围,且不相同

set集合法

 
package test;
 
import java.util.HashSet;
import java.util.Set;
 
public class TestRomdom {
 
    private final static Integer K = 3;
 
    public static void main(String[] args) {
        System.out.println(getRandomIndices(30));
    }
    
    public static Set<Integer> getRandomIndices(int length) {
        // set 容器保证了元素不重复
        Set<Integer> set  = new HashSet<>(K);
        // while 保证了输出的元素是K个,只要set的大小为K的时候退出循环
        while (set.size() != K) {
            for(int i = 0; i < K; i++) {
                set.add((int)(Math.random() * length + 1));
            }
        }
        return set;
    }
}
 

交换法:

	public static int[] getRandomIndices(int paraLength) {
		Random random = new Random();
		int[] resultIndices = new int[paraLength];

		// Step 1. Initialize.
		for (int i = 0; i < paraLength; i++) {
			resultIndices[i] = i;
		} // Of for i

		// Step 2. Randomly swap.
		int tempFirst, tempSecond, tempValue;
		for (int i = 0; i < paraLength; i++) {
			// Generate two random indices.
			tempFirst = random.nextInt(paraLength);
			tempSecond = random.nextInt(paraLength);

			// Swap.
			tempValue = resultIndices[tempFirst];
			resultIndices[tempFirst] = resultIndices[tempSecond];
			resultIndices[tempSecond] = tempValue;
		} // Of for i

		return resultIndices;
	}// Of getRandomIndices
  • 标准化欧式距离

  • 计算准确度函数
	public void isPure() {
		System.out.println("最终的簇:");
		System.out.println(Arrays.deepToString(clusters));
		System.out.println("准确率:");
		// 待定
	}
  • 想法
	k-means代码,原理,都是比较简单的。但代码实现,还是要一些功夫,更好的算法的还有改进。
k-means所属是原型聚类,原型向量是通过随机来选择的,k为所分的簇的数量。k为几就要随机选几个
样本作为原型向量。
	这就有个问题:例如:如何在1~30范围内,选择3个数,并保证3个数肯定不相同?有的很妙的思路:
我初始一个30容量的数组,里面内容是1~30(或0~29,然后随机生成两个在1~30的随机数,这两个随机数
决定,这个数组哪两个位置来交换。这样我k=3,我就取这个数组的前3个就好,这样保证了这3个数肯定不一样。
但缺陷还是有的,所以我们还有一种算法:高斯混合聚类,这个算法是以概率函数来选择原型向量,而不是通过
随机选择。
	在距离计算时,密度,含糖量,单位不一样,向加的时候,你是否想过?所以我们,可以用标准化欧式距离,在
算样本之间的距离的同时,除以标准差(又叫均方差)。(12),(35.先算第一位的平均数(1+3/2=2(2-1)^2+(2-3)^2=2;2/2=1;sqrt(1)=1;这个1就是标准差。这样一除,单位就变成标准差了,单位一样,就可以
放心相加了。
	更新均值向量,是这个算法的关键。

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/langs/874112.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-05-13
下一篇 2022-05-13

发表评论

登录后才能评论

评论列表(0条)

保存