- 1. K-Means算法原理
- 2. 西瓜数据集例题结果
- 3. Java代码
- 3.1 xigua.arff
- 3.2 KMeans.java
- 3.3 输出示例
- 4. 代码补充
【聚类2】原型聚类——K-Means算法
2. 西瓜数据集例题结果- 西瓜数据集
- 结果
"这个数据集应该得到3个簇:"
1. C1 = {6,7,8,10,11,12,15,18,19,20}
2. C2 = {1,2,4,22,23,24,25,26,27,28,29,30}
3. C3 = {3,5,9,13,14,16,17,21}
3. Java代码
聚类是无监督学习,不应该有标签,程序里面的属性种类减了1
3.1 xigua.arff
//我晕了,这个文件明明内容一摸一样,复制到文本文件,改后缀为arff。但是却不行,bug吧!!!
//为了你们,不需要下载weka这个软件,我把我电脑上测试得起的xigua.arff文件,免费上传了。
@relation xigua
@attribute 密度 numeric
@attribute 含糖量 numeric
@attribute 好瓜 {是,否}
@data
0.697,0.46,是
0.774,0.376,是
0.634,0.264,是
0.608,0.318,是
0.556,0.215,是
0.403,0.237,是
0.481,0.149,是
0.437,0.211,否
0.666,0.091,否
0.243,0.267,否
0.245,0.057,否
0.343,0.099,否
0.639,0.161,否
0.657,0.198,否
0.36,0.37,否
0.593,0.042,否
0.719,0.103,否
0.359,0.188,否
0.339,0.241,否
0.282,0.257,否
0.748,0.232,是
0.714,0.346,是
0.483,0.312,是
0.478,0.437,是
0.525,0.369,是
0.751,0.489,是
0.532,0.472,是
0.473,0.376,是
0.725,0.445,是
0.446,0.459,是
3.2 KMeans.java
package cluster;
import java.io.FileReader;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import weka.core.Instances;
public class KMeans {
Instances dataset;
int k;
int[][] clusters;
static int T = 5;
/**
* 读取arff文件
* @param fileURL
* @param numClusters 聚类的簇的个数
*/
public KMeans(String fileURL, int numClusters) {
dataset = null;
k = numClusters;
try {
FileReader fileReader = new FileReader(fileURL);
dataset = new Instances(fileReader);
fileReader.close();
} catch (Exception e) {
System.out.println("Cannot read the file: " + fileURL + "\r\n" + e);
System.exit(0);
}
}
/**
* 从样本里面随机选num个样本下标
* @param length
* @param num
* @return
*/
public int[] getRandomIndices(int length, int num) {
// 1
Set<Integer> set = new HashSet<>(num);
// 2
while (set.size() != num) {
for (int i = 0; i < num; i++) {
set.add((int) (Math.random() * length));
}
}
int[] result = new int[num];
Iterator<Integer> it = set.iterator();
for (int i = 0; i < num; i++) {
result[i] = it.next();
}
return result;
}
/**
* 通过下标,来计算两个样本(原型样本---其余样本)的欧式距离
* @param prototypeIndex
* @param otherIndex
* @return
*/
public double eudistance(int prototypeIndex, int otherIndex) {
double result = 0.0;
for (int i = 0; i < dataset.numAttributes() - 1; i++) {
double temp;
double p1 = dataset.instance(prototypeIndex).value(i);
double p2 = dataset.instance(otherIndex).value(i);
temp = Math.abs((p1 - p2)) * Math.abs((p1 - p2));
result += temp;
}
return Math.sqrt(result);
}
/**
* 通过向量,来计算两个样本(均值向量---样本向量)的欧式距离
* @param vector1
* @param vector2
* @return
*/
public double eudistance(double[] vector1, double[] vector2) {
double result = 0.0;
int n = dataset.numAttributes() - 1;
for (int i = 0; i < n; i++) {
double temp;
double p1 = vector1[i];
double p2 = vector2[i];
temp = Math.abs((p1 - p2)) * Math.abs((p1 - p2));
result += temp;
}
return Math.sqrt(result);
}
/**
* 聚类
*/
public void clustering() {
int length = dataset.numInstances();
System.out.println("Beginning ...");
// 1
int[] prototypeIndex = getRandomIndices(length, k);
System.out.println("原型向量样本:" + Arrays.toString(prototypeIndex));
// 2
double[][] tempDistances = getDistances(prototypeIndex);
System.out.println("距离:" + Arrays.deepToString(tempDistances));
// 3
clusters = selectCluster(tempDistances);
System.out.println("聚类:" + Arrays.deepToString(clusters));
int[][] oldClusters = clusters;
while (Arrays.equals(clusters, oldClusters) || T > 0) {
System.out.println("New loop ...");
// 1
double[][] meanVector = getMeanVector(clusters);
System.out.println("均值向量" + Arrays.deepToString(meanVector));
// 2
double[][] newDistances = getDistances(meanVector);
System.out.println("距离:" + Arrays.deepToString(newDistances));
// 3
clusters = selectCluster(newDistances);
System.out.println("聚类:" + Arrays.deepToString(clusters));
// 4
T--;
}
}
/**
* 更新均值变量
* @param clusters
* @return
*/
public double[][] getMeanVector(int[][] clusters) {
int n = dataset.numAttributes() - 1;
double[][] result = new double[k][n];
// 1 计算均值
double[][] sum = new double[k][n];
for (int i = 0; i < k; i++) {
for (int j = 0; j < n; j++) {
for (int t = 0; t < clusters[i].length; t++) {
sum[i][j] += dataset.instance(clusters[i][t]).value(j);
}
result[i][j] = sum[i][j] * 1.0 / clusters[i].length;
}
}
return result;
}
/**
* 根据距离最近,划分样本
* @param distances
* @return
*/
public int[][] selectCluster(double[][] distances) {
int length = dataset.numInstances();
int[][] result = new int[k][];
// 1 minIndex
int[] minIndex = new int[length];
for (int i = 0; i < length; i++) {
double min = 100;
for (int j = 0; j < k; j++) {
if (min > distances[i][j]) {
min = distances[i][j];
minIndex[i] = j;
}
}
}
// 2 statistic
int[] count = new int[k];
for (int i = 0; i < k; i++) {
for (int j = 0; j < length; j++) {
if (minIndex[j] == i) {
count[i]++;
}
}
}
// 3 init
for (int i = 0; i < k; i++) {
result[i] = new int[count[i]];
}
// 4 select
for (int i = 0; i < k; i++) {
int[] temp = new int[count[i]];
int t = 0;
for (int j = 0; j < length; j++) {
if (minIndex[j] == i) {
temp[t++] = j;
}
}
result[i] = temp;
}
// 5 test
// System.out.println("最小下标:" + Arrays.toString(minIndex));
// System.out.println("统计个数:" + Arrays.toString(count));
// System.out.println("簇类数:" + k);
// System.out.println("样本长度:" + length);
return result;
}
/**
* 例如:30行,3列。每个样本(下标)到原型样本(下标)的距离---->构成了二维矩阵
* @param prototypeIndex
* @return
*/
public double[][] getDistances(int[] prototypeIndex) {
int length = dataset.numInstances();
double[][] distances = new double[length][k];
for (int i = 0; i < length; i++) {
for (int j = 0; j < k; j++) {
distances[i][j] = eudistance(i, prototypeIndex[j]);
}
}
return distances;
}
/**
* 例如:30行,3列。每个样本(向量)到均值向量(向量)的距离---->构成了二维矩阵
* @param meanVector
* @return
*/
public double[][] getDistances(double[][] meanVector) {
int length = dataset.numInstances();
int n = dataset.numAttributes() - 1;
double[][] distances = new double[length][k];
// 1
double[][] vector = new double[length][n];
for (int i = 0; i < length; i++) {
for (int j = 0; j < n; j++) {
vector[i][j] = dataset.instance(i).value(j);
}
}
// System.out.println("样本向量" + Arrays.deepToString(vextor));
// System.out.println("测试" + Arrays.toString(vector[0]) + "," +Arrays.toString(meanVector[0]));
// 2
for (int i = 0; i < length; i++) {
for (int j = 0; j < k; j++) {
distances[i][j] = eudistance(vector[i], meanVector[j]);
}
}
return distances;
}
public static void main(String arags[]) {
KMeans kmeans = new KMeans("D:/data/xigua.arff", 3);
kmeans.clustering();
}
}// KMeans
3.3 输出示例
Beginning ...
原型向量样本:[28, 29, 25]
距离:[[0.03176476034853721, 0.2510019920239678, 0.06129437168288785], [0.08462860036654278, 0.3383385878081305, 0.11531695452100701], [0.20258825237411965, 0.2708671260969112, 0.25360205046489664], [0.17267889274604467, 0.2147673159491453, 0.2229125389025929], [0.2854137347781287, 0.26764902391004536, 0.33630492116530203], [0.383337970986439, 0.22612607103118387, 0.4296603309592357], [0.38360396244043155, 0.31196954979613, 0.4341658669218482], [0.37107950630558945, 0.2481632527188504, 0.4193804954930547], [0.35888298928759493, 0.4287470116513933, 0.40697542923375607], [0.5138170880770705, 0.2794154612758571, 0.5543897545950863], [0.6172066104636275, 0.4494496634774578, 0.6653269872776844], [0.5154027551342736, 0.3744449225186529, 0.564414741125708], [0.2967355725220689, 0.35503943442947294, 0.3465948643589515], [0.25618938307431866, 0.335621810971814, 0.30580549373744087], [0.3726258176777342, 0.12376186811776885, 0.4087077195258245], [0.42406721165400185, 0.4421515577265334, 0.4741023096336908], [0.3420526275297414, 0.4486256791580259, 0.3873241536491108], [0.44721918563496355, 0.2846225570821821, 0.49423172702690793], [0.436591342103803, 0.2428435710493486, 0.4808825220363077], [0.4812411038138783, 0.2601922366251538, 0.5232446846361652], [0.21423818520515897, 0.37780021175219053, 0.25701750913118737], [0.0996092365195116, 0.2908487579481817, 0.1477091737164622], [0.2761394575210142, 0.15158495967608399, 0.3211744074486633], [0.24712952069714375, 0.03883297567789519, 0.27790825824361537], [0.21395326592506128, 0.11975391434103524, 0.2558827856656246], [0.051107729356722545, 0.3064718584144391, 0.0], [0.1948794499171218, 0.08697700845625815, 0.21965882636488793], [0.2612757164376361, 0.0872811548961172, 0.30008832033253147], [0.0, 0.2793510336476312, 0.051107729356722545], [0.2793510336476312, 0.0, 0.3064718584144391]]
聚类:[[0, 1, 2, 3, 8, 12, 13, 15, 16, 20, 21, 28], [4, 5, 6, 7, 9, 10, 11, 14, 17, 18, 19, 22, 23, 24, 26, 27, 29], [25]]
New loop ...
均值向量[[0.6811666666666666, 0.25300000000000006], [0.4108823529411764, 0.2774117647058823], [0.751, 0.489]]
距离:[[0.20760465901430158, 0.33941386481152713, 0.06129437168288785], [0.15410070661024816, 0.3762632931126085, 0.11531695452100701], [0.04843236980000499, 0.22352037907446318, 0.25360205046489664], [0.09786910192247139, 0.20125300401807075, 0.2229125389025929], [0.13080785314515492, 0.15796949028718082, 0.33630492116530203], [0.27862644247171586, 0.04117331920713786, 0.4296603309592357], [0.22557192742990967, 0.14630811920035275, 0.4341658669218482], [0.24775262079564583, 0.07136283331846313, 0.4193804954930547], [0.1627084133589219, 0.31596575741936295, 0.40697542923375607], [0.43839026879913484, 0.16820490264363994, 0.5543897545950863], [0.47818130568970496, 0.27585920509938083, 0.6653269872776844], [0.3715813429714204, 0.19088942251025212, 0.564414741125708], [0.101202903998738, 0.25610419727445255, 0.3465948643589515], [0.060075184375728535, 0.2586119188426436, 0.30580549373744087], [0.34181431768985, 0.10564845079654196, 0.4087077195258245], [0.2286796036184931, 0.29763322450993257, 0.4741023096336908], [0.1546976441679418, 0.35405641936969007, 0.3873241536491108], [0.3286584261982508, 0.10337428217179, 0.49423172702690793], [0.34237702577389406, 0.08057846656121218, 0.4808825220363077], [0.39918670791720723, 0.13048870080609384, 0.5232446846361652], [0.07005493875840915, 0.34016251458998237, 0.25701750913118737], [0.09862569532215104, 0.3107807168716212, 0.1477091737164622], [0.20676321669430892, 0.07998312970909735, 0.3211744074486633], [0.2741034374911128, 0.17312765056744542, 0.27790825824361537], [0.19453541522760767, 0.1463258084362868, 0.2558827856656246], [0.24611520563436226, 0.40056159970185534, 0.0], [0.2649748939889294, 0.22920311024055642, 0.21965882636488793], [0.24178991110282305, 0.11652571482094794, 0.30008832033253147], [0.19693999368109844, 0.35602768544421964, 0.051107729356722545], [0.3126329494968677, 0.18495279487528693, 0.3064718584144391]]
聚类:[[2, 3, 4, 8, 12, 13, 15, 16, 20, 21], [5, 6, 7, 9, 10, 11, 14, 17, 18, 19, 22, 23, 24, 27, 29], [0, 1, 25, 26, 28]]
New loop ...
均值向量[[0.6534000000000001, 0.19699999999999998], [0.39313333333333333, 0.2686], [0.6958, 0.4484]]
距离:[[0.26658949716746155, 0.35912241800131484, 0.011661903789690599], [0.21583641954035465, 0.39571982232101766, 0.10656922632730337], [0.06975213258388598, 0.24091058737861878, 0.1944803331959301], [0.129236836853894, 0.22047232126605926, 0.157203689524133], [0.09904928066371815, 0.17145994025168423, 0.27206543330603394], [0.25357476215112584, 0.03310454819373182, 0.3611395852021763], [0.17895742510440865, 0.14840724750197043, 0.3684825640379745], [0.2168523921933997, 0.07240196436868579, 0.35119253978409054], [0.10674624115162086, 0.32557330630409154, 0.35864020968095595], [0.4163269868745, 0.15014185884615183, 0.4877845836022291], [0.4317297302711502, 0.2582983632244782, 0.59700469009883], [0.32550293393455004, 0.17685449135125494, 0.4965362021041366], [0.038773186611368436, 0.26838065835260516, 0.29295904150580504], [0.0037363083384538277, 0.2731482706842161, 0.2533882396639591], [0.340606165534331, 0.10667604125471557, 0.3448306830895418], [0.16635251726379138, 0.3021493743902913, 0.41920019083965127], [0.11462704741901006, 0.3655303604961487, 0.34617827777028415], [0.29453753580825665, 0.08752967750680021, 0.425725733307255], [0.317463950709368, 0.060763293012951296, 0.4126996486550479], [0.37621531069322534, 0.11173709221998658, 0.45592148446854314], [0.10086704119780646, 0.35674908705014385, 0.22260682828700473], [0.16085198164772477, 0.3300699589144364, 0.10400480758118832], [0.20557519305597172, 0.099797684230536, 0.2527623389668643], [0.2972627793720567, 0.18857600884288306, 0.21809814304573982], [0.21464053671196415, 0.16573767760463456, 0.18835339126227588], [0.30787945693079294, 0.42029122178688333, 0.06852298884316124], [0.30060432465285664, 0.24628339593060491, 0.16549138950410677], [0.2541361052664498, 0.1338411164195982, 0.23426822234353512], [0.25812896001804986, 0.3758356614857675, 0.029397278785629127], [0.33415379692590663, 0.19760325008573226, 0.25002479877004197]]
聚类:[[2, 3, 4, 8, 12, 13, 15, 16, 20], [5, 6, 7, 9, 10, 11, 14, 17, 18, 19, 22, 23, 24, 27, 29], [0, 1, 21, 25, 26, 28]]
New loop ...
均值向量[[0.6466666666666667, 0.18044444444444444], [0.39313333333333333, 0.2686], [0.6988333333333333, 0.4313333333333333]]
距离:[[0.2840506171202938, 0.35912241800131484, 0.028725230876163427], [0.23335756487934936, 0.39571982232101766, 0.09333705349728776], [0.08451020831025079, 0.24091058737861878, 0.17945418790196993], [0.14288681526057137, 0.22047232126605926, 0.14524165686499477], [0.09702850542081709, 0.17145994025168423, 0.25923246753102164], [0.2501439091975697, 0.03310454819373182, 0.35395311208626984], [0.16862442744414055, 0.14840724750197043, 0.3565998769240144], [0.21188145998746513, 0.07240196436868579, 0.3422038460073501], [0.09151003453038953, 0.32557330630409154, 0.3419134474623008], [0.41284215140330416, 0.15014185884615183, 0.48455079426436004], [0.4202078556801487, 0.2582983632244782, 0.5882942621587337], [0.31439885810115253, 0.17685449135125494, 0.48689095858883585], [0.02090129655143107, 0.26838065835260516, 0.276875674064893], [0.020370942752901137, 0.2731482706842161, 0.23705373277428515], [0.3436700254892084, 0.10667604125471557, 0.3443396659630655], [0.14848223903431002, 0.3021493743902913, 0.4034614465954447], [0.10597052932971385, 0.3655303604961487, 0.3289520819545336], [0.2877658727696254, 0.08752967750680021, 0.41796866575803926], [0.3135693752368362, 0.060763293012951296, 0.4070710080017435], [0.37261579524249583, 0.11173709221998658, 0.45182091462092466], [0.11369441390449406, 0.35674908705014385, 0.20530742531357427], [0.17872442405302752, 0.3300699589144364, 0.08667067298432356], [0.2099848613003058, 0.099797684230536, 0.246626179109644], [0.3070328932392492, 0.18857600884288306, 0.22090602577164395], [0.2244013710043724, 0.16573767760463456, 0.18467125445564667], [0.3257176312523502, 0.42029122178688333, 0.07776120855256535], [0.31329392975248194, 0.24628339593060491, 0.17171819614964767], [0.26153792539467985, 0.1338411164195982, 0.23251338073801736], [0.2759089579669709, 0.3758356614857675, 0.029520708362473686], [0.3433078919016796, 0.19760325008573226, 0.25434256208682193]]
聚类:[[2, 3, 4, 8, 12, 13, 15, 16, 20], [5, 6, 7, 9, 10, 11, 14, 17, 18, 19, 22, 23, 24, 27, 29], [0, 1, 21, 25, 26, 28]]
New loop ...
均值向量[[0.6466666666666667, 0.18044444444444444], [0.39313333333333333, 0.2686], [0.6988333333333333, 0.4313333333333333]]
距离:[[0.2840506171202938, 0.35912241800131484, 0.028725230876163427], [0.23335756487934936, 0.39571982232101766, 0.09333705349728776], [0.08451020831025079, 0.24091058737861878, 0.17945418790196993], [0.14288681526057137, 0.22047232126605926, 0.14524165686499477], [0.09702850542081709, 0.17145994025168423, 0.25923246753102164], [0.2501439091975697, 0.03310454819373182, 0.35395311208626984], [0.16862442744414055, 0.14840724750197043, 0.3565998769240144], [0.21188145998746513, 0.07240196436868579, 0.3422038460073501], [0.09151003453038953, 0.32557330630409154, 0.3419134474623008], [0.41284215140330416, 0.15014185884615183, 0.48455079426436004], [0.4202078556801487, 0.2582983632244782, 0.5882942621587337], [0.31439885810115253, 0.17685449135125494, 0.48689095858883585], [0.02090129655143107, 0.26838065835260516, 0.276875674064893], [0.020370942752901137, 0.2731482706842161, 0.23705373277428515], [0.3436700254892084, 0.10667604125471557, 0.3443396659630655], [0.14848223903431002, 0.3021493743902913, 0.4034614465954447], [0.10597052932971385, 0.3655303604961487, 0.3289520819545336], [0.2877658727696254, 0.08752967750680021, 0.41796866575803926], [0.3135693752368362, 0.060763293012951296, 0.4070710080017435], [0.37261579524249583, 0.11173709221998658, 0.45182091462092466], [0.11369441390449406, 0.35674908705014385, 0.20530742531357427], [0.17872442405302752, 0.3300699589144364, 0.08667067298432356], [0.2099848613003058, 0.099797684230536, 0.246626179109644], [0.3070328932392492, 0.18857600884288306, 0.22090602577164395], [0.2244013710043724, 0.16573767760463456, 0.18467125445564667], [0.3257176312523502, 0.42029122178688333, 0.07776120855256535], [0.31329392975248194, 0.24628339593060491, 0.17171819614964767], [0.26153792539467985, 0.1338411164195982, 0.23251338073801736], [0.2759089579669709, 0.3758356614857675, 0.029520708362473686], [0.3433078919016796, 0.19760325008573226, 0.25434256208682193]]
聚类:[[2, 3, 4, 8, 12, 13, 15, 16, 20], [5, 6, 7, 9, 10, 11, 14, 17, 18, 19, 22, 23, 24, 27, 29], [0, 1, 21, 25, 26, 28]]
New loop ...
均值向量[[0.6466666666666667, 0.18044444444444444], [0.39313333333333333, 0.2686], [0.6988333333333333, 0.4313333333333333]]
距离:[[0.2840506171202938, 0.35912241800131484, 0.028725230876163427], [0.23335756487934936, 0.39571982232101766, 0.09333705349728776], [0.08451020831025079, 0.24091058737861878, 0.17945418790196993], [0.14288681526057137, 0.22047232126605926, 0.14524165686499477], [0.09702850542081709, 0.17145994025168423, 0.25923246753102164], [0.2501439091975697, 0.03310454819373182, 0.35395311208626984], [0.16862442744414055, 0.14840724750197043, 0.3565998769240144], [0.21188145998746513, 0.07240196436868579, 0.3422038460073501], [0.09151003453038953, 0.32557330630409154, 0.3419134474623008], [0.41284215140330416, 0.15014185884615183, 0.48455079426436004], [0.4202078556801487, 0.2582983632244782, 0.5882942621587337], [0.31439885810115253, 0.17685449135125494, 0.48689095858883585], [0.02090129655143107, 0.26838065835260516, 0.276875674064893], [0.020370942752901137, 0.2731482706842161, 0.23705373277428515], [0.3436700254892084, 0.10667604125471557, 0.3443396659630655], [0.14848223903431002, 0.3021493743902913, 0.4034614465954447], [0.10597052932971385, 0.3655303604961487, 0.3289520819545336], [0.2877658727696254, 0.08752967750680021, 0.41796866575803926], [0.3135693752368362, 0.060763293012951296, 0.4070710080017435], [0.37261579524249583, 0.11173709221998658, 0.45182091462092466], [0.11369441390449406, 0.35674908705014385, 0.20530742531357427], [0.17872442405302752, 0.3300699589144364, 0.08667067298432356], [0.2099848613003058, 0.099797684230536, 0.246626179109644], [0.3070328932392492, 0.18857600884288306, 0.22090602577164395], [0.2244013710043724, 0.16573767760463456, 0.18467125445564667], [0.3257176312523502, 0.42029122178688333, 0.07776120855256535], [0.31329392975248194, 0.24628339593060491, 0.17171819614964767], [0.26153792539467985, 0.1338411164195982, 0.23251338073801736], [0.2759089579669709, 0.3758356614857675, 0.029520708362473686], [0.3433078919016796, 0.19760325008573226, 0.25434256208682193]]
聚类:[[2, 3, 4, 8, 12, 13, 15, 16, 20], [5, 6, 7, 9, 10, 11, 14, 17, 18, 19, 22, 23, 24, 27, 29], [0, 1, 21, 25, 26, 28]]
4. 代码补充
- 随机生成3个整数,在1~30范围,且不相同
set集合法
package test;
import java.util.HashSet;
import java.util.Set;
public class TestRomdom {
private final static Integer K = 3;
public static void main(String[] args) {
System.out.println(getRandomIndices(30));
}
public static Set<Integer> getRandomIndices(int length) {
// set 容器保证了元素不重复
Set<Integer> set = new HashSet<>(K);
// while 保证了输出的元素是K个,只要set的大小为K的时候退出循环
while (set.size() != K) {
for(int i = 0; i < K; i++) {
set.add((int)(Math.random() * length + 1));
}
}
return set;
}
}
交换法:
public static int[] getRandomIndices(int paraLength) {
Random random = new Random();
int[] resultIndices = new int[paraLength];
// Step 1. Initialize.
for (int i = 0; i < paraLength; i++) {
resultIndices[i] = i;
} // Of for i
// Step 2. Randomly swap.
int tempFirst, tempSecond, tempValue;
for (int i = 0; i < paraLength; i++) {
// Generate two random indices.
tempFirst = random.nextInt(paraLength);
tempSecond = random.nextInt(paraLength);
// Swap.
tempValue = resultIndices[tempFirst];
resultIndices[tempFirst] = resultIndices[tempSecond];
resultIndices[tempSecond] = tempValue;
} // Of for i
return resultIndices;
}// Of getRandomIndices
- 标准化欧式距离
- 计算准确度函数
public void isPure() {
System.out.println("最终的簇:");
System.out.println(Arrays.deepToString(clusters));
System.out.println("准确率:");
// 待定
}
- 想法
k-means代码,原理,都是比较简单的。但代码实现,还是要一些功夫,更好的算法的还有改进。
k-means所属是原型聚类,原型向量是通过随机来选择的,k为所分的簇的数量。k为几就要随机选几个
样本作为原型向量。
这就有个问题:例如:如何在1~30范围内,选择3个数,并保证3个数肯定不相同?有的很妙的思路:
我初始一个30容量的数组,里面内容是1~30(或0~29),然后随机生成两个在1~30的随机数,这两个随机数
决定,这个数组哪两个位置来交换。这样我k=3,我就取这个数组的前3个就好,这样保证了这3个数肯定不一样。
但缺陷还是有的,所以我们还有一种算法:高斯混合聚类,这个算法是以概率函数来选择原型向量,而不是通过
随机选择。
在距离计算时,密度,含糖量,单位不一样,向加的时候,你是否想过?所以我们,可以用标准化欧式距离,在
算样本之间的距离的同时,除以标准差(又叫均方差)。(1,2),(3,5).先算第一位的平均数(1+3)/2=2,
(2-1)^2+(2-3)^2=2;2/2=1;sqrt(1)=1;这个1就是标准差。这样一除,单位就变成标准差了,单位一样,就可以
放心相加了。
更新均值向量,是这个算法的关键。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)