[code] spectral cluster
1 根據數據構造一個graph, graph的每個節點對應一個數據點, 邊的權重表示數據之間的相似度, 0表示完全不相似(通常為減少計算量, 將相似度低于閥值r的都記為0)。 graph用鄰接矩陣的形式表示出來, 記為W.
?
2 把W的每一列元素加起來得到N個數,把他們放在對角上形成一個N * N矩陣, 記為D, D的其他元素都是0。 并令 L = D - W
?
3 求出L的前k個特征值及其對應的特征列向量
?
4 將這k個特征列向量排列在一起組成一個N * k的矩陣, 將每一行看做是一個k維空間中的向量, 然后對這N個k維向量進行聚類。
?
聚類的結果就是最終graph中的點的類別。
?
cut(A, B) = sigma(i in A, j in B) wij
RatioCut(A, B) = cut(A, B)/|A| + cut(A, B)/|B|
NCut(A, B) = cut(A, B)/vol(A) + cut(A,B)/vol(B), NCut 是normalized cut, vol(A)表示A中的邊之和
?
兩者都可以算作?A 的“大小”的一種度量,通過在分母上放置這樣的項,就可以有效地防止孤立點的情況出現,達到相對平均一些的分割。
?
public class SpectralClusterer{/*** The points of the dataset*/protected DoubleMatrix2D v;/*** The class number of each point in the dataset*/protected int[] cluster;/*** The number of different clusters found*/protected int numOfClusters = 0;/*** The alpha star parameter value*/protected double alpha_star = 0.5;/*** The distance cut factor*/protected double r = -1;/*** The sigma scaling factor*/protected double sigma = 1.0;/*** The using sparse matrix flag*/protected boolean useSparseMatrix = false;//計算x, y之間的歐式距離protected static double distnorm2(DoubleMatrix1D x, DoubleMatrix1D y) {DoubleMatrix1D z = x.copy();z.assign(y, Functions.minus);return z.zDotProduct(z);}//合并兩個無交集的點集合。 點用整數表示protected static int[] merge(int[] a, int[] b) {int[] v = new int[a.length + b.length];System.arraycopy(a, 0, v, 0, a.length);System.arraycopy(b, 0, v, a.length, b.length);return v;}note, asso 與 cut 正好相對//a,b表示兩個點集,W是點的相似度矩陣; asso返回Cut(a, b).//Computes the association degree between two partitions of a graphprotected static double asso(DoubleMatrix2D W, int[] a, int[] b) {return W.viewSelection(a, b).zSum();}//同上, 返回NCut(a, b)protected static double Nasso(DoubleMatrix2D W, int[] a, int[] b) {int[] v = merge(a, b);return Nasso(W, a, b, v);}//相對于指定subgraph v的NCut(a, b) ...?protected static double Nasso(DoubleMatrix2D W, int[] a, int[] b, int[] v) {return asso(W, a, a) / asso(W, a, v) + asso(W, b, b) / asso(W, b, v);}//Returns the normalized dissimilarity degree (or cut) between two partitionsprotected static double Ncut(DoubleMatrix2D W, int[] a, int[] b) {return 2 - Nasso(W, a, b);}//Returns the normalized dissimilarity degree (or cut) between two partitionsprotected static double Ncut(DoubleMatrix2D W, int[] a, int[] b, int[] v) {return 2 - Nasso(W, a, b, v);}//計算 W的bestCut (最小); 得到兩個partition, 分別表示兩個點集(cluster)protected static int[][] bestCut(DoubleMatrix2D W) {int n = W.columns();// Builds the diagonal matrices D and D^(-1/2) (represented as their diagonals)DoubleMatrix1D d = DoubleFactory1D.dense.make(n);DoubleMatrix1D d_minus_1_2 = DoubleFactory1D.dense.make(n);for(int i = 0; i < n; i++) {double d_i = W.viewRow(i).zSum();d.set(i, d_i); //d就是步驟2中的D,w中各列加起來形成的對角矩陣d_minus_1_2.set(i, 1 / Math.sqrt(d_i));//d_minus_1_2 是什么?}DoubleMatrix2D D = DoubleFactory2D.sparse.diagonal(d);//這里的D是什么DoubleMatrix2D X = D.copy();// X = D^(-1/2) * (D - W) * D^(-1/2),,,, X就是步驟2中的L。。?X.assign(W, Functions.minus);for(int i = 0; i < n; i++)for(int j = 0; j < n; j++)X.set(i, j, X.get(i, j) * d_minus_1_2.get(i) * d_minus_1_2.get(j));// Computes the eigenvalues and the eigenvectors of X //計算特征值與特征向量EigenvalueDecomposition e = new EigenvalueDecomposition(X);DoubleMatrix1D lambda = e.getRealEigenvalues();// Selects the eigenvector z_2 associated with the second smallest eigenvalue// Creates a map that contains the pairs <index, eigevalue>//得到第2小的特征值AbstractIntDoubleMap map = new OpenIntDoubleHashMap(n);for(int i = 0; i < n; i++)map.put(i, Math.abs(lambda.get(i)));IntArrayList list = new IntArrayList();// Sorts the map on the valuemap.keysSortedByValue(list);// Gets the index of the second smallest elementint i_2 = list.get(1);// y_2 = D^(-1/2) * z_2DoubleMatrix1D y_2 = e.getV().viewColumn(i_2).copy();y_2.assign(d_minus_1_2, Functions.mult);// Creates a map that contains the pairs <i, y_2[i]>map.clear();for(int i = 0; i < n; i++)map.put(i, y_2.get(i));// Sorts the map on the valuemap.keysSortedByValue(list);// Search the element in the map previuosly ordered that minimizes the cut// of the partitiondouble best_cut = Double.POSITIVE_INFINITY;int[][] partition = new int[2][];// The array v contains all the elements of the graph ordered by their// projection on vector y_2int[] v = list.elements();// For each admissible splitting point ifor(int i = 1; i < n; i++) {// The array a contains all the elements that have a projection on vector// y_2 less or equal to the one of i-th element// The array b contains the remaining elementsint[] a = new int[i];int[] b = new int[n - i];System.arraycopy(v, 0, a, 0, i);System.arraycopy(v, i, b, 0, n - i);double cut = Ncut(W, a, b, v);if(cut < best_cut) {best_cut = cut;partition[0] = a;partition[1] = b;}}return partition;}/*** Splits recursively the points of the graph while the value of the* best cut found is less of a specified limit (the alpha star factor).** @param W the weight matrix of the graph* @param alpha_star the alpha star factor* @return an array of sets of points (partitions)*///以alpha_star作為閥值,在W上遞歸的進行partitionprotected static int[][] partition(DoubleMatrix2D W, double alpha_star) {// If the graph contains only one pointif(W.columns() == 1) {int[][] p = new int[1][1];p[0][0] = 0;return p;// Otherwise} else {// Computes the best cutint[][] cut = bestCut(W);// Computes the value of the found cutdouble cutVal = Ncut(W, cut[0], cut[1], null);// If the value is less than alpha starif(cutVal < alpha_star) {// Recursively partitions the first one found ...DoubleMatrix2D W0 = W.viewSelection(cut[0], cut[0]);int[][] p0 = partition(W0, alpha_star);// ... and the second oneDoubleMatrix2D W1 = W.viewSelection(cut[1], cut[1]);int[][] p1 = partition(W1, alpha_star);// Merges the partitions found in the previous recursive stepsint[][] p = new int[p0.length + p1.length][];for(int i = 0; i < p0.length; i++) {p[i] = new int[p0[i].length];for(int j = 0; j < p0[i].length; j++)p[i][j] = cut[0][p0[i][j]];}for(int i = 0; i < p1.length; i++) {p[i + p0.length] = new int[p1[i].length];for(int j = 0; j < p1[i].length; j++)p[i + p0.length][j] = cut[1][p1[i][j]];}return p; //得到所有合適的partition (得到了所有合適的聚類)} else {// Otherwise returns the partitions found in current step// w/o recursive invocationint[][] p = new int[1][W.columns()];for(int i = 0; i < p[0].length; i++)p[0][i] = i;return p;}}}public int numberOfClusters() throws java.lang.Exception {return numOfClusters;}//分類。 前提是已經聚類; 返回距離instance最近的類public int clusterInstance(Instance instance) throws java.lang.Exception {DoubleMatrix1D u = DoubleFactory1D.dense.make(instance.toDoubleArray());double min_dist = Double.POSITIVE_INFINITY;int c = -1;for(int i = 0; i < v.rows(); i++) {double dist = distnorm2(u, v.viewRow(i));if(dist < min_dist) {c = cluster[i];min_dist = dist;}}return c;}//聚類public void buildClusterer(Instances data) throws java.lang.Exception {int n = data.numInstances();int k = data.numAttributes();DoubleMatrix2D w;if(useSparseMatrix)w = DoubleFactory2D.sparse.make(n, n);elsew = DoubleFactory2D.dense.make(n, n);double[][] v1 = new double[n][];for(int i = 0; i < n; i++)v1[i] = data.instance(i).toDoubleArray();v = DoubleFactory2D.dense.make(v1);double sigma_sq = sigma * sigma;//Sets up similarity matrix。。。 得到矩陣Wfor(int i = 0; i < n; i++)for(int j = i; j < n; j++) {double dist = distnorm2(v.viewRow(i), v.viewRow(j));if((r == -1) || (dist < r)) {double sim = Math.exp(- (dist * dist) / (2 * sigma_sq));w.set(i, j, sim);w.set(j, i, sim);}}//Partitions points。。。。。。。。進行partitionint[][] p = partition(w, alpha_star);//Deploys results。。。。。。。。。輸出聚類結果numOfClusters = p.length;cluster = new int[n];for(int i = 0; i < p.length; i++)for(int j = 0; j < p[i].length; j++)cluster[p[i][j]] = i;} }??
?
partition調用bestCut對W進行劃分,直到cut > alpha_cut為止。
bestCut的計算使用了本文最開頭的步驟(有一些不一樣。。), W, D, L=D-W,計算特征值和特征向量。。
?
還需要理解。。
總結
以上是生活随笔為你收集整理的[code] spectral cluster的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 文本分类之特征简约算法说明
- 下一篇: weka 特征选择