## 2013年4月8日 星期一

### [ ML In Action ] Unsupervised learning : The k-means clustering algorithm (2)

Preface:

Bisecting k-means:

2. While the number of clusters is less than k
3.     for every cluster
4.         measure total error
5.         perform k-means clustering with k=2 on the given cluster
6.         measure total error after k-means has split the cluster in two
7.     choose the cluster split that gives the lowest error and commit this split

1. def biKmeans(dataSet, k, distMeas=distEclud):
2.     m = shape(dataSet)[0]
3.     clusterAssment = mat(zeros((m,2)))
4.     centroid0 = mean(dataSet, axis=0).tolist()[0]
5.     centList =[centroid0] #create a list with one centroid
6.     for j in range(m): # calc initial Error
7.         clusterAssment[j,1] = distMeas(mat(centroid0), dataSet[j,:])**2
8.     while (len(centList) < k):
9.         lowestSSE = inf
10.         for i in range(len(centList)): # Search the best group to do k-means with best/smallest SSE.
11.             ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:] #get the data points currently in cluster i
12.             centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2, distMeas)
13.             sseSplit = sum(splitClustAss[:,1]) # compare the SSE to the SSE of currrent split
14.             sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1]) # compare the SSE of instances except ones in cluster i
15.             print "sseSplit, and notSplit: ",sseSplit,sseNotSplit
16.             if (sseSplit + sseNotSplit) < lowestSSE:
17.                 bestCentToSplit = i
18.                 bestNewCents = centroidMat
19.                 bestClustAss = splitClustAss.copy()
20.                 lowestSSE = sseSplit + sseNotSplit
21.         bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList) # Increase the cluster index
22.         bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit
23.         print 'the bestCentToSplit is: ',bestCentToSplit
24.         print 'the len of bestClustAss is: ', len(bestClustAss)
25.         centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0] # replace a centroid with two best centroids
26.         centList.append(bestNewCents[1,:].tolist()[0])
27.         clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss #reassign new clusters, and SSE
28.     return mat(centList), clusterAssment
Experiment:

>>> import kMeans # 剛剛的代碼存放在 kMeans.py, 這邊載入 kMeans module
>>> from numpy import * # 載入 numpy 模組
>>> datMat3 = mat(kMeans.loadDataSet('testSet2.txt')) # 載入 test data set
>>> centList, myNewAssments=kMeans.biKmeans(datMat3, 3) # 執行 bisecting k-means, 切割成 3 個 clusters
>>> centList # 列印出 3 個 cluster 的 centroid
matrix([[-0.45965615, -2.7782156 ],
[ 2.93386365, 3.12782785],
[-2.94737575, 3.3263781 ]])

>>> import matplotlib.pyplot as plt
>>> fig = plt.figure()
>>> ax.scatter(x=array(datMat3[:,0]).flatten(), y=array(datMat3[:,1]).flatten())

>>> plt.show()

1. #-*- coding: utf-8 -*-
2. from numpy import *
3. import matplotlib.pyplot as plt
4. import kMeans
5.
6. 1) Execute the k-means cluster algorithm
7. k = 3 # How many clusters?
9. rst = kMeans.biKmeans(datMat3,k)
10.
11. 2) Output the centroid
12. fig = plt.figure()
14. ax.scatter(x=array(rst[0][:,0]).flatten(), y=array(rst[0][:,1]).flatten(), c='pink', s=80)
15.
16. 3) Output the dataset with different according to its cluster.
17. color = ['red''blue''green''yellow']
18. for i in range(k):
19.     smc = datMat3[nonzero(rst[1][:,0].A==i)[0]] # sub data set of each cluster.
20.     ax.scatter(x=array(smc[:,0]).flatten(), y=array(smc[:,1]).flatten(), c=color[i], s=30)
21. plt.show()

### [ Python 文章收集 ] List Comprehensions and Generator Expressions

Source From  Here   Preface   Do you know the difference between the following syntax?  view plain copy to clipboard print ? [x  for ...