操作方法
第一步计算欧氏距离 并取样,k代表分类的总个数 import numpy as np #calculate the O distance def calculate_distance(vector1,vector2): import numpy as np return np.sqrt(np.sum(np.square(vector1-vector2))) #initialize centroids def initialize_centroids(data,k): import random return random.sample(data,k)
产生新的簇类并求出最短距离 #find the minimun diastance from individual to centroids def minimun_distance(data,centroidlist): clusterdictionary=cd=dict() for i in data: vector1=i marker=0 min_dist=float(inf) for j in range(len(centroidlist)): vector2=centroidlist[j] distance=calculate_distance(vector1,vector2) if distance<min_dist: min_dist=distance marker=j if marker not in clusterdictionary.keys(): clusterdictionary[marker]=list() clusterdictionary[marker].append(i) return clusterdictionary #get centroids def getcentroids(clusterdictionary): import numpy as np centroidlist=list() for key in clusterdictionary.keys(): centroid=np.mean(np.array(clusterdictionary[key]),axis=0) centroidlist.append(centroid) return np.array(centroidlist)
导入数据并计算,当簇中心变化小于一定阈值跳出循环 #get mean squared deviation def getmsd(clusterdictionary,centroidlist): sum=0.0 for key in clusterdictionary.keys(): vector1=centroidlist[key] distance=0.0 for i in clusterdictionary[key]: vector2=i distance+=calculate_distance(vector1,vector2) sum+=distance return sum #show result def showresult(clusterdictionary,centroidlist): import matplotlib.pyplot as plt colormark=['or','ob','og','ok'] centroidmark=['dr','db','dg','dk'] for key in clusterdictionary.keys(): plt.plot(centroidlist[key][0],centroidlist[key][1],centroidmark[key],markersize=12) for i in clusterdictionary[key]: plt.plot(i[0],i[1],colormark[key]) plt.show path='C:\\Users\\jyjh\\Desktop\\data.txt' data=open(path,'r').readlines() temp=list() import re for i in data: numlist=list() for j in i.strip().split('\t'): num=float(j) numlist.append(num) temp.append(numlist) data=np.array(temp) centroidlist=initialize_centroids(data,4) clusterdictionary=minimun_distance(data,centroidlist) new_msd=getmsd(clusterdictionary,centroidlist) old_msd=-0.000001 k=2 while(abs(new_msd-old_msd)>=0.00001): centroidlist=getcentroids(clusterdictionary) clusterdictionary=minimun_distance(data,centroidlist) old_msd=new_msd new_msd=getmsd(clusterdictionary,centroidlist) k+=1 print new_msd-old_msd showresult(clusterdictionary,centroidlist)