Kỹ thuật đọc file Excel phân cụm dữ liệu và ghi kết quả phân cụm vào file dữ liệu
import xlrd
import xlwt
import numpy as np
import matplotlib.pyplot as plt
from openpyxl import load_workbook
file_location = "D:\VDKM.xlsx"
wb = load_workbook(file_location)
#wb = xlrd.open_workbook(file_location)
#wt=xlwt
sheet = wb.active
K=2
#data = [[sheet.cell_value(r, c) for c in range(sheet.ncols)] for r in range(sheet.nrows)]
data = [[sheet.cell(row=r,column=c).value for c in range(1,sheet.max_column)] for r in range(1,sheet.max_row+1)]
X=np.array(data)
X=np.array(X[1:,1:3].astype(np.float32))
#print(X)
#K_mean
def init_centers(X, k):
return X[np.random.choice(X.shape[0], k, replace=False)]
# 2. Phân cụm
def group_data(X, centers):
y = np.zeros(X.shape[0])
for i in range(X.shape[0]):
d = X[i] - centers
d = np.linalg.norm(d, axis=1)
y[i] = np.argmin(d)
#print("d=",d)
# print("y=",y[i])
return y
# Ham ve hinh
def plot_result_K(X, y, centers, k, title):
for i in range(K):
plt.scatter(X[:,0],
X[:,1],
s=250)
#c='lightgreen',
# marker='s',
#label='cluster '+ str(i+1))
# plt.scatter(centers[:,0],
# centers[:,1],
# s=250,
# marker='*',
# c='red',
#label='centroids')
plt.title(title)
plt.legend(); plt.grid(); plt.show()
def plot_result(X, y, centers, k, title):
for i in range(K):
plt.scatter(X[y==i,0],
X[y==i,1],
s=250,
#c='lightgreen',
#marker='s',
label='cluster '+ str(i+1))
plt.scatter(centers[:,0],
centers[:,1],
s=250,
marker='*',
c='red',
label='centroids')
plt.title(title)
plt.legend(); plt.grid(); plt.show()
# 3. Cập nhập tọa độ điểm trung tâm
def update_centers(X, y, k):
centers = np.zeros((k, X.shape[1]))
for i in range(k):
X_i = X[y==i, :]
centers[i] = np.mean(X_i, axis = 0)
return centers
# Thuật toán K_means
def kmeans(X, k):
#centers = init_centers(X, k)
centers=np.array([[1.,1.],[1., 1. ]])
y = []
iter = 0
plot_result_K(X, y, centers, k, 'iter: '+ str(iter))
while True:
# save pre-loop groups
y_old = y
# grouping
y = group_data(X, centers)
# break while loop if groups are not changed
if np.array_equal(y, y_old):
break
plot_result(X, y, centers, k, 'iter: '+ str(iter))
# update centers
centers = update_centers(X, y, k)
# plot current state
iter += 1
# print(centers)
return (centers, y)
# run k-means
centers, y = kmeans(X, K)
#Luu du lieu vao file
for i in range(len(y)):
sheet.cell(row = i+2, column = 5, value = y[i])
wb.save(file_location)
print(y);
plot_result(X, y, centers, K, 'Final')