#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np from bokeh.models import ColumnDataSource, Button, Select, Div from bokeh.sampledata.iris import flowers from bokeh.plotting import figure, curdoc, show from bokeh.layouts import column, row # In[2]: # read and store the dataset data = flowers.copy(deep=True) data = data.drop(['species'], axis=1) # In[194]: dist_matrix = np.empty((m, k)) for i in range(m): dist = np.linalg.norm(pca_data[i, :] - initial_medoids, ord=1, axis=1) dist_matrix[i, :] = dist dist_another = np.repeat(np.sum(np.abs(pca_data - initial_medoids), axis=-1),3) dist_matrix_another = dist_another.reshape((m,k)) # In[6]: #k-medoid algorithm using given medoids m = len(data) #Dimension Reduction from sklearn.decomposition import PCA pca_components = PCA(n_components=3) pca_data = pca_components.fit_transform(data) #initialize the given medoids medoids = [24, 74, 124] initial_medoids = np.array([[24,74,124]]) #random medoids def random_medoids(k): np.random.seed(1) random_medoids = np.random.choice(np.hstack(pca_data), size=k, replace=False) return random_medoids.reshape((initial_medoids.shape)) def get_cost(k, meds): #calculate distance matrix dist_matrix = np.empty((m, k)) for i in range(m): dist = np.linalg.norm(pca_data[i, :] - meds, ord=1)#, axis=1) dist_matrix[i, :] = dist #assign points to clusters label = np.argmin(dist_matrix, axis=1) return (dist_matrix, label) #updating medoids def update_medoids(k, meds): dist_matrix, label = get_cost(k, meds) result_medoids = meds for i in set(label): initial_diff = np.sum(get_cost(k, meds[i])[0]) label = pca_data[label==i] for c in label: new_medoid = c new_diff = np.sum(get_cost(k, new_medoid)[0]) if new_diff < initial_diff: initial_diff = new_diff result_medoids[i] = c return result_medoids def has_converged(old_medoids, medoids): return set([tuple(x) for x in old_medoids]) == set([tuple(x) for x in medoids]) def k_medoids(k, starting_medoids=None, max_steps=np.inf): if starting_medoids is None: medoids = random_medoids(k) else: medoids = initial_medoids converged = False label = np.zeros(m) i = 1 while (not converged) and (i <= max_steps): old_medoids = medoids.copy() dist_matrix, label = get_cost(k, medoids) medoids = update_medoids(k, medoids) converged = has_converged(old_medoids, medoids) i += 1 return (medoids,label, np.sum(dist_matrix)) results = k_medoids(3) final_medoids = results[0] data['clusters'] = results[1] final_cost = results[2] print(results) # In[7]: #set up dashboard #create scatter plots for start up #scatter plot of "Petal Length" vs "Sepal Length" p1 = figure(title = "Scatterplot of flower distribution by petal length and sepal length") p1.xaxis.axis_label = 'Petal length' p1.yaxis.axis_label = 'Sepal length' p1.circle(data["petal_length"], data["sepal_length"], color='gray', fill_alpha=0.2, size=8) #scatter plot of "Petal Width" vs "Petal Length" p2 = figure(title = "Scatterplot of flower distribution by petal width and petal length") p2.xaxis.axis_label = 'Petal width' p2.yaxis.axis_label = 'Petal length' p2.circle(data["petal_width"], data["petal_length"], color='gray', fill_alpha=0.2, size=8) #Random Medoids Select def select_medoids(attr, old, new): pass medoids_select = Select(options=['True', 'False'], value='False', title='Random Medoids', width = 250) #medoids_select.on_change('value', select_medoids) #Cluster Data Button cluster_button = Button(label='Cluster Data', width = 241) #cluster_button.on_click(k_medoids) #add text div = Div(text = """The final cost is: """ + str(round(final_cost,2)), width=250, height=100) #create layout show(row(column(medoids_select,cluster_button,div,width = 260),p1,p2)) #curdoc().add_root(layout)