Week 3 (2/14-2/20)

Notebook

Weekly digest

Jupyter Notebook extensions

ipywidgets and interact

k-means

  • supervised vs unsupervised machine learning

  • k-means clustering algorithm

Resources

1. k-means

[1]:
%config InlineBackend.figure_format = 'retina'
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap


def get_distances(X, Y):
    """
    Given 2-dimensional numpy arrays X and Y
    returns a 2-dimensional array whose (i, j)-entry
    is the square of the Euclidean distance between
    the i-th row of X and the j-th row of Y
    """

    return np.sum((X[:, np.newaxis, :] - Y[np.newaxis, :, :])**2, axis=-1)


def kmeans(X, n):
    """
    Implements the k-mneans algorithm.

    X:
        2-dimensional numpy array whose rows are coordinates of data points.
    n:
        Integer, the number of clusters.

    Returns:
        A list of tuples (labels, centers), one tuple for each iteration step.
        labels is a 1-dimensional array with labels od points in X, centers is
        a 2-dimensional array with coordinates of centroids of clusters.
        The last tuple on the list is the clustering final result.
    """

    X = X.copy()
    h = X.shape[0]
    centers = X[np.random.choice(h, size=n, replace=False)]

    new_centers = np.zeros_like(centers)

    steps = []
    while True:
        d = get_distances(X, centers)
        labels = np.argmin(d, axis=1)
        steps.append((labels, centers))
        for j in range(n):
            new_centers[j] = np.mean(X[labels == j], axis=0)
        if np.array_equal(centers, new_centers):
            break
        else:
            centers = new_centers.copy()
    return steps


def plot_clusters(X, labels=None, centers=None):
    """
    Plots clusters and their centoids.

    X:
        2-dimensional numpy array whose rows are coordinates of data points.
    labels:
        1-dimensional numpy array with labels of points in X.
    centers:
        2-dimensional numpy array whose rows are coordinates of cluster
        centroids.
    """

    if labels is not None:
        n = len(set(labels))
    else:
        n = 1
    colors = ListedColormap(
        ['tab:blue', 'tab:red', 'tab:green', 'tab:orange', 'tab:purple'][:n])
    plt.figure(figsize=(6, 6))
    plt.scatter(X[:, 0], X[:, 1], c=labels, marker="+", cmap=colors)
    if centers is not None:
        plt.scatter(centers[:, 0],
                    centers[:, 1],
                    c=range(centers.shape[0]),
                    s=250,
                    linewidths=3,
                    edgecolors="k",
                    cmap=colors)
    plt.show()


def plot_iteration(X, steps, k=0):
    """
    Plots clusters computed by a given iteration of k-means.

    X:
        2-dimensional numpy array whose rows are coordinates of data points.
    steps:
        A list with cluster data produced by the kmeans function.
    k:
        An index of the steps list
    """

    plot_clusters(X, steps[k][0], steps[k][1])

2. Plotting nearest neighbors

[2]:
%config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap, Normalize


def plot_clusters(X, y, p=None, p_label=None, neighbors=None):
    """
    Plots nearest neighbors of a given point.

    X:
        A 2-dimensional numpy array with coordinates
        of points in clusters
    y:
        A 1-dimensional numpy array with labels of points
    p:
        An array with coordinates of the point whose
        neighbors will be plotted.
    p_label:
        The predicted label of the point p.
    neighbors:
        A list of row numbers of X which are the nearest
        neighbors of the point p.
    """

    # build a custom colormap
    col_list = ['dodgerblue', 'limegreen', 'red', 'orange', 'fuchsia']
    colors = ListedColormap(col_list)
    norm = Normalize(0, len(col_list))

    plt.figure(figsize=(8, 8))
    with plt.style.context('seaborn'):
        scatter = plt.scatter(X[:, 0],
                              X[:, 1],
                              c=y,
                              s=90,
                              cmap=colors,
                              norm=norm,
                              label=y)
        if p is not None:
            p_col = "k" if p_label is None else colors(norm(p_label))
            plt.plot(p[0], p[1], marker="*", mfc=p_col, mec="k", ms=30, mew=2)
        if neighbors is not None:
            plt.scatter(X[neighbors, 0],
                        X[neighbors, 1],
                        edgecolors='black',
                        linewidth=3,
                        facecolors="None",
                        s=300)
        plt.legend(*scatter.legend_elements(),
                   markerscale=1.5,
                   prop={
                       "size": 12,
                       "weight": "normal"
                   })
    plt.show()

Exercises

Exercise 1

Here are some JPEG image files (you can also use other images for this exercise):

  • Use the k-means algorithm to organize colors of pixels in these images into \(k\) clusters (for different values of \(k\)).

  • Make a plot showing colors of cluster centroids.

  • Display the images, but replace the color of each pixel with the color of the centroid of the cluster to which the pixel belongs.