In this post I describe how to implement the DBSCAN clustering algorithm to work with Jaccard-distance as its metric. It should be able to handle sparse data.

Overview

The DBSCAN clustering algorithm will be implemented in Python as described in this Wikipedia article. The algorithm will use Jaccard-distance (1 minus Jaccard index) when measuring distance between points. In this implementation, two points that are exactly a distance epsilon apart are taken to be in the same cluster, thus greater-than-or-equal and not should be used.

This method could support a large number of sparse points in very high-dimensional space (~10,000 dimensions). This means that the points – even though they lie in very high dimensional space – only have few non-zero coordinates.

In order to test the algorithm I will use this data folder including 4 files of increasing dimensions.

  • data_10points_10dims.dat
  • data_100points_100dims.dat
  • data_1000points_1000dims.dat
  • data_10000points_10000dims.dat

Let’s have a look at the first data file, just to have in mind how it looks like. alt text

Now we are ready to run the python class.

import cPickle as pickle
import numpy as np

# Create simple class used as Enum
class Status:
    def __init__(self):
        pass

    New, Visited, Noise = range(3)

# Create variable to track the status of each point
_status = None

# Create variable to track membership of each point
_is_member = None

# Create variable to store distance between points
_distance = None

# Create variable to store indexes of ordered neighbors based on distance
_distance_idx = None;


# Create the DBSCAN algorithm as dicribed in Wikipedia
def dbscan(points, eps=0.15, min_pts=2):
    
    clusters = []
    n_pts = points.shape[0]

    # Compute the distances of the points to be clustered
    _pre_compute_distance(points)

    for k in range(n_pts):
        if _status[k] == Status.Visited:
            continue
        _status[k] = Status.Visited
        neighbors = _region_query(k, eps)
        if len(neighbors) < min_pts:
            _status[k] = Status.Noise
        else:
            cluster = _expand_cluster(k, neighbors, eps, min_pts)
            clusters.append(cluster)

    return clusters


def _expand_cluster(point, neighbors, eps, min_pts):
    global _status
    global _is_member

    cluster = []
    _add_to_cluster(cluster, point)

    while neighbors:
        k = neighbors.pop()
        if _status[k] != Status.Visited:
            _status[k] = Status.Visited
            extended_neighbors = _region_query(k, eps)
            if len(extended_neighbors) >= min_pts:
                neighbors.update(extended_neighbors)
        if not _is_member[k]:
            _add_to_cluster(cluster, k)

    return cluster


def _region_query(center, eps):
    global _distance
    # Create set instead of a list because in this case it would be necessary to iterate throught.
    neighbors = set()
    i = 0
    # Use presorted/precomputed distances. When the closest accepted neighbors will be
    # found, iteration will stops
    while _distance[center, _distance_idx[center, i]] <= eps:
        neighbors.add(_distance_idx[center, i])
        i += 1
    return neighbors


def _pre_compute_distance(points):
    global _is_member
    global _status
    global _distance
    global _distance_idx

    n_pts = points.shape[0]

    _distance = np.zeros((n_pts, n_pts))
    _is_member = [False] * n_pts
    _status = [Status.New] * n_pts

    # Give the number of non-zero elements
    n = points.sum(axis=1).dot(np.ones((1, n_pts))).A

    # Compute the distance (the full array can be constructed using A + A.T)
    n_intersect = points.dot(points.transpose())
    _distance = 1.0 - (n_intersect / (n + n.T - n_intersect))
    # The neighbors/column-indexes are sorted based on distance, to decrease the running-time 
    # of _range_query method
    _distance_idx = np.argsort(_distance, axis=1)


def _add_to_cluster(cluster, k):
    cluster.append(k)
    _is_member[k] = True


if __name__ == "__main__":

    file_descriptor = open("test_files/data_10points_10dims.dat", "r")
    points_to_cluster = pickle.load(file_descriptor)
    _pre_compute_distance(points_to_cluster)
    clusters = dbscan(points_to_cluster, eps=0.4, min_pts=2)
    if len(clusters) > 0:
        print "FOR data_10points_10dims.dat file the output is the following: \n"
        print "The following clusters were found:"
        cluster_lengths = []
        for cluster in clusters:
            print cluster
            cluster_lengths.append(len(cluster))
        noise = [point for point, status in enumerate(_status) if status == Status.Noise]
        print "\nThe noise points are:\n", noise
        print "\nThe largest cluster contains: \n%d points" % max(cluster_lengths)
    else:
        print "No clusters were found!"

alt text


alt text


alt text


alt text alt text