In this post I describe how to implement the DBSCAN clustering algorithm to work with Jaccard-distance as its metric. It should be able to handle sparse data.
The DBSCAN clustering algorithm will be implemented in Python as described in this Wikipedia article. The algorithm will use Jaccard-distance (1 minus Jaccard index) when measuring distance between points. In this implementation, two points that are exactly a distance epsilon apart are taken to be in the same cluster, thus greater-than-or-equal and not should be used.
This method could support a large number of sparse points in very high-dimensional space (~10,000 dimensions). This means that the points – even though they lie in very high dimensional space – only have few non-zero coordinates.
In order to test the algorithm I will use this data folder including 4 files of increasing dimensions.
Let’s have a look at the first data file, just to have in mind how it looks like.
Now we are ready to run the python class.
import cPickle as pickle import numpy as np # Create simple class used as Enum class Status: def __init__(self): pass New, Visited, Noise = range(3) # Create variable to track the status of each point _status = None # Create variable to track membership of each point _is_member = None # Create variable to store distance between points _distance = None # Create variable to store indexes of ordered neighbors based on distance _distance_idx = None; # Create the DBSCAN algorithm as dicribed in Wikipedia def dbscan(points, eps=0.15, min_pts=2): clusters =  n_pts = points.shape # Compute the distances of the points to be clustered _pre_compute_distance(points) for k in range(n_pts): if _status[k] == Status.Visited: continue _status[k] = Status.Visited neighbors = _region_query(k, eps) if len(neighbors) < min_pts: _status[k] = Status.Noise else: cluster = _expand_cluster(k, neighbors, eps, min_pts) clusters.append(cluster) return clusters def _expand_cluster(point, neighbors, eps, min_pts): global _status global _is_member cluster =  _add_to_cluster(cluster, point) while neighbors: k = neighbors.pop() if _status[k] != Status.Visited: _status[k] = Status.Visited extended_neighbors = _region_query(k, eps) if len(extended_neighbors) >= min_pts: neighbors.update(extended_neighbors) if not _is_member[k]: _add_to_cluster(cluster, k) return cluster def _region_query(center, eps): global _distance # Create set instead of a list because in this case it would be necessary to iterate throught. neighbors = set() i = 0 # Use presorted/precomputed distances. When the closest accepted neighbors will be # found, iteration will stops while _distance[center, _distance_idx[center, i]] <= eps: neighbors.add(_distance_idx[center, i]) i += 1 return neighbors def _pre_compute_distance(points): global _is_member global _status global _distance global _distance_idx n_pts = points.shape _distance = np.zeros((n_pts, n_pts)) _is_member = [False] * n_pts _status = [Status.New] * n_pts # Give the number of non-zero elements n = points.sum(axis=1).dot(np.ones((1, n_pts))).A # Compute the distance (the full array can be constructed using A + A.T) n_intersect = points.dot(points.transpose()) _distance = 1.0 - (n_intersect / (n + n.T - n_intersect)) # The neighbors/column-indexes are sorted based on distance, to decrease the running-time # of _range_query method _distance_idx = np.argsort(_distance, axis=1) def _add_to_cluster(cluster, k): cluster.append(k) _is_member[k] = True if __name__ == "__main__": file_descriptor = open("test_files/data_10points_10dims.dat", "r") points_to_cluster = pickle.load(file_descriptor) _pre_compute_distance(points_to_cluster) clusters = dbscan(points_to_cluster, eps=0.4, min_pts=2) if len(clusters) > 0: print "FOR data_10points_10dims.dat file the output is the following: \n" print "The following clusters were found:" cluster_lengths =  for cluster in clusters: print cluster cluster_lengths.append(len(cluster)) noise = [point for point, status in enumerate(_status) if status == Status.Noise] print "\nThe noise points are:\n", noise print "\nThe largest cluster contains: \n%d points" % max(cluster_lengths) else: print "No clusters were found!"