In this post I describe how to implement the DBSCAN clustering algorithm to work with Jaccard-distance as its metric. It should be able to handle sparse data.
Overview
The DBSCAN clustering algorithm will be implemented in Python as described in this Wikipedia article. The algorithm will use Jaccard-distance (1 minus Jaccard index) when measuring distance between points. In this implementation, two points that are exactly a distance epsilon apart are taken to be in the same cluster, thus greater-than-or-equal and not should be used.
This method could support a large number of sparse points in very high-dimensional space (~10,000 dimensions). This means that the points – even though they lie in very high dimensional space – only have few non-zero coordinates.
In order to test the algorithm I will use this data folder including 4 files of increasing dimensions.
- data_10points_10dims.dat
- data_100points_100dims.dat
- data_1000points_1000dims.dat
- data_10000points_10000dims.dat
Let’s have a look at the first data file, just to have in mind how it looks like.
Now we are ready to run the python class.
import cPickle as pickle
import numpy as np
# Create simple class used as Enum
class Status:
def __init__(self):
pass
New, Visited, Noise = range(3)
# Create variable to track the status of each point
_status = None
# Create variable to track membership of each point
_is_member = None
# Create variable to store distance between points
_distance = None
# Create variable to store indexes of ordered neighbors based on distance
_distance_idx = None;
# Create the DBSCAN algorithm as dicribed in Wikipedia
def dbscan(points, eps=0.15, min_pts=2):
clusters = []
n_pts = points.shape[0]
# Compute the distances of the points to be clustered
_pre_compute_distance(points)
for k in range(n_pts):
if _status[k] == Status.Visited:
continue
_status[k] = Status.Visited
neighbors = _region_query(k, eps)
if len(neighbors) < min_pts:
_status[k] = Status.Noise
else:
cluster = _expand_cluster(k, neighbors, eps, min_pts)
clusters.append(cluster)
return clusters
def _expand_cluster(point, neighbors, eps, min_pts):
global _status
global _is_member
cluster = []
_add_to_cluster(cluster, point)
while neighbors:
k = neighbors.pop()
if _status[k] != Status.Visited:
_status[k] = Status.Visited
extended_neighbors = _region_query(k, eps)
if len(extended_neighbors) >= min_pts:
neighbors.update(extended_neighbors)
if not _is_member[k]:
_add_to_cluster(cluster, k)
return cluster
def _region_query(center, eps):
global _distance
# Create set instead of a list because in this case it would be necessary to iterate throught.
neighbors = set()
i = 0
# Use presorted/precomputed distances. When the closest accepted neighbors will be
# found, iteration will stops
while _distance[center, _distance_idx[center, i]] <= eps:
neighbors.add(_distance_idx[center, i])
i += 1
return neighbors
def _pre_compute_distance(points):
global _is_member
global _status
global _distance
global _distance_idx
n_pts = points.shape[0]
_distance = np.zeros((n_pts, n_pts))
_is_member = [False] * n_pts
_status = [Status.New] * n_pts
# Give the number of non-zero elements
n = points.sum(axis=1).dot(np.ones((1, n_pts))).A
# Compute the distance (the full array can be constructed using A + A.T)
n_intersect = points.dot(points.transpose())
_distance = 1.0 - (n_intersect / (n + n.T - n_intersect))
# The neighbors/column-indexes are sorted based on distance, to decrease the running-time
# of _range_query method
_distance_idx = np.argsort(_distance, axis=1)
def _add_to_cluster(cluster, k):
cluster.append(k)
_is_member[k] = True
if __name__ == "__main__":
file_descriptor = open("test_files/data_10points_10dims.dat", "r")
points_to_cluster = pickle.load(file_descriptor)
_pre_compute_distance(points_to_cluster)
clusters = dbscan(points_to_cluster, eps=0.4, min_pts=2)
if len(clusters) > 0:
print "FOR data_10points_10dims.dat file the output is the following: \n"
print "The following clusters were found:"
cluster_lengths = []
for cluster in clusters:
print cluster
cluster_lengths.append(len(cluster))
noise = [point for point, status in enumerate(_status) if status == Status.Noise]
print "\nThe noise points are:\n", noise
print "\nThe largest cluster contains: \n%d points" % max(cluster_lengths)
else:
print "No clusters were found!"