Set and Cluster Similarity in R

2015-10-06

The following set similarity functions are written in C++ for easy integration with R using the sourceCpp command.

The first function computes the Jaccard index assuming inputs are unique, sorted integers where each integer denotes an element of a set. L1 and L2 are both sets.

/**
 * The Jaccard Similarity Coefficient or Jaccard Index is used to compare the
 * similarity/diversity of sample sets. It is defined as the size of the
 * intersection of the sets divided by the size of the union of the sets.
 *
 * INPUTS:
 *    L1: A list of unique, sorted integers denoting set elements.
 *    L2: A list of unique, sorted integers denoting set elements.
 *
 * RETURNS:
 *    Size(Intersection(L1,L2))/Size(Union(L1,L2))
 *
 * SIDE-EFFECTS:
 *    None: this is the 21st-century.
 *
 * COMPLEXITY:
 *    Time:  O(N1+N2)
 *    Space: O(1)
 */
// [[Rcpp::export]]
NumericVector JaccardSimilarity(const NumericVector L1, const NumericVector L2){
  int n1                = L1.size();   //Number of elmeents in set 1
  int n2                = L2.size();   //Number of elmeents in set 2
  int i1                = 0;           //Current element of interest in set 1
  int i2                = 0;           //Current element of interest in set 2
  int size_intersection = 0;           //Result: to be incremented in algorithm
  int size_union        = 0;           //Result: to be incremented in algorithm
  while(true){
    if(i1==n1){
      size_union += n2-i2;
      break;
    } else if (i2==n2){
      size_union += n1-i1;
      break;
    }
    if(L1[i1]==L2[i2]){        //Element is in both sets.
      i1++;                    //Therefore, we discard both the current element
      i2++;                    //from both sets and mark the current element as 
      size_union++;            //being part of both the union and the
      size_intersection++;     //intersection
    } else if(L1[i1]<L2[i2]){  //Element is only in the first set.
      i1++;                    //Throw it out
      size_union++;            //and mark it as a member of only the union
    } else if(L1[i1]>L2[i2]){  //Element is only in the second set.
      i2++;                    //Throw it out
      size_union++;            //and mark it as a member of only the union
    }
  }
  return NumericVector::create((double)size_intersection/(double)size_union);
}

The second function computes the Jaccard index assuming only that L1 and L2 are both sets whose integers indicate set members.

/**
 * The Jaccard Similarity Coefficient or Jaccard Index is used to compare the
 * similarity/diversity of sample sets. It is defined as the size of the
 * intersection of the sets divided by the size of the union of the sets.
 *
 * INPUTS:
 *    L1: A list of integers denoting set elements.
 *    L2: A list of integers denoting set elements.
 *
 * RETURNS:
 *    Size(Intersection(L1,L2))/Size(Union(L1,L2))
 *
 * SIDE-EFFECTS:
 *    None: this is the 21st-century.
 *
 * COMPLEXITY:
 *    Time:  O(N1+N2)
 *    Space: O(N1+N2) worst-case
 */
#include <unordered_map>
// [[Rcpp::export]]
NumericVector JaccardSimilarityUnsorted(const NumericVector L1, const NumericVector L2){
  int n1                = L1.size();   //Number of elmeents in set 1
  int n2                = L2.size();   //Number of elmeents in set 2
  int size_intersection = 0;           //Result: to be incremented in algorithm
  int size_union        = 0;           //Result: to be incremented in algorithm
  std::unordered_map<int, unsigned char> seen;
  for(int i1=0;i1<n1;i1++){            //Loop through all elements in Set 1
    if(!seen.count(L1[i1])){           //If we haven't seen the element before
      seen[L1[i1]] = 1;                //Mark it as having been seen in Set 1
      size_union++;                    //And as being part of the union
    }                
  }
  for(int i2=0;i2<n2;i2++){            //Loop through all elements in Set 2
    if(!seen.count(L2[i2])){           //If we have not seen the element before
      size_union++;                    //then it is part of the union.
      seen[L2[i2]] = 2;                //Mark it as being done.
    } else if(seen[L2[i2]]==1){        //Elsewise, we've seen the element in the
      size_intersection++;             //Set 1, it is part of the intersection
                                       //But not the union, because that would
                                       //be double counting.
      seen[L2[i2]] = 2;                //Mark it as being done.
    }
  }
  return NumericVector::create((double)size_intersection/(double)size_union);
}

This function, due to Jamie Murdoch, quickly computes the Fowlkes-Mallows index of a cluster assignment.

/**
 * Third iteration of the Fowlkes and Mallows similarity in C++.
 * For each cluster i, track the number of elements in that cluster.
 * For each pair of clusters (x, y), track the number of elements assigned to x
 *   in L1 and to y in L2. This is the "overlap" between x and y.
 * <C1, C1> is the sum of squares of cluster sizes. To see this, note that
 *   for all pairs of elements i, j in cluster x, c_{ij} = 1. For all pairs
 *   of elements in different clusters, c_{ij} = 0. Then, for each cluster x,
 *   the number of elements with value 1 in C_{ij} is |x|^2, the square of the
 *   cluster size. 
 * Another way to see this is that C_{ij} is an adjacency matrix for a graph, in
 *   which the clusters represent separate cliques. We want to count the number
 *   of edges for each clique, counting self-edges once and undirected edges
 *   between elements twice.
 * <C1, C2> is the sum of squares of overlap sizes. To see this, consider the
 *   overlap O_{xy} between clusters x and y. This is the number of elements that 
 *   belong to x in L1 and to y in L2. For any pair (i,j) of these elements,
 *   C_{ij} = 1 in both C1 and C2. This ordered pair contributes 1 to <C1, C2>.
 *   We sum these contributions across all pairs in the overlap (of which there
 *   are |O_{xy}|^2) and all overlaps.
 *
 * INPUTS:
 *    L1: Each item of the list is a point. The number stored at the element
 *        is the cluster the point is assigned to.
 *    L2: The same as L1. Must be the same length as L1.
 *
 * RETURNS:
 *    The Fowlkes and Mallows index
 *
 * SIDE-EFFECTS:
 *    None
 *
 * COMPLEXITY:
 *    Time:  O(K^2+n), where K = number of clusters
 *    Space: O(K^2)
 *
 * SOURCES:
 *    Asa Ben-Hur, Andre Elisseeff, and Isabelle Guyon (2001) A stability based
 *    method for discovering structure in clustered data. Biocomputing 2002: pp.
 *    6-17. 
 */
// [[Rcpp::export]]
NumericVector FowlkesMallows(const NumericVector L1, const NumericVector L2){
  int n = L1.size();
  int K = max(L1);
  
  ind overlaps[K][K];
  ind cluster_sizes1[K], cluster_sizes2[K];
  for(int i = 0; i < K; i++){
    cluster_sizes1[i] = 0;
    cluster_sizes2[i] = 0;
    for(int j = 0; j < K; j++)
      overlaps[i][j] = 0;
  }
  
  //O(n) time. O(K^2) space. Determine the size of each cluster as well as the
  //size of the overlaps between the clusters.
  for(int i = 0; i < n; i++){
    cluster_sizes1[(int)L1[i] - 1]++; // -1's account for zero-based indexing
    cluster_sizes2[(int)L2[i] - 1]++;
    overlaps[(int)L1[i] - 1][(int)L2[i] - 1]++;
  }
  
  // O(K^2) time. O(1) space. Square the overlap values.
  int C1dotC2 = 0;
  for(int j = 0; j < K; j++){
    for(int k = 0; k < K; k++){
      C1dotC2 += pow(overlaps[j][k], 2);
    }
  }
  
  // O(K) time. O(1) space. Square the cluster sizes
  int C1dotC1 = 0, C2dotC2 = 0;
  for(int i = 0; i < K; i++){
    C1dotC1 += pow(cluster_sizes1[i], 2);
    C2dotC2 += pow(cluster_sizes2[i], 2);
  }
  
  return NumericVector::create((double)C1dotC2 / (sqrt(C1dotC1) * sqrt(C2dotC2)));
}

We can modify the above to quickly compute the cluster assignment similarity using the Jaccard index, as suggested by Ben-Hur, Elisseeff, and Guyon (2002).

/**
 * The Jaccard Similarity Coefficient or Jaccard Index is used to compare the
 * similarity/diversity of sample sets. It is defined as the size of the
 * intersection of the sets divided by the size of the union of the sets. Here,
 * it is used to determine how similar to clustering assignments are.
 *
 * INPUTS:
 *    L1: A list. Each element of the list is a number indicating the cluster
 *        assignment of that number.
 *    L2: The same as L1. Must be the same length as L1.
 *
 * RETURNS:
 *    The Jaccard Similarity Index
 *
 * SIDE-EFFECTS:
 *    None
 *
 * COMPLEXITY:
 *    Time:  O(K^2+n), where K = number of clusters
 *    Space: O(K^2)
 *
 * SOURCES:
 *    Asa Ben-Hur, Andre Elisseeff, and Isabelle Guyon (2001) A stability based
 *    method for discovering structure in clustered data. Biocomputing 2002: pp.
 *    6-17. 
 */
// [[Rcpp::export]]
NumericVector JaccardIndex(const NumericVector L1, const NumericVector L2){
  int n = L1.size();
  int K = max(L1);
  
  ind overlaps[K][K];
  ind cluster_sizes1[K], cluster_sizes2[K];
  for(int i = 0; i < K; i++){
    cluster_sizes1[i] = 0;
    cluster_sizes2[i] = 0;
    for(int j = 0; j < K; j++)
      overlaps[i][j] = 0;
  }
  
  //O(n) time. O(K^2) space. Determine the size of each cluster as well as the
  //size of the overlaps between the clusters.
  for(int i = 0; i < n; i++){
    cluster_sizes1[(int)L1[i] - 1]++; // -1's account for zero-based indexing
    cluster_sizes2[(int)L2[i] - 1]++;
    overlaps[(int)L1[i] - 1][(int)L2[i] - 1]++;
  }
  
  // O(K^2) time. O(1) space. Square the overlap values.
  int C1dotC2 = 0;
  for(int j = 0; j < K; j++){
    for(int k = 0; k < K; k++){
      C1dotC2 += pow(overlaps[j][k], 2);
    }
  }
  
  // O(K) time. O(1) space. Square the cluster sizes
  int C1dotC1 = 0, C2dotC2 = 0;
  for(int i = 0; i < K; i++){
    C1dotC1 += pow(cluster_sizes1[i], 2);
    C2dotC2 += pow(cluster_sizes2[i], 2);
  }
  
  return NumericVector::create((double)C1dotC2/(double)(C1dotC1+C2dotC2-C1dotC2));
}