Sunday, October 6, 2013

C++ - Bitap algorithm for Levenshtein distance calculation

In information theory and computer science, the Levenshtein distance is a string metric for measuring the difference between two sequences. Informally, the Levenshtein distance between two words is the minimum number of single-character edits (insertion, deletion, substitution) required to change one word into the other.

The bitap algorithm (also known as the shift-orshift-and or Baeza-Yates–Gonnet algorithm) is an approximate string matching algorithm. The algorithm tells whether a given text contains a substring which is "approximately equal" to a given pattern, where approximate equality is defined in terms of Levenshtein distance — if the substring and pattern are within a given distance k of each other, then the algorithm considers them equal.


C++ implementation of bitap algorithm is given below:



#include 
#include 

using namespace std;

#define Diff_Timeout    1.0f
#define Diff_EditCost   4
#define Match_Threshold   0.5f
#define Match_Distance   100
#define Patch_DeleteThreshold 0.5f
#define Patch_Margin    4
#define Match_MaxBits   32

int match_bitap(const wstring &text, const wstring &pattern, int loc) 
{
 if (!(Match_MaxBits == 0 || pattern.length() <= Match_MaxBits)) 
 {
  throw _T("Pattern too long for this application.");
 }

 // Initialize the alphabet.
 map  s = match_alphabet(pattern);

 // Highest score beyond which we give up.
 double score_threshold = Match_Threshold;
 // Is there a nearby exact match? (speedup)
 int best_loc = text.find(pattern, loc);
 
 if (best_loc != -1) 
 {
  score_threshold = min(match_bitapScore(0, best_loc, loc, pattern), score_threshold);
  // What about in the other direction? (speedup)
  best_loc = text.rfind(pattern, loc + pattern.length());
  if (best_loc != -1) {
   score_threshold = min(match_bitapScore(0, best_loc, loc, pattern), score_threshold);
  }
 }

 // Initialize the bit arrays.
 int matchmask = 1 << (pattern.length() - 1);
 best_loc = -1;

 int bin_min, bin_mid;
 int bin_max = pattern.length() + text.length();
 int *rd;
 int *last_rd = NULL;
 
 for (int d = 0; d < pattern.length(); d++) 
 {
  // Scan for the best match; each iteration allows for one more error.
  // Run a binary search to determine how far from 'loc' we can stray at
  // this error level.
  bin_min = 0;
  bin_mid = bin_max;
  
  while (bin_min < bin_mid) 
  {
   if (match_bitapScore(d, loc + bin_mid, loc, pattern) <= score_threshold) 
   {
     bin_min = bin_mid;
   } 
   else 
   {
    bin_max = bin_mid;
   }
   
   bin_mid = (bin_max - bin_min) / 2 + bin_min;
  }
  
  // Use the result from this iteration as the maximum for the next.
  bin_max = bin_mid;
  int start = max(1, loc - bin_mid + 1);
  int finish = min(loc + bin_mid, text.length()) + pattern.length();

  rd = new int[finish + 2];
  rd[finish + 1] = (1 << d) - 1;
  
  for (int j = finish; j >= start; j--) 
  {
   int charMatch;
   
   if (text.length() <= j - 1) 
   {
    // Out of range.
    charMatch = 0;
   } 
   else 
   {
    charMatch = s[text[j - 1]];
   }

   if (d == 0) 
   {
    // First pass: exact match.
    rd[j] = ((rd[j + 1] << 1) | 1) & charMatch;
   } 
   else 
   {
    // Subsequent passes: fuzzy match.
    rd[j] = ((rd[j + 1] << 1) | 1) & charMatch
     | (((last_rd[j + 1] | last_rd[j]) << 1) | 1)
     | last_rd[j + 1];
   }
   
   if ((rd[j] & matchmask) != 0) 
   {
    double score = match_bitapScore(d, j - 1, loc, pattern);
    // This match will almost certainly be better than any existing
    // match.  But check anyway.
    if (score <= score_threshold) 
    {
     // Told you so.
     score_threshold = score;
     best_loc = j - 1;
     if (best_loc > loc) 
     {
      // When passing loc, don't exceed our current distance from loc.
      start = max(1, 2 * loc - best_loc);
     } 
     else 
     {
      // Already passed loc, downhill from here on in.
      break;
     }
    }
   }
  }

  if (match_bitapScore(d + 1, loc, loc, pattern) > score_threshold) 
  {
   // No hope for a (better) match at greater error levels.
   break;
  }
  
  delete [] last_rd;
  last_rd = rd;
 }

 delete [] last_rd;
 delete [] rd;
 return best_loc;
}


double match_bitapScore(int e, int x, int loc, const wstring &pattern) 
{
 const float accuracy = static_cast (e) / pattern.length();
 const int proximity = abs(loc - x);
 
 if (Match_Distance == 0) 
 {
  // Dodge divide by zero error.
  return proximity == 0 ? accuracy : 1.0;
 }
 
 return accuracy + (proximity / static_cast (Match_Distance));
}


map  match_alphabet(const wstring &pattern) 
{
 map  s;
 int i;
 
 for (i = 0; i < pattern.length(); i++) 
 {
  TCHAR c = pattern[i];
  s[c] = 0;
 }
 
 for (i = 0; i < pattern.length(); i++) 
 {
  TCHAR c = pattern[i];
  s[c] = (s[c] | (1 << (pattern.length() - i - 1)));
 }

 return s;
}

The call to procedure is as follows:

int res = match_bitap(stringOnestringTwo, 0);

where stringOne and stringTwo are strings whose match (Levenshtein distance) is to be determined, and 0 is location in stringOne where matching procedure should start. Return value is Levenshtein distance, as described above.