Sunday, October 6, 2013

C++ - Bitap algorithm for Levenshtein distance calculation

In information theory and computer science, the Levenshtein distance is a string metric for measuring the difference between two sequences. Informally, the Levenshtein distance between two words is the minimum number of single-character edits (insertion, deletion, substitution) required to change one word into the other.

The bitap algorithm (also known as the shift-orshift-and or Baeza-Yates–Gonnet algorithm) is an approximate string matching algorithm. The algorithm tells whether a given text contains a substring which is "approximately equal" to a given pattern, where approximate equality is defined in terms of Levenshtein distance — if the substring and pattern are within a given distance k of each other, then the algorithm considers them equal.


C++ implementation of bitap algorithm is given below:



#include 
#include 

using namespace std;

#define Diff_Timeout    1.0f
#define Diff_EditCost   4
#define Match_Threshold   0.5f
#define Match_Distance   100
#define Patch_DeleteThreshold 0.5f
#define Patch_Margin    4
#define Match_MaxBits   32

int match_bitap(const wstring &text, const wstring &pattern, int loc) 
{
 if (!(Match_MaxBits == 0 || pattern.length() <= Match_MaxBits)) 
 {
  throw _T("Pattern too long for this application.");
 }

 // Initialize the alphabet.
 map  s = match_alphabet(pattern);

 // Highest score beyond which we give up.
 double score_threshold = Match_Threshold;
 // Is there a nearby exact match? (speedup)
 int best_loc = text.find(pattern, loc);
 
 if (best_loc != -1) 
 {
  score_threshold = min(match_bitapScore(0, best_loc, loc, pattern), score_threshold);
  // What about in the other direction? (speedup)
  best_loc = text.rfind(pattern, loc + pattern.length());
  if (best_loc != -1) {
   score_threshold = min(match_bitapScore(0, best_loc, loc, pattern), score_threshold);
  }
 }

 // Initialize the bit arrays.
 int matchmask = 1 << (pattern.length() - 1);
 best_loc = -1;

 int bin_min, bin_mid;
 int bin_max = pattern.length() + text.length();
 int *rd;
 int *last_rd = NULL;
 
 for (int d = 0; d < pattern.length(); d++) 
 {
  // Scan for the best match; each iteration allows for one more error.
  // Run a binary search to determine how far from 'loc' we can stray at
  // this error level.
  bin_min = 0;
  bin_mid = bin_max;
  
  while (bin_min < bin_mid) 
  {
   if (match_bitapScore(d, loc + bin_mid, loc, pattern) <= score_threshold) 
   {
     bin_min = bin_mid;
   } 
   else 
   {
    bin_max = bin_mid;
   }
   
   bin_mid = (bin_max - bin_min) / 2 + bin_min;
  }
  
  // Use the result from this iteration as the maximum for the next.
  bin_max = bin_mid;
  int start = max(1, loc - bin_mid + 1);
  int finish = min(loc + bin_mid, text.length()) + pattern.length();

  rd = new int[finish + 2];
  rd[finish + 1] = (1 << d) - 1;
  
  for (int j = finish; j >= start; j--) 
  {
   int charMatch;
   
   if (text.length() <= j - 1) 
   {
    // Out of range.
    charMatch = 0;
   } 
   else 
   {
    charMatch = s[text[j - 1]];
   }

   if (d == 0) 
   {
    // First pass: exact match.
    rd[j] = ((rd[j + 1] << 1) | 1) & charMatch;
   } 
   else 
   {
    // Subsequent passes: fuzzy match.
    rd[j] = ((rd[j + 1] << 1) | 1) & charMatch
     | (((last_rd[j + 1] | last_rd[j]) << 1) | 1)
     | last_rd[j + 1];
   }
   
   if ((rd[j] & matchmask) != 0) 
   {
    double score = match_bitapScore(d, j - 1, loc, pattern);
    // This match will almost certainly be better than any existing
    // match.  But check anyway.
    if (score <= score_threshold) 
    {
     // Told you so.
     score_threshold = score;
     best_loc = j - 1;
     if (best_loc > loc) 
     {
      // When passing loc, don't exceed our current distance from loc.
      start = max(1, 2 * loc - best_loc);
     } 
     else 
     {
      // Already passed loc, downhill from here on in.
      break;
     }
    }
   }
  }

  if (match_bitapScore(d + 1, loc, loc, pattern) > score_threshold) 
  {
   // No hope for a (better) match at greater error levels.
   break;
  }
  
  delete [] last_rd;
  last_rd = rd;
 }

 delete [] last_rd;
 delete [] rd;
 return best_loc;
}


double match_bitapScore(int e, int x, int loc, const wstring &pattern) 
{
 const float accuracy = static_cast (e) / pattern.length();
 const int proximity = abs(loc - x);
 
 if (Match_Distance == 0) 
 {
  // Dodge divide by zero error.
  return proximity == 0 ? accuracy : 1.0;
 }
 
 return accuracy + (proximity / static_cast (Match_Distance));
}


map  match_alphabet(const wstring &pattern) 
{
 map  s;
 int i;
 
 for (i = 0; i < pattern.length(); i++) 
 {
  TCHAR c = pattern[i];
  s[c] = 0;
 }
 
 for (i = 0; i < pattern.length(); i++) 
 {
  TCHAR c = pattern[i];
  s[c] = (s[c] | (1 << (pattern.length() - i - 1)));
 }

 return s;
}

The call to procedure is as follows:

int res = match_bitap(stringOnestringTwo, 0);

where stringOne and stringTwo are strings whose match (Levenshtein distance) is to be determined, and 0 is location in stringOne where matching procedure should start. Return value is Levenshtein distance, as described above.

3 comments:

Unknown said...

What is license of this code? Do you allow to use it in GPL projects?

Zoromatic Software said...

Original licence for C code (which was basis for this C++ implementation) is GPL, so feel free to use it in GPL projects.

Castor said...

Hi there, nice to see a full implementation of Bitap in C++. However, I have one correction and one question to make. Correction: the function does not seem to be returning the edit distance, as stated in the post. Rather, it returns the position of the best match within the text being searched. Question: how exactly can one change the tolerated edit distance? Say, for example, one wants the tolerated to be 2 edits (where inserts, substitutions and deletions all have cost of 1)?