Tag Archives: Levenshtein Distance

Levenshtein distance in C

The Levenshtein distance is a measure of the similarity of two strings. It is defined as the minimum number of insertions, deletions, and substitutions necessary to transform the first string into the second.

For example, the Levenshtein distance between “CHALK” and “CHEESE” is 4, as follows:

  1. Substitute E for A
  2. Substitute E for L
  3. Substitute S for K
  4. Insert E

The Levenshtein distance can be calculated using a dynamic programming algorithm due to Wagner and Fischer. The algorithm fills a table with the distance between all of the prefixes of the two strings, the final entry being the distance between the two entire strings.

Below is an implementation in C. I have recorded in the matrix cells information about the edits and the previous cell from which each cell was derived to enable tracing back the sequence of edits – the edit script. The function levenshtein_distance() takes two strings, and the address of a pointer to which to assign an array containing the edit script. It returns the Levenshtein distance.

#include <string.h>
#include <stdlib.h>

typedef enum {
    INSERTION,
    DELETION,
    SUBSTITUTION,
    NONE
} edit_type;

struct edit {
    unsigned int score;
    edit_type type;
    char arg1;
    char arg2;
    unsigned int pos;
    struct edit *prev;
};
typedef struct edit edit;

static int min3(int a, int b, int c)
{
    if (a < b && a < c) {
        return a;
    }
    if (b < a && b < c) {
        return b;
    }
    return c;
}

static unsigned int levenshtein_matrix_calculate(edit **mat, const char *str1, size_t len1,
        const char *str2, size_t len2)
{
    unsigned int i, j;
    for (j = 1; j <= len2; j++) {
        for (i = 1; i <= len1; i++) {
            unsigned int substitution_cost;
            unsigned int del = 0, ins = 0, subst = 0;
            unsigned int best;
            if (str1[i - 1] == str2[j - 1]) {
                substitution_cost = 0;
            }
            else {
                substitution_cost = 1;
            }
            del = mat[i - 1][j].score + 1; /* deletion */
            ins = mat[i][j - 1].score + 1; /* insertion */
            subst = mat[i - 1][j - 1].score + substitution_cost; /* substitution */
            best = min3(del, ins, subst);
            mat[i][j].score = best;                  
            mat[i][j].arg1 = str1[i - 1];
            mat[i][j].arg2 = str2[j - 1];
            mat[i][j].pos = i - 1;
            if (best == del) {
                mat[i][j].type = DELETION;
                mat[i][j].prev = &mat[i - 1][j];
            }
            else if (best == ins) {
                mat[i][j].type = INSERTION;
                mat[i][j].prev = &mat[i][j - 1];
            }
            else {
                if (substitution_cost > 0) {
                    mat[i][j].type = SUBSTITUTION;
                }
                else {
                    mat[i][j].type = NONE;
                }
                mat[i][j].prev = &mat[i - 1][j - 1];
            }
        }
    }
    return mat[len1][len2].score;
}

static edit **levenshtein_matrix_create(const char *str1, size_t len1, const char *str2,
        size_t len2)
{
    unsigned int i, j;
    edit **mat = malloc((len1 + 1) * sizeof(edit *));
    if (mat == NULL) {
        return NULL;
    }
    for (i = 0; i <= len1; i++) {
        mat[i] = malloc((len2 + 1) * sizeof(edit));
        if (mat[i] == NULL) {
            for (j = 0; j < i; j++) {
                free(mat[j]);
            }
            free(mat);
            return NULL;
        }
    }
    for (i = 0; i <= len1; i++) {
        mat[i][0].score = i;
        mat[i][0].prev = NULL;
        mat[i][0].arg1 = 0;
        mat[i][0].arg2 = 0;
    }

    for (j = 0; j <= len2; j++) {
        mat[0][j].score = j;
        mat[0][j].prev = NULL;
        mat[0][j].arg1 = 0;
        mat[0][j].arg2 = 0;
    }
    return mat; 
}

unsigned int levenshtein_distance(const char *str1, const char *str2, edit **script)
{
    const size_t len1 = strlen(str1), len2 = strlen(str2);
    unsigned int i, distance;
    edit **mat, *head;

    /* If either string is empty, the distance is the other string's length */
    if (len1 == 0) {
        return len2;
    }
    if (len2 == 0) {
        return len1;
    }
    /* Initialise the matrix */
    mat = levenshtein_matrix_create(str1, len1, str2, len2);
    if (!mat) {
        *script = NULL;
        return 0;
    }
    /* Main algorithm */
    distance = levenshtein_matrix_calculate(mat, str1, len1, str2, len2);
    /* Read back the edit script */
    *script = malloc(distance * sizeof(edit));
    if (*script) {
        i = distance - 1;
        for (head = &mat[len1][len2];
                head->prev != NULL;
                head = head->prev) {
            if (head->type != NONE) {
                memcpy(*script + i, head, sizeof(edit));
                i--;
            }
        }
    }
    else {
        distance = 0;
    }
    /* Clean up */
    for (i = 0; i <= len1; i++) {
        free(mat[i]);
    }
    free(mat);

    return distance;

Here is an example program that finds the distance from CHALK to CHEESE and prints the edit script:

#include <stdlib.h>
#include <stdio.h>

void print(const edit *e)
{
    if (e->type == INSERTION) {
        printf("Insert %c", e->arg2);
    }
    else if (e->type == DELETION) {
        printf("Delete %c", e->arg1);
    }
    else {
        printf("Substitute %c for %c", e->arg2, e->arg1);
    }
    printf(" at %u\n", e->pos);
}

int main(void)
{
    edit *script;
    unsigned int distance;
    unsigned int i;
    const char str1[] = "CHALK";
    const char str2[] = "CHEESE";
    distance = levenshtein_distance(str1, str2, &script);
    printf("Distance is %d:\n", distance);
    for (i = 0; i < distance; i++) {
        print(&script[i]);
    }
    free(script);
    return 0;
}

The output:

Distance is 4:
Substitute E for A at 2
Substitute E for L at 3
Substitute S for K at 4
Insert E at 4

BK-Tree in C++

A BK-tree is an efficient data structure for storing items for which there is some concept of distance between them. A common use is to store words for approximate string matching, using the Levenshtein distance metric. Another use is to store phrases using some measure of semantic distance in order to implement an AI system like a chatterbot.

This is a simple implementation of a BK-tree in C++.

#ifndef BK_TREE_HPP
#define BK_TREE_HPP

#include <map>
#include <memory>
#include <type_traits>

namespace MB
{

template <typename T, typename Unit, typename Metric> class bktree;

namespace detail
{

template <typename T, typename Unit, typename Metric>
class bktree_node
{
    friend class bktree<T, Unit, Metric>;
	typedef bktree_node<T, Unit, Metric> node_type;
    typedef std::unique_ptr<node_type> node_ptr_type;
	
	T value_;
	std::map<Unit, node_ptr_type> children_;

	bktree_node(const T &value)
		: value_(value)
    {
    }

	bool insert(const T& value, const Metric& distance)
    {
        bool inserted = false;
		Unit dist = distance(value, this->value_);
		if (dist > 0) {
            // Not already here 
            auto it = children_.find(dist);
            if (it == children_.end()) {
                // Attach a new edge here
                children_.insert(std::make_pair(dist, node_ptr_type(new node_type(value))));
                inserted = true;
            }
            else {
                // Follow existing edge
                inserted = it->second->insert(value, distance);
            }
            return inserted;
        }
	}

    template <class OutputIt>
	OutputIt find(const T &value, const Unit& limit, const Metric& distance, OutputIt it) const
    {
		Unit dist = distance(value, this->value_);
		if (dist <= limit) {
            // Found one
			*it++ = std::make_pair(this->value_, dist);
        }
		for (auto iter = children_.begin(); iter != children_.end(); ++iter) {
            // Follow edges between dist + limit and dist - limit
			if (dist - limit <= iter->first && dist + limit >= iter->first) {
				iter->second->find(value, limit, distance, it);
            }
		}
        return it;
	}
};

}; // namespace detail

template <typename T, typename Unit, typename Metric>
class bktree
{
private:
	typedef typename detail::bktree_node<T, Unit, Metric>::node_type node_type;
    typedef typename detail::bktree_node<T, Unit, Metric>::node_ptr_type node_ptr_type;

    node_ptr_type head_;
    const Metric distance_;
	size_t size_;

public:
	bktree(const Metric& distance = Metric())
        : head_(nullptr), distance_(distance), size_(0L)
    {
        static_assert(std::is_integral<Unit>::value, "Integral unit type required.");
    }

	bool insert(const T &value)
    {
        bool inserted = false;
		if (head_ == nullptr) {
            // Inserting the first value
			head_ = node_ptr_type(new node_type(value));
			size_ = 1;
            inserted = true;
		}
        else if (head_->insert(value, distance_)) {
            ++size_;
            inserted = true;
        }
        return inserted;
	}

    template <class OutputIt>
	OutputIt find(const T& value, const Unit& limit, OutputIt it) const
    {
		return head_->find(value, limit, distance_, it);
	}

	size_t size() const
    {
		return size_;
	}
};

}; // namespace MB

#endif // BK_TREE_HPP

Here is an example program. It loads a dictionary from a file, and then prompts the user to enter words and a limit on distance, and it will return all words within that Levenshtein distance of the word entered. I used this dictionary with it:

#include <fstream>
#include <iostream>
#include <string>

#include "bk_tree.hpp"
#include "levenshtein_distance.hpp"

int main()
{
    const char filename[] = "dictionary.txt";
    std::ifstream ifs(filename);
    if (!ifs) {
        std::cerr << "Couldn't open " << filename << " for reading\n";
        return 1;
    }
    MB::bktree<std::string, int, levenshtein_distance> tree;
    std::string word;
    while (ifs >> word) {
        tree.insert(word);
    }
    std::cout << "Inserted " << tree.size() << " words\n";
    std::vector<std::pair<std::string, int> > results;
    do {
        std::cout << "Enter a word (\"quit!\" to quit)> ";
        std::cin >> word;
        if (word != "quit!") {
            int limit;
            std::cout << "Enter a limit> ";
            std::cin >> limit;
            tree.find(word, limit, std::back_inserter(results));
            for (auto it = results.begin(); it != results.end(); ++it) {
                std::cout << it->first << "(distance " << it->second << ")\n";
            }
            results.clear();
        }
    }  while (word != "quit!");
}

Example run:

$ ./dictionary
Inserted 127142 words
Enter a word ("quit!" to quit)> mispelling
Enter a limit> 2
spelling(distance 2)
misdealing(distance 2)
impelling(distance 2)
miscalling(distance 2)
mispenning(distance 2)
misrelying(distance 2)
respelling(distance 2)
dispelling(distance 1)
misbilling(distance 2)
misspelling(distance 1)
misspellings(distance 2)
Enter a word ("quit!" to quit)> quit!
$

Here is an implementation of the Levenshtein distance:

#include <string>
#include <vector>
#include <algorithm>

namespace MB
{

namespace detail
{

template <typename T>
T min3(const T& a, const T& b, const T& c)
{
   return std::min(std::min(a, b), c);
}

}; // namespace detail

class levenshtein_distance 
{
    mutable std::vector<std::vector<unsigned int> > matrix_;

public:
    explicit levenshtein_distance(size_t initial_size = 8)
        : matrix_(initial_size, std::vector<unsigned int>(initial_size))
    {
    }

    unsigned int operator()(const std::string& s, const std::string& t) const
    {
        const size_t m = s.size();
        const size_t n = t.size();
        // The distance between a string and the empty string is the string's length
        if (m == 0) {
            return n;
        }
        if (n == 0) {
            return m;
        }
        // Size the matrix as necessary
        if (matrix_.size() < m + 1) {
            matrix_.resize(m + 1, matrix_[0]);
        }
        if (matrix_[0].size() < n + 1) {
            for (auto& mat : matrix_) {
                mat.resize(n + 1);
            }
        }
        // The top row and left column are prefixes that can be reached by
        // insertions and deletions alone
        unsigned int i, j;
        for (i = 1;  i <= m; ++i) {
            matrix_[i][0] = i;
        }
        for (j = 1; j <= n; ++j) {
            matrix_[0][j] = j;
        }
        // Fill in the rest of the matrix
        for (j = 1; j <= n; ++j) {
            for (i = 1; i <= m; ++i) {
                unsigned int substitution_cost = s[i - 1] == t[j - 1] ? 0 : 1;
                matrix_[i][j] =
                    detail::min3(matrix_[i - 1][j] + 1,         // Deletion
                    matrix_[i][j - 1] + 1,                      // Insertion
                    matrix_[i - 1][j - 1] + substitution_cost); // Substitution
            }
        }
        return matrix_[m][n];
    }
};

}; // namespace MB