Tag Archives: Wagner-Fischer

Levenshtein distance in C

The Levenshtein distance is a measure of the similarity of two strings. It is defined as the minimum number of insertions, deletions, and substitutions necessary to transform the first string into the second.

For example, the Levenshtein distance between “CHALK” and “CHEESE” is 4, as follows:

  1. Substitute E for A
  2. Substitute E for L
  3. Substitute S for K
  4. Insert E

The Levenshtein distance can be calculated using a dynamic programming algorithm due to Wagner and Fischer. The algorithm fills a table with the distance between all of the prefixes of the two strings, the final entry being the distance between the two entire strings.

Below is an implementation in C. I have recorded in the matrix cells information about the edits and the previous cell from which each cell was derived to enable tracing back the sequence of edits – the edit script. The function levenshtein_distance() takes two strings, and the address of a pointer to which to assign an array containing the edit script. It returns the Levenshtein distance.

#include <string.h>
#include <stdlib.h>

typedef enum {
    INSERTION,
    DELETION,
    SUBSTITUTION,
    NONE
} edit_type;

struct edit {
    unsigned int score;
    edit_type type;
    char arg1;
    char arg2;
    unsigned int pos;
    struct edit *prev;
};
typedef struct edit edit;

static int min3(int a, int b, int c)
{
    if (a < b && a < c) {
        return a;
    }
    if (b < a && b < c) {
        return b;
    }
    return c;
}

static unsigned int levenshtein_matrix_calculate(edit **mat, const char *str1, size_t len1,
        const char *str2, size_t len2)
{
    unsigned int i, j;
    for (j = 1; j <= len2; j++) {
        for (i = 1; i <= len1; i++) {
            unsigned int substitution_cost;
            unsigned int del = 0, ins = 0, subst = 0;
            unsigned int best;
            if (str1[i - 1] == str2[j - 1]) {
                substitution_cost = 0;
            }
            else {
                substitution_cost = 1;
            }
            del = mat[i - 1][j].score + 1; /* deletion */
            ins = mat[i][j - 1].score + 1; /* insertion */
            subst = mat[i - 1][j - 1].score + substitution_cost; /* substitution */
            best = min3(del, ins, subst);
            mat[i][j].score = best;                  
            mat[i][j].arg1 = str1[i - 1];
            mat[i][j].arg2 = str2[j - 1];
            mat[i][j].pos = i - 1;
            if (best == del) {
                mat[i][j].type = DELETION;
                mat[i][j].prev = &mat[i - 1][j];
            }
            else if (best == ins) {
                mat[i][j].type = INSERTION;
                mat[i][j].prev = &mat[i][j - 1];
            }
            else {
                if (substitution_cost > 0) {
                    mat[i][j].type = SUBSTITUTION;
                }
                else {
                    mat[i][j].type = NONE;
                }
                mat[i][j].prev = &mat[i - 1][j - 1];
            }
        }
    }
    return mat[len1][len2].score;
}

static edit **levenshtein_matrix_create(const char *str1, size_t len1, const char *str2,
        size_t len2)
{
    unsigned int i, j;
    edit **mat = malloc((len1 + 1) * sizeof(edit *));
    if (mat == NULL) {
        return NULL;
    }
    for (i = 0; i <= len1; i++) {
        mat[i] = malloc((len2 + 1) * sizeof(edit));
        if (mat[i] == NULL) {
            for (j = 0; j < i; j++) {
                free(mat[j]);
            }
            free(mat);
            return NULL;
        }
    }
    for (i = 0; i <= len1; i++) {
        mat[i][0].score = i;
        mat[i][0].prev = NULL;
        mat[i][0].arg1 = 0;
        mat[i][0].arg2 = 0;
    }

    for (j = 0; j <= len2; j++) {
        mat[0][j].score = j;
        mat[0][j].prev = NULL;
        mat[0][j].arg1 = 0;
        mat[0][j].arg2 = 0;
    }
    return mat; 
}

unsigned int levenshtein_distance(const char *str1, const char *str2, edit **script)
{
    const size_t len1 = strlen(str1), len2 = strlen(str2);
    unsigned int i, distance;
    edit **mat, *head;

    /* If either string is empty, the distance is the other string's length */
    if (len1 == 0) {
        return len2;
    }
    if (len2 == 0) {
        return len1;
    }
    /* Initialise the matrix */
    mat = levenshtein_matrix_create(str1, len1, str2, len2);
    if (!mat) {
        *script = NULL;
        return 0;
    }
    /* Main algorithm */
    distance = levenshtein_matrix_calculate(mat, str1, len1, str2, len2);
    /* Read back the edit script */
    *script = malloc(distance * sizeof(edit));
    if (*script) {
        i = distance - 1;
        for (head = &mat[len1][len2];
                head->prev != NULL;
                head = head->prev) {
            if (head->type != NONE) {
                memcpy(*script + i, head, sizeof(edit));
                i--;
            }
        }
    }
    else {
        distance = 0;
    }
    /* Clean up */
    for (i = 0; i <= len1; i++) {
        free(mat[i]);
    }
    free(mat);

    return distance;

Here is an example program that finds the distance from CHALK to CHEESE and prints the edit script:

#include <stdlib.h>
#include <stdio.h>

void print(const edit *e)
{
    if (e->type == INSERTION) {
        printf("Insert %c", e->arg2);
    }
    else if (e->type == DELETION) {
        printf("Delete %c", e->arg1);
    }
    else {
        printf("Substitute %c for %c", e->arg2, e->arg1);
    }
    printf(" at %u\n", e->pos);
}

int main(void)
{
    edit *script;
    unsigned int distance;
    unsigned int i;
    const char str1[] = "CHALK";
    const char str2[] = "CHEESE";
    distance = levenshtein_distance(str1, str2, &script);
    printf("Distance is %d:\n", distance);
    for (i = 0; i < distance; i++) {
        print(&script[i]);
    }
    free(script);
    return 0;
}

The output:

Distance is 4:
Substitute E for A at 2
Substitute E for L at 3
Substitute S for K at 4
Insert E at 4