Tag Archives: Algorithms

Levenshtein distance in C

The Levenshtein distance is a measure of the similarity of two strings. It is defined as the minimum number of insertions, deletions, and substitutions necessary to transform the first string into the second.

For example, the Levenshtein distance between “CHALK” and “CHEESE” is 4, as follows:

  1. Substitute E for A
  2. Substitute E for L
  3. Substitute S for K
  4. Insert E

The Levenshtein distance can be calculated using a dynamic programming algorithm due to Wagner and Fischer. The algorithm fills a table with the distance between all of the prefixes of the two strings, the final entry being the distance between the two entire strings.

Below is an implementation in C. I have recorded in the matrix cells information about the edits and the previous cell from which each cell was derived to enable tracing back the sequence of edits – the edit script. The function levenshtein_distance() takes two strings, and the address of a pointer to which to assign an array containing the edit script. It returns the Levenshtein distance.

#include <string.h>
#include <stdlib.h>

typedef enum {
    INSERTION,
    DELETION,
    SUBSTITUTION,
    NONE
} edit_type;

struct edit {
    unsigned int score;
    edit_type type;
    char arg1;
    char arg2;
    unsigned int pos;
    struct edit *prev;
};
typedef struct edit edit;

static int min3(int a, int b, int c)
{
    if (a < b && a < c) {
        return a;
    }
    if (b < a && b < c) {
        return b;
    }
    return c;
}

static unsigned int levenshtein_matrix_calculate(edit **mat, const char *str1, size_t len1,
        const char *str2, size_t len2)
{
    unsigned int i, j;
    for (j = 1; j <= len2; j++) {
        for (i = 1; i <= len1; i++) {
            unsigned int substitution_cost;
            unsigned int del = 0, ins = 0, subst = 0;
            unsigned int best;
            if (str1[i - 1] == str2[j - 1]) {
                substitution_cost = 0;
            }
            else {
                substitution_cost = 1;
            }
            del = mat[i - 1][j].score + 1; /* deletion */
            ins = mat[i][j - 1].score + 1; /* insertion */
            subst = mat[i - 1][j - 1].score + substitution_cost; /* substitution */
            best = min3(del, ins, subst);
            mat[i][j].score = best;                  
            mat[i][j].arg1 = str1[i - 1];
            mat[i][j].arg2 = str2[j - 1];
            mat[i][j].pos = i - 1;
            if (best == del) {
                mat[i][j].type = DELETION;
                mat[i][j].prev = &mat[i - 1][j];
            }
            else if (best == ins) {
                mat[i][j].type = INSERTION;
                mat[i][j].prev = &mat[i][j - 1];
            }
            else {
                if (substitution_cost > 0) {
                    mat[i][j].type = SUBSTITUTION;
                }
                else {
                    mat[i][j].type = NONE;
                }
                mat[i][j].prev = &mat[i - 1][j - 1];
            }
        }
    }
    return mat[len1][len2].score;
}

static edit **levenshtein_matrix_create(const char *str1, size_t len1, const char *str2,
        size_t len2)
{
    unsigned int i, j;
    edit **mat = malloc((len1 + 1) * sizeof(edit *));
    if (mat == NULL) {
        return NULL;
    }
    for (i = 0; i <= len1; i++) {
        mat[i] = malloc((len2 + 1) * sizeof(edit));
        if (mat[i] == NULL) {
            for (j = 0; j < i; j++) {
                free(mat[j]);
            }
            free(mat);
            return NULL;
        }
    }
    for (i = 0; i <= len1; i++) {
        mat[i][0].score = i;
        mat[i][0].prev = NULL;
        mat[i][0].arg1 = 0;
        mat[i][0].arg2 = 0;
    }

    for (j = 0; j <= len2; j++) {
        mat[0][j].score = j;
        mat[0][j].prev = NULL;
        mat[0][j].arg1 = 0;
        mat[0][j].arg2 = 0;
    }
    return mat; 
}

unsigned int levenshtein_distance(const char *str1, const char *str2, edit **script)
{
    const size_t len1 = strlen(str1), len2 = strlen(str2);
    unsigned int i, distance;
    edit **mat, *head;

    /* If either string is empty, the distance is the other string's length */
    if (len1 == 0) {
        return len2;
    }
    if (len2 == 0) {
        return len1;
    }
    /* Initialise the matrix */
    mat = levenshtein_matrix_create(str1, len1, str2, len2);
    if (!mat) {
        *script = NULL;
        return 0;
    }
    /* Main algorithm */
    distance = levenshtein_matrix_calculate(mat, str1, len1, str2, len2);
    /* Read back the edit script */
    *script = malloc(distance * sizeof(edit));
    if (*script) {
        i = distance - 1;
        for (head = &mat[len1][len2];
                head->prev != NULL;
                head = head->prev) {
            if (head->type != NONE) {
                memcpy(*script + i, head, sizeof(edit));
                i--;
            }
        }
    }
    else {
        distance = 0;
    }
    /* Clean up */
    for (i = 0; i <= len1; i++) {
        free(mat[i]);
    }
    free(mat);

    return distance;

Here is an example program that finds the distance from CHALK to CHEESE and prints the edit script:

#include <stdlib.h>
#include <stdio.h>

void print(const edit *e)
{
    if (e->type == INSERTION) {
        printf("Insert %c", e->arg2);
    }
    else if (e->type == DELETION) {
        printf("Delete %c", e->arg1);
    }
    else {
        printf("Substitute %c for %c", e->arg2, e->arg1);
    }
    printf(" at %u\n", e->pos);
}

int main(void)
{
    edit *script;
    unsigned int distance;
    unsigned int i;
    const char str1[] = "CHALK";
    const char str2[] = "CHEESE";
    distance = levenshtein_distance(str1, str2, &script);
    printf("Distance is %d:\n", distance);
    for (i = 0; i < distance; i++) {
        print(&script[i]);
    }
    free(script);
    return 0;
}

The output:

Distance is 4:
Substitute E for A at 2
Substitute E for L at 3
Substitute S for K at 4
Insert E at 4

Permutation cycles in C

A cycle of a permutation is a subset of the elements that replace one another in sequence, until the last element is replaced by the first. For example, consider the permutation below:

\(\sigma=\begin{pmatrix}
0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 \\
1 & 3 & 7 & 0 & 4 & 8 & 2 & 6 & 5\end{pmatrix}\)

We can find the cycles:
\(0 \rightarrow 1, 1 \rightarrow 3, 3 \rightarrow 0\)
\(2 \rightarrow 7, 7 \rightarrow 6, 6 \rightarrow 2\)
\(4 \rightarrow 4\)
\(5 \rightarrow 8, 8 \rightarrow 5\)

These can be written as:
\((0, 1, 3)(2, 7, 6)(4)(5, 8)\)

It’s customary to omit cycles of length 1, so this would usually be written as
\((0, 1, 3)(2, 7, 6)(5, 8).\)

To find the cycle decomposition of a permutation, we can use an algorithm that is very similar to depth-first search (DFS) on a graph. We begin a new search for each unvisited vertex (number), and visit its neighbour (image in the permutation) until we get back to the first vertex again.

Below is an implementation in C. The function permutation_cycles() takes a permutation in the form of integers starting from 0, its length, and a callback function that is called for each cycle found.

#include <stdlib.h>

typedef void(*cyclefn)(const unsigned int *, size_t);

void permutation_cycles_recursive(const unsigned int *permutation, unsigned int *visited, 
        unsigned int start, unsigned int current, unsigned int *path,
        size_t length, cyclefn fun)
{
    visited[current] = 1;
    path[length] = current;
    if (start == current && length > 0) {
        fun(path, length);
    }
    else {
        permutation_cycles_recursive(permutation, visited, start, permutation[current],
                path, length + 1, fun);
    }
}

void permutation_cycles(const unsigned int *permutation, size_t n, cyclefn fun)
{
    unsigned int i;
    unsigned int *visited = calloc(n, sizeof(unsigned int));
    unsigned int *path = malloc(n * sizeof(unsigned int));
    if (!(visited && path)) {
        free(visited);
        free(path);
        return;
    }
    for (i = 0; i < n; i++) {
        if (!visited[i]) {
            permutation_cycles_recursive(permutation, visited, i, i, 
                    path, 0, fun);
        }
    }
    free(visited);
    free(path);
}

Here is an example program that finds the cycle decomposition of the permutation shown above:

#include <stdio.h>

void print(const unsigned int *cycle, size_t length)
{
    if (length > 1) {
        unsigned int i;
        putchar('(');
        for (i = 0; i < length; i++) {
            printf("%u", cycle[i]);
            if (i < length - 1) {
                printf(", ");
            }
        }
        putchar(')');
    }
}

int main(void)
{
    unsigned int permutation[] = {1, 3, 7, 0, 4, 8, 2, 6, 5};
    const size_t n = sizeof(permutation) / sizeof(permutation[0]);
    permutation_cycles(permutation, n, print);
    putchar('\n');
    return 0;
}

The output:

(0, 1, 3)(2, 7, 6)(5, 8)

Permutations in lexicographic order in C

This recursive algorithm produces the permutations in the most natural order, and is also the easiest to understand. It uses two buffers, one containing the permutation being built, and another for the remaining, unused letters.

The algorithm is:

if (there are no more characters to arrange) {
    print (the current permutation);
}
else {
    choose a remaining character;
    remove it from the remaining characters;
    add it to the permutation;
    use recursion to add the remaining characters to the permutation in the same way;
}

Here it is in C:

#include <stdlib.h>
#include <string.h>

typedef void(*permutationfn)(const char *);

static void permutations_recursive(char *perm, size_t used, char *remaining, size_t len,
        permutationfn fun)
{
    if (used == len) {
        perm[used] = '\0';
        fun(perm);
    } 
    else {
        const size_t unused = len - used;
        unsigned int i;
        for (i = 0; i < unused; i++) {
            /* Remove character at i from remaining */
            char c = remaining[i];
            memmove(remaining + i, remaining + i + 1, unused - i);
            /* Append it to perm */
            perm[used] = c;
            permutations_recursive(perm, used + 1, remaining, len, fun);
            /* Put character back */
            memmove(remaining + i + 1, remaining + i, unused - i);
            remaining[i] = c;
        }
    }
}

void permutations(const char *str, permutationfn fun)
{
    const size_t len = strlen(str);
    char *perm = malloc(len + 1);
    char *remaining = malloc(len + 1);
    if (perm && remaining) {
        strcpy(remaining, str);
        permutations_recursive(perm, 0, remaining, len, fun);
    }
    free(perm);
    free(remaining);

Example program:

#include <stdio.h>

int main(void)
{
    char str[] = "ABCD";
    permutations(str, (permutationfn)puts);
    return 0;
}

The output:

ABCD
ABDC
ACBD
ACDB
ADBC
ADCB
BACD
BADC
BCAD
BCDA
BDAC
BDCA
CABD
CADB
CBAD
CBDA
CDAB
CDBA
DABC
DACB
DBAC
DBCA
DCAB
DCBA

Reference: Exhaustive recursion and backtracking

Permutations in transposition order in C

This algorithm is similar to the direct insertion order permutation algorithm, except that instead of inserting a new element into the spaces between existing elements, the new element is inserted into a space already occupied by an existing element or at the end. If the space is already occupied, the existing element is bumped to the end of the permutation.

#include <string.h>
#include <stdlib.h>

typedef void(*permutationfn)(const char *);

static void permutations_recursive(const char *str, char *perm, unsigned int level, size_t len, 
        permutationfn fun)
{
    if (level == len) {
        perm[level] = '\0';
        fun(perm);
    }
    else {
        /* Insert character str[level] in every position */
        unsigned int i;
        for (i = 0; i <= level; i++) {
            if (level > 0 && i < level) {
                /* Bump the existing element to the end */
                perm[level] = perm[i];
            }
            perm[i] = str[level];
            permutations_recursive(str, perm, level + 1, len, fun);
            if (level > 0 && i < level) {
                /* Move the existing element back */
                perm[i] = perm[level];
            }
        } 
    }
}

void permutations(const char *str, permutationfn fun)
{
    char *perm = malloc(strlen(str) + 1);
    permutations_recursive(str, perm, 0, strlen(str), fun);
    free(perm);
}

Example program:

#include <stdio.h>

int main(void)
{
    char str[] = "ABCD";
    permutations(str, (permutationfn)puts);
    return 0;
}

The output:

DABC
CDBA
CADB
CABD
DCAB
BDAC
BCDA
BCAD
DACB
BDCA
BADC
BACD
DBAC
CDAB
CBDA
CBAD
DCBA
ADBC
ACDB
ACBD
DBCA
ADCB
ABDC
ABCD

Reference: Decision Trees

Permutations in direct insertion order in C

To form a permutation in direct insertion order, we take an existing partial permutation containing \(k\) elements, and then insert the next element into each of the \(k+1\) possible places immediately before an element or at the end. So for example if we start with the permutation “A”, we can insert “B” before the “A” or at the end, producing “BA” and “AB”. Taking “BA”, we can then insert “C” at the beginning, between the “B” and the “A”, or at the end, producing “CBA”, “BCA”, “BAC”. The process continues until every element has been added.

Below is an implementation in C. I used a recursive function that takes a callback to process each permutation as it is found. Notice the use of memmove() to move existing elements out of the way, and then to move them back after the recursive call. We can’t use memcpy() for this, because the source and destination ranges overlap.

typedef void(*permutationfn)(const char *);

static void permutations_recursive(const char *str, char *perm, unsigned int level, size_t len, 
        permutationfn fun)
{
    if (level == len) {
        perm[level] = '\0';
        fun(perm);
    }
    else {
        /* Insert character str[level] in every position */
        unsigned int i;
        for (i = 0; i <= level; i++) {
            if (level > 0 && i < level) {
                /* Move the existing elements out of the way */
                memmove(perm + i + 1, perm + i, level - i);
            }
            perm[i] = str[level];
            permutations_recursive(str, perm, level + 1, len, fun);
            if (level > 0 && i < level) {
                /* Move the existing elements back */
                memmove(perm + i, perm + i + 1, len - i);
            }
        } 
    }
}

void permutations(const char *str, permutationfn fun)
{
    char *perm = malloc(strlen(str) + 1);
    permutations_recursive(str, perm, 0, strlen(str), fun);
    free(perm);
}

An example program:

int main(void)
{
    char str[] = "ABCD";
    permutations(str, (permutationfn)puts);
    return 0;
}

The output:

DCBA
CDBA
CBDA
CBAD
DBCA
BDCA
BCDA
BCAD
DBAC
BDAC
BADC
BACD
DCAB
CDAB
CADB
CABD
DACB
ADCB
ACDB
ACBD
DABC
ADBC
ABDC
ABCD

Reference: Decision Trees

Greedy partition in C

In a previous post, I showed a dynamic programming algorithm that gives an exact solution to the partition problem. It’s also possible to get an approximate solution using a greedy algorithm.

The algorithm is:

  1. Sort the elements of the set in decreasing order
  2. For each element in the set:
    1. If the sum of the first subset is greater, put the element in the second set
    2. Else, put it in the first set

You can see intuitively that this is going to make the sets as close as possible to having the same sum without using an exact algorithm. Below is an implementation in C. The function partition() takes an array of unsigned integers and its size, and returns an array of unsigned integers which are 0 if the corresponding element is in the first set of the partition, and 1 if it is in the second set. Note that the indices of this set correspond to the array of numbers after having been sorted.

#include <stdlib.h>

int compare_unsigned_ints_decreasing(const unsigned int *p1, const unsigned int *p2)
{
    if (*p1 > *p2) {
        return -1;
    }
    if (*p1 < *p2) {
        return 1;
    }
    return 0;
}

unsigned int *partition(unsigned int *numbers, size_t n)
{
    unsigned int i, sum0 = 0, sum1 = 0;
    unsigned int *solution = calloc(n, sizeof(unsigned int));
    if (solution == NULL) {
        return NULL;
    }
    qsort(numbers, n, sizeof(unsigned int),
            (int(*)(const void *, const void *))compare_unsigned_ints_decreasing);
    for (i = 0; i < n; i++) {
        if (sum1 < sum0) {
            solution[i] = 1;
            sum1 += numbers[i];
        }
        else {
            sum0 += numbers[i];
        }
    }
    return solution;
}

Here is an example program that partitions the set of numbers from 1 to 7:

#include <stdio.h>
#include <stdlib.h>

int main(void)
{
    unsigned int numbers[] = {1, 2, 3, 4, 5, 6, 7};
    const size_t len = sizeof(numbers) / sizeof(numbers[0]);
    unsigned int i;
    unsigned int *solution = partition(numbers, len);
    printf("First set:\n");
    for (i = 0; i < len; i++) {
        if (solution[i] == 0) {
            printf("%u\n", numbers[i]);
        }
    }
    printf("Second set:\n");
    for (i = 0; i < len; i++) {
        if (solution[i] == 1) {
            printf("%u\n", numbers[i]);
        }
    }
    putchar('\n');
    free(solution);
    return 0;
}

Here is the output. In this case, the partition divides the set perfectly into two sets of sum 14, but that won’t generally be the case for this algorithm as it is an approximate one.

First set:
7
4
3
Second set:
6
5
2
1

Prüfer codes and labeled trees in C

A tree and its Prüfer Code

Arthur Cayley first proved that there are \(n^{n-2}\) labeled trees on \(n\) vertices, and in another proof of the same theorem, Heinz Prüfer showed a bijection between trees on \(n\) vertices and \((n-2)\)-tuples of integers from 1 to \(n – 2\), called Prüfer codes. The proof provides an algorithm that allows us to construct an arbitrary labeled tree from its code.

The algorithm proceeds as follows:

  1. Initialise an array of node degrees for the nodes, which are numbered from 1 to \(n + 2\)
  2. For each occurrence of a node number in the code, increment its degree
  3. Add 1 to all degrees
  4. For each occurrence of a node number in the code:
    1. Find the lowest-numbered node with degree 1
    2. Create an edge between that node and the node in the code
    3. Decrement the degrees of both nodes by 1
  5. There are now 2 nodes left with degree 1, so join them with an edge to complete the tree

Below is an implementation of the algorithm in C. The function prufer_tree() takes a code in the form of an array of unsigned integers and its length, and returns the tree as an array of edges.

#include <stdlib.h>

typedef struct {
    unsigned int first;
    unsigned int second;
} edge;

edge *prufer_tree(const unsigned int *code, size_t len)
{
    unsigned int i, j, e = 0;
    const unsigned int n = len + 2; /* Nodes */
    edge *tree = malloc((n - 1) * sizeof(edge));
    unsigned int *degrees = calloc(n + 1, sizeof(unsigned int)); /* 1-based */
    if (tree == NULL || degrees == NULL) {
        free(tree);
        free(degrees);
        return NULL;
    }
    /* Set the degrees from the occurrences in the code */
    for (i = 0; i < len; i++) {
        degrees[ code[i]]++;
    }
    /* Add 1 to them all */
    for (i = 1; i <= n; i++) {
        degrees[i]++;
    }
    /* Add edges to nodes in the code */
    for (i = 0; i < len; i++) {
        /* Find the lowest-numbered node with degree 1 */
        for (j = 1; degrees[j] != 1; j++);
        /* Add the edge */
        tree[e].first = j;
        tree[e].second = code[i];
        degrees[j]--;
        degrees[ code[i]]--;
        e++;
    }
    /* Find the last 2 degree-1 nodes */
    for (i = 1; degrees[i] != 1; i++);
    for (j = i + 1; degrees[j] != 1; j++);
    /* Add the last edge */
    tree[e].first = i;
    tree[e].second = j;
    free(degrees);

    return tree;
}

Here is an example program that constructs the tree in the image at the top:

#include <stdio.h>
#include <stdlib.h>

int main(void)
{
    unsigned int code[] = {1, 1, 1, 1, 6, 5};
    const size_t len = sizeof(code) / sizeof(unsigned int);
    const unsigned int n = len + 1; /* Edges */
    edge *tree = prufer_tree(code, len);
    unsigned int e;
    for (e = 0; e < n; e++) {
        printf("(%u, %u)\n", tree[e].first, tree[e].second);
    }
    free(tree);
    return 0;
}

The output:

(2, 1)
(3, 1)
(4, 1)
(7, 1)
(1, 6)
(6, 5)
(5, 8)

CYK algorithm in C++

The Cocke-Younger-Kasami (CYK) algorithm is a dynamic programming algorithm for generalised parsing – i.e., parsing with ambiguous grammars where there may be more than one possible parse. The algorithm works by populating a table with the possible ways of parsing successively larger portions of the input. If the grammar’s start symbol is in the top cell of the table when it has been completed, the parse is successful.

Below is an implementation in C++. The class MB::cyk_parser is instantiated with a grammar (see Context-free grammar in C++). The parse() method attempts to recognise a sentence and displays the parsing table if it is called with a std::ostream.

The grammar must be in Chomsky Normal Form (CNF), which means that every left-hand side must be a single non-terminal symbol, and every right-hand side alternative must either be a single terminal symbol or a pair of non-terminal symbols. This is a requirement of the classic CYK algorithm, but some variants of the algorithm can handle grammars that are not in CNF.

The parsing table is an upper-right triangular matrix internally, but is printed as a lower-left triangular matrix, as this is easier to read.

#ifndef CYK_HPP
#define CYK_HPP

#include <vector>
#include <set>
#include <iostream>
#include <iterator>

#include <grammar.hpp>

namespace MB
{

class cyk_parser
{
public:
    cyk_parser(const MB::grammar& grammar)
        : grammar_(grammar)
    {
    }

    template <class InputIt>
    bool parse(InputIt begin, InputIt end)
    {
        return parse(begin, end, nullptr);
    }

    template <class InputIt>
    bool parse(InputIt begin, InputIt end, std::ostream& os)
    {
        return parse(begin, end, &os);
    }

private:
    template <class InputIt>
    bool parse(InputIt begin, InputIt end, std::ostream *os)
    {
        std::vector<std::string> str;
        std::copy(begin, end, std::back_inserter(str));
        const size_t len = str.size();
        std::vector<std::vector<std::set<std::string> > > table(len,
                std::vector<std::set<std::string> >(len));
        unsigned int i, j, k;

        // Populate main diagonal
        i = 0;
        for (const std::string& c : str) {
            std::vector<MB::rule::const_ptr> rules;
            grammar_.get_rules_for_symbol(c, std::back_inserter(rules));
            for (MB::rule::const_ptr r : rules) {
                table[i][i].insert(r->left());
            }
            i++;
        }
        // Populate upper-right triangle
        for (i = 1; i < len; i++) {
            for (j = i; j < len; j++) {
                for (k = j - i; k < j; k++) {
                    std::vector<std::pair<std::string, std::string> > product;
                    MB::detail::cartesian_product(table[j - i][k].begin(), table[j - i][k].end(), 
                            table[k + 1][j].begin(),
                            table[k + 1][j].end(), std::back_inserter(product));
                    std::vector<MB::rule::const_ptr> rules;
                    grammar_.get_rules_for_symbols(product.begin(), product.end(), 
                            std::back_inserter(rules));
                    for (MB::rule::const_ptr r : rules) {
                        table[j - i][j].insert(r->left());
                    }
                }
            }
        }
        if (os) {
            // Print the table 
            for (i = 0; i < len; i++) {
                k = 0;
                for (j = len - i - 1; j < len; j++) {
                    std::copy(table[k][j].begin(), table[k][j].end(), 
                            std::ostream_iterator<std::string>(*os, " "));
                    ++k;
                    *os << '\t';
                }
                *os << '\n';
            }
            for (const std::string& c : str) {
                *os << c << '\t';
            }
            *os << '\n';
        }
             
        // Successful if the start symbol is in the top-right cell    
        return table[0][len - 1].find(grammar_.get_start_left()) != table[0][len - 1].end();
    }

private:
    const MB::grammar& grammar_;
};

} // namespace MB

#endif // CYK_HPP

Here is an example program:

#include <fstream>
#include <iostream>

#include <grammar.hpp>
#include <cyk.hpp>

int main()
{
    std::string filename = "cyk.dat";
    std::ifstream ifs(filename);
    if (ifs) {
        MB::grammar grammar(ifs);
        std::string sentence[] = {"b", "a", "a", "b", "a"};
        const size_t len = sizeof(sentence) / sizeof(sentence[0]);
        bool success = MB::cyk_parser(grammar).parse(sentence, sentence + len, std::cout);
        std::cout << "Success: " << std::boolalpha << success << '\n';
    }
    else {
        std::cerr << "Couldn't open " << filename << " for reading\n";
    }
}

This is the grammar file:

S -> A B | B C
A -> B A | a
B -> C C | b
C -> A B | a

Here is the output. Notice the start symbol S in the top-left cell, which indicates a successful parse:

A C S
        A C S
        B       B
A S     B       C S     A S
B       A C     A C     B       A C
b       a       a       b       a
Success: true

Reference: The CYK Algorithm

Boyer-Moore search of a list for a sub-list in Python

I had the need to search a list for a sub-list in Python, rather like std::search() in C++, but there isn’t a built-in equivalent in Python. A naive way of implementing it would be to try to match the sub-list against every position in the list, but that wouldn’t be very efficient. An efficient algorithm for string searching is the Boyer-Moore algorithm, and this can be adapted to search a list.

Below is an implementation of Boyer-Moore for searching a list in Python. It’s based on the Wikipedia article. The function search() takes a list to search (haystack), and a sub-list to search for (needle), and returns the starting index of needle, or -1 if it isn’t found.

def search(haystack, needle):
    """
    Search list `haystack` for sub-list `needle`.
    """
    if len(needle) == 0:
        return 0
    char_table = make_char_table(needle)
    offset_table = make_offset_table(needle)
    i = len(needle) - 1
    while i < len(haystack):
        j = len(needle) - 1
        while needle[j] == haystack[i]:
            if j == 0:
                return i
            i -= 1
            j -= 1
        i += max(offset_table[len(needle) - 1 - j], char_table.get(haystack[i]));
    return -1

    
def make_char_table(needle):
    """
    Makes the jump table based on the mismatched character information.
    """
    table = {}
    for i in range(len(needle) - 1):
        table[needle[i]] = len(needle) - 1 - i
    return table
    
def make_offset_table(needle):
    """
    Makes the jump table based on the scan offset in which mismatch occurs.
    """
    table = []
    last_prefix_position = len(needle)
    for i in reversed(range(len(needle))):
        if is_prefix(needle, i + 1):
            last_prefix_position = i + 1
        table.append(last_prefix_position - i + len(needle) - 1)
    for i in range(len(needle) - 1):
        slen = suffix_length(needle, i)
        table[slen] = len(needle) - 1 - i + slen
    return table
    
def is_prefix(needle, p):
    """
    Is needle[p:end] a prefix of needle?
    """
    j = 0
    for i in range(p, len(needle)):
        if needle[i] != needle[j]:
            return 0
        j += 1    
    return 1
    
def suffix_length(needle, p):
    """
    Returns the maximum length of the substring ending at p that is a suffix.
    """
    length = 0;
    j = len(needle) - 1
    for i in reversed(range(p + 1)):
        if needle[i] == needle[j]:
            length += 1
        else:
            break
        j -= 1
    return length

An example program:

def main():
    haystack = [0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1]
    needle = [0, 0, 1]
    index = search(haystack, needle)
    print(index)

if __name__ == '__main__':
    main()

Output:

4

Aho-Corasick algorithm in C++

The Aho-Corasick algorithm constructs a finite-state automaton (FSA) that can match multiple strings simultaneously. It begins by constructing a conventional prefix tree and then adds transition edges that allow the automaton to recover efficiently from failures.

Below is an implementation in C++. The constructor takes a pair of iterators to a sequence of strings, which the automaton will then be able to match. Once the automaton has been constructed, its search() method can be used to search a string of text. It takes an output iterator, to which it writes pairs containing the strings found and their starting positions.

#ifndef AHO_CORASICK_HPP
#define AHO_CORASICK_HPP

#include <queue>
#include <string>
#include <vector>
#include <set>

namespace MB
{

namespace detail
{

// Maximum number of characters in alphabet
static constexpr int MAXCHARS = 256;

struct state
{
    std::set<int> output_function;
    int failure_function;
    std::vector<int> goto_function;
    state()
        : failure_function(-1),
          goto_function(detail::MAXCHARS, -1)
    {
    }
};

} // namespace detail

class ac_automaton
{
public:
    template <class InputIt>
    ac_automaton(InputIt first, InputIt last)
        : states(1)
    {
        // Build the prefix tree
        for (InputIt it = first; it != last; ++it) {
            add_word(*it);
        }
        // Turn it into an automaton
        construct_automaton();
    }

    template <class OutputIt>
    OutputIt search(std::string text, OutputIt it) const
    {
        int current_state = 0;
     
        for (int i = 0; i < text.size(); ++i) {
            current_state = find_next_state(current_state, text[i]);
            if (states[current_state].output_function.size()) {
                for (auto length : states[current_state].output_function) {
                    *it++ = std::make_pair(std::string(text, i - length + 1, length),
                            i - length + 1);
                }
            }
        }
        return it;
    }

private:
    std::vector<detail::state> states;

private:
    void add_word(const std::string &word)
    {
        int current_state = 0;
        // Add to prefix tree
        for (int c : word) {
            if (states[current_state].goto_function == -1) {
                states[current_state].goto_function = states.size();
                states.push_back(detail::state());
            }
            current_state = states[current_state].goto_function;
        }
        // Add to output function for this state
        states[current_state].output_function.insert(word.size());
    }

    void construct_automaton()
    {
        // Complete the goto function by setting it to 0 for each
        // letter that doesn't have an edge from the root
        for (int c = 0; c < detail::MAXCHARS; ++c) {
            if (states[0].goto_function == -1) {
                states[0].goto_function = 0;
            }
        }
     
        // Compute failure and output functions
        std::queue<int> state_queue;
        for (int c = 0; c < detail::MAXCHARS; ++c) {
            if (states[0].goto_function != 0) {
                states[states[0].goto_function].failure_function = 0;
                state_queue.push(states[0].goto_function);
            }
        }
        while (state_queue.size()) {
            int s = state_queue.front();
            state_queue.pop();
     
            for (int c = 0; c < detail::MAXCHARS; ++c) {
                if (states[s].goto_function != -1) {
                    int failure = states[s].failure_function;
                    while (states[failure].goto_function == -1) {
                         failure = states[failure].failure_function;
                    }
                    failure = states[failure].goto_function;
                    states[states[s].goto_function].failure_function = failure;
                    for (auto length : states[failure].output_function) {
                        states[states[s].goto_function].output_function.insert(length);
                    }
                    state_queue.push(states[s].goto_function);
                }
            }
        }
    }
 
    int find_next_state(int current_state, char c) const
    {
        int next_state = current_state;
     
        while (states[next_state].goto_function == -1) {
            next_state = states[next_state].failure_function;
        }
     
        return states[next_state].goto_function;
    }
 
};

} // namespace MB

#endif // AHO_CORASICK_HPP

Here is an example program:

#include <iostream>

#include "aho_corasick.hpp"

int main()
{
    std::string words[] = {"brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the"};
    const size_t len = sizeof(words) / sizeof(words[0]);
    std::string text = "the quick brown fox jumps over the lazy dog";
 
    MB::ac_automaton automaton(words, words + len);
    std::vector<std::pair<std::string, int> > results;

    automaton.search(text, std::back_inserter(results));
    for (auto it = results.begin(); it != results.end(); ++it) {
        std::cout << it->first << " starting at " << it->second << '\n';
    }
 
    return 0;
}

Output:

the starting at 0
quick starting at 4
brown starting at 10
fox starting at 16
jumps starting at 20
over starting at 26
the starting at 31
lazy starting at 35
dog starting at 40

Reference: Biosequence Algorithms, Spring 2005 Lecture 4: Set Matching and Aho-Corasick Algorithm