Tag Archives: Algorithms

Parsing by generation

By far the simplest method of parsing is to use the grammar to generate sentences, and then wait for the input sentence to show up…

Needless to say, you could be in for a long wait, so this is not a feasible method, but every top-down parsing method is just a refinement of this, so it is interesting theoretically.

As I have already written the algorithm to generate sentences from a grammar, the implementation of the parser is trivial. The only difficulty is that it’s hard to know how many sentences you need to generate to be sure of encountering the input sentence if it is valid, so I’ve added a max_gen parameter to specify the maximum number of sentences to generate.

#ifndef GENERATION_PARSER_HPP
#define GENERATION_PARSER_HPP

#include <vector>
#include <string>
#include <algorithm>

#include <grammar.hpp>
#include <generator.hpp>

namespace MB
{

class generation_parser
{
public:
    generation_parser(const MB::grammar &grammar)
        : grammar_(grammar)
    {
    }
    template <class InputIt>
    bool parse(InputIt begin, InputIt end, unsigned int max_gen = 100)
    {
        std::vector<std::string> sentence(begin, end);
        std::vector<std::vector<std::string> > sentences;
        generator(grammar_).generate(std::back_inserter(sentences), max_gen);
        return std::find(sentences.begin(), sentences.end(), sentence) != sentences.end();
    }
private:
    const grammar& grammar_;
};

} // namespace MB


#endif // GENERATION_PARSER_HPP

The algorithm to generate production trees from a grammar can be used in the same way. The only difference is that the sentence needs to be extracted from the trees for comparison.

#ifndef TREE_GENERATION_PARSER_HPP
#define TREE_GENERATION_PARSER_HPP

#include <string>
#include <vector>
#include <algorithm>

#include <grammar.hpp>
#include <tree_generator.hpp>

namespace MB
{

class tree_generation_parser
{
public:
    tree_generation_parser(const MB::grammar &grammar)
        : grammar_(grammar)
    {
    }
    template <class InputIt>
    bool parse(InputIt begin, InputIt end, unsigned int max_gen = 100)
    {
        std::vector<std::string> sentence(begin, end);
                std::vector<tree> trees;
        tree_generator(grammar_).generate(std::back_inserter(trees), max_gen);
        std::vector<std::vector<std::string> > sentences(trees.size());
        std::transform(trees.begin(), trees.end(), sentences.begin(),
                std::mem_fun_ref(&tree::get_leaves));
        return std::find(sentences.begin(), sentences.end(), sentence) != sentences.end();
    }
private:
    const grammar& grammar_;
};

} // namespace MB

#endif // TREE_GENERATION_PARSER_HPP

Generating production trees from a grammar in C++

A production tree for the sentence 'the man fed the dog'

In a previous post, I showed how to generate sentences from a grammar. We can go further and generate production trees. A production tree is just like a parse tree – it is a recursive decomposition of the grammatical structure of the sentence like the picture above.

The main algorithm for generating production trees is just the same as for generating sentences – using a queue and breadth-first search – but instead of substituting right-hand side symbols in place of the first non-terminal in the sentential form, we append child nodes to the first non-terminal leaf node in the production tree. A depth-first search is used to find the first non-terminal leaf node. Once there are no non-terminal leaf nodes, the production tree is complete.

The complete tree can be printed in the dot language and converted to a graphviz graph, like the picture above, or the sentence can be retrieved from it by using an in-order traversal to retrieve the leaf nodes, which in a completed tree are all of the terminals.

Below is the code for the production tree, which is a Composite made of three classes, the abstract symbol class, and concrete subclasses nonterminal and terminal. The structure is pointer-based, so smart pointers are used throughout.

#ifndef TREE_HPP
#define TREE_HPP

#include <vector>
#include <string>
#include <memory>
#include <iostream>

namespace MB
{

// Abstract base class for grammar symbols
class symbol
{
public:
    typedef std::shared_ptr<symbol> ptr;
    typedef std::shared_ptr<const symbol> const_ptr;
    symbol(const symbol&) = delete;
    symbol& operator=(const symbol&) = delete;
    virtual ptr clone() const = 0;
    virtual ptr get_first_nonterminal_leaf() = 0;
    virtual void append_child(ptr child) = 0;
    virtual void get_leaves(std::vector<std::string>&) const = 0;
    const std::string& name() const
    {
        return name_;
    }
    virtual void print_dot(std::ostream &os, unsigned int id, unsigned int& maxid) const
    {
        os << "\tNode" << id << "[label=\"";
        os << name_;
        os << "\"];\n";
    }
protected:
    symbol(const std::string& name)
        : name_(name)
    {
    }
    std::string name_;
};

// A non-terminal, which can have children
class nonterminal : public symbol, public std::enable_shared_from_this<nonterminal>
{
public:
    static symbol::ptr create(const std::string& name)
    {
        return symbol::ptr(new nonterminal(name));
    }
    symbol::ptr clone() const override
    {
        symbol::ptr copy = create(name_);
        for (auto child : children_) {
            copy->append_child(child->clone());
        }
        return copy;
    }
    symbol::ptr get_first_nonterminal_leaf() override
    {
        if (children_.empty()) {
            // If we have no children, we are the first non-terminal leaf
            return shared_from_this();
        }
        else {
            // Search in children
            for (auto child : children_) {
                symbol::ptr p = child->get_first_nonterminal_leaf();
                if (p) {
                    return p;
                }
            }
        }
        // Not us or one of our children
        return nullptr;
    }
    void append_child(symbol::ptr child) override
    {
        children_.push_back(child);
    }
    void get_leaves(std::vector<std::string>& sentence) const override
    {
        for (auto child : children_) {
            child->get_leaves(sentence);
        }
    }
    void print_dot(std::ostream &os, unsigned int id, unsigned int& maxid) const override
    {
        symbol::print_dot(os, id, maxid);
        for (auto child : children_) {
            unsigned int child_id = ++maxid;
            child->print_dot(os, child_id, maxid);
            os << "\tNode" << id << " -> Node" << child_id << "[dir=none];\n";
        }
    }
private:
    nonterminal(const std::string& name)
        : symbol(name)
    {
    }
    std::vector<symbol::ptr> children_;
};

// A terminal, which is a leaf
class terminal : public symbol
{
public:
    terminal(const std::string& name)
        : symbol(name)
    {
    }
    static symbol::ptr create(const std::string& name)
    {
        return symbol::ptr(new terminal(name));
    }
    symbol::ptr clone() const override
    {
        return create(name_);
    }
    virtual symbol::ptr get_first_nonterminal_leaf() override
    {
        return nullptr;
    }
    virtual void append_child(symbol::ptr child) override
    {
    }
    void get_leaves(std::vector<std::string>& sentence) const override
    {
        sentence.push_back(name_);
    }
};

// A production tree
class tree
{
public:
    tree(symbol::ptr root)
        : root_(root)
    {
    }
    tree clone() const
    {
        symbol::ptr root = root_ ? root_->clone() : nullptr;
        return tree(root);
    }
    // Get the first non-terminal for substitution
    symbol::ptr get_first_nonterminal_leaf() const
    {
        if (root_) {
            return root_->get_first_nonterminal_leaf();
        }
        return nullptr;
    }
    // Get the sentence
    std::vector<std::string> get_leaves() const
    {
        std::vector<std::string> sentence;
        if (root_) {
            root_->get_leaves(sentence);
        }
        return sentence;
    }
    // Print in dot format
    void print_dot(std::ostream& os) const
    {
        os << "digraph G {\n";
        os << "\tnode[shape=plaintext];\n";
        unsigned int id = 0;
        unsigned int maxid = 0;
        if (root_) {
            root_->print_dot(os, id, maxid);
        }
        os << "}\n";
    }
private:
    symbol::ptr root_;
};

} // namespace MB

#endif // TREE_HPP

The production tree generator is very similar to the sentence generator described previously, except production trees are copied and modified rather than sentential forms.

#ifndef TREE_GENERATOR_HPP
#define TREE_GENERATOR_HPP

#include <vector>
#include <list>
#include <queue>
#include <algorithm>

#include <grammar.hpp>
#include <tree.hpp>

namespace MB
{

class tree_generator
{
public:
    tree_generator(const grammar& grammar)
        : grammar_(grammar)
    {
        // Store the left-hand sides
        std::transform(grammar_.rules().begin(), grammar_.rules().end(),
                std::back_inserter(lefts_),
                [](rule::const_ptr r)->const std::string& { return r->left(); }); 
    }
    template <class InputIt>
    InputIt generate(InputIt init, unsigned int n) const
    {
        std::queue<tree> queue;
        unsigned int sentences = 0;
        // Add the start symbol to the queue
        queue.push(tree(nonterminal::create(grammar_.get_start_left())));
        while (sentences < n && queue.size()) {
            tree sf = queue.front();
            queue.pop();
            // Find the first non-terminal
            symbol::ptr nt = sf.get_first_nonterminal_leaf();
            if (nt) {
                // Append each right-hand side alternative
                std::vector<rule::const_ptr> rules;
                grammar_.get_rules_for_left(nt->name(), std::back_inserter(rules));
                for (rule::const_ptr rule : rules) {
                    for (const std::vector<std::string>& alternative : rule->right()) {
                        tree new_sf = sf.clone();
                        nt = new_sf.get_first_nonterminal_leaf(); 
                        for (auto symbol : alternative) {
                            if (std::find(lefts_.begin(), lefts_.end(), symbol) != lefts_.end()) {
                                nt->append_child(nonterminal::create(symbol));
                            } 
                            else {
                                nt->append_child(terminal::create(symbol));
                            }
                        }
                        queue.push(new_sf);
                    }
                }
            }
            else {
                // Finished sentence
                *init++ = sf;
                ++sentences;
            }
        }
        return init;
    }
private:
    const grammar& grammar_;
    std::vector<std::string> lefts_;
};

} // namespace MB

#endif // TREE_GENERATOR_HPP

Here is an example program that generates the first 50 production trees and prints them in dot format after their associated sentence:

#include <fstream>
#include <iostream>
#include <vector>
#include <algorithm>

#include <grammar.hpp>
#include <tree_generator.hpp>

int main()
{
    std::string filename = "generator.dat";
    std::ifstream ifs(filename);
    if (ifs) {
        MB::grammar grammar(ifs);
        MB::tree_generator gen(grammar);
        std::vector<MB::tree > trees;
        gen.generate(std::back_inserter(trees), 50);
        for (const auto& t : trees) {
            std::vector<std::string> sentence = t.get_leaves();
            std::copy(sentence.begin(), sentence.end(), 
                    std::ostream_iterator<std::string>(std::cout, " "));
            std::cout << '\n';
            t.print_dot(std::cout);
        }
    }
    else {
        std::cerr << "Couldn't open " << filename << " for reading\n";
    }
}

Here is an example sentence and tree from the output

a dog barked
digraph G {
    node[shape=plaintext];
    Node0[label="S"];
    Node1[label="NP"];
    Node2[label="DET"];
    Node3[label="a"];
    Node2 -> Node3[dir=none];
    Node1 -> Node2[dir=none];
    Node4[label="N"];
    Node5[label="dog"];
    Node4 -> Node5[dir=none];
    Node1 -> Node4[dir=none];
    Node0 -> Node1[dir=none];
    Node6[label="VP"];
    Node7[label="V"];
    Node8[label="barked"];
    Node7 -> Node8[dir=none];
    Node6 -> Node7[dir=none];
    Node0 -> Node6[dir=none];
}

If you have graphviz installed, the dot language part can be converted to a png image by pasting it into a file (say “dog.dot”) and running the command:

dot -Tpng -O dog.dot 

This will generate a file called “dog.dot.png” containing the picture of the production tree like this:
Production tree for the phrase 'a dog barked'

Generating sentences from a grammar in C++

Although phrase structure grammars are used in the development of parsers, they really describe how to generate sentences in a language. If a grammar is context-free, there is a simple algorithm to generate sentences from it. The algorithm uses a queue to perform a breadth-first search as follows:

  1. Put the start symbol in the queue
  2. While the queue is not empty and more sentences are wanted:
    1. Pop the first sentential form from the queue
    2. Find the first non-terminal in it
    3. For every right-hand side of that non-terminal
      1. Make a copy of the sentential form
      2. Substitute the right-hand side for the non-terminal
      3. Put this new sentential form into the queue
    4. If there are no non-terminals left in the sentential form, it is a complete sentence, so return it

Note that the queue, and consequent breadth-first search, are required because if the rule’s right-hand side is left-recursive, the algorithm would go into an infinite recursion without producing anything if it performed a depth-first search.

Here is the code in C++, using the class I described earlier in Context-free grammar in C++. The generator class is instantiated with a grammar, and its generate() method will write n sentences to the supplied iterator, if n sentences can be generated.

#ifndef GENERATOR_HPP
#define GENERATOR_HPP

#include <vector>
#include <list>
#include <queue>
#include <algorithm>

#include <grammar.hpp>

namespace MB
{

class generator
{
public:
    generator(const grammar& grammar)
        : grammar_(grammar)
    {
        // Store the left-hand sides
        std::transform(grammar_.rules().begin(), grammar_.rules().end(),
                std::back_inserter(lefts_),
                [](rule::const_ptr r)->const std::string& { return r->left(); }); 
    }
    template <class InputIt>
    InputIt generate(InputIt init, unsigned int n) const
    {
        std::queue<sentential_form> queue;
        unsigned int sentences = 0;
        // Add the start symbol to the queue
        queue.push(sentential_form(1, grammar_.get_start_left()));
        while (sentences < n && queue.size()) {
            sentential_form sf = queue.front();
            queue.pop();
            // Find the first non-terminal
            sentential_form::iterator it = std::find_first_of(sf.begin(), sf.end(), 
                    lefts_.begin(), lefts_.end());
            if (it != sf.end()) {
                // Replace with each right-hand side alternative
                std::string left = *it;
                std::vector<rule::const_ptr> rules;
                grammar_.get_rules_for_left(left, std::back_inserter(rules));
                for (rule::const_ptr rule : rules) {
                    for (const std::vector<std::string>& alternative : rule->right()) {
                        sentential_form new_sf(sf);
                        it = std::find(new_sf.begin(), new_sf.end(), left);
                        new_sf.insert(it, alternative.begin(), alternative.end());
                        new_sf.erase(it);
                        queue.push(new_sf);
                    }
                }
            }
            else {
                // Finished sentence
                *init++ = std::vector<std::string>(sf.begin(), sf.end());
                ++sentences;
            }
        }
        return init;
    }
private:
    typedef std::list<std::string> sentential_form;
    const grammar& grammar_;
    std::vector<std::string> lefts_;
};

} // namespace MB

#endif // GENERATOR_HPP

Here is an example grammar:

S -> NP VP
NP -> NP PP
NP -> DET N
VP -> V NP
VP -> V
PP -> P NP
DET -> a | the
N -> man | dog | house | telescope | spoon
V -> saw | fed | barked
P -> with

And an example program that generates the first 50 sentences:

#include <fstream>
#include <iostream>
#include <vector>
#include <algorithm>

#include <grammar.hpp>
#include <generator.hpp>

int main()
{
    std::string filename = "generator.dat";
    std::ifstream ifs(filename);
    if (ifs) {
        MB::grammar grammar(ifs);
        MB::generator gen(grammar);
        std::vector<std::vector<std::string> > sentences;
        gen.generate(std::back_inserter(sentences), 50);
        for (auto sentence : sentences) {
            std::copy(sentence.begin(), sentence.end(),
                    std::ostream_iterator<const std::string&>(std::cout, " "));
            std::cout << '\n';
        }
    }
    else {
        std::cerr << "Couldn't open " << filename << " for reading\n";
    }
}

This is the somewhat surreal output:

a man saw
a man fed
a man barked
a dog saw
a dog fed
a dog barked
a house saw
a house fed
a house barked
a telescope saw
a telescope fed
a telescope barked
a spoon saw
a spoon fed
a spoon barked
the man saw
the man fed
the man barked
the dog saw
the dog fed
the dog barked
the house saw
the house fed
the house barked
the telescope saw
the telescope fed
the telescope barked
the spoon saw
the spoon fed
the spoon barked
a man saw a man
a man saw a dog
a man saw a house
a man saw a telescope
a man saw a spoon
a man saw the man
a man saw the dog
a man saw the house
a man saw the telescope
a man saw the spoon
a man fed a man
a man fed a dog
a man fed a house
a man fed a telescope
a man fed a spoon
a man fed the man
a man fed the dog
a man fed the house
a man fed the telescope
a man fed the spoon

Unger’s method in C++

Unger’s method is a top down, non-directional, generalised parsing algorithm. Top-down means that the parsing process begins with the grammar’s start symbol and attempts to derive the input string by applying the production rules from the start rule downwards. Non-directional means that the input is not processed from left to right (or right to left), but instead is processed in arbitrary order. This requires that the entire input is in memory from the beginning. Generalised means that the parser can handle ambiguous grammars, which can produce sentences that have multiple correct parses.

To choose between production rules to apply, Unger’s method works by evaluating all possible partitions of the input string that can correspond to the symbols on the right-hand-side of each grammar rule. It proceeds as follows:

  1. Find all of the right-hand side alternatives of rules applicable to the input (these are the right-hand sides of the start rules at first)
  2. For each right-hand side, count how many symbols it contains, and form all of the partitions of the input into that number of parts – each cell in the partition is a possible product of the symbol in the corresponding place in the alternative
  3. Reject any partitions that can’t possibly be correct because they don’t have terminals in cells that require them, i.e., where the symbol in the right-hand side is a terminal, the corresponding cell of the partition must contain only that terminal
  4. For the partitions that remain, repeat the whole process for each cell of the partition, using the rules that can produce the symbol corresponding to that cell, and the cell’s content as the input
  5. The process terminates when there are no more rules to try, or we have successfully broken the input down to terminals

For example, given the grammar:

E -> E + T
E -> T
T -> T * a
T -> a

And the sentence:

a * a + a

We find all of the right hand side alternatives of rules applicable to the input -these are the two start rules first

E + T
T

For each right-hand side, we count how many symbols it contains, and form of all of the partitions of the input into that number of parts.
The first start rule alternative “E + T” contains 3 symbols, so we form all of the partitions of the input into 3 parts as follows:

# E + T
0 a * a + a
1 a * a + a
2 a * a + a
3 a * a + a
4 a * a + a
5 a * a + a

We reject any partitions that can’t possibly be correct because they don’t have terminals in cells that require them. So we reject all of the above except row #5, because it is the only one with the “+” in the right place.

For the partitions that remain, we repeat the process for each cell of the partition, using the rules that can produce the symbol corresponding to that cell (the column header). So, in the example above, we repeat the process with “a * a” and all of the rules that can produce E, which is just “E -> E + T”.

Eventually, we will get down to the terminals if the parse succeeds, or run out of rules to try. Note that:

  • A table (rule) succeeds if any one of its rows succeeds
  • A row succeeds only if all of its cells succeed
  • We can check terminals for validity first in order to prevent unnecessary recursion

Here is the code in C++. The class unger_parser is instantiated with a grammar (see Context-free grammar in C++), and its parse method takes an input string which it will attempt to recognise, and will print the rules matched if given a std::ostream.

#ifndef UNGER_HPP
#define UNGER_HPP

#include <vector>
#include <iterator>
#include <iostream>
#include <iterator>

#include <grammar.hpp>


namespace MB
{

namespace detail
{
// First partition of sequence into k parts
void first_sequence_partition(unsigned int *arr, unsigned int k)
{
    unsigned int p;
    for (p = 0; p < k - 1; p++) {
        arr[p] = p + 1;
    }
}

// Next partition of sequence into k parts
bool next_sequence_partition(unsigned int *arr, size_t len, unsigned int k)
{
    int p; // Separator number
    unsigned int s; // Its highest place
    bool changed = 0;
    for (p = k - 2, s = len - 1; p >= 0 && !changed; p--, s--) {
        if (arr[p] < s) {
            arr[p]++;
            changed = true;
            if (static_cast<unsigned int>(p) < k - 2) {
                p++;
                while (static_cast<unsigned int>(p) <= k - 2) {
                    arr[p] = arr[p - 1] + 1;
                    p++;
                }
            }
        }
    }
    if (!changed) {
        first_sequence_partition(arr, k);
    }
    return changed;
}

// Fill a vector with len elements starting at start
void fill(const std::vector<std::string>& input, unsigned int start, unsigned int len,
        std::vector<std::string>& output)
{
    unsigned int i, j;
    for (i = start, j = 0; j < len; ++i, ++j) {
        output.push_back(input[i]);
    }
}

} // namespace detail

class unger_parser
{
public:
    unger_parser(const MB::grammar& grammar)
        : grammar_(grammar), os_(nullptr)
    {
    }

    template <class InputIt>
    bool parse(InputIt begin, InputIt end)
    {
        return parse(begin, end, nullptr);
    }

    template <class InputIt>
    bool parse(InputIt begin, InputIt end, std::ostream& os)
    {
        os_ = &os;
        bool success = parse(begin, end, &os);
        os_ = nullptr;
        return success;
    }

private:
    template <class InputIt>
    bool parse(InputIt begin, InputIt end, std::ostream *os)
    {
        std::vector<std::string> input;
        std::copy(begin, end, std::back_inserter(input));
        std::vector<MB::rule::const_ptr> rules;
        grammar_.get_start_rules(std::back_inserter(rules));
        return valid_rules(input, rules);
    }

    // If symbol is a single terminal, it needs to match cell
    bool valid_cell_terminal(const std::vector<std::string>& cell,
            const std::string& symbol)
    {
        return cell.size() == 1 && cell[0] == symbol;
    }

    // If symbol is a non-terminal, find its rules and try to match one of them
    bool valid_cell_nonterminal(const std::vector<std::string>& cell,
            const std::string& symbol)
    {
        // Try all of the rules where symbol is the left hand side
        std::vector<MB::rule::const_ptr> rules;
        grammar_.get_rules_for_left(symbol, std::back_inserter(rules));
        return valid_rules(cell, rules);
    }

    // A cell is valid if it matches a terminal or a rule for a non-terminal succeeds
    bool valid_cell(const std::vector<std::string>& cell,
            const std::string& symbol)
    {
        bool terminal = grammar_.symbol_is_terminal(symbol);
        return (terminal && valid_cell_terminal(cell, symbol))
            || (!terminal && valid_cell_nonterminal(cell, symbol));
    }

    // All of the cells in a row need to succeed
    bool valid_row(const std::vector<std::string>& input, 
            const std::vector<std::string>& right, const unsigned int *arr, unsigned int k)
    {
        const size_t len = input.size();
        bool success = true;
        std::vector<std::vector<std::string> > cells;
        for (unsigned int p = 0; p < k && success; p++) {
            std::vector<std::string> cell;
            if (p == 0) {
                // First partition goes from the beginning to arr[0]
                detail::fill(input, 0, arr[0], cell);
            }
            else if (p == k - 1) {
                // Last partition goes from arr[k - 2] to the end
                detail::fill(input, arr[k - 2], len - arr[k - 2], cell);
            }
            else {
                // Partition goes from separator p - 1 to separator p
                detail::fill(input, arr[p - 1], arr[p] - arr[p - 1], cell);
            }
            cells.push_back(cell);
            std::string symbol = right[p];
            if (grammar_.symbol_is_terminal(symbol)) {
                success = valid_cell_terminal(cell, symbol);
            }
        }
        if (success) {
            for (unsigned int p = 0; p < k && success; p++) {
                std::string symbol = right[p];
                success = valid_cell(cells[p], symbol);
            }
        }
        return success;
    }

    // One of the rows in a table needs to succeed
    bool valid_table(const std::vector<std::string>& input,
            const std::vector<std::string>& right)
    {
        const unsigned int k = right.size(); // Number of parts
        const size_t len = input.size(); // Number of tokens
        bool success = false;
        if (k <= len) {
            if (k > 1) {
                std::vector<unsigned int> arr(k);
                detail::first_sequence_partition(arr.data(), k);
                do {
                    success = valid_row(input, right, arr.data(), k);
                } while (detail::next_sequence_partition(arr.data(), len, k) && !success);
            }
            else {
                success = valid_cell(input, right[0]);
            }
        }
        return success;
    }
    // One of the right hand side alternatives of one of the rules needs to succeed 
    bool valid_rules(const std::vector<std::string>& input,
            const std::vector<MB::rule::const_ptr>& rules)
    {
        for (MB::rule::const_ptr r : rules) {
            for (const std::vector<std::string>& alternative : r->right()) {
                if (valid_table(input, alternative)) {
                    if (os_) {
                        *os_ << "Succeeded in matching rule ";
                        *os_ << *r;
                        *os_ << " with input ";
                        std::copy(input.begin(), input.end(), 
                                std::ostream_iterator<std::string>(*os_, " "));
                        *os_ << '\n';
                    }
                    return true;
                }
            }
        }
        return false;
    }

private:
    const MB::grammar& grammar_;
    std::ostream* os_;
};

} // namespace MB

#endif // UNGER_HPP

Here is a program for the example above:

#include <fstream>
#include <iostream>
#include <string>

#include <grammar.hpp>
#include <unger.hpp>

int main()
{
    std::string filename = "unger.dat";
    std::ifstream ifs(filename);
    if (ifs) {
        MB::grammar grammar(ifs);
        std::string sentence[] = {"a", "*", "a", "+", "a"};
        const size_t len = sizeof(sentence) / sizeof(sentence[0]);
        bool success = MB::unger_parser(grammar).parse(sentence, sentence + len, std::cout);
        std::cout << "Success: " << std::boolalpha << success << '\n';
    }
    else {
        std::cerr << "Couldn't open " << filename << " for reading\n";
    }
}

Here is the output:

Succeeded in matching rule T -> a  with input a
Succeeded in matching rule T -> T * a  with input a * a
Succeeded in matching rule E -> T  with input a * a
Succeeded in matching rule T -> a  with input a
Succeeded in matching rule E -> E + T  with input a * a + a
Success: true

Reference:

Earley’s algorithm in C++

Introduction

Earley’s algorithm is a dynamic programming algorithm for generalised parsing, i.e., parsing with grammars that might be ambiguous. For example, the Earley algorithm could parse sentences generated by the following toy English grammar, which I am going to use in the explanation and code:

S -> NP VP
NP -> NP PP
NP -> Noun
VP -> Verb NP
VP -> VP PP
PP -> Prep NP
Noun -> John
Noun -> Mary
Noun -> Denver
Verb -> called
Prep -> from

As an example of the ambiguity of this grammar, consider the sentence “John called Mary from Denver”. Was John calling from Denver, or was Mary from Denver?

Data structures

The Earley algorithm uses three data structures, states, statelists, and the chart.

States

A state represents a state in the process of evaluating a grammar rule. In particular, it represents how far along the right hand side of the rule the parse has progressed. This is usually represented as a dot between symbols of the right hand side showing the position, hence a state is sometimes known as a “dotted rule”.

For example, the state below is for the grammar rule “PP -> Prep NP” from the grammar above:

(PP -> Prep @ NP, [3 , 4])

The position of the dot (represented by a “@”) indicates that the “Prep” has been processed and the “NP” should be next. The “3” and “4” are the positions within the input that the processed part of this state refers to (i.e., the span of the “Prep”).

Statelists

A statelist is an ordered list of states. The states in a statelist are unique, so it is sometimes called a state set.
Once a state has been added to a statelist, it is never removed or modified – instead new states are created in this or the next statelist based on the state by the procedures described below.

The chart

The chart is a set of statelists. There is one statelist for every point in the input, from before the first word to after the last. The statelists are generated one at a time as the parse progresses. There are only two statelists in action at any time, the current one being processed and the next one being generated.

The algorithm

The algorithm processes the chart from the first statelist to the last, constructing the statelists ahead of itself as it goes.
Rather than looping over the input, it loops over the statelists, the processing of which adds new statelists ahead of the iteration. The processing of states within the statelists consumes the input.

There are three procedures, the Predictor, the Scanner, and the Completer.

The Predictor

The Predictor processes states in which the dot is before a symbol that is the left-hand side of a rule whose right-hand side is a non-terminal. For example, in the state below,

(S -> @ NP VP, [0 , 0]) 

the dot is before “NP”, and the two rules having “NP” on the left are “NP -> NP PP” and “NP -> Noun”, which both have non-terminals on their right, so the Predictor would process this state.
The Predictor inserts into the current statelist a state for each rule where that symbol after the dot (“NP”) is on the left (“NP -> NP PP” and “NP -> Noun”), and places the dot before the first symbol of these new rules’ right-hand sides. So the state above gives rise to the following two states:

(NP -> @ NP PP, [0 , 0])
(NP -> @ Noun, [0 , 0])

The Scanner

The Scanner identifies states in the statelist where the dot is before a symbol that is the left-hand side of a rule whose right-hand side is a terminal (in other words, those states where the dot is to the left a symbol not processed by the Predictor). It reads the next word of input, and inserts a state into the next statelist containing a rule where the symbol that followed the dot is on the left, and the word read is the terminal on the right.
For example, consider this state in which the Predictor has predicted a “Prep”, whose right hand side is a terminal (“Prep -> from”):

(PP -> @ Prep NP, [3 , 3])

This causes the Scanner to read the next word of input, which, if it is “from”, results in the following state being added to the next statelist:

(Prep -> from @, [3 , 4])

The Completer

The Completer processes those states not processed by the Predictor or Scanner, in other words, those in which the dot is at the right-hand end, after every symbol in the rule. These states are now complete. It finds states that have the left-hand symbol of those completed states on their right-hand side, and inserts states that are identical, except that the dot is moved to the other side of that particular symbol, as the completion of the original state represents the completion of this symbol. So the Completer’s job is to drive the dots forward in rules that are being processed.
Continuing the example above, as soon as the Scanner has added the state for “Prep -> from”, that state has the dot on the extreme right, so the Completer adds a state marking the completion of the “Prep” part of the rule “PP -> Prep NP”:

(PP -> Prep @ NP, [3 , 4])

Summary

The Predictor and Completer insert states into the current statelist. The Scanner inserts states into the next statelist (because it advances the reading position). The Predictor and Completer are duals of each other and book-end the work of the Scanner. The Predictor moves down the grammar hierarchy to the bottom in order to introduce states with lower level rules. It feeds the Scanner by working its way down to the terminals and inserting states for their rules. The Completer moves up the grammar hierarchy to the top to advance the dots. It is propagating the work of the Scanner upwards, all the way to the start symbol. The parse is successful if the start state is in the last statelist, with the dot on the right of its right hand side.

The code

Below is the code in C++. The class earley_parser is instantiated with a grammar (see Context-free grammar in C++), and its parse() method attempts to recognise a sentence, and prints the statelists if it is passed a std::ostream. The Predictor, Scanner, and Completer are implemented as methods with those names.

#ifndef EARLEY_HPP
#define EARLEY_HPP

#include <string>
#include <vector>
#include <iostream>
#include <queue>
#include <iterator>
#include <algorithm>

#include <grammar.hpp>

namespace MB
{

namespace detail
{
// An element of a statelist, also known as a "dotted rule" or "Earley item"
struct state
{
    MB::rule::const_ptr rule_; // The grammar rule
    const unsigned int right_; // Index of right hand side alternative
    unsigned int dot_; // Position of dot within symbols on right hand side
    unsigned int i_, j_; // Positions within the input
    char added_by_; // Which function added this state
    state(MB::rule::const_ptr rule, unsigned int right, unsigned int i, unsigned int j)
        : rule_(rule), right_(right), dot_(0), i_(i), j_(j), added_by_(0)
    {
    }

    // Is dot all the way to the right?
    bool completed() const
    {
        return dot_ == rule_->right()[right_].size();
    }

    // Get symbol to the right of dot
    std::string next_symbol() const
    {
        return rule_->right()[right_][dot_];
    }
};

// Pretty-print state
std::ostream& operator <<(std::ostream& os, const state& st)
{
    unsigned int s;
    const std::vector<std::string>& right = st.rule_->right()[st.right_];
    size_t rlen = right.size();
    os << '(';
    os << st.rule_->left() << " -> ";
    for (s = 0; s < rlen; s++) {
        if (s == st.dot_) {
            os << "@ ";
        }
        os << right[s];
        if (s < rlen - 1) {
            os << ' ';
        }
    }
    if (s == st.dot_) {
        os << " @";
    }
    os << ", [" << st.i_ << " , " << st.j_ << "]) ";
    switch (st.added_by_) {
        case 'P':
            os << "predictor";
            break;
        case 'S':
            os << "scanner";
            break;
        case 'C':
            os << "completer";
            break;
        default:
            os << "start state";
    }
    return os;
}

// Needed to check for duplicate states
bool operator ==(const state& state1, const state& state2)
{
    return state1.rule_->left() == state2.rule_->left()
        && state1.rule_->right() == state2.rule_->right()
        && state1.right_ == state2.right_
        && state1.dot_ == state2.dot_
        && state1.i_ == state2.i_
        && state1.j_ == state2.j_;
} 

// A statelist is a vector of states
typedef std::vector<state> statelist;

// A chart is a vector of statelists
struct chart
{
    const MB::grammar& grammar_;
    std::vector<statelist> chart_;

    chart(const MB::grammar& grammar)
        : grammar_(grammar),
        // Chart begins with 1 statelist containing 1 dummy state used to track
        // successful parse
        chart_(1, statelist(1, state(MB::rule::create("$", grammar.get_start_left()),
                        0, 0, 0)))
    {
    }

    // Add state st to statelist s
    void add_state(const state& st, unsigned int s)
    {
        if (s < chart_.size()) {
            // Adding to the last statelist
            if (std::find(chart_[s].begin(), chart_[s].end(), st) == chart_[s].end()) {
                chart_[s].push_back(st);
            }
        }
        else {
            // Adding to a new statelist
            chart_.push_back(statelist(1, st));
        }
    }

    // Add predictions for the next symbol in this state
    void predictor(const state& st)
    {
        std::vector<MB::rule::const_ptr> rules;
        grammar_.get_rules_for_left(st.next_symbol(), std::back_inserter(rules));
        const unsigned int j = st.j_; // Save this as st can be invalidated by add_state()
        for (MB::rule::const_ptr r : rules) {
            for (unsigned int a = 0; a < r->right().size(); ++a) {
                state prediction = state(r, a, j, j);
                prediction.added_by_ = 'P';
                add_state(prediction, j);
            }
        }
    }

    // Scan input for next symbol 
    bool scanner(const state& st, const std::vector<std::string>& input)
    {
        const std::string& word = input[st.j_];
        if (grammar_.symbol_is_terminal(word)) {
            std::vector<MB::rule::const_ptr> rules;
            grammar_.get_rules_for_symbol(word, std::back_inserter(rules));
            state scanned = state(rules[0], 0, st.j_, st.j_ + 1);
            scanned.dot_ = st.dot_ + 1;
            scanned.added_by_ = 'S';
            add_state(scanned, st.j_ + 1);
            return true;
        }
        else {
            // Unrecognised terminal symbol
            return false;
        }
    }

    // Complete states
    void completer(const state& st)
    {
        const statelist& list = chart_[st.i_];
        const unsigned int i = st.i_;
        const unsigned int j = st.j_;
        std::queue<state> completed_queue;
        for (const state& candidate : list) {
            if (candidate.j_ == i
                    && !candidate.completed() 
                    && candidate.next_symbol() == st.rule_->left())
            {
                state completed = state(candidate.rule_, candidate.right_, candidate.i_, j);
                completed.dot_ = candidate.dot_ + 1;
                completed.added_by_ = 'C';
                // Queue the new states as list can be invalidated by add_state()
                completed_queue.push(completed);
            }
        }
        while (!completed_queue.empty()) {
            add_state(completed_queue.front(), j);
            completed_queue.pop();
        }
    }

    // We have succeeded if the completed dummy state is in the final statelist
    bool succeeded() const
    {
        const statelist& list = chart_[chart_.size() - 1];
        return std::find_if(list.begin(), list.end(),
                [](const state &st)->bool { 
                return st.rule_->left() == "$" && st.completed(); })
            != list.end();
    }

    // The main algorithm
    bool parse(const std::vector<std::string>& input, std::ostream *os)
    {
        for (unsigned int i = 0; i <= input.size(); ++i) {
            for (unsigned int s = 0; s < chart_[i].size(); ++s) {
                const state& st = chart_[i][s];
                if (!st.completed()
                        && !grammar_.symbol_is_left_of_terminal(st.next_symbol())) {
                    predictor(st);
                }
                else if (!st.completed()) {
                    if (i < input.size()) {
                        if (!scanner(st, input)) {
                            if (os) {
                                *os << "Unrecognised symbol: " << input[st.j_] << '\n';
                                return false;
                            }
                        }
                    }
                }
                else {
                    completer(st);
                }
            }
            if (os) {
                *os << *this;
                *os << '\n';
            }
        }
        return succeeded();
    }

    // Pretty-print chart
    friend std::ostream& operator <<(std::ostream& os, const chart &ch)
    {
        for (unsigned int i = 0; i < ch.chart_.size(); ++i) {
            os << "S" << i << ": ";
            os << '[';
            for (unsigned int j = 0; j < ch.chart_[i].size(); ++j) {
                os << ch.chart_[i][j];
                if (j < ch.chart_[i].size() - 1) {
                    os << ",\n";
                }
            } 
            os << ']';
            os << "\n\n";
        }
        return os;
    }
};

} // namespace detail

class earley_parser
{
public:
    earley_parser(const MB::grammar& grammar)
        : grammar_(grammar)
    {
    }
    template <class InputIt>
    bool parse(InputIt begin, InputIt end) const
    {
        std::vector<std::string> input;
        std::copy(begin, end, std::back_inserter(input));
        return detail::chart(grammar_).parse(input, nullptr);
    }
    template <class InputIt>
    bool parse(InputIt begin, InputIt end, std::ostream& os) const
    {
        std::vector<std::string> input;
        std::copy(begin, end, std::back_inserter(input));
        return detail::chart(grammar_).parse(input, &os);
    }
private:
    const MB::grammar& grammar_;
};

} // namespace MB

#endif // EARLEY_HPP

Here is an example program that attempts to recognise the ambiguous sentence “John called Mary from Denver” mentioned in the introduction:

#include <fstream>
#include <iostream>
#include <string>

#include <grammar.hpp>
#include <earley.hpp>

int main()
{
    std::string filename = "earley.dat";
    std::ifstream ifs(filename);
    if (ifs) {
        MB::grammar grammar(ifs);
        std::string sentence[] = {"John", "called", "Mary", "from", "Denver"};
        const size_t len = sizeof(sentence) / sizeof(sentence[0]);
        bool success = MB::earley_parser(grammar).parse(sentence, sentence + len, std::cout);
        std::cout << "Success: " << std::boolalpha << success << '\n';
    }
    else {
        std::cerr << "Couldn't open " << filename << " for reading\n";
    }
}

Here is the output. The entire chart is printed for each iteration. Notice how each iteration adds another statelist. The parsing process is a process of driving the dots forward. At the end of the parse, the presence of the completed start rule in the last statelist indicates success.

S0: [($ -> @ S, [0 , 0]) start state,
(S -> @ NP VP, [0 , 0]) predictor,
(NP -> @ NP PP, [0 , 0]) predictor,
(NP -> @ Noun, [0 , 0]) predictor]

S1: [(Noun -> John @, [0 , 1]) scanner]


S0: [($ -> @ S, [0 , 0]) start state,
(S -> @ NP VP, [0 , 0]) predictor,
(NP -> @ NP PP, [0 , 0]) predictor,
(NP -> @ Noun, [0 , 0]) predictor]

S1: [(Noun -> John @, [0 , 1]) scanner,
(NP -> Noun @, [0 , 1]) completer,
(S -> NP @ VP, [0 , 1]) completer,
(NP -> NP @ PP, [0 , 1]) completer,
(VP -> @ Verb NP, [1 , 1]) predictor,
(VP -> @ VP PP, [1 , 1]) predictor,
(PP -> @ Prep NP, [1 , 1]) predictor]

S2: [(Verb -> called @, [1 , 2]) scanner]


S0: [($ -> @ S, [0 , 0]) start state,
(S -> @ NP VP, [0 , 0]) predictor,
(NP -> @ NP PP, [0 , 0]) predictor,
(NP -> @ Noun, [0 , 0]) predictor]

S1: [(Noun -> John @, [0 , 1]) scanner,
(NP -> Noun @, [0 , 1]) completer,
(S -> NP @ VP, [0 , 1]) completer,
(NP -> NP @ PP, [0 , 1]) completer,
(VP -> @ Verb NP, [1 , 1]) predictor,
(VP -> @ VP PP, [1 , 1]) predictor,
(PP -> @ Prep NP, [1 , 1]) predictor]

S2: [(Verb -> called @, [1 , 2]) scanner,
(VP -> Verb @ NP, [1 , 2]) completer,
(NP -> @ NP PP, [2 , 2]) predictor,
(NP -> @ Noun, [2 , 2]) predictor]

S3: [(Noun -> Mary @, [2 , 3]) scanner]


S0: [($ -> @ S, [0 , 0]) start state,
(S -> @ NP VP, [0 , 0]) predictor,
(NP -> @ NP PP, [0 , 0]) predictor,
(NP -> @ Noun, [0 , 0]) predictor]

S1: [(Noun -> John @, [0 , 1]) scanner,
(NP -> Noun @, [0 , 1]) completer,
(S -> NP @ VP, [0 , 1]) completer,
(NP -> NP @ PP, [0 , 1]) completer,
(VP -> @ Verb NP, [1 , 1]) predictor,
(VP -> @ VP PP, [1 , 1]) predictor,
(PP -> @ Prep NP, [1 , 1]) predictor]

S2: [(Verb -> called @, [1 , 2]) scanner,
(VP -> Verb @ NP, [1 , 2]) completer,
(NP -> @ NP PP, [2 , 2]) predictor,
(NP -> @ Noun, [2 , 2]) predictor]

S3: [(Noun -> Mary @, [2 , 3]) scanner,
(NP -> Noun @, [2 , 3]) completer,
(VP -> Verb NP @, [1 , 3]) completer,
(NP -> NP @ PP, [2 , 3]) completer,
(S -> NP VP @, [0 , 3]) completer,
(VP -> VP @ PP, [1 , 3]) completer,
(PP -> @ Prep NP, [3 , 3]) predictor,
($ -> S @, [0 , 3]) completer]

S4: [(Prep -> from @, [3 , 4]) scanner]


S0: [($ -> @ S, [0 , 0]) start state,
(S -> @ NP VP, [0 , 0]) predictor,
(NP -> @ NP PP, [0 , 0]) predictor,
(NP -> @ Noun, [0 , 0]) predictor]

S1: [(Noun -> John @, [0 , 1]) scanner,
(NP -> Noun @, [0 , 1]) completer,
(S -> NP @ VP, [0 , 1]) completer,
(NP -> NP @ PP, [0 , 1]) completer,
(VP -> @ Verb NP, [1 , 1]) predictor,
(VP -> @ VP PP, [1 , 1]) predictor,
(PP -> @ Prep NP, [1 , 1]) predictor]

S2: [(Verb -> called @, [1 , 2]) scanner,
(VP -> Verb @ NP, [1 , 2]) completer,
(NP -> @ NP PP, [2 , 2]) predictor,
(NP -> @ Noun, [2 , 2]) predictor]

S3: [(Noun -> Mary @, [2 , 3]) scanner,
(NP -> Noun @, [2 , 3]) completer,
(VP -> Verb NP @, [1 , 3]) completer,
(NP -> NP @ PP, [2 , 3]) completer,
(S -> NP VP @, [0 , 3]) completer,
(VP -> VP @ PP, [1 , 3]) completer,
(PP -> @ Prep NP, [3 , 3]) predictor,
($ -> S @, [0 , 3]) completer]

S4: [(Prep -> from @, [3 , 4]) scanner,
(PP -> Prep @ NP, [3 , 4]) completer,
(NP -> @ NP PP, [4 , 4]) predictor,
(NP -> @ Noun, [4 , 4]) predictor]

S5: [(Noun -> Denver @, [4 , 5]) scanner]


S0: [($ -> @ S, [0 , 0]) start state,
(S -> @ NP VP, [0 , 0]) predictor,
(NP -> @ NP PP, [0 , 0]) predictor,
(NP -> @ Noun, [0 , 0]) predictor]

S1: [(Noun -> John @, [0 , 1]) scanner,
(NP -> Noun @, [0 , 1]) completer,
(S -> NP @ VP, [0 , 1]) completer,
(NP -> NP @ PP, [0 , 1]) completer,
(VP -> @ Verb NP, [1 , 1]) predictor,
(VP -> @ VP PP, [1 , 1]) predictor,
(PP -> @ Prep NP, [1 , 1]) predictor]

S2: [(Verb -> called @, [1 , 2]) scanner,
(VP -> Verb @ NP, [1 , 2]) completer,
(NP -> @ NP PP, [2 , 2]) predictor,
(NP -> @ Noun, [2 , 2]) predictor]

S3: [(Noun -> Mary @, [2 , 3]) scanner,
(NP -> Noun @, [2 , 3]) completer,
(VP -> Verb NP @, [1 , 3]) completer,
(NP -> NP @ PP, [2 , 3]) completer,
(S -> NP VP @, [0 , 3]) completer,
(VP -> VP @ PP, [1 , 3]) completer,
(PP -> @ Prep NP, [3 , 3]) predictor,
($ -> S @, [0 , 3]) completer]

S4: [(Prep -> from @, [3 , 4]) scanner,
(PP -> Prep @ NP, [3 , 4]) completer,
(NP -> @ NP PP, [4 , 4]) predictor,
(NP -> @ Noun, [4 , 4]) predictor]

S5: [(Noun -> Denver @, [4 , 5]) scanner,
(NP -> Noun @, [4 , 5]) completer,
(PP -> Prep NP @, [3 , 5]) completer,
(NP -> NP @ PP, [4 , 5]) completer,
(NP -> NP PP @, [2 , 5]) completer,
(VP -> VP PP @, [1 , 5]) completer,
(PP -> @ Prep NP, [5 , 5]) predictor,
(VP -> Verb NP @, [1 , 5]) completer,
(NP -> NP @ PP, [2 , 5]) completer,
(S -> NP VP @, [0 , 5]) completer,
(VP -> VP @ PP, [1 , 5]) completer,
($ -> S @, [0 , 5]) completer]


Success: true

Reference: Dynamic Programming and the Earley Parser

Binary search in Python

I needed to do binary search of a sorted list with a custom comparator, something that isn’t supported by the Python standard library, so I’ve written a suite of functions for the purpose. They’re modeled on the C++ standard library functions lower_bound(), upper_bound(), equal_range(), and binary_search().

lower_bound()

lower_bound() returns the index of the first element greater than or equal to the sought value, or the length of the list if it isn’t found:

def lower_bound(sequence, value, compare=cmp):
    """Find the index of the first element in sequence >= value"""
    elements = len(sequence)
    offset = 0
    middle = 0
    found = len(sequence)

    while elements > 0:
        middle = elements / 2
        if compare(value, sequence[offset + middle]) > 0:
            offset = offset + middle + 1
            elements = elements - (middle + 1)
        else:
            found = offset + middle
            elements = middle
    return found

upper_bound()

upper_bound() returns the index of the first element greater than the sought value, or the length of the list if there is no such value:

def upper_bound(sequence, value, compare=cmp):
    """Find the index of the first element in sequence > value"""
    elements = len(sequence)
    offset = 0
    middle = 0
    found = 0

    while elements > 0:
        middle = elements / 2
        if compare(value, sequence[offset + middle]) < 0:
            elements = middle
        else:
            offset = offset + middle + 1
            found = offset
            elements = elements - (middle + 1)
    return found

equal_range()

equal_range() returns a pair containing the results of lower_bound() and upper_bound(), so it defines the bounds of a slice whose elements are equal to the value:

def equal_range(sequence, value, compare=cmp):
    """Find the range of sequence equal to value"""
    return lower_bound(sequence, value, compare), upper_bound(sequence, value, compare)

binary_search()

binary_search() just returns a Boolean value indicating whether or not a value is present:

def binary_search(sequence, value, compare=cmp):
    """Find out if value is in sequence"""
    index = lower_bound(sequence, value, compare)
    return index < len(sequence) and compare(value, sequence[index]) == 0

Spanning forest of a graph in C

A graph showing a spanning forest

If a graph isn’t connected, there isn’t a spanning tree that covers all of the vertices. We can, however, construct a spanning forest, which is a set of spanning trees, one for each connected component in the graph. This is rather similar to finding connected components, except that in a spanning forest the components are represented by a set of edges, rather than a set of vertices. Any vertices in the graph that are entirely unconnected will not appear in the spanning forest.

To find the spanning forest, all we need to do is to use the depth-first search algorithm for finding a spanning tree repeatedly, starting at each unvisited vertex in turn. Once all vertices that appear in edges have been visited, the spanning forest is complete.

Below is an implementation in C. The function spanning_forest() takes a graph in edge list format, the number of edges (size), the number of vertices (order), and a callback function that is called with each spanning tree found. It reuses the spanning_tree_recursive() function from the spanning tree algorithm to find each spanning tree.

#include <stdlib.h>

typedef struct {
    unsigned int first;
    unsigned int second;
} edge;

typedef void (*treefn)(const unsigned int *, size_t, const edge *, size_t);

void spanning_tree_recursive(const edge *edges, unsigned int size, 
        unsigned int order, unsigned int *visited, unsigned int *tree,
        unsigned int vertex, int edge, unsigned int *len)
{
    unsigned int e;
    visited[vertex] = 1;
    if (edge != -1) {
        tree[(*len)++] = edge;
    }
    for (e = 0; e < size; e++) {
        if (edges[e].first == vertex || edges[e].second == vertex) {
            unsigned int neighbour = edges[e].first == vertex ?
                edges[e].second : edges[e].first;
            if (!visited[neighbour]) {
                spanning_tree_recursive(edges, size, order, visited, tree, 
                        neighbour, e, len);
            }
        }
    }
}

void spanning_forest(const edge *edges, unsigned int size, unsigned int order,
        treefn fun)
{
    unsigned int *visited = calloc(order, sizeof(unsigned int));
    unsigned int *tree = malloc((order - 1) * sizeof(unsigned int));
    unsigned int len, v;
    if (visited == NULL || tree == NULL) {
        free(visited);
        free(tree);
        return;
    }
    for (v = 0; v < order; v++) {
        if (!visited[v]) {
            len = 0;
            spanning_tree_recursive(edges, size, order, visited, tree, v, -1, &len);
            if (len > 0) {
                fun(tree, len, edges, size);
            }
        }
    }
    free(visited);
    free(tree);
}

Here is an example program that finds the spanning forest of the graph shown at the top.

#include <stdio.h>
#include <stdlib.h>

/* Connect two edges */
void edge_connect(edge *edges, unsigned int first, unsigned int second, 
        unsigned int *pos)
{
    edges[*pos].first = first;
    edges[*pos].second = second;
    (*pos)++;
}

void print(const unsigned int *tree, size_t tree_size, const edge *edges, size_t size)
{
    unsigned int e;
    for (e = 0; e < tree_size; e++) {
        printf("(%u, %u) ", edges[tree[e]].first, edges[tree[e]].second);
    }
    putchar('\n');
}

int main(void)
{
    const unsigned int order = 9; /* Vertices */
    const unsigned int size = 8; /* Edges */
    edge *edges;
    
    edges = malloc(size * sizeof(edge));
    if (edges == NULL) {
        return 1;
    }
 
    /* Square */
    edges[0].first = 0;
    edges[0].second = 1;
    edges[1].first = 1;
    edges[1].second = 2;
    edges[2].first = 2;
    edges[2].second = 3;
    edges[3].first = 3;
    edges[3].second = 0;
 
    /* Triangle */
    edges[4].first = 4;
    edges[4].second = 5;
    edges[5].first = 5;
    edges[5].second = 6;
    edges[6].first = 6;
    edges[6].second = 4;
 
    /* Line */
    edges[7].first = 7;
    edges[7].second = 8;

    spanning_forest(edges, size, order, print);

    free(edges);
    return 0;
}

The output:

(0, 1) (1, 2) (2, 3)
(4, 5) (5, 6)
(7, 8)

Spanning tree of a graph in C

A complete graph on 5 vertices showing a spanning tree

In a previous post I showed an algorithm to find all spanning trees in a graph. A simpler problem is just to find a single spanning tree. This can be solved using a depth-first search. We simply need to record, for each vertex we visit, the edge by which we reached it.

Below is an implementation in C. The function spanning_tree() takes a graph in edge list format, the number of edges (size), the number of vertices (order), and the address of a pointer to which to assign the spanning tree. The spanning tree is in the form of an array of edge indices. The function returns the number of edges in this array, which will be order – 1 if the graph is connected. If the graph is not connected, the function will return a spanning tree of the component containing vertex 0, and the returned size will be correspondingly smaller.

#include <stdlib.h>

typedef struct {
    unsigned int first;
    unsigned int second;
} edge;

void spanning_tree_recursive(const edge *edges, unsigned int size, 
        unsigned int order, unsigned int *visited, unsigned int *tree,
        unsigned int vertex, int edge, unsigned int *len)
{
    unsigned int e;
    visited[vertex] = 1;
    if (edge != -1) {
        tree[(*len)++] = edge;
    }
    for (e = 0; e < size; e++) {
        if (edges[e].first == vertex || edges[e].second == vertex) {
            unsigned int neighbour = edges[e].first == vertex ?
                edges[e].second : edges[e].first;
            if (!visited[neighbour]) {
                spanning_tree_recursive(edges, size, order, visited, tree, 
                        neighbour, e, len);
            }
        }
    }
}

unsigned int spanning_tree(const edge *edges, unsigned int size, unsigned int order,
        unsigned int **tree)
{
    unsigned int *visited = calloc(order, sizeof(unsigned int));
    *tree = malloc((order - 1) * sizeof(unsigned int));
    unsigned int len = 0;
    if (visited == NULL || *tree == NULL) {
        free(visited);
        free(*tree);
        *tree = NULL;
        return 0;
    }
    spanning_tree_recursive(edges, size, order, visited, *tree, 0, -1, &len);
    free(visited);
    return len;
}

Here is an example program that finds a spanning tree of the complete graph on 5 vertices:

/* Calculate the nth triangular number T(n) */
unsigned int triangular_number(unsigned int n)
{
    return (n * (n + 1)) / 2;
}

/* Construct a complete graph on v vertices */
unsigned int complete_graph(unsigned int v, edge **edges)
{
    unsigned int n = triangular_number(v - 1);
    unsigned int i, j, k;
    *edges = malloc(n * sizeof(edge));
    if (edges != NULL) {
        for (i = 0, k = 0; i < v - 1; i++) {
            for (j = i + 1; j < v; j++) {
                (*edges)[k].first = i;
                (*edges)[k].second = j;
                k++;
            }
        }
    }
    else {
        n = 0;
    }
    return n;
}

int main(void)
{
    edge *edges;
    const unsigned int order = 5; /* Vertices */
    const unsigned int size = complete_graph(5, &edges); /* Edges */
    unsigned int *tree;
    unsigned int tree_size = spanning_tree(edges, size, order, &tree);
    unsigned int e;
    for (e = 0; e < tree_size; e++) {
        printf("(%u, %u) ", edges[tree[e]].first, edges[tree[e]].second);
    }
    putchar('\n');
    free(edges);
    free(tree);
    return 0;
}

The output:

(0, 1) (1, 2) (2, 3) (3, 4)

Recursive integer partitions in C

Here is a recursive algorithm to generate integer partitions in antilexicographic order. The function partitions() takes an integer to partition, and a callback function to call for each partition found.

#include <stdlib.h>

unsigned int min(int a, int b)
{
    return a < b ? a : b;
}

typedef void (*partitionfn)(const unsigned int *, size_t);

static void partitions_recursive(unsigned int n, unsigned int maximum, unsigned int *partition,
        size_t length, partitionfn fun)
{
    unsigned int i;
    if (n == 0) {
        fun(partition, length);
    }
    for (i = min(maximum, n); i >= 1; i--) {
        partition[length] = i;
        partitions_recursive(n - i, i, partition, length + 1, fun);
    }
}

void partitions(unsigned int n, partitionfn fun)
{
    unsigned int *partition = malloc(n * sizeof(unsigned int));
    if (partition) {
        partitions_recursive(n, n, partition, 0, fun);
        free(partition);
    }
}

An example program:

#include <stdio.h>

void print(const unsigned int *partition, size_t length)
{
    unsigned int i;
    for (i = 0; i < length; i++) {
        printf("%u ", partition[i]);
    }
    putchar('\n');
}

int main(void)
{
    partitions(6, print);
    return 0;
}

The output:

6
5 1
4 2
4 1 1
3 3
3 2 1
3 1 1 1
2 2 2
2 2 1 1
2 1 1 1 1
1 1 1 1 1 1

Reference: Partition.java

Levenshtein distance in C

The Levenshtein distance is a measure of the similarity of two strings. It is defined as the minimum number of insertions, deletions, and substitutions necessary to transform the first string into the second.

For example, the Levenshtein distance between “CHALK” and “CHEESE” is 4, as follows:

  1. Substitute E for A
  2. Substitute E for L
  3. Substitute S for K
  4. Insert E

The Levenshtein distance can be calculated using a dynamic programming algorithm due to Wagner and Fischer. The algorithm fills a table with the distance between all of the prefixes of the two strings, the final entry being the distance between the two entire strings.

Below is an implementation in C. I have recorded in the matrix cells information about the edits and the previous cell from which each cell was derived to enable tracing back the sequence of edits – the edit script. The function levenshtein_distance() takes two strings, and the address of a pointer to which to assign an array containing the edit script. It returns the Levenshtein distance.

#include <string.h>
#include <stdlib.h>

typedef enum {
    INSERTION,
    DELETION,
    SUBSTITUTION,
    NONE
} edit_type;

struct edit {
    unsigned int score;
    edit_type type;
    char arg1;
    char arg2;
    unsigned int pos;
    struct edit *prev;
};
typedef struct edit edit;

static int min3(int a, int b, int c)
{
    if (a < b && a < c) {
        return a;
    }
    if (b < a && b < c) {
        return b;
    }
    return c;
}

static unsigned int levenshtein_matrix_calculate(edit **mat, const char *str1, size_t len1,
        const char *str2, size_t len2)
{
    unsigned int i, j;
    for (j = 1; j <= len2; j++) {
        for (i = 1; i <= len1; i++) {
            unsigned int substitution_cost;
            unsigned int del = 0, ins = 0, subst = 0;
            unsigned int best;
            if (str1[i - 1] == str2[j - 1]) {
                substitution_cost = 0;
            }
            else {
                substitution_cost = 1;
            }
            del = mat[i - 1][j].score + 1; /* deletion */
            ins = mat[i][j - 1].score + 1; /* insertion */
            subst = mat[i - 1][j - 1].score + substitution_cost; /* substitution */
            best = min3(del, ins, subst);
            mat[i][j].score = best;                  
            mat[i][j].arg1 = str1[i - 1];
            mat[i][j].arg2 = str2[j - 1];
            mat[i][j].pos = i - 1;
            if (best == del) {
                mat[i][j].type = DELETION;
                mat[i][j].prev = &mat[i - 1][j];
            }
            else if (best == ins) {
                mat[i][j].type = INSERTION;
                mat[i][j].prev = &mat[i][j - 1];
            }
            else {
                if (substitution_cost > 0) {
                    mat[i][j].type = SUBSTITUTION;
                }
                else {
                    mat[i][j].type = NONE;
                }
                mat[i][j].prev = &mat[i - 1][j - 1];
            }
        }
    }
    return mat[len1][len2].score;
}

static edit **levenshtein_matrix_create(const char *str1, size_t len1, const char *str2,
        size_t len2)
{
    unsigned int i, j;
    edit **mat = malloc((len1 + 1) * sizeof(edit *));
    if (mat == NULL) {
        return NULL;
    }
    for (i = 0; i <= len1; i++) {
        mat[i] = malloc((len2 + 1) * sizeof(edit));
        if (mat[i] == NULL) {
            for (j = 0; j < i; j++) {
                free(mat[j]);
            }
            free(mat);
            return NULL;
        }
    }
    for (i = 0; i <= len1; i++) {
        mat[i][0].score = i;
        mat[i][0].prev = NULL;
        mat[i][0].arg1 = 0;
        mat[i][0].arg2 = 0;
    }

    for (j = 0; j <= len2; j++) {
        mat[0][j].score = j;
        mat[0][j].prev = NULL;
        mat[0][j].arg1 = 0;
        mat[0][j].arg2 = 0;
    }
    return mat; 
}

unsigned int levenshtein_distance(const char *str1, const char *str2, edit **script)
{
    const size_t len1 = strlen(str1), len2 = strlen(str2);
    unsigned int i, distance;
    edit **mat, *head;

    /* If either string is empty, the distance is the other string's length */
    if (len1 == 0) {
        return len2;
    }
    if (len2 == 0) {
        return len1;
    }
    /* Initialise the matrix */
    mat = levenshtein_matrix_create(str1, len1, str2, len2);
    if (!mat) {
        *script = NULL;
        return 0;
    }
    /* Main algorithm */
    distance = levenshtein_matrix_calculate(mat, str1, len1, str2, len2);
    /* Read back the edit script */
    *script = malloc(distance * sizeof(edit));
    if (*script) {
        i = distance - 1;
        for (head = &mat[len1][len2];
                head->prev != NULL;
                head = head->prev) {
            if (head->type != NONE) {
                memcpy(*script + i, head, sizeof(edit));
                i--;
            }
        }
    }
    else {
        distance = 0;
    }
    /* Clean up */
    for (i = 0; i <= len1; i++) {
        free(mat[i]);
    }
    free(mat);

    return distance;

Here is an example program that finds the distance from CHALK to CHEESE and prints the edit script:

#include <stdlib.h>
#include <stdio.h>

void print(const edit *e)
{
    if (e->type == INSERTION) {
        printf("Insert %c", e->arg2);
    }
    else if (e->type == DELETION) {
        printf("Delete %c", e->arg1);
    }
    else {
        printf("Substitute %c for %c", e->arg2, e->arg1);
    }
    printf(" at %u\n", e->pos);
}

int main(void)
{
    edit *script;
    unsigned int distance;
    unsigned int i;
    const char str1[] = "CHALK";
    const char str2[] = "CHEESE";
    distance = levenshtein_distance(str1, str2, &script);
    printf("Distance is %d:\n", distance);
    for (i = 0; i < distance; i++) {
        print(&script[i]);
    }
    free(script);
    return 0;
}

The output:

Distance is 4:
Substitute E for A at 2
Substitute E for L at 3
Substitute S for K at 4
Insert E at 4