Tag Archives: Context-Free Grammar

Generating production trees from a grammar in C++

A production tree for the sentence 'the man fed the dog'

In a previous post, I showed how to generate sentences from a grammar. We can go further and generate production trees. A production tree is just like a parse tree – it is a recursive decomposition of the grammatical structure of the sentence like the picture above.

The main algorithm for generating production trees is just the same as for generating sentences – using a queue and breadth-first search – but instead of substituting right-hand side symbols in place of the first non-terminal in the sentential form, we append child nodes to the first non-terminal leaf node in the production tree. A depth-first search is used to find the first non-terminal leaf node. Once there are no non-terminal leaf nodes, the production tree is complete.

The complete tree can be printed in the dot language and converted to a graphviz graph, like the picture above, or the sentence can be retrieved from it by using an in-order traversal to retrieve the leaf nodes, which in a completed tree are all of the terminals.

Below is the code for the production tree, which is a Composite made of three classes, the abstract symbol class, and concrete subclasses nonterminal and terminal. The structure is pointer-based, so smart pointers are used throughout.

#ifndef TREE_HPP
#define TREE_HPP

#include <vector>
#include <string>
#include <memory>
#include <iostream>

namespace MB
{

// Abstract base class for grammar symbols
class symbol
{
public:
    typedef std::shared_ptr<symbol> ptr;
    typedef std::shared_ptr<const symbol> const_ptr;
    symbol(const symbol&) = delete;
    symbol& operator=(const symbol&) = delete;
    virtual ptr clone() const = 0;
    virtual ptr get_first_nonterminal_leaf() = 0;
    virtual void append_child(ptr child) = 0;
    virtual void get_leaves(std::vector<std::string>&) const = 0;
    const std::string& name() const
    {
        return name_;
    }
    virtual void print_dot(std::ostream &os, unsigned int id, unsigned int& maxid) const
    {
        os << "\tNode" << id << "[label=\"";
        os << name_;
        os << "\"];\n";
    }
protected:
    symbol(const std::string& name)
        : name_(name)
    {
    }
    std::string name_;
};

// A non-terminal, which can have children
class nonterminal : public symbol, public std::enable_shared_from_this<nonterminal>
{
public:
    static symbol::ptr create(const std::string& name)
    {
        return symbol::ptr(new nonterminal(name));
    }
    symbol::ptr clone() const override
    {
        symbol::ptr copy = create(name_);
        for (auto child : children_) {
            copy->append_child(child->clone());
        }
        return copy;
    }
    symbol::ptr get_first_nonterminal_leaf() override
    {
        if (children_.empty()) {
            // If we have no children, we are the first non-terminal leaf
            return shared_from_this();
        }
        else {
            // Search in children
            for (auto child : children_) {
                symbol::ptr p = child->get_first_nonterminal_leaf();
                if (p) {
                    return p;
                }
            }
        }
        // Not us or one of our children
        return nullptr;
    }
    void append_child(symbol::ptr child) override
    {
        children_.push_back(child);
    }
    void get_leaves(std::vector<std::string>& sentence) const override
    {
        for (auto child : children_) {
            child->get_leaves(sentence);
        }
    }
    void print_dot(std::ostream &os, unsigned int id, unsigned int& maxid) const override
    {
        symbol::print_dot(os, id, maxid);
        for (auto child : children_) {
            unsigned int child_id = ++maxid;
            child->print_dot(os, child_id, maxid);
            os << "\tNode" << id << " -> Node" << child_id << "[dir=none];\n";
        }
    }
private:
    nonterminal(const std::string& name)
        : symbol(name)
    {
    }
    std::vector<symbol::ptr> children_;
};

// A terminal, which is a leaf
class terminal : public symbol
{
public:
    terminal(const std::string& name)
        : symbol(name)
    {
    }
    static symbol::ptr create(const std::string& name)
    {
        return symbol::ptr(new terminal(name));
    }
    symbol::ptr clone() const override
    {
        return create(name_);
    }
    virtual symbol::ptr get_first_nonterminal_leaf() override
    {
        return nullptr;
    }
    virtual void append_child(symbol::ptr child) override
    {
    }
    void get_leaves(std::vector<std::string>& sentence) const override
    {
        sentence.push_back(name_);
    }
};

// A production tree
class tree
{
public:
    tree(symbol::ptr root)
        : root_(root)
    {
    }
    tree clone() const
    {
        symbol::ptr root = root_ ? root_->clone() : nullptr;
        return tree(root);
    }
    // Get the first non-terminal for substitution
    symbol::ptr get_first_nonterminal_leaf() const
    {
        if (root_) {
            return root_->get_first_nonterminal_leaf();
        }
        return nullptr;
    }
    // Get the sentence
    std::vector<std::string> get_leaves() const
    {
        std::vector<std::string> sentence;
        if (root_) {
            root_->get_leaves(sentence);
        }
        return sentence;
    }
    // Print in dot format
    void print_dot(std::ostream& os) const
    {
        os << "digraph G {\n";
        os << "\tnode[shape=plaintext];\n";
        unsigned int id = 0;
        unsigned int maxid = 0;
        if (root_) {
            root_->print_dot(os, id, maxid);
        }
        os << "}\n";
    }
private:
    symbol::ptr root_;
};

} // namespace MB

#endif // TREE_HPP

The production tree generator is very similar to the sentence generator described previously, except production trees are copied and modified rather than sentential forms.

#ifndef TREE_GENERATOR_HPP
#define TREE_GENERATOR_HPP

#include <vector>
#include <list>
#include <queue>
#include <algorithm>

#include <grammar.hpp>
#include <tree.hpp>

namespace MB
{

class tree_generator
{
public:
    tree_generator(const grammar& grammar)
        : grammar_(grammar)
    {
        // Store the left-hand sides
        std::transform(grammar_.rules().begin(), grammar_.rules().end(),
                std::back_inserter(lefts_),
                [](rule::const_ptr r)->const std::string& { return r->left(); }); 
    }
    template <class InputIt>
    InputIt generate(InputIt init, unsigned int n) const
    {
        std::queue<tree> queue;
        unsigned int sentences = 0;
        // Add the start symbol to the queue
        queue.push(tree(nonterminal::create(grammar_.get_start_left())));
        while (sentences < n && queue.size()) {
            tree sf = queue.front();
            queue.pop();
            // Find the first non-terminal
            symbol::ptr nt = sf.get_first_nonterminal_leaf();
            if (nt) {
                // Append each right-hand side alternative
                std::vector<rule::const_ptr> rules;
                grammar_.get_rules_for_left(nt->name(), std::back_inserter(rules));
                for (rule::const_ptr rule : rules) {
                    for (const std::vector<std::string>& alternative : rule->right()) {
                        tree new_sf = sf.clone();
                        nt = new_sf.get_first_nonterminal_leaf(); 
                        for (auto symbol : alternative) {
                            if (std::find(lefts_.begin(), lefts_.end(), symbol) != lefts_.end()) {
                                nt->append_child(nonterminal::create(symbol));
                            } 
                            else {
                                nt->append_child(terminal::create(symbol));
                            }
                        }
                        queue.push(new_sf);
                    }
                }
            }
            else {
                // Finished sentence
                *init++ = sf;
                ++sentences;
            }
        }
        return init;
    }
private:
    const grammar& grammar_;
    std::vector<std::string> lefts_;
};

} // namespace MB

#endif // TREE_GENERATOR_HPP

Here is an example program that generates the first 50 production trees and prints them in dot format after their associated sentence:

#include <fstream>
#include <iostream>
#include <vector>
#include <algorithm>

#include <grammar.hpp>
#include <tree_generator.hpp>

int main()
{
    std::string filename = "generator.dat";
    std::ifstream ifs(filename);
    if (ifs) {
        MB::grammar grammar(ifs);
        MB::tree_generator gen(grammar);
        std::vector<MB::tree > trees;
        gen.generate(std::back_inserter(trees), 50);
        for (const auto& t : trees) {
            std::vector<std::string> sentence = t.get_leaves();
            std::copy(sentence.begin(), sentence.end(), 
                    std::ostream_iterator<std::string>(std::cout, " "));
            std::cout << '\n';
            t.print_dot(std::cout);
        }
    }
    else {
        std::cerr << "Couldn't open " << filename << " for reading\n";
    }
}

Here is an example sentence and tree from the output

a dog barked
digraph G {
    node[shape=plaintext];
    Node0[label="S"];
    Node1[label="NP"];
    Node2[label="DET"];
    Node3[label="a"];
    Node2 -> Node3[dir=none];
    Node1 -> Node2[dir=none];
    Node4[label="N"];
    Node5[label="dog"];
    Node4 -> Node5[dir=none];
    Node1 -> Node4[dir=none];
    Node0 -> Node1[dir=none];
    Node6[label="VP"];
    Node7[label="V"];
    Node8[label="barked"];
    Node7 -> Node8[dir=none];
    Node6 -> Node7[dir=none];
    Node0 -> Node6[dir=none];
}

If you have graphviz installed, the dot language part can be converted to a png image by pasting it into a file (say “dog.dot”) and running the command:

dot -Tpng -O dog.dot 

This will generate a file called “dog.dot.png” containing the picture of the production tree like this:
Production tree for the phrase 'a dog barked'

Generating sentences from a grammar in C++

Although phrase structure grammars are used in the development of parsers, they really describe how to generate sentences in a language. If a grammar is context-free, there is a simple algorithm to generate sentences from it. The algorithm uses a queue to perform a breadth-first search as follows:

  1. Put the start symbol in the queue
  2. While the queue is not empty and more sentences are wanted:
    1. Pop the first sentential form from the queue
    2. Find the first non-terminal in it
    3. For every right-hand side of that non-terminal
      1. Make a copy of the sentential form
      2. Substitute the right-hand side for the non-terminal
      3. Put this new sentential form into the queue
    4. If there are no non-terminals left in the sentential form, it is a complete sentence, so return it

Note that the queue, and consequent breadth-first search, are required because if the rule’s right-hand side is left-recursive, the algorithm would go into an infinite recursion without producing anything if it performed a depth-first search.

Here is the code in C++, using the class I described earlier in Context-free grammar in C++. The generator class is instantiated with a grammar, and its generate() method will write n sentences to the supplied iterator, if n sentences can be generated.

#ifndef GENERATOR_HPP
#define GENERATOR_HPP

#include <vector>
#include <list>
#include <queue>
#include <algorithm>

#include <grammar.hpp>

namespace MB
{

class generator
{
public:
    generator(const grammar& grammar)
        : grammar_(grammar)
    {
        // Store the left-hand sides
        std::transform(grammar_.rules().begin(), grammar_.rules().end(),
                std::back_inserter(lefts_),
                [](rule::const_ptr r)->const std::string& { return r->left(); }); 
    }
    template <class InputIt>
    InputIt generate(InputIt init, unsigned int n) const
    {
        std::queue<sentential_form> queue;
        unsigned int sentences = 0;
        // Add the start symbol to the queue
        queue.push(sentential_form(1, grammar_.get_start_left()));
        while (sentences < n && queue.size()) {
            sentential_form sf = queue.front();
            queue.pop();
            // Find the first non-terminal
            sentential_form::iterator it = std::find_first_of(sf.begin(), sf.end(), 
                    lefts_.begin(), lefts_.end());
            if (it != sf.end()) {
                // Replace with each right-hand side alternative
                std::string left = *it;
                std::vector<rule::const_ptr> rules;
                grammar_.get_rules_for_left(left, std::back_inserter(rules));
                for (rule::const_ptr rule : rules) {
                    for (const std::vector<std::string>& alternative : rule->right()) {
                        sentential_form new_sf(sf);
                        it = std::find(new_sf.begin(), new_sf.end(), left);
                        new_sf.insert(it, alternative.begin(), alternative.end());
                        new_sf.erase(it);
                        queue.push(new_sf);
                    }
                }
            }
            else {
                // Finished sentence
                *init++ = std::vector<std::string>(sf.begin(), sf.end());
                ++sentences;
            }
        }
        return init;
    }
private:
    typedef std::list<std::string> sentential_form;
    const grammar& grammar_;
    std::vector<std::string> lefts_;
};

} // namespace MB

#endif // GENERATOR_HPP

Here is an example grammar:

S -> NP VP
NP -> NP PP
NP -> DET N
VP -> V NP
VP -> V
PP -> P NP
DET -> a | the
N -> man | dog | house | telescope | spoon
V -> saw | fed | barked
P -> with

And an example program that generates the first 50 sentences:

#include <fstream>
#include <iostream>
#include <vector>
#include <algorithm>

#include <grammar.hpp>
#include <generator.hpp>

int main()
{
    std::string filename = "generator.dat";
    std::ifstream ifs(filename);
    if (ifs) {
        MB::grammar grammar(ifs);
        MB::generator gen(grammar);
        std::vector<std::vector<std::string> > sentences;
        gen.generate(std::back_inserter(sentences), 50);
        for (auto sentence : sentences) {
            std::copy(sentence.begin(), sentence.end(),
                    std::ostream_iterator<const std::string&>(std::cout, " "));
            std::cout << '\n';
        }
    }
    else {
        std::cerr << "Couldn't open " << filename << " for reading\n";
    }
}

This is the somewhat surreal output:

a man saw
a man fed
a man barked
a dog saw
a dog fed
a dog barked
a house saw
a house fed
a house barked
a telescope saw
a telescope fed
a telescope barked
a spoon saw
a spoon fed
a spoon barked
the man saw
the man fed
the man barked
the dog saw
the dog fed
the dog barked
the house saw
the house fed
the house barked
the telescope saw
the telescope fed
the telescope barked
the spoon saw
the spoon fed
the spoon barked
a man saw a man
a man saw a dog
a man saw a house
a man saw a telescope
a man saw a spoon
a man saw the man
a man saw the dog
a man saw the house
a man saw the telescope
a man saw the spoon
a man fed a man
a man fed a dog
a man fed a house
a man fed a telescope
a man fed a spoon
a man fed the man
a man fed the dog
a man fed the house
a man fed the telescope
a man fed the spoon

Context-free grammar in C++

This is the class I used for managing the grammar for the Unger, CYK, and Earley parsers.

It will read a grammar from a file, and exposes the look-up methods needed by the various parsers, thus freeing them from needing to implement them themselves and allowing them to be used by different parsers.

A typical grammar file looks like this:

S -> A B | B C
A -> B A | a
B -> C C | b
C -> A B | a

Symbols can be any length, not just single letters. Right-hand sides can contain any number of alternatives. Rather than using pipe separators, multiple rules can have the same left-hand side (subject). The subject of the first rule is considered to be the start symbol. Terminals are identified by virtue of not appearing on the left hand side of any rule.

The look-up methods are:

get_rules_for_left()
Get all of the rules where symbol is the subject
symbol_is_terminal()
Is this symbol a terminal (doesn’t occur as the subject of a rule)?
get_start_rules()
Get the start rule(s)
get_rules_for_symbols()
Get the rules where these pairs of symbols occur together in a right hand side alternative
get_rules_for_symbol()
Get the rules where this symbol occurs in a right hand side alternative
get_start_left()
Get the subject of the start symbol
symbol_is_left_of_terminal()
Is this symbol the left (subject) of a terminal

Notice how there is a look-up method for looking up rules given the left-hand side (get_rules_for_left()), and a couple of methods for looking up rules given the right-hand side (get_rules_for_symbols(), get_rules_for_symbol()). Which of these methods a parser uses is very telling about its manner of operation. The Unger parser, which is a top-down parser, looks up rules by their left-hand sides. The CYK parser, which is a bottom-up parser, looks up rules by their right-hand sides. The Earley parser, which is variously described as top-down, bottom-up, both, or neither, looks up rules both by their left- and right-hand sides.

Below is the code in C++. A grammar object is instantiated from a std::ifstream. Rules are implemented using smart pointers to prevent unnecessary copies.

#ifndef GRAMMAR_HPP
#define GRAMMAR_HPP

#include <string>
#include <vector>
#include <fstream>
#include <iterator>
#include <list>
#include <sstream>
#include <set>
#include <algorithm>
#include <utility>
#include <memory>

namespace MB
{

namespace detail
{

// Form the cartesian product of two sequences
template <class InputIt1, class InputIt2, class OutputIt>
OutputIt cartesian_product(InputIt1 begin1, InputIt1 end1, InputIt2 begin2, InputIt2 end2, OutputIt it)
{
    for (InputIt1 it1 = begin1; it1 != end1; ++it1) {
        for (InputIt2 it2 = begin2; it2 != end2; ++it2) {
            *it++ = std::make_pair(*it1, *it2);
        }
    }
    return it;
}

// Find out if container contains all of range of items
template <class AssociativeContainer, class InputIt>
bool find_all(const AssociativeContainer& container, InputIt begin, InputIt end)
{
    bool all = true;
    for (InputIt it = begin; it != end && all; ++it) {
        all = container.find(*it) != container.end();
    }
    return all;
}

} // namespace detail

class rule
{
public:
    typedef std::shared_ptr<rule> ptr;
    typedef std::shared_ptr<const rule> const_ptr;
    rule(const rule&) = delete;
    rule& operator=(const rule&) = delete;
    operator bool()
    {
        return succeeded_;
    }
    size_t size(unsigned int right) const
    {
        return right_[right].size();
    }
private:
    std::string left_;
    std::vector<std::vector<std::string> > right_;
    bool succeeded_;
    rule(const std::string& left, const std::string& right)
        : left_(left)
    {
        right_.push_back(std::vector<std::string>(1, right));
    }
    rule(const std::string &str)
    {
        unsigned int i = 0;
        while (i < str.size() && std::isspace(str[i])) {
            ++i;
        }
        // Get left-hand side
        while (i < str.size() && std::isalnum(str[i])) {
            left_.push_back(str[i]);
            ++i;
        }
        if (!(succeeded_ = !left_.empty())) {
            return;
        }
        while (i < str.size() && std::isspace(str[i])) {
            ++i;
        }
        // Get arrow
        if (!(succeeded_ = str.substr(i, 2) == "->")) {
            return;
        }
        i += 2;
        while (i < str.size() && std::isspace(str[i])) {
            ++i;
        }
        // Get right-hand side
        while (i < str.size()) {
            bool added = false;
            while (i < str.size() && str[i] != '|') {
                std::string symbol;
                while (i < str.size() && !std::isspace(str[i]) && str[i] != '|') {
                    symbol.push_back(str[i]);
                    ++i;
                }
                if (!symbol.empty()) {
                    if (!added) {
                        right_.push_back(std::vector<std::string>());
                        added = true;
                    }
                    right_.back().push_back(symbol);
                }
                while (i < str.size() && std::isspace(str[i])) {
                    ++i;
                }
            }
            if (i < str.size()) {
                ++i;
            }
        }  
    }
public:
    static ptr create(const std::string& left, const std::string& right)
    {
        return ptr(new rule(left, right));
    }
    static ptr create(const std::string& line)
    {
        return ptr(new rule(line));
    }
    const std::string& left() const
    {
        return left_;
    }
    const std::vector<std::vector<std::string> >& right() const
    {
        return right_;
    }
    friend std::ostream& operator <<(std::ostream& os, const rule& r)
    {
        os << r.left_ << " -> ";
        unsigned int i = 0;
        for (const std::vector<std::string>& alternative : r.right_) {
            for (const std::string& symbol: alternative) {
                os << symbol << " ";
            }
            if (i++ < r.right_.size() - 1) {
                os << "| ";
            }
        }
        return os;
    }
};

class grammar
{
public:
    // Create from a stream
    grammar(std::istream& is)
    {
        std::string line;
        while (std::getline(is, line)) {
            line.erase(line.find_last_not_of("\r\n") + 1);
            if (line.size()) {
                rule::ptr r = rule::create(line);
                if (*r) {
                    rules_.push_back(r);
                }
            }
        }
        // Get the terminals
        std::set<std::string> nonterminals;
        std::set<std::string> symbols;
        for (rule::const_ptr r : rules_) {
            nonterminals.insert(r->left());
            for (const std::vector<std::string>& alternative : r->right()) {
                for (const std::string& symbol : alternative) {
                    symbols.insert(symbol);
                }
            }
        }
        for (const std::string& symbol : symbols) {
            if (nonterminals.find(symbol) == nonterminals.end()) {
                terminals_.push_back(symbol);
            }
        }
    }

    const std::vector<rule::ptr>& rules() const
    {
        return rules_;
    }

    // Get all of the rules where symbol is the subject
    template <class OutputIt>
    OutputIt get_rules_for_left(const std::string& symbol, OutputIt it) const
    {
        for (rule::const_ptr r : rules_) {
            if (r->left() == symbol) {
                *it++ = r;
            }
        }
        return it;
    }

    // Is this symbol a terminal (doesn't occur as the subject of a rule)?
    bool symbol_is_terminal(const std::string& symbol) const
    {
        return std::find(terminals_.begin(), terminals_.end(), symbol) != terminals_.end();
    }

    // Get the rule(s) whose left-hand side is the start symbol
    template <class OutputIt>
    OutputIt get_start_rules(OutputIt it) const
    {
        std::string start_symbol;
        bool started = false;
        for (rule::const_ptr r : rules_) {
            if (!started || r->left() == start_symbol) {
                *it++ = r;
            }
            if (!started) {
                started = true;
                start_symbol = r->left();
            }
        }
        return it;
    }

    // Get the rules where any of these pairs of symbols occur together as a right hand side alternative
    template <class InputIt, class OutputIt>
    OutputIt get_rules_for_symbols(InputIt begin, InputIt end, OutputIt it) const
    {
        for (InputIt init = begin; init != end; ++init) {
            for (rule::const_ptr r : rules_) {
                for (const std::vector<std::string>& alternative : r->right()) {
                    if (alternative.size() == 2
                            && alternative[0] == init->first
                            && alternative[1] == init->second) {
                        *it++ = r;
                    }
                }
            }
        }
        return it;
    }

    // Get the rules where this symbol occurs in a right hand side alternative
    template <class OutputIt>
    OutputIt get_rules_for_symbol(const std::string& symbol, OutputIt it) const
    {
        for (rule::const_ptr r : rules_) {
            for (const std::vector<std::string>& alternative : r->right()) {
                if (std::find(alternative.begin(), alternative.end(), symbol) != alternative.end()) {
                    *it++ = r;
                }
            }
        }
        return it;
    }

    // Get the subject of the start symbol
    const std::string& get_start_left() const
    {
        return rules_.front()->left();
    }

    // Is this symbol the left (subject) of a terminal?
    bool symbol_is_left_of_terminal(const std::string& symbol) const
    {
        for (rule::const_ptr r : rules_) {
            if (r->left() == symbol) {
                for (const std::vector<std::string>& alternative : r->right()) {
                    if (alternative.size() == 1 && symbol_is_terminal(alternative[0])) {
                        return true;
                    }
                }
            }
        }
        return false;
    }

    // Pretty-print
    friend std::ostream& operator <<(std::ostream& os, const grammar& gram)
    {
        for (rule::const_ptr r : gram.rules_) {
            os << *r;
            os << '\n';
        }
        return os;
    }

private:
    std::vector<rule::ptr> rules_;
    std::vector<rule::ptr> start_rules_;
    std::vector<std::string> terminals_;
};

} // namespace MB

#endif // GRAMMAR_HPP