Generating production trees from a grammar in C++

A production tree for the sentence 'the man fed the dog'

In a previous post, I showed how to generate sentences from a grammar. We can go further and generate production trees. A production tree is just like a parse tree – it is a recursive decomposition of the grammatical structure of the sentence like the picture above.

The main algorithm for generating production trees is just the same as for generating sentences – using a queue and breadth-first search – but instead of substituting right-hand side symbols in place of the first non-terminal in the sentential form, we append child nodes to the first non-terminal leaf node in the production tree. A depth-first search is used to find the first non-terminal leaf node. Once there are no non-terminal leaf nodes, the production tree is complete.

The complete tree can be printed in the dot language and converted to a graphviz graph, like the picture above, or the sentence can be retrieved from it by using an in-order traversal to retrieve the leaf nodes, which in a completed tree are all of the terminals.

Below is the code for the production tree, which is a Composite made of three classes, the abstract symbol class, and concrete subclasses nonterminal and terminal. The structure is pointer-based, so smart pointers are used throughout.

#ifndef TREE_HPP
#define TREE_HPP

#include <vector>
#include <string>
#include <memory>
#include <iostream>

namespace MB
{

// Abstract base class for grammar symbols
class symbol
{
public:
    typedef std::shared_ptr<symbol> ptr;
    typedef std::shared_ptr<const symbol> const_ptr;
    symbol(const symbol&) = delete;
    symbol& operator=(const symbol&) = delete;
    virtual ptr clone() const = 0;
    virtual ptr get_first_nonterminal_leaf() = 0;
    virtual void append_child(ptr child) = 0;
    virtual void get_leaves(std::vector<std::string>&) const = 0;
    const std::string& name() const
    {
        return name_;
    }
    virtual void print_dot(std::ostream &os, unsigned int id, unsigned int& maxid) const
    {
        os << "\tNode" << id << "[label=\"";
        os << name_;
        os << "\"];\n";
    }
protected:
    symbol(const std::string& name)
        : name_(name)
    {
    }
    std::string name_;
};

// A non-terminal, which can have children
class nonterminal : public symbol, public std::enable_shared_from_this<nonterminal>
{
public:
    static symbol::ptr create(const std::string& name)
    {
        return symbol::ptr(new nonterminal(name));
    }
    symbol::ptr clone() const override
    {
        symbol::ptr copy = create(name_);
        for (auto child : children_) {
            copy->append_child(child->clone());
        }
        return copy;
    }
    symbol::ptr get_first_nonterminal_leaf() override
    {
        if (children_.empty()) {
            // If we have no children, we are the first non-terminal leaf
            return shared_from_this();
        }
        else {
            // Search in children
            for (auto child : children_) {
                symbol::ptr p = child->get_first_nonterminal_leaf();
                if (p) {
                    return p;
                }
            }
        }
        // Not us or one of our children
        return nullptr;
    }
    void append_child(symbol::ptr child) override
    {
        children_.push_back(child);
    }
    void get_leaves(std::vector<std::string>& sentence) const override
    {
        for (auto child : children_) {
            child->get_leaves(sentence);
        }
    }
    void print_dot(std::ostream &os, unsigned int id, unsigned int& maxid) const override
    {
        symbol::print_dot(os, id, maxid);
        for (auto child : children_) {
            unsigned int child_id = ++maxid;
            child->print_dot(os, child_id, maxid);
            os << "\tNode" << id << " -> Node" << child_id << "[dir=none];\n";
        }
    }
private:
    nonterminal(const std::string& name)
        : symbol(name)
    {
    }
    std::vector<symbol::ptr> children_;
};

// A terminal, which is a leaf
class terminal : public symbol
{
public:
    terminal(const std::string& name)
        : symbol(name)
    {
    }
    static symbol::ptr create(const std::string& name)
    {
        return symbol::ptr(new terminal(name));
    }
    symbol::ptr clone() const override
    {
        return create(name_);
    }
    virtual symbol::ptr get_first_nonterminal_leaf() override
    {
        return nullptr;
    }
    virtual void append_child(symbol::ptr child) override
    {
    }
    void get_leaves(std::vector<std::string>& sentence) const override
    {
        sentence.push_back(name_);
    }
};

// A production tree
class tree
{
public:
    tree(symbol::ptr root)
        : root_(root)
    {
    }
    tree clone() const
    {
        symbol::ptr root = root_ ? root_->clone() : nullptr;
        return tree(root);
    }
    // Get the first non-terminal for substitution
    symbol::ptr get_first_nonterminal_leaf() const
    {
        if (root_) {
            return root_->get_first_nonterminal_leaf();
        }
        return nullptr;
    }
    // Get the sentence
    std::vector<std::string> get_leaves() const
    {
        std::vector<std::string> sentence;
        if (root_) {
            root_->get_leaves(sentence);
        }
        return sentence;
    }
    // Print in dot format
    void print_dot(std::ostream& os) const
    {
        os << "digraph G {\n";
        os << "\tnode[shape=plaintext];\n";
        unsigned int id = 0;
        unsigned int maxid = 0;
        if (root_) {
            root_->print_dot(os, id, maxid);
        }
        os << "}\n";
    }
private:
    symbol::ptr root_;
};

} // namespace MB

#endif // TREE_HPP

The production tree generator is very similar to the sentence generator described previously, except production trees are copied and modified rather than sentential forms.

#ifndef TREE_GENERATOR_HPP
#define TREE_GENERATOR_HPP

#include <vector>
#include <list>
#include <queue>
#include <algorithm>

#include <grammar.hpp>
#include <tree.hpp>

namespace MB
{

class tree_generator
{
public:
    tree_generator(const grammar& grammar)
        : grammar_(grammar)
    {
        // Store the left-hand sides
        std::transform(grammar_.rules().begin(), grammar_.rules().end(),
                std::back_inserter(lefts_),
                [](rule::const_ptr r)->const std::string& { return r->left(); }); 
    }
    template <class InputIt>
    InputIt generate(InputIt init, unsigned int n) const
    {
        std::queue<tree> queue;
        unsigned int sentences = 0;
        // Add the start symbol to the queue
        queue.push(tree(nonterminal::create(grammar_.get_start_left())));
        while (sentences < n && queue.size()) {
            tree sf = queue.front();
            queue.pop();
            // Find the first non-terminal
            symbol::ptr nt = sf.get_first_nonterminal_leaf();
            if (nt) {
                // Append each right-hand side alternative
                std::vector<rule::const_ptr> rules;
                grammar_.get_rules_for_left(nt->name(), std::back_inserter(rules));
                for (rule::const_ptr rule : rules) {
                    for (const std::vector<std::string>& alternative : rule->right()) {
                        tree new_sf = sf.clone();
                        nt = new_sf.get_first_nonterminal_leaf(); 
                        for (auto symbol : alternative) {
                            if (std::find(lefts_.begin(), lefts_.end(), symbol) != lefts_.end()) {
                                nt->append_child(nonterminal::create(symbol));
                            } 
                            else {
                                nt->append_child(terminal::create(symbol));
                            }
                        }
                        queue.push(new_sf);
                    }
                }
            }
            else {
                // Finished sentence
                *init++ = sf;
                ++sentences;
            }
        }
        return init;
    }
private:
    const grammar& grammar_;
    std::vector<std::string> lefts_;
};

} // namespace MB

#endif // TREE_GENERATOR_HPP

Here is an example program that generates the first 50 production trees and prints them in dot format after their associated sentence:

#include <fstream>
#include <iostream>
#include <vector>
#include <algorithm>

#include <grammar.hpp>
#include <tree_generator.hpp>

int main()
{
    std::string filename = "generator.dat";
    std::ifstream ifs(filename);
    if (ifs) {
        MB::grammar grammar(ifs);
        MB::tree_generator gen(grammar);
        std::vector<MB::tree > trees;
        gen.generate(std::back_inserter(trees), 50);
        for (const auto& t : trees) {
            std::vector<std::string> sentence = t.get_leaves();
            std::copy(sentence.begin(), sentence.end(), 
                    std::ostream_iterator<std::string>(std::cout, " "));
            std::cout << '\n';
            t.print_dot(std::cout);
        }
    }
    else {
        std::cerr << "Couldn't open " << filename << " for reading\n";
    }
}

Here is an example sentence and tree from the output

a dog barked
digraph G {
    node[shape=plaintext];
    Node0[label="S"];
    Node1[label="NP"];
    Node2[label="DET"];
    Node3[label="a"];
    Node2 -> Node3[dir=none];
    Node1 -> Node2[dir=none];
    Node4[label="N"];
    Node5[label="dog"];
    Node4 -> Node5[dir=none];
    Node1 -> Node4[dir=none];
    Node0 -> Node1[dir=none];
    Node6[label="VP"];
    Node7[label="V"];
    Node8[label="barked"];
    Node7 -> Node8[dir=none];
    Node6 -> Node7[dir=none];
    Node0 -> Node6[dir=none];
}

If you have graphviz installed, the dot language part can be converted to a png image by pasting it into a file (say “dog.dot”) and running the command:

dot -Tpng -O dog.dot 

This will generate a file called “dog.dot.png” containing the picture of the production tree like this:
Production tree for the phrase 'a dog barked'