In a previous post, I showed how to generate sentences from a grammar. We can go further and generate production trees. A production tree is just like a parse tree – it is a recursive decomposition of the grammatical structure of the sentence like the picture above.
The main algorithm for generating production trees is just the same as for generating sentences – using a queue and breadth-first search – but instead of substituting right-hand side symbols in place of the first non-terminal in the sentential form, we append child nodes to the first non-terminal leaf node in the production tree. A depth-first search is used to find the first non-terminal leaf node. Once there are no non-terminal leaf nodes, the production tree is complete.
The complete tree can be printed in the dot language and converted to a graphviz graph, like the picture above, or the sentence can be retrieved from it by using an in-order traversal to retrieve the leaf nodes, which in a completed tree are all of the terminals.
Below is the code for the production tree, which is a Composite made of three classes, the abstract symbol
class, and concrete subclasses nonterminal
and terminal
. The structure is pointer-based, so smart pointers are used throughout.
#ifndef TREE_HPP #define TREE_HPP #include <vector> #include <string> #include <memory> #include <iostream> namespace MB { // Abstract base class for grammar symbols class symbol { public: typedef std::shared_ptr<symbol> ptr; typedef std::shared_ptr<const symbol> const_ptr; symbol(const symbol&) = delete; symbol& operator=(const symbol&) = delete; virtual ptr clone() const = 0; virtual ptr get_first_nonterminal_leaf() = 0; virtual void append_child(ptr child) = 0; virtual void get_leaves(std::vector<std::string>&) const = 0; const std::string& name() const { return name_; } virtual void print_dot(std::ostream &os, unsigned int id, unsigned int& maxid) const { os << "\tNode" << id << "[label=\""; os << name_; os << "\"];\n"; } protected: symbol(const std::string& name) : name_(name) { } std::string name_; }; // A non-terminal, which can have children class nonterminal : public symbol, public std::enable_shared_from_this<nonterminal> { public: static symbol::ptr create(const std::string& name) { return symbol::ptr(new nonterminal(name)); } symbol::ptr clone() const override { symbol::ptr copy = create(name_); for (auto child : children_) { copy->append_child(child->clone()); } return copy; } symbol::ptr get_first_nonterminal_leaf() override { if (children_.empty()) { // If we have no children, we are the first non-terminal leaf return shared_from_this(); } else { // Search in children for (auto child : children_) { symbol::ptr p = child->get_first_nonterminal_leaf(); if (p) { return p; } } } // Not us or one of our children return nullptr; } void append_child(symbol::ptr child) override { children_.push_back(child); } void get_leaves(std::vector<std::string>& sentence) const override { for (auto child : children_) { child->get_leaves(sentence); } } void print_dot(std::ostream &os, unsigned int id, unsigned int& maxid) const override { symbol::print_dot(os, id, maxid); for (auto child : children_) { unsigned int child_id = ++maxid; child->print_dot(os, child_id, maxid); os << "\tNode" << id << " -> Node" << child_id << "[dir=none];\n"; } } private: nonterminal(const std::string& name) : symbol(name) { } std::vector<symbol::ptr> children_; }; // A terminal, which is a leaf class terminal : public symbol { public: terminal(const std::string& name) : symbol(name) { } static symbol::ptr create(const std::string& name) { return symbol::ptr(new terminal(name)); } symbol::ptr clone() const override { return create(name_); } virtual symbol::ptr get_first_nonterminal_leaf() override { return nullptr; } virtual void append_child(symbol::ptr child) override { } void get_leaves(std::vector<std::string>& sentence) const override { sentence.push_back(name_); } }; // A production tree class tree { public: tree(symbol::ptr root) : root_(root) { } tree clone() const { symbol::ptr root = root_ ? root_->clone() : nullptr; return tree(root); } // Get the first non-terminal for substitution symbol::ptr get_first_nonterminal_leaf() const { if (root_) { return root_->get_first_nonterminal_leaf(); } return nullptr; } // Get the sentence std::vector<std::string> get_leaves() const { std::vector<std::string> sentence; if (root_) { root_->get_leaves(sentence); } return sentence; } // Print in dot format void print_dot(std::ostream& os) const { os << "digraph G {\n"; os << "\tnode[shape=plaintext];\n"; unsigned int id = 0; unsigned int maxid = 0; if (root_) { root_->print_dot(os, id, maxid); } os << "}\n"; } private: symbol::ptr root_; }; } // namespace MB #endif // TREE_HPP
The production tree generator is very similar to the sentence generator described previously, except production trees are copied and modified rather than sentential forms.
#ifndef TREE_GENERATOR_HPP #define TREE_GENERATOR_HPP #include <vector> #include <list> #include <queue> #include <algorithm> #include <grammar.hpp> #include <tree.hpp> namespace MB { class tree_generator { public: tree_generator(const grammar& grammar) : grammar_(grammar) { // Store the left-hand sides std::transform(grammar_.rules().begin(), grammar_.rules().end(), std::back_inserter(lefts_), [](rule::const_ptr r)->const std::string& { return r->left(); }); } template <class InputIt> InputIt generate(InputIt init, unsigned int n) const { std::queue<tree> queue; unsigned int sentences = 0; // Add the start symbol to the queue queue.push(tree(nonterminal::create(grammar_.get_start_left()))); while (sentences < n && queue.size()) { tree sf = queue.front(); queue.pop(); // Find the first non-terminal symbol::ptr nt = sf.get_first_nonterminal_leaf(); if (nt) { // Append each right-hand side alternative std::vector<rule::const_ptr> rules; grammar_.get_rules_for_left(nt->name(), std::back_inserter(rules)); for (rule::const_ptr rule : rules) { for (const std::vector<std::string>& alternative : rule->right()) { tree new_sf = sf.clone(); nt = new_sf.get_first_nonterminal_leaf(); for (auto symbol : alternative) { if (std::find(lefts_.begin(), lefts_.end(), symbol) != lefts_.end()) { nt->append_child(nonterminal::create(symbol)); } else { nt->append_child(terminal::create(symbol)); } } queue.push(new_sf); } } } else { // Finished sentence *init++ = sf; ++sentences; } } return init; } private: const grammar& grammar_; std::vector<std::string> lefts_; }; } // namespace MB #endif // TREE_GENERATOR_HPP
Here is an example program that generates the first 50 production trees and prints them in dot format after their associated sentence:
#include <fstream> #include <iostream> #include <vector> #include <algorithm> #include <grammar.hpp> #include <tree_generator.hpp> int main() { std::string filename = "generator.dat"; std::ifstream ifs(filename); if (ifs) { MB::grammar grammar(ifs); MB::tree_generator gen(grammar); std::vector<MB::tree > trees; gen.generate(std::back_inserter(trees), 50); for (const auto& t : trees) { std::vector<std::string> sentence = t.get_leaves(); std::copy(sentence.begin(), sentence.end(), std::ostream_iterator<std::string>(std::cout, " ")); std::cout << '\n'; t.print_dot(std::cout); } } else { std::cerr << "Couldn't open " << filename << " for reading\n"; } }
Here is an example sentence and tree from the output
a dog barked digraph G { node[shape=plaintext]; Node0[label="S"]; Node1[label="NP"]; Node2[label="DET"]; Node3[label="a"]; Node2 -> Node3[dir=none]; Node1 -> Node2[dir=none]; Node4[label="N"]; Node5[label="dog"]; Node4 -> Node5[dir=none]; Node1 -> Node4[dir=none]; Node0 -> Node1[dir=none]; Node6[label="VP"]; Node7[label="V"]; Node8[label="barked"]; Node7 -> Node8[dir=none]; Node6 -> Node7[dir=none]; Node0 -> Node6[dir=none]; }
If you have graphviz installed, the dot language part can be converted to a png image by pasting it into a file (say “dog.dot”) and running the command:
dot -Tpng -O dog.dot
This will generate a file called “dog.dot.png” containing the picture of the production tree like this: