Tag Archives: CYK

CYK algorithm in C++

The Cocke-Younger-Kasami (CYK) algorithm is a dynamic programming algorithm for generalised parsing – i.e., parsing with ambiguous grammars where there may be more than one possible parse. The algorithm works by populating a table with the possible ways of parsing successively larger portions of the input. If the grammar’s start symbol is in the top cell of the table when it has been completed, the parse is successful.

Below is an implementation in C++. The class MB::cyk_parser is instantiated with a grammar (see Context-free grammar in C++). The parse() method attempts to recognise a sentence and displays the parsing table if it is called with a std::ostream.

The grammar must be in Chomsky Normal Form (CNF), which means that every left-hand side must be a single non-terminal symbol, and every right-hand side alternative must either be a single terminal symbol or a pair of non-terminal symbols. This is a requirement of the classic CYK algorithm, but some variants of the algorithm can handle grammars that are not in CNF.

The parsing table is an upper-right triangular matrix internally, but is printed as a lower-left triangular matrix, as this is easier to read.

#ifndef CYK_HPP
#define CYK_HPP

#include <vector>
#include <set>
#include <iostream>
#include <iterator>

#include <grammar.hpp>

namespace MB
{

class cyk_parser
{
public:
    cyk_parser(const MB::grammar& grammar)
        : grammar_(grammar)
    {
    }

    template <class InputIt>
    bool parse(InputIt begin, InputIt end)
    {
        return parse(begin, end, nullptr);
    }

    template <class InputIt>
    bool parse(InputIt begin, InputIt end, std::ostream& os)
    {
        return parse(begin, end, &os);
    }

private:
    template <class InputIt>
    bool parse(InputIt begin, InputIt end, std::ostream *os)
    {
        std::vector<std::string> str;
        std::copy(begin, end, std::back_inserter(str));
        const size_t len = str.size();
        std::vector<std::vector<std::set<std::string> > > table(len,
                std::vector<std::set<std::string> >(len));
        unsigned int i, j, k;

        // Populate main diagonal
        i = 0;
        for (const std::string& c : str) {
            std::vector<MB::rule::const_ptr> rules;
            grammar_.get_rules_for_symbol(c, std::back_inserter(rules));
            for (MB::rule::const_ptr r : rules) {
                table[i][i].insert(r->left());
            }
            i++;
        }
        // Populate upper-right triangle
        for (i = 1; i < len; i++) {
            for (j = i; j < len; j++) {
                for (k = j - i; k < j; k++) {
                    std::vector<std::pair<std::string, std::string> > product;
                    MB::detail::cartesian_product(table[j - i][k].begin(), table[j - i][k].end(), 
                            table[k + 1][j].begin(),
                            table[k + 1][j].end(), std::back_inserter(product));
                    std::vector<MB::rule::const_ptr> rules;
                    grammar_.get_rules_for_symbols(product.begin(), product.end(), 
                            std::back_inserter(rules));
                    for (MB::rule::const_ptr r : rules) {
                        table[j - i][j].insert(r->left());
                    }
                }
            }
        }
        if (os) {
            // Print the table 
            for (i = 0; i < len; i++) {
                k = 0;
                for (j = len - i - 1; j < len; j++) {
                    std::copy(table[k][j].begin(), table[k][j].end(), 
                            std::ostream_iterator<std::string>(*os, " "));
                    ++k;
                    *os << '\t';
                }
                *os << '\n';
            }
            for (const std::string& c : str) {
                *os << c << '\t';
            }
            *os << '\n';
        }
             
        // Successful if the start symbol is in the top-right cell    
        return table[0][len - 1].find(grammar_.get_start_left()) != table[0][len - 1].end();
    }

private:
    const MB::grammar& grammar_;
};

} // namespace MB

#endif // CYK_HPP

Here is an example program:

#include <fstream>
#include <iostream>

#include <grammar.hpp>
#include <cyk.hpp>

int main()
{
    std::string filename = "cyk.dat";
    std::ifstream ifs(filename);
    if (ifs) {
        MB::grammar grammar(ifs);
        std::string sentence[] = {"b", "a", "a", "b", "a"};
        const size_t len = sizeof(sentence) / sizeof(sentence[0]);
        bool success = MB::cyk_parser(grammar).parse(sentence, sentence + len, std::cout);
        std::cout << "Success: " << std::boolalpha << success << '\n';
    }
    else {
        std::cerr << "Couldn't open " << filename << " for reading\n";
    }
}

This is the grammar file:

S -> A B | B C
A -> B A | a
B -> C C | b
C -> A B | a

Here is the output. Notice the start symbol S in the top-left cell, which indicates a successful parse:

A C S
        A C S
        B       B
A S     B       C S     A S
B       A C     A C     B       A C
b       a       a       b       a
Success: true

Reference: The CYK Algorithm