The Aho-Corasick algorithm constructs a finite-state automaton (FSA) that can match multiple strings simultaneously. It begins by constructing a conventional prefix tree and then adds transition edges that allow the automaton to recover efficiently from failures.
Below is an implementation in C++. The constructor takes a pair of iterators to a sequence of strings, which the automaton will then be able to match. Once the automaton has been constructed, its search()
method can be used to search a string of text. It takes an output iterator, to which it writes pairs containing the strings found and their starting positions.
#ifndef AHO_CORASICK_HPP #define AHO_CORASICK_HPP #include <queue> #include <string> #include <vector> #include <set> namespace MB { namespace detail { // Maximum number of characters in alphabet static constexpr int MAXCHARS = 256; struct state { std::set<int> output_function; int failure_function; std::vector<int> goto_function; state() : failure_function(-1), goto_function(detail::MAXCHARS, -1) { } }; } // namespace detail class ac_automaton { public: template <class InputIt> ac_automaton(InputIt first, InputIt last) : states(1) { // Build the prefix tree for (InputIt it = first; it != last; ++it) { add_word(*it); } // Turn it into an automaton construct_automaton(); } template <class OutputIt> OutputIt search(std::string text, OutputIt it) const { int current_state = 0; for (int i = 0; i < text.size(); ++i) { current_state = find_next_state(current_state, text[i]); if (states[current_state].output_function.size()) { for (auto length : states[current_state].output_function) { *it++ = std::make_pair(std::string(text, i - length + 1, length), i - length + 1); } } } return it; } private: std::vector<detail::state> states; private: void add_word(const std::string &word) { int current_state = 0; // Add to prefix tree for (int c : word) { if (states[current_state].goto_function == -1) { states[current_state].goto_function = states.size(); states.push_back(detail::state()); } current_state = states[current_state].goto_function; } // Add to output function for this state states[current_state].output_function.insert(word.size()); } void construct_automaton() { // Complete the goto function by setting it to 0 for each // letter that doesn't have an edge from the root for (int c = 0; c < detail::MAXCHARS; ++c) { if (states[0].goto_function == -1) { states[0].goto_function = 0; } } // Compute failure and output functions std::queue<int> state_queue; for (int c = 0; c < detail::MAXCHARS; ++c) { if (states[0].goto_function != 0) { states[states[0].goto_function].failure_function = 0; state_queue.push(states[0].goto_function); } } while (state_queue.size()) { int s = state_queue.front(); state_queue.pop(); for (int c = 0; c < detail::MAXCHARS; ++c) { if (states[s].goto_function != -1) { int failure = states[s].failure_function; while (states[failure].goto_function == -1) { failure = states[failure].failure_function; } failure = states[failure].goto_function; states[states[s].goto_function].failure_function = failure; for (auto length : states[failure].output_function) { states[states[s].goto_function].output_function.insert(length); } state_queue.push(states[s].goto_function); } } } } int find_next_state(int current_state, char c) const { int next_state = current_state; while (states[next_state].goto_function == -1) { next_state = states[next_state].failure_function; } return states[next_state].goto_function; } }; } // namespace MB #endif // AHO_CORASICK_HPP
Here is an example program:
#include <iostream> #include "aho_corasick.hpp" int main() { std::string words[] = {"brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the"}; const size_t len = sizeof(words) / sizeof(words[0]); std::string text = "the quick brown fox jumps over the lazy dog"; MB::ac_automaton automaton(words, words + len); std::vector<std::pair<std::string, int> > results; automaton.search(text, std::back_inserter(results)); for (auto it = results.begin(); it != results.end(); ++it) { std::cout << it->first << " starting at " << it->second << '\n'; } return 0; }
Output:
the starting at 0 quick starting at 4 brown starting at 10 fox starting at 16 jumps starting at 20 over starting at 26 the starting at 31 lazy starting at 35 dog starting at 40
Reference: Biosequence Algorithms, Spring 2005 Lecture 4: Set Matching and Aho-Corasick Algorithm