# Boyer-Moore search of a list for a sub-list in Python

I had the need to search a list for a sub-list in Python, rather like std::search() in C++, but there isn’t a built-in equivalent in Python. A naive way of implementing it would be to try to match the sub-list against every position in the list, but that wouldn’t be very efficient. An efficient algorithm for string searching is the Boyer-Moore algorithm, and this can be adapted to search a list.

Below is an implementation of Boyer-Moore for searching a list in Python. It’s based on the Wikipedia article. The function search() takes a list to search (haystack), and a sub-list to search for (needle), and returns the starting index of needle, or -1 if it isn’t found.

def search(haystack, needle):
"""
Search list haystack for sub-list needle.
"""
if len(needle) == 0:
return 0
char_table = make_char_table(needle)
offset_table = make_offset_table(needle)
i = len(needle) - 1
while i < len(haystack):
j = len(needle) - 1
while needle[j] == haystack[i]:
if j == 0:
return i
i -= 1
j -= 1
i += max(offset_table[len(needle) - 1 - j], char_table.get(haystack[i]));
return -1

def make_char_table(needle):
"""
Makes the jump table based on the mismatched character information.
"""
table = {}
for i in range(len(needle) - 1):
table[needle[i]] = len(needle) - 1 - i
return table

def make_offset_table(needle):
"""
Makes the jump table based on the scan offset in which mismatch occurs.
"""
table = []
last_prefix_position = len(needle)
for i in reversed(range(len(needle))):
if is_prefix(needle, i + 1):
last_prefix_position = i + 1
table.append(last_prefix_position - i + len(needle) - 1)
for i in range(len(needle) - 1):
slen = suffix_length(needle, i)
table[slen] = len(needle) - 1 - i + slen
return table

def is_prefix(needle, p):
"""
Is needle[p:end] a prefix of needle?
"""
j = 0
for i in range(p, len(needle)):
if needle[i] != needle[j]:
return 0
j += 1
return 1

def suffix_length(needle, p):
"""
Returns the maximum length of the substring ending at p that is a suffix.
"""
length = 0;
j = len(needle) - 1
for i in reversed(range(p + 1)):
if needle[i] == needle[j]:
length += 1
else:
break
j -= 1
return length



An example program:

def main():
haystack = [0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1]
needle = [0, 0, 1]
index = search(haystack, needle)
print(index)

if __name__ == '__main__':
main()


Output:

4


# Aho-Corasick algorithm in C++

The Aho-Corasick algorithm constructs a finite-state automaton (FSA) that can match multiple strings simultaneously. It begins by constructing a conventional prefix tree and then adds transition edges that allow the automaton to recover efficiently from failures.

Below is an implementation in C++. The constructor takes a pair of iterators to a sequence of strings, which the automaton will then be able to match. Once the automaton has been constructed, its search() method can be used to search a string of text. It takes an output iterator, to which it writes pairs containing the strings found and their starting positions.

#ifndef AHO_CORASICK_HPP
#define AHO_CORASICK_HPP

#include <queue>
#include <string>
#include <vector>
#include <set>

namespace MB
{

namespace detail
{

// Maximum number of characters in alphabet
static constexpr int MAXCHARS = 256;

struct state
{
std::set<int> output_function;
int failure_function;
std::vector<int> goto_function;
state()
: failure_function(-1),
goto_function(detail::MAXCHARS, -1)
{
}
};

} // namespace detail

class ac_automaton
{
public:
template <class InputIt>
ac_automaton(InputIt first, InputIt last)
: states(1)
{
// Build the prefix tree
for (InputIt it = first; it != last; ++it) {
}
// Turn it into an automaton
construct_automaton();
}

template <class OutputIt>
OutputIt search(std::string text, OutputIt it) const
{
int current_state = 0;

for (int i = 0; i < text.size(); ++i) {
current_state = find_next_state(current_state, text[i]);
if (states[current_state].output_function.size()) {
for (auto length : states[current_state].output_function) {
*it++ = std::make_pair(std::string(text, i - length + 1, length),
i - length + 1);
}
}
}
return it;
}

private:
std::vector<detail::state> states;

private:
{
int current_state = 0;
for (int c : word) {
if (states[current_state].goto_function == -1) {
states[current_state].goto_function = states.size();
states.push_back(detail::state());
}
current_state = states[current_state].goto_function;
}
// Add to output function for this state
states[current_state].output_function.insert(word.size());
}

void construct_automaton()
{
// Complete the goto function by setting it to 0 for each
// letter that doesn't have an edge from the root
for (int c = 0; c < detail::MAXCHARS; ++c) {
if (states.goto_function == -1) {
states.goto_function = 0;
}
}

// Compute failure and output functions
std::queue<int> state_queue;
for (int c = 0; c < detail::MAXCHARS; ++c) {
if (states.goto_function != 0) {
states[states.goto_function].failure_function = 0;
state_queue.push(states.goto_function);
}
}
while (state_queue.size()) {
int s = state_queue.front();
state_queue.pop();

for (int c = 0; c < detail::MAXCHARS; ++c) {
if (states[s].goto_function != -1) {
int failure = states[s].failure_function;
while (states[failure].goto_function == -1) {
failure = states[failure].failure_function;
}
failure = states[failure].goto_function;
states[states[s].goto_function].failure_function = failure;
for (auto length : states[failure].output_function) {
states[states[s].goto_function].output_function.insert(length);
}
state_queue.push(states[s].goto_function);
}
}
}
}

int find_next_state(int current_state, char c) const
{
int next_state = current_state;

while (states[next_state].goto_function == -1) {
next_state = states[next_state].failure_function;
}

return states[next_state].goto_function;
}

};

} // namespace MB

#endif // AHO_CORASICK_HPP



Here is an example program:

#include <iostream>

#include "aho_corasick.hpp"

int main()
{
std::string words[] = {"brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the"};
const size_t len = sizeof(words) / sizeof(words);
std::string text = "the quick brown fox jumps over the lazy dog";

MB::ac_automaton automaton(words, words + len);
std::vector<std::pair<std::string, int> > results;

automaton.search(text, std::back_inserter(results));
for (auto it = results.begin(); it != results.end(); ++it) {
std::cout << it->first << " starting at " << it->second << '\n';
}

return 0;
}


Output:

the starting at 0
quick starting at 4
brown starting at 10
fox starting at 16
jumps starting at 20
over starting at 26
the starting at 31
lazy starting at 35
dog starting at 40


# Huffman coding in C

Huffman coding is a compression method which generates variable-length codes for data – the more frequent the data item, the shorter the code generated. This allows more efficient compression than fixed-length codes. This is an implementation of the algorithm in C. The function huffman() takes arrays of letters and their frequencies, the length of the arrays, and a callback which is called for each code generated. The algorithm requires a priority queue, and I used a min-heap for the purpose.

#include <stdlib.h>

#include <minheap.h>

struct huffman_node {
char data;
unsigned int frequency;
struct huffman_node *left;
struct huffman_node *right;
};
typedef struct huffman_node huffman_node;

huffman_node *huffman_node_create(char data, unsigned int frequency)
{
huffman_node *node = malloc(sizeof(huffman_node));
if (node) {
node->data = data;
node->frequency = frequency;
node->left = NULL;
node->right = NULL;
}
return node;
}

void huffman_node_delete(huffman_node *node)
{
if (node) {
huffman_node_delete(node->left);
huffman_node_delete(node->right);
free(node);
}
}

unsigned int max(unsigned int a, unsigned int b)
{
return a > b ? a : b;
}

unsigned int huffman_node_height(const huffman_node *node)
{
unsigned int height = 0;
if (node->left || node->right) {
height = max(node->left ? huffman_node_height(node->left) : 0,
node->right ? huffman_node_height(node->right) : 0) + 1;
}
return height;
}

void huffman_node_print(const huffman_node *node, unsigned int indent)
{
unsigned int i;
for (i = 0; i < indent; i++) {
printf("  ");
}
printf("%c %u\n", node->data, node->frequency);
if (node->left != NULL) {
huffman_node_print(node->left, indent + 1);
}
if (node->right != NULL) {
huffman_node_print(node->right, indent + 1);
}
}

typedef void huffmanfn(char, const unsigned int *, size_t);

void huffman_node_encodings(const huffman_node *node, unsigned int *arr,
unsigned int pos, huffmanfn fun)
{
if (node->left) {
arr[pos] = 0;
huffman_node_encodings(node->left, arr, pos + 1, fun);
}
if (node->right) {
arr[pos] = 1;
huffman_node_encodings(node->right, arr, pos + 1, fun);
}
if (!(node->left || node->right)) {
fun(node->data, arr, pos);
}
}

void huffman(const char *letters, const int *frequencies, size_t size, huffmanfn fun)
{
minheap *heap = minheap_create();
unsigned int i;
huffman_node *top;
unsigned int *arr;
/* Populate the heap */
for (i = 0; i < size; i++) {
}
/* Build the tree */
while (minheap_get_count(heap) != 1) {
huffman_node *left = minheap_remove_min(heap);
huffman_node *right = minheap_remove_min(heap);
top = huffman_node_create('\$', left->frequency + right->frequency);
top->left = left;
top->right = right;
}
top = minheap_remove_min(heap);
/* Generate the encodings */
arr = malloc(huffman_node_height(top) * sizeof(unsigned int));
huffman_node_encodings(top, arr, 0, fun);
/* Clean up */
huffman_node_delete(top);
free(arr);
minheap_delete(heap);
}


Example program:

void print(char letter, const unsigned int *arr, size_t len)
{
unsigned int i;
printf("%c: ", letter);
for (i = 0; i < len; i++) {
printf("%u", arr[i]);
}
putchar('\n');
}

int main(void)
{
char letters[] = {'a', 'b', 'c', 'd', 'e', 'f'};
int frequencies[] = {45, 13, 12, 16, 9, 5};
const size_t size = sizeof(letters) / sizeof(letters);
huffman(letters, frequencies, size, print);
return 0;
}


The output:

a: 0
c: 100
b: 101
f: 1100
e: 1101
d: 111


# Min-Heap in C

This is a binary min-heap using a dynamic array for storage.

#ifndef MINHEAP_H
#define MINHEAP_H

#include <dynarray.h>

struct entry
{
void *item;
unsigned int value;
};
typedef struct entry entry;

struct minheap
{
dynarray *entries;
};
typedef struct minheap minheap;

typedef void(*minheap_forfn)(void*);

minheap *minheap_create(void);
void minheap_delete(minheap *heap);
void minheap_add(minheap *heap, void *item, unsigned int value);
void *minheap_remove_min(minheap *heap);
void minheap_for_each(const minheap *heap, minheap_forfn fun);
unsigned int minheap_get_count(const minheap *heap);

#endif /* MINHEAP_H */


Implementation:

#include <stdlib.h>

#include <minheap.h>

static entry *entry_create(void *item, unsigned int value)
{
entry *e = malloc(sizeof(entry));
if (e) {
e->item = item;
e->value = value;
}
return e;
}

minheap *minheap_create(void)
{
minheap *heap = malloc(sizeof(minheap));
if (heap) {
heap->entries = dynarray_create();
}
return heap;
}

void minheap_delete(minheap *heap)
{
if (heap) {
dynarray_for_each(heap->entries, free);
dynarray_delete(heap->entries);
free(heap);
}
}

static void minheap_swap(minheap *heap, unsigned int index1, unsigned int index2)
{
void *temp = dynarray_get(heap->entries, index1);
dynarray_set(heap->entries, index1, dynarray_get(heap->entries, index2));
dynarray_set(heap->entries, index2, temp);
}

static void minheap_bubble_up(minheap *heap, unsigned int index)
{
entry *e = dynarray_get(heap->entries, index);
unsigned int parent_index = (index - 1) / 2;
entry *parent = dynarray_get(heap->entries, parent_index);
if (e->value < parent->value) {
minheap_swap(heap, index, parent_index);
if (parent_index > 0) {
minheap_bubble_up(heap, parent_index);
}
}
}

static void minheap_bubble_down(minheap *heap, unsigned int index)
{
entry *e = dynarray_get(heap->entries, index);
unsigned int left_child_index = (index * 2) + 1;
unsigned int right_child_index = left_child_index + 1;
unsigned int swapped = 0;
unsigned int swapped_index;
if (right_child_index < dynarray_get_count(heap->entries) /* There is a right child */
&& ((entry*)dynarray_get(heap->entries, right_child_index))->value
< ((entry*)dynarray_get(heap->entries, left_child_index))->value
/* And it's less than left child */
&& e->value > ((entry*)dynarray_get(heap->entries, right_child_index))->value) {
minheap_swap(heap, index, right_child_index);
swapped = 1;
swapped_index = right_child_index;
}
else if (e->value > ((entry*)dynarray_get(heap->entries, left_child_index))->value) {
minheap_swap(heap, index, left_child_index);
swapped = 1;
swapped_index = left_child_index;
}
if (swapped && (swapped_index * 2) + 1 < dynarray_get_count(heap->entries) - 1) {
minheap_bubble_down(heap, swapped_index);
}
}

void minheap_add(minheap *heap, void *item, unsigned int value)
{
entry *e = entry_create(item, value);
if (e) {
unsigned int count = dynarray_get_count(heap->entries);
if (count > 1) {
minheap_bubble_up(heap, count - 1);
}
}
}

void *minheap_remove_min(minheap *heap)
{
void *item = NULL;
unsigned int count = dynarray_get_count(heap->entries);
if (count > 1) {
minheap_swap(heap, 0, count - 1);
}
if (count > 0) {
entry *e = dynarray_remove_tail(heap->entries);
item = e->item;
free(e);
}
if (dynarray_get_count(heap->entries) > 1) {
minheap_bubble_down(heap, 0);
}
return item;
}

void minheap_for_each(const minheap *heap, minheap_forfn fun)
{
dynarray_for_each(heap->entries, (dynarray_forfn)fun);
}

unsigned int minheap_get_count(const minheap *heap)
{
return dynarray_get_count(heap->entries);
}


Example program:

#include <stdio.h>

#include <minheap.h>

int main(void)
{
minheap *heap = minheap_create();
unsigned int numbers;
unsigned int i;
for (i = 0; i < 10; i++) {
numbers[i] = i;
}
printf("Count is %u\n", minheap_get_count(heap));
for (i = 0; i < 10; i++) {
const int *e = minheap_remove_min(heap);
if (e) {
printf("%d\n", *e);
}
}
printf("Count is now %u\n", minheap_get_count(heap));
minheap_delete(heap);
return 0;
}


Output:

Count is 10
0
1
2
3
4
5
6
7
8
9
Count is now 0


# Subset-Sum using dynamic programming in C

The Subset-Sum problem is to determine, given a set of integers, whether there is a subset that sums to a given value. The problem is NP-complete, but can be solved in pseudo-polynomial time using dynamic programming.

Below is an implementation in C. The algorithm works by filling in a table. The rows of the table correspond to values from 0 to the target value, while the columns correspond to elements of the set. Each cell in the table repesents a solution for that row’s total using the elements chosen from the columns to the left. When the table has been filled, the truth value in the bottom-right-hand cell will indicate whether there is a successful solution. I have used chains of pointers within the table to allow the elements in the solution to be read back.

struct entry {
unsigned int truth;
int element;
unsigned int count;
struct entry *prev;
};
typedef struct entry entry;

unsigned int subset_sum(const unsigned int *weights, size_t len, unsigned int target,
unsigned int **solution)
{
entry **table;
unsigned int i, j;
unsigned int count = 0;

table = malloc((target + 1) * sizeof(entry *));
for (i = 0; i <= target; i++) {
table[i] = malloc((len + 1 ) * sizeof(entry));
}

for (i = 0; i <= target; i++) {
for (j = 0; j <= len; j++) {
/* First row */
if (i == 0) {
table[i][j].truth = 1;
table[i][j].element = -1;
table[i][j].count = 0;
table[i][j].prev = NULL;
}
else if (j == 0) {
/* First column */
table[i][j].truth = 0;
table[i][j].element = -1;
table[i][j].count = 0;
table[i][j].prev = NULL;
}
else {
/* Initialise for don't add element case */
table[i][j].truth = table[i][j-1].truth;
table[i][j].prev = &table[i][j - 1];
table[i][j].element = -1;
table[i][j].count = table[i][j - 1].count;
if (i >= weights[j-1]) {
if (!table[i][j].truth) {
table[i][j].truth = table[i - weights[j-1]][j-1].truth;
table[i][j].element = j - 1;
table[i][j].count = table[i - weights[j-1]][j-1].count + 1;
table[i][j].prev = &table[i - weights[j-1]][j-1];
}
}
}
}
}

if (!table[target][len].truth) {
/* No solution */
*solution = NULL;
}
else {
/* Solution */
*solution = calloc(len, sizeof(unsigned int));
}
if (*solution) {
count = table[target][len].count;
}
}
}

for (i = 0; i <= target; i++) {
free(table[i]);
}
free(table);

return count;
}


Here is an example program:

int main(void)
{
unsigned int weights[] = {5, 7, 1, 3, 11, 9};
unsigned int target = 14;
const size_t len = sizeof(weights) / sizeof(weights);
unsigned int *solution;
const unsigned int count = subset_sum(weights, len, target, &solution);
if (count) {
unsigned int i;
printf("Found a solution\n");
printf("Elements:\n");
for (i = 0; i < len; i++) {
if (solution[i]) {
printf("%u ", weights[i]);
}
}
putchar('\n');
}
else {
printf("No solution\n");
}
free(solution);

return 0;
}


The output:

Found a solution
Elements:
3 11


# Making change using dynamic programming in C

A coin system is canonical if the greedy algorithm for making change is optimal for all values. All coin systems in the world are canonical for obvious reasons. If a coin system is not canonical, the change-making problem becomes NP-hard. We can use dynamic programming to solve the change-making problem for abitrary coin systems.

This is another table-filling algorithm. The table has a column for every value from 0 to the target value, and a column for every coin. Each cell will be filled with the minimum number of the coins from that row upwards that are needed to make that value. When the table has been filled, the minimum number of coins for the target value can be read from the bottom-right-hand cell.

Below is an implemention in C. As with other dynamic programming algorithms, I have used pointers within the table to link each cell to the cell from which its value was derived. These pointers can then be used to read back how many of each coin was used in producing the solution. The function make_change() takes an array of coin values, its length, a target value, and the address of a pointer to which to assign the solution array. The solution array has an entry for each coin and indicates how many of each coin were used in the solution.

#include <stdlib.h>

struct change_entry {
unsigned int count;
int coin;
struct change_entry *prev;
};
typedef struct change_entry change_entry;

unsigned int make_change(const unsigned int *coins, size_t len, unsigned int value,
unsigned int **solution)
{
unsigned int i, j;
change_entry **table;
unsigned int result;

/* Allocate the table */
table = malloc((len + 1) * sizeof(change_entry *));
for (i = 0; i <= len; i++) {
table[i] = malloc((value + 1) * sizeof(change_entry));
}

/* Calculate the values and build chains */
for (i = 0; i <= len; i++) {
for (j = 0; j <= value; j++) {
if (i == 0) {
/* Initialising the first row */
table[i][j].count = j;
table[i][j].coin = -1;
table[i][j].prev = NULL;
}
else if (j == 0) {
/* Initialising the first column */
table[i][j].count = 0;
table[i][j].coin = -1;
table[i][j].prev = NULL;
}
else if (coins[i - 1] == j) {
/* Can use coin */
table[i][j].count = 1;
table[i][j].coin = i - 1;
table[i][j].prev = NULL;
}
else if (coins[i - 1] > j) {
/* Can't use coin */
table[i][j].count = table[i - 1][j].count;
table[i][j].coin = -1;
table[i][j].prev = &table[i - 1][j];
}
else {
/* Can use coin - choose best solution */
if (table[i - 1][j].count < table[i][j - coins[i - 1]].count + 1) {
/* Don't use coin */
table[i][j].count = table[i - 1][j].count;
table[i][j].coin = -1;
table[i][j].prev = &table[i - 1][j];
}
else {
/* Use coin */
table[i][j].count = table[i][j - coins[i - 1]].count + 1;
table[i][j].coin = i - 1;
table[i][j].prev = &table[i][j - coins[i - 1]];
}
}
}
}
result = table[len][value].count;
/* Read back the solution */
*solution = calloc(len, sizeof(unsigned int));
if (*solution) {
}
}
}
else {
result = 0;
}

for (i = 0; i <= len; i++) {
free(table[i]);
}
free(table);

return result;
}


Here is an example program:

int main(void)
{
unsigned int coins[] = {1, 2, 5, 10, 20, 50, 100};
const size_t len = sizeof(coins) / sizeof(coins);
const unsigned int value = 252;
unsigned int *solution;
unsigned int result = make_change(coins, len, value, &solution);
unsigned int i;
printf("Number of coins: %u\n", result);
printf("Coins used:\n");
for (i = 0; i < len; i++) {
if (solution[i] > 0) {
printf("%u x %u\n", solution[i], coins[i]);
}
}
free(solution);
return 0;
}


The output:

Number of coins: 4
Coins used:
1 x 2
1 x 50
2 x 100