Tag Archives: Hash Table

Prefixes, suffixes, and substrings in C

Prefixes

To find all prefixes of a string, just find all substrings starting at the beginning of the string of length from 1 to the length of the string. A good way of returning these to the client is to allow the client to pass in a function pointer, which is called for each prefix found:

typedef void(*substring_fn)(const char *, size_t, void *);

void prefixes(const char *str, size_t len, substring_fn fun, void *data)
{
    unsigned int length;
    for (length = 1; length <= len; length++) {
        fun(str, length, data);
    }
}

The client function is given the beginning of the string and a length, rather than a nul-terminated string. This means that the original string does not need to be modified or copied, which is much more efficient. The client can use the length to do something with the string, like printing it using printf with the * format specifier to specify the length:

#include <stdio.h>
#include <string.h>

#include <substring.h>

void print(const char *str, size_t len, void *data)
{
    printf("%.*s\n", (int)len, str);
}

int main(void)
{
    char str[] = "abracadabra";
    prefixes(str, strlen(str), print, NULL);
    return 0;
}

This produces the following output:

a
ab
abr
abra
abrac
abraca
abracad
abracada
abracadab
abracadabr
abracadabra

The third argument to the client function is a void * pointer that the client can use to pass anything it needs into the function. For example, it might pass in a dynamic array and add the prefixes to that:

#include <stdio.h>
#include <string.h>

#include <substring.h>
#include <dynarray.h>

void add_to_dynarray(const char *str, size_t len, void *data)
{
    dynarray *array = data;
    char *token = calloc(len + 1, 1);
    memcpy(token, str, len);
    dynarray_add_tail(array, token);
}

int main(void)
{
    char str[] = "abracadabra";
    dynarray *array = dynarray_create(0);
    prefixes(str, strlen(str), add_to_dynarray, array);
    dynarray_for_each(array, (dynarray_forfn)puts);
    dynarray_for_each(array, free);
    dynarray_delete(array);
    return 0;
}

Suffixes

To find all suffixes of a string, walk a pointer from the beginning of the string to 1 before the end, passing the client the pointer and the length remaining to the end of the string.

void suffixes(const char *str, size_t len, substring_fn fun, void *data)
{
    unsigned int start;
    for (start = 0; start < len; start++) {
        fun(str + start, len - start, data);
    }
}
abracadabra
bracadabra
racadabra
acadabra
cadabra
adabra
dabra
abra
bra
ra
a

Substrings

A substring is simply a prefix of a suffix, or a suffix of a prefix, so you can find all substrings by finding all suffixes and then finding all of their prefixes, or vice-versa. We can use the function pointer interface above for this by wrapping the client function pointer and data in our own structure and passing that from prefixes to suffixes as the third argument as follows:

typedef struct {
    substring_fn fun;
    void *data;
} substrings_data;

void call_suffixes(const char *str, size_t len, void *data)
{
    substrings_data *sd = data;
    suffixes(str, len, sd->fun, sd->data);
}

void substrings(const char *str, size_t len, substring_fn fun, void *data)
{
    substrings_data sd = {fun, data};
    prefixes(str, len, call_suffixes, &sd);
}

This produces the following output:

a
ab
b
abr
br
r
abra
bra
ra
a
abrac
brac
rac
ac
c
abraca
braca
raca
aca
ca
a
abracad
bracad
racad
acad
cad
ad
d
abracada
bracada
racada
acada
cada
ada
da
a
abracadab
bracadab
racadab
acadab
cadab
adab
dab
ab
b
abracadabr
bracadabr
racadabr
acadabr
cadabr
adabr
dabr
abr
br
r
abracadabra
bracadabra
racadabra
acadabra
cadabra
adabra
dabra
abra
bra
ra
a

Unique substrings

The substrings function will produce all substrings, which can include duplicates, as you can see in the output above. You can use a set data structure like a hash table to find just the unique substrings. Again, we can make good use of the function pointer interface by wrapping the client function and the hashtable in a structure, and then passing that to hashtable‘s own iteration function hashtable_for_each2.

void add_to_hashtable(const char *str, size_t len, void *data)
{
    hashtable *table = data;
    char *buf = malloc(len + 1);
    strncpy(buf, str, len);
    buf[len] = '\0';
    buf = hashtable_add(table, buf);
    if (buf) {
        free(buf);
    }
}

void call_substring_fun(const char *substring, void *data)
{
    substrings_data *sd = data;
    sd->fun(substring, strlen(substring), data);
}

void unique_substrings(const char *str, size_t len, substring_fn fun, void *data)
{
    hashtable *table = hashtable_create(7, (hashtable_cmpfn)strcmp);
    substrings(str, len, add_to_hashtable, table);
    substrings_data sd = {fun, data};
    hashtable_for_each2(table, (hashtable_forfn2)call_substring_fun, &sd);
    hashtable_for_each(table, free);
    hashtable_delete(table);
}

The substrings produced are now unique:

abrac
abraca
abracad
acada
ad
adabr
b
brac
bracada
bracadab
cada
dabr
rac
acadabra
c
ca
abra
abracadabra
br
bracadabra
d
dabra
r
racada
racadabra
acad
adabra
bra
cadabra
da
ra
racad
racadabr
aca
acadabr
bracad
cad
ab
abr
abracada
abracadabr
adab
bracadabr
cadab
racadab
a
abracadab
ac
acadab
ada
braca
cadabr
dab
raca

Related

Hash table in C

This is a hash table in C using singly-linked lists for overflow chains.

Here is an example program:

#include <stdio.h>
#include <string.h>

#include <hashtable.h>

int main(void)
{
    hashtable * table;
    const char * result;
    unsigned int e;
    char * elements[] = {"A", "B", "C", "D", "E", "F"};
    const unsigned int n = sizeof(elements) / sizeof(const char*);

    table = hashtable_create(7, (hashtable_cmpfn)strcmp);
    for (e = 0; e < n; e++) {
        hashtable_add(table, elements[e]);
    }
    hashtable_for_each(table, (hashtable_forfn)puts);
    for (e = 0; e < n; e++) {
        result = hashtable_find(table, elements[e]);
        if (result) {
            printf("Found: %s\n", result);
        }
        else {
            printf("Couldn't find %s\n", elements[e]);
        }
    }
    printf("The load factor is %.2f\n", hashtable_get_load_factor(table));
    for (e = 0; e < n; e++) {
        result = hashtable_remove(table, elements[e]);
        if (result) {
            printf("Removed: %s\n", result);
        }
        else {
            printf("Couldn't remove %s\n", elements[e]);
        }
    }
    hashtable_delete(table);

    return 0;
}

The header file:

#ifndef HASHTABLE_H
#define HASHTABLE_H

struct hashnode {
	void * data;
	struct hashnode * next;
};
typedef struct hashnode hashnode;

typedef int (*hashtable_cmpfn)(const void*, const void*);
typedef unsigned long (*hashtable_hashfn)(const void*);
typedef void (*hashtable_forfn)(void *);
typedef void (*hashtable_forfn2)(void *, void *);

struct hashtable {
	unsigned int size;
	hashtable_hashfn hash;
	hashtable_cmpfn compare;
	hashnode **buckets;
	size_t count;
};
typedef struct hashtable hashtable;

hashtable *hashtable_create(size_t size, hashtable_cmpfn compare);
void hashtable_empty(hashtable * table);
void hashtable_delete(hashtable * table);
void *hashtable_add(hashtable * table, void * data);
void *hashtable_find(const hashtable * table, const void * data);
void *hashtable_remove(hashtable * table, const void * data);
float hashtable_get_load_factor(const hashtable * table);
unsigned int hashtable_get_count(const hashtable * table);
unsigned int hashtable_find_count(const hashtable *table);
void hashtable_for_each(const hashtable * table, hashtable_forfn fun);
void hashtable_for_each2(const hashtable * table, hashtable_forfn2 fun, void *data);
void hashtable_set_hashfn(hashtable * table, hashtable_hashfn hash);

#endif /* HASHTABLE_H */

The implementation:

#include <stdlib.h>

#include <hashtable.h>

hashnode * hashnode_create(void * data)
{
	hashnode * node = malloc(sizeof(hashnode));
	if (node) {
		node->data = data;
		node->next = NULL;
	}
	return node;
}

void hashnode_delete(hashnode * node)
{
	free(node);
}

static unsigned long sdbm(const char *str)
{
    unsigned long hash = 0;
    int c;

    while ((c = *str++))
        hash = c + (hash << 6) + (hash << 16) - hash;

    return hash;
}

hashtable * hashtable_create(size_t size, hashtable_cmpfn compare)
{
	hashtable * table = malloc(sizeof (hashtable));
	if (table) {
		table->size = size;
		table->hash = (hashtable_hashfn)sdbm;
		table->compare = compare;
		table->count = 0;
		table->buckets = malloc(size * sizeof(hashnode *));
		if (table->buckets) {
			unsigned int b;
			for (b = 0; b < size; b++) {
				table->buckets[b] = NULL;
			}
		}
		else {
			free(table);
			table = NULL;
		}
	}
	return table;
}

void hashtable_empty(hashtable * table)
{
	unsigned int i;
	hashnode * temp;
	for (i = 0; i < table->size; i++) {
		hashnode * current = table->buckets[i];
		while (current != NULL) {
			temp = current->next;
			hashnode_delete(current);
			current = temp;
		}
        table->buckets[i] = NULL;
	}
	table->count = 0;
}

void hashtable_delete(hashtable * table)
{
	if (table) {
		hashtable_empty(table);
		free(table->buckets);
		free(table);
	}
}

void * hashtable_add(hashtable * table, void * data)
{
	const unsigned int bucket = table->hash(data) % table->size;
	void * found = NULL;

	if (table->buckets[bucket] == NULL) {
		/* An empty bucket */
		table->buckets[bucket] = hashnode_create(data);
	}
	else {
        unsigned int added = 0;
		hashnode * current, * previous = NULL;
		for (current = table->buckets[bucket]; current != NULL && !found && !added; current = current->next) {
			const int result = table->compare(current->data, data);
			if (result == 0) {
				/* Changing an existing entry */
				found = current->data;
				current->data = data;
			}
			else if (result > 0) {
				/* Add before current */
				hashnode * node = hashnode_create(data);
				node->next = current;
				if (previous == NULL) {
					/* Adding at the front */
					table->buckets[bucket] = node;
				}
				else {
					previous->next = node;
				}
                added = 1;
			}
            previous = current;
		}
		if (!found && !added && current == NULL) {
			/* Adding at the end */
			previous->next = hashnode_create(data);
		}
	}
	if (found == NULL) {
		table->count++;
	}

	return found;
}

void * hashtable_find(const hashtable * table, const void * data)
{
	hashnode * current;
	const unsigned int bucket = table->hash(data) % table->size;
	void * found = NULL;
	unsigned int passed = 0;
	for (current = table->buckets[bucket]; current != NULL && !found && !passed; current = current->next) {
		const int result = table->compare(current->data, data);
		if (result == 0) {
			found = current->data;
		}
		else if (result > 0) {
			passed = 1;
		}
	}
	return found;
}

void * hashtable_remove(hashtable * table, const void * data)
{
	hashnode * current, * previous = NULL;
	const unsigned int bucket = table->hash(data) % table->size;
	void * found = NULL;
	unsigned int passed = 0;

	current = table->buckets[bucket];
	while (current != NULL && !found && !passed) {
		const int result = table->compare(current->data, data);
		if (result == 0) {
			found = current->data;
			if (previous == NULL) {
				/* Removing the first entry */
				table->buckets[bucket] = current->next;
			}
			else {
				previous->next = current->next;
			}
			hashnode_delete(current);
			table->count--;
		}
		else if (result > 0) {
			passed = 1;
		}
		else {
			previous = current;
	 		current = current->next;
		}
	}
	return found;
}


float hashtable_get_load_factor(const hashtable * table)
{
	unsigned int touched = 0;
	float loadfactor;
	unsigned int b;
	for (b = 0; b < table->size; b++) {
		if (table->buckets[b] != NULL) {
			touched++;
		}
	}
	loadfactor = (float)touched / (float)table->size;
	return loadfactor;
}

unsigned int hashtable_get_count(const hashtable * table)
{
	return table->count;
}

unsigned int hashtable_find_count(const hashtable *table)
{
	unsigned int b;
	const hashnode *node;
	unsigned int count = 0;
	for (b = 0; b < table->size; b++) {
		for (node = table->buckets[b]; node != NULL; node = node->next) {
			count++;
		}
	}
	return count;
}

void hashtable_for_each(const hashtable * table, hashtable_forfn fun)
{
	unsigned int b;

	for (b = 0; b < table->size; b++) {
		const hashnode *node;
		for (node = table->buckets[b]; node != NULL; node = node->next) {
			fun(node->data);
		}
	}
}


void hashtable_for_each2(const hashtable * table, hashtable_forfn2 fun, void *data)
{
	unsigned int b;

	for (b = 0; b < table->size; b++) {
		const hashnode *node;
		for (node = table->buckets[b]; node != NULL; node = node->next) {
			fun(node->data, data);
		}
	}
}

void hashtable_set_hashfn(hashtable * table, hashtable_hashfn hash)
{
    table->hash = hash;
}