Binary search in Python

I needed to do binary search of a sorted list with a custom comparator, something that isn’t supported by the Python standard library, so I’ve written a suite of functions for the purpose. They’re modeled on the C++ standard library functions lower_bound(), upper_bound(), equal_range(), and binary_search().

lower_bound()

lower_bound() returns the index of the first element greater than or equal to the sought value, or the length of the list if it isn’t found:

def lower_bound(sequence, value, compare=cmp):
"""Find the index of the first element in sequence >= value"""
elements = len(sequence)
offset = 0
middle = 0
found = len(sequence)

while elements > 0:
middle = elements / 2
if compare(value, sequence[offset + middle]) > 0:
offset = offset + middle + 1
elements = elements - (middle + 1)
else:
found = offset + middle
elements = middle
return found

upper_bound()

upper_bound() returns the index of the first element greater than the sought value, or the length of the list if there is no such value:

def upper_bound(sequence, value, compare=cmp):
"""Find the index of the first element in sequence > value"""
elements = len(sequence)
offset = 0
middle = 0
found = 0

while elements > 0:
middle = elements / 2
if compare(value, sequence[offset + middle]) < 0:
elements = middle
else:
offset = offset + middle + 1
found = offset
elements = elements - (middle + 1)
return found

equal_range()

equal_range() returns a pair containing the results of lower_bound() and upper_bound(), so it defines the bounds of a slice whose elements are equal to the value:

def equal_range(sequence, value, compare=cmp):
"""Find the range of sequence equal to value"""
return lower_bound(sequence, value, compare), upper_bound(sequence, value, compare)

binary_search()

binary_search() just returns a Boolean value indicating whether or not a value is present:

def binary_search(sequence, value, compare=cmp):
"""Find out if value is in sequence"""
index = lower_bound(sequence, value, compare)
return index < len(sequence) and compare(value, sequence[index]) == 0

Shellsort in C

This is Robert Sedgewick’s implementation of shellsort, generalised to have the same interface as the standard qsort() function.

void shellsort_internal(void *arr, int left, int right, size_t element_size,
int (*compare)(const void *, const void *))
{
int i, j, k, h;
int incs[] = { 1391376, 463792, 198768, 86961, 33936,
13776, 4592, 1968, 861, 336,
112, 48, 21, 7, 3, 1 };
void *v = malloc(element_size);
if (!v) {
return;
}
for (k = 0; k < 16; k++) {
for (h = incs[k], i = left + h; i <= right; i++) {
memcpy(v, arr + i * element_size, element_size);
j = i;
while (j >= h && compare(arr + (j - h) * element_size, v) > 0) {
memcpy(arr + j * element_size, arr + (j - h) * element_size, element_size);
j -= h;
}
memcpy(arr + j * element_size, v, element_size);
}
}
free(v);
}

void shellsort(void *arr, size_t len, size_t element_size,
int (*compare)(const void *, const void *))
{
shellsort_internal(arr, 0, len - 1, element_size, compare);
}

Example program:

static int compare_chars(const void *v1, const void *v2)
{
const char *p1 = v1;
const char *p2 = v2;
if (*p1 < *p2) {
return -1;
}
if (*p1 > *p2) {
return 1;
}
return 0;
}

int main(void)
{
char str1[] = "QWERTYUIOP";
const size_t len = strlen(str1);
char *str2 = malloc(len + 1);
strcpy(str2, str1);
shellsort(str2, len, 1, compare_chars);
printf("%s\n", str2);
free(str2);
return 0;
}

Radix sort of strings in C

You can use radix sort to sort any kind of data that can be seen as a sequence of symbols. This includes strings, where the symbols are the individual letters, and numbers, where the symbols are the digits. Radix sort works by having a bucket for each value that a symbol can have, and putting data items into buckets according to the value of each symbol in the item in turn, starting with the rightmost. At the end of the sort, the items will be in order of length, and then in lexicographic order within each length class. This is called shortlex order.

An example will make this clearer. Supposing we have the set of words in the string “The quick brown fox jumps over the lazy dog”. Once sorted by radix sort they will be in the following order:

The
dog
fox
the
lazy
over
brown
jumps
quick

Notice that it is by length first, and then lexicographic.

The algorithm requires a bucket for each value that a character will have, so we have 256 buckets in order to accommodate all of ASCII characters.

We proceed by taking each character in the strings in turn, starting with the rightmost, so the first iteration of the algorithm will process the last letters of each word, which are [e, g, x, e, y, r, n, s, k]. We put the words into the buckets that correspond to these letters, so the buckets look like this:

 The -> the
 dog
 quick
 brown
 over
 jumps
 fox
 lazy

The bucket numbers are the ASCII codes of the characters. I have omitted empty buckets. Notice that “The” and “the” go in the same bucket because they both end in “e”. The buckets are implemented as linked lists so they can hold multiple entries.

Now, we collect the words together in bucket order, so we have a single list like: this:

The -> the -> dog -> quick -> brown -> over -> jumps -> fox -> lazy

We now consider the second letters of each word from the right, which are [h, h, o, c, w, e, p, o, z], and repeat the process of putting the words in buckets:

 quick
 over
 The -> the
 dog -> fox
 jumps
 brown
 lazy

Collecting the words together, we get:

quick -> over -> The -> the -> dog -> fox -> jumps -> brown -> lazy

The third letters from the right are [i, v, T, t, d, f, m, o, a], so the buckets are:

 The
 lazy
 dog
 fox
 quick
 jumps
 brown
 the
 over

The collected words are now:

The -> lazy -> dog -> fox -> quick -> jumps -> brown -> the -> over

“The”, “dog”, “fox”, and “the” only have three letters, so they have no fourth letter from the right. That’s fine, because we consider this letter to be 0, and we have a 0 bucket, so in the fourth iteration those four words end up in the 0 bucket, while the remaining words are bucketed according to their fourth letter from the right:

 The -> dog -> fox -> the
 lazy
 over
 brown
 quick -> jumps

The collected list is now:

The -> dog -> fox -> the -> lazy -> over -> brown -> quick -> jumps

Now we can see that we are making progress. The four three-letter words “The”, “dog”, “fox”, and “the” are now at the head of the list, and what’s more, they are in lexicographic order.

On the fifth iteration, “lazy” and “over” end up in the 0 bucket, and once again, they are in lexicographic order, and because the items were added to the buckets in order of the result of the previous iteration, they go after the four three letter words.

 The -> dog -> fox -> the -> lazy -> over
 brown
 jumps
 quick

Because the longest word is 5 letters, we have finished after 5 iterations. There is no need to do a sixth iteration to put the two five-letter words “jumps” and “quick” into the 0 bucket, because they are already sorted by the fifth letter, so we just collect the items from the buckets in the normal way and we are done:
Here is the collected list:

The -> dog -> fox -> the -> lazy -> over -> brown -> jumps -> quick

So you can see that the number of iterations of the algorithm is the number of characters in the longest word.

Below is the implementation:

/* A linked list node containing a string and length */
struct bucket_entry
{
char *str;
size_t len;
struct bucket_entry *next;
};
typedef struct bucket_entry bucket_entry;

struct bucket
{
bucket_entry *tail;
};
typedef struct bucket bucket;

/* Initialise buckets */
static void init_buckets(bucket *buckets)
{
unsigned int b;
for (b = 0; b < 256; b++) {
buckets[b].tail = NULL;
}
}

/*
* Turn entries into a linked list of words with their lengths
* Return the length of the longest word
*/
static size_t init_entries(char **strings, size_t len, bucket_entry *entries)
{
unsigned int s;
size_t maxlen = 0;
for (s = 0; s < len; s++) {
entries[s].str = strings[s];
entries[s].len = strlen(strings[s]);
if (entries[s].len > maxlen) {
maxlen = entries[s].len;
}
if (s < len - 1) {
entries[s].next = &entries[s + 1];
}
else {
entries[s].next = NULL;
}
}
return maxlen;
}

/* Put strings into buckets according to the character c from the right */
void bucket_strings(bucket_entry *head, bucket *buckets, unsigned int c)
{
while (current != NULL) {
bucket_entry * next = current->next;
current->next = NULL;
int pos = current->len - 1 - c;
unsigned char ch;
if (pos < 0) {
ch = 0;
}
else {
ch = current->str[pos];
}
buckets[ch].tail = current;
}
else {
buckets[ch].tail->next = current;
buckets[ch].tail = buckets[ch].tail->next;
}
current = next;
}
}

/* Merge buckets into a single linked list */
bucket_entry *merge_buckets(bucket *buckets)
{
bucket_entry *tail;
unsigned int b;
for (b = 0; b < 256; b++) {
tail = buckets[b].tail;
}
else {
tail = buckets[b].tail;
}
}
}
}

void print_buckets(const bucket *buckets)
{
unsigned int b;
for (b = 0; b < 256; b++) {
const bucket_entry *entry;
printf("[%d] ", b);
for (entry = buckets[b].head; entry != NULL; entry = entry->next) {
printf("%s", entry->str);
if (entry->next) {
printf(" -> ");
}
}
putchar('\n');
}
}
putchar('\n');
}

{
const bucket_entry *current;
for (current = head; current != NULL; current = current->next) {
printf("%s", current->str);
if (current->next != NULL) {
printf(" -> ");
}
}
printf("\n");
}

void copy_list(const bucket_entry *head, char **strings)
{
const bucket_entry *current;
unsigned int s;
for (current = head, s = 0; current != NULL; current = current->next, s++) {
strings[s] = current->str;
}
}

{
size_t maxlen;
unsigned int c;
bucket_entry *entries = malloc(len * sizeof(bucket_entry));
bucket *buckets = malloc(256 * sizeof(bucket));
if (!entries || !buckets) {
free(entries);
free(buckets);
return;
}
init_buckets(buckets);
maxlen = init_entries(strings, len, entries);
/* Main loop */
for (c = 0; c < maxlen; c++) {
printf("Bucketing on char %d from the right\n", c);
print_buckets(buckets);
init_buckets(buckets);
}
/* Copy back into array */
free(buckets);
free(entries);

And an example program:

int main(void)
{
char *strings[] = {"The", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"};
const size_t len = sizeof(strings) / sizeof(char*);
unsigned int s;
for (s = 0; s < len; s++) {
printf("%s\n", strings[s]);
}
return 0;
}

Quicksort in C

Quicksort was invented by Tony Hoare and is one of the fastest known sorting algorithms. This is an implementation in C using the Lomuto partition scheme described in the Cormen book.

void memswap(void *array, size_t element_size, unsigned int first, unsigned int second, void *temp)
{
if (first == second) {
return;
}
memcpy(temp, array + first * element_size, element_size);
memcpy(array + first * element_size, array + second * element_size, element_size);
memcpy(array + second * element_size, temp, element_size);
}

static unsigned int partition(char *arr, unsigned int low, unsigned int high, size_t element_size,
int (*compare)(const void *, const void *), void *temp)
{
void *pivot = arr + high * element_size;
unsigned int i = low;
unsigned int j;
for (j = low; j < high; j++) {
if (compare(arr + j * element_size, pivot) <= 0) {
memswap(arr, element_size, i, j, temp);
i++;
}
}
memswap(arr, element_size, i, high, temp);
return i;
}

static void quicksort_recursive(void *arr, unsigned int low, unsigned int high, size_t element_size,
int (*compare)(const void *, const void *), void *temp)
{
if (low < high) {
unsigned int p = partition(arr, low, high, element_size, compare, temp);
quicksort_recursive(arr, low, p - 1, element_size, compare, temp);
quicksort_recursive(arr, p + 1, high, element_size, compare, temp);
}
}

void quicksort(void *arr, size_t len, size_t element_size,
int (*compare)(const void *, const void *))
{
void *temp = malloc(element_size);
if (!temp) {
return;
}
quicksort_recursive(arr, 0, len - 1, element_size, compare, temp);
free(temp);
}

Example program:

static int compare_chars(const void *v1, const void *v2)
{
const char *p1 = v1;
const char *p2 = v2;
if (*p1 < *p2) {
return -1;
}
if (*p1 > *p2) {
return 1;
}
return 0;
}

int main(void)
{
char str1[] = "QWERTYUIOP";
const size_t len = strlen(str1);
char *str2 = malloc(len + 1);
strcpy(str2, str1);
quicksort(str2, len, 1, compare_chars);
printf("%s\n", str2);
free(str2);
return 0;
}

Mergesort in C

Mergesort was invented by John Von Neumann in 1945, and is the archetypal divide-and-conquer algorithm. This is an implementation in C, with the same interface as the standard qsort function.

void *memdup(const void *mem, size_t size)
{
void *buf = malloc(size);
if (buf) {
memcpy(buf, mem, size);
}
return buf;
}

void merge(void *arr, unsigned int left, unsigned int middle, unsigned int right,
size_t element_size, int (*compare)(const void *, const void *))
{
unsigned int i, j, k;
unsigned int nl = middle - left + 1;
unsigned int nr = right - middle;

void *larr = memdup(arr + left * element_size, nl * element_size);
void *rarr = memdup(arr + (middle + 1) * element_size, nr * element_size);

for (i = 0, j = 0, k = left; i < nl && j < nr; k++) {
if (compare(larr + i * element_size, rarr + j * element_size) <= 0) {
memcpy(arr + k * element_size, larr + i * element_size, element_size);
i++;
}
else {
memcpy(arr + k * element_size, rarr + j * element_size, element_size);
j++;
}
}

memcpy(arr + k * element_size, larr + i * element_size, (nl - i) * element_size);
k += nl - i;
memcpy(arr + k * element_size, rarr + j * element_size, (nr - j) * element_size);

free(larr);
free(rarr);
}

void mergesort_recursive(void *arr, unsigned int left, unsigned int right, size_t element_size,
int (*compare)(const void *, const void *))
{
if (left < right) {
unsigned int middle = (left + right) / 2;

mergesort_recursive(arr, left, middle, element_size, compare);
mergesort_recursive(arr, middle + 1, right, element_size, compare);

merge(arr, left, middle, right, element_size, compare);
}
}

void mergesort(void *arr, size_t len, size_t element_size,
int (*compare)(const void *, const void *))
{
mergesort_recursive(arr, 0, len - 1, element_size, compare);
}

An example program:

static int compare_chars(const void *v1, const void *v2)
{
const char *p1 = v1;
const char *p2 = v2;
if (*p1 < *p2) {
return - 1;
}
if (*p1 > *p2) {
return 1;
}
return 0;
}

int main(void)
{
char str1[] = "QWERTYUIOP";
const size_t len = strlen(str1);
char *str2 = malloc(len + 1);
strcpy(str2, str1);
mergesort(str2, len, 1, compare_chars);
printf("%s\n", str2);
free(str2);
return 0;
}