Tag Archives: Longest Common Subsequence

Shortest Common Supersequence using dynamic programming in C

The Shortest Common Supersequence (SCS) problem is to find the shortest sequence that is a supersequence of two or more sequences. It is closely related to the Longest Common Subsequence problem, in that the SCS of two strings is precisely their LCS, with those letters from each that are not in the LCS spliced into the string in their proper place relative to the SCS letters. For example, given the strings “pAqBrCs” and “wAxByCz”, their LCS is “ABC”, and an SCS is “wpAxqByrCzs”. Note that the order in which the letters occurring between the LCS letters is chosen from the two strings doesn’t matter, as long as the letters from each string are in the same order as they occur in it, so “pwAqxByrCsz” is also an SCS, and in general the SCS is not unique.

One way of computing an SCS is to use a dynamic programming algorithm similar to the Longest Common Subsequence one, but in which the letters occurring between the LCS letters are added to the solution. This is what I have implemented in C here. Rather than using pointers in the table to retrieve the solution as I have done in other DP algorithms, this time I have used enumerated values that indicate in which direction the next cell in the sequence is to be found – DOWN, RIGHT, or DIAG (diagonally down and to the right).

typedef enum  {
    DOWN,
    RIGHT,
    DIAG,
    STOP
} direction;
 
typedef struct {
	size_t len;
	char letter;
	direction dir;
} cell;
 
char *shortest_common_supersequence(const char *str1, const char *str2)
{
	size_t lstr1 = strlen(str1);
    size_t lstr2 = strlen(str2);
	cell **matrix;
    int i;
    int j;
    char *result;
    unsigned int r;

    matrix = malloc((lstr2 + 1) * sizeof(cell*));
    for (i = 0; i <= lstr2; i++) {
        matrix[i] = malloc((lstr1 + 1) * sizeof(cell));
    }
 
	for (i = 0; i < lstr2; i++) {
		matrix[i][lstr1].len = lstr2 - i;
        matrix[i][lstr1].letter = str2[i];
        matrix[i][lstr1].dir = DOWN;
    }
 
	for (j = 0; j < lstr1; j++) {
		matrix[lstr2][j].len = lstr1 - j;
        matrix[lstr2][j].letter = str1[j];
        matrix[lstr2][j].dir = RIGHT;
    }

	matrix[lstr2][lstr1].len = 0;
	matrix[lstr2][lstr1].letter = 0;
	matrix[lstr2][lstr1].dir = STOP;
 
	for (i = lstr2 - 1; i >= 0; i--) {
		for (j = lstr1 - 1; j >= 0; j--) {
			cell *pcell = &matrix[i][j];
			if (str2[i] == str1[j]) {
				pcell->dir = DIAG;
				pcell->letter = str1[j];
                pcell->len = matrix[i + 1][j + 1].len + 1;
			} 
            else if (matrix[i][j + 1].len < matrix[i + 1][j].len) {
				pcell->dir = RIGHT;
				pcell->letter = str1[j];
                pcell->len = matrix[i][j + 1].len + 1;
			}
            else {
				pcell->dir = DOWN;
				pcell->letter = str2[i];
                pcell->len = matrix[i + 1][j].len + 1;
			}
		}
	}
 
    result = malloc(matrix[0][0].len + 1);
    i = 0;
    j = 0;
    r = 0;
	while (i <= lstr2 && j <= lstr1) {
        result[r] = matrix[i][j].letter;
        r++;
        if (matrix[i][j].dir == DOWN) {
            i += 1;
        }
        else if (matrix[i][j].dir == RIGHT) {
            j += 1;
        }
        else {
            i += 1;
            j += 1;
        }
    }

    for (i = 0; i <= lstr2; i++) {
        free(matrix[i]);
    }
    free(matrix);
 
	return result;
}

Example program:

int main(void)
{
    char a[] = "pAqBrCs";
    char b[] = "wAxByCz";
    char *scs = shortest_common_supersequence(a, b);
    printf("%s\n", scs);
    free(scs);
    return 0;
}

Output:

wpAxqByrCzs

Longest Common Subsequence in C

This Longest Common Subsequence (LCS) algorithm is a classic dynamic programming table-filling algorithm.

static size_t longest_common_subsequence_internal(const char *str1, const char *str2, matrix **mat)
{
    int i, j;
    const size_t alen = strlen(str1);
    const size_t blen = strlen(str2);
    *mat = matrix_create(alen, blen);
    if (*mat == NULL) {
        return 0;
    }
    for (i = alen - 1; i >= 0; i--) {
	    for (j = blen - 1; j >= 0; j--) {
            if (str1[i] == str2[j]) {
                matrix_set(*mat, i, j, 1 + matrix_get(*mat, i + 1, j + 1));
            }
            else {
                matrix_set(*mat, i, j, max(matrix_get(*mat, i + 1, j), matrix_get(*mat, i, j + 1)));
            }
	    }
    }
    return matrix_get(*mat, 0, 0);
}

size_t longest_common_subsequence_length(const char *str1, const char *str2)
{
    matrix *mat;
    size_t result = longest_common_subsequence_internal(str1, str2, &mat);
    matrix_delete(mat);
	return result;
}

char *longest_common_subsequence(const char *str1, const char *str2)
{
    matrix *mat;
    size_t lcs_len = longest_common_subsequence_internal(str1, str2, &mat);
    unsigned int i = 0;
    unsigned int j = 0;
    unsigned int k = 0;
    const unsigned int alen = strlen(str1);
    const unsigned int blen = strlen(str2);
    char *lcs = malloc(lcs_len + 1);
    while (i < alen && j < blen) {
        if (str1[i] == str2[j]) {
            lcs[k] = str1[i];
            i++;
            j++;
            k++;
        }
        else if (matrix_get(mat, i + 1, j) >= matrix_get(mat, i ,j + 1)) {
            i++;
        }
        else {
            j++;
        }
    }
    lcs[k] = '\0';
    matrix_delete(mat);
    return lcs;
}

Example program:

int main(void)
{
    char str1[] = "pAqBrCs";
    char str2[] = "wAxByCz";
    char *lcs;
    lcs = longest_common_subsequence(str1, str2);
    printf("The longest common subsequence is \"%s\"\n", lcs);
    free(lcs);
    return 0;
}
The longest common subsequence is "ABC"