practice · medium · from Dynamic Programming

Longest Common Subsequence

time
O(m · n)
space
O(min(m, n))
tags
dynamic-programming · strings

in the wild Every diff tool you've ever used (git diff, diff -u) computes LCS internally; bioinformatics sequence alignment (Needleman-Wunsch) is the weighted version.

Problem

Given two strings s1 and s2, return the length of their longest common subsequence. A subsequence is what you get by deleting zero or more characters (without changing order) — "ace" is a subsequence of "abcde".

Examples

Why this is the canonical 2D DP

This is the textbook first 2D DP problem because the recurrence is simple but the state is fundamentally 2-dimensional.

Let f(i, j) = LCS length of s1[0..i] and s2[0..j] (prefixes of length i and j respectively). The recurrence:

f(i, j) = 0                                          if i == 0 or j == 0
        = 1 + f(i-1, j-1)                             if s1[i-1] == s2[j-1]    (match → both advance)
        = max(f(i-1, j), f(i, j-1))                   otherwise                (skip one char from one side)

Read English: at any state (i, j), either the last characters match (in which case the optimum includes that pair, plus the optimum for the shorter prefixes) or they don't (in which case you must drop at least one — try dropping from each side and take the bigger).

The DP table is (m+1) × (n+1); each cell does O(1) work; total O(m·n). Space can collapse to O(min(m, n)) because each row only needs the previous row.

This is the engine inside every diff tool (git, hg, Mercurial, IDEs). Patch files are reconstructed from LCS by recording "kept" characters vs "deletions/insertions". Same algorithm, dressed in a different output format.

Hints

Hint 1 — find the state

You're matching two strings. What two indices do you need to describe "the subproblem"? At any point you've consumed a prefix of s1 and a prefix of s2 — the LCS so far depends only on those two prefix lengths.

Hint 2 — the two cases at (i, j)

Either s1[i-1] == s2[j-1] — in which case the optimum extends by 1, recursing on shorter prefixes of both. Or they're not equal — in which case at least one of them isn't in the LCS, so try dropping from each side and take the max.

Hint 3 — space optimization

The recurrence at row i only uses values from row i-1 (above) and row i (left). So keep only the previous row + the current one. Two arrays of size n+1. Or — trickier — one array plus a single scalar for the diagonal value.

Solution

def lcs(s1: str, s2: str) -> int:
    m, n = len(s1), len(s2)
    # Two-row rolling DP. prev[j] = f(i-1, j); cur[j] = f(i, j).
    prev = [0] * (n + 1)
    cur  = [0] * (n + 1)
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if s1[i - 1] == s2[j - 1]:
                cur[j] = prev[j - 1] + 1
            else:
                cur[j] = max(prev[j], cur[j - 1])
        prev, cur = cur, prev                   # swap row buffers
        # `cur` is now the old prev row, which will be overwritten next i.
    return prev[n]                              # after the final swap, prev holds row m

cases = [
    ("abcde",  "ace",     3),
    ("abc",    "abc",     3),
    ("abc",    "def",     0),
    ("AGGTAB", "GXTXAYB", 4),
    ("",       "abc",     0),
    ("abc",    "",        0),
]
for s1, s2, expected in cases:
    got  = lcs(s1, s2)
    mark = "✓" if got == expected else "✗"
    print(f"{mark} lcs({s1!r}, {s2!r}) = {got}  (expected {expected})")
function lcs(s1, s2) {
  const m = s1.length, n = s2.length;
  let prev = new Array(n + 1).fill(0);
  let cur  = new Array(n + 1).fill(0);
  for (let i = 1; i <= m; i++) {
    for (let j = 1; j <= n; j++) {
      if (s1[i - 1] === s2[j - 1]) cur[j] = prev[j - 1] + 1;
      else                          cur[j] = Math.max(prev[j], cur[j - 1]);
    }
    [prev, cur] = [cur, prev];
  }
  return prev[n];
}

const cases = [
  ["abcde",  "ace",     3],
  ["abc",    "abc",     3],
  ["abc",    "def",     0],
  ["AGGTAB", "GXTXAYB", 4],
  ["",       "abc",     0],
  ["abc",    "",        0],
];
for (const [s1, s2, expected] of cases) {
  const got = lcs(s1, s2);
  console.log(`${got === expected ? "✓" : "✗"} lcs(${JSON.stringify(s1)}, ${JSON.stringify(s2)}) = ${got}  (expected ${expected})`);
}
pub fn lcs(s1: &str, s2: &str) -> usize {
    let a: Vec<char> = s1.chars().collect();
    let b: Vec<char> = s2.chars().collect();
    let (m, n) = (a.len(), b.len());
    let mut prev = vec![0usize; n + 1];
    let mut cur  = vec![0usize; n + 1];
    for i in 1..=m {
        for j in 1..=n {
            cur[j] = if a[i - 1] == b[j - 1] {
                prev[j - 1] + 1
            } else {
                prev[j].max(cur[j - 1])
            };
        }
        std::mem::swap(&mut prev, &mut cur);
    }
    prev[n]
}

fn main() {
    let cases: &[(&str, &str, usize)] = &[
        ("abcde",  "ace",     3),
        ("abc",    "abc",     3),
        ("abc",    "def",     0),
        ("AGGTAB", "GXTXAYB", 4),
        ("",       "abc",     0),
        ("abc",    "",        0),
    ];
    for (s1, s2, expected) in cases {
        let got  = lcs(s1, s2);
        let mark = if got == *expected { "✓" } else { "✗" };
        println!("{mark} lcs({:?}, {:?}) = {}  (expected {})", s1, s2, got, expected);
    }
}
#include <algorithm>
#include <iostream>
#include <string>
#include <vector>

int lcs(const std::string& s1, const std::string& s2) {
    int m = (int)s1.size(), n = (int)s2.size();
    std::vector<int> prev(n + 1, 0), cur(n + 1, 0);
    for (int i = 1; i <= m; ++i) {
        for (int j = 1; j <= n; ++j) {
            if (s1[i - 1] == s2[j - 1]) cur[j] = prev[j - 1] + 1;
            else                          cur[j] = std::max(prev[j], cur[j - 1]);
        }
        std::swap(prev, cur);
    }
    return prev[n];
}

int main() {
    struct Case { std::string s1, s2; int expected; };
    std::vector<Case> cases = {
        {"abcde",  "ace",     3},
        {"abc",    "abc",     3},
        {"abc",    "def",     0},
        {"AGGTAB", "GXTXAYB", 4},
        {"",       "abc",     0},
        {"abc",    "",        0},
    };
    for (auto& c : cases) {
        int got = lcs(c.s1, c.s2);
        std::cout << (got == c.expected ? "✓" : "✗")
                  << " lcs(\"" << c.s1 << "\", \"" << c.s2 << "\") = " << got
                  << "  (expected " << c.expected << ")\n";
    }
}
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

static int imax(int a, int b) { return a > b ? a : b; }

int lcs(const char* s1, const char* s2) {
    int m = (int)strlen(s1), n = (int)strlen(s2);
    int* prev = calloc(n + 1, sizeof(int));
    int* cur  = calloc(n + 1, sizeof(int));
    for (int i = 1; i <= m; i++) {
        for (int j = 1; j <= n; j++) {
            if (s1[i - 1] == s2[j - 1]) cur[j] = prev[j - 1] + 1;
            else                         cur[j] = imax(prev[j], cur[j - 1]);
        }
        int* tmp = prev; prev = cur; cur = tmp;     // swap row buffers
        memset(cur, 0, (n + 1) * sizeof(int));
    }
    int ans = prev[n];
    free(prev); free(cur);
    return ans;
}

static void show(const char* s1, const char* s2, int expected) {
    int got = lcs(s1, s2);
    printf("%s lcs(\"%s\", \"%s\") = %d  (expected %d)\n",
           got == expected ? "✓" : "✗", s1, s2, got, expected);
}

int main(void) {
    show("abcde",  "ace",     3);
    show("abc",    "abc",     3);
    show("abc",    "def",     0);
    show("AGGTAB", "GXTXAYB", 4);
    show("",       "abc",     0);
    show("abc",    "",        0);
    return 0;
}

The rolling-row trick is what makes the space optimization work. A naïve two-dimensional dp[m+1][n+1] array uses O(m·n) space; the two-row rolling version uses O(n) — a savings that matters for million-character inputs (e.g. comparing two source files).

Complexity

Time
O(m · n) — fill an (m+1) × (n+1) table; each cell does O(1) work.
Space (full table)
O(m · n)
Space (rolling rows)
O(min(m, n)) — keep two rows of the shorter dimension.
Lower bound
The "no asymptotic improvement under SETH" result (Bringmann–Künnemann 2015) suggests O(m·n) is roughly optimal for arbitrary alphabets. For practical inputs, Hunt-Szymanski gives O((r + n) log n) where r is the # of matching pairs — fast when strings differ a lot.

In the wild

Variations