practice · hard · from Heaps & Priority Queues

Median of a Data Stream

time
O(log n) per insert, O(1) per query
space
O(n)
tags
heaps · two-heaps · streaming

in the wild Latency percentiles in a streaming monitoring system, online sensor median (robust to outlier spikes), real-time score median for fairness.

Two heaps · running median

step 0 / 0

Problem

Implement a MedianFinder class with two methods:

The median of a sorted list is the middle element if the count is odd, or the average of the two middle elements if even.

Examples

add_num(1)        find_median() → 1.0
add_num(2)        find_median() → 1.5     ((1 + 2) / 2)
add_num(3)        find_median() → 2.0
add_num(0)        find_median() → 1.5     ((1 + 2) / 2)
add_num(5)        find_median() → 2.0

Why this is the canonical "two-heap trick" problem

Storing the stream sorted is O(n) per insert. A balanced BST gives O(log n) insert + O(log n) lookup, but the two-heap trick is cleaner and faster:

Split the stream into two halves of (nearly) equal size:

Maintain two invariants on every insert:

  1. Order: every value in lo ≤ every value in hi. (Equivalent to: lo.top() ≤ hi.top().)
  2. Size balance: |lo.size − hi.size| ≤ 1. Convention: lo may have one more than hi, never the other way.

Then the median is lo.top() (odd total) or the average of the two tops (even total). Each insert does one push + zero-or-one cross-heap transfer — O(log n). The query is O(1).

This is one of the most elegant "use two data structures together to get something neither could give alone" patterns. Once you see it, you spot it everywhere — load balancers maintaining "lightly-loaded" and "heavily-loaded" pools, sliding-window medians, dual-priority job queues.

Hints

Hint 1 — what to maintain instead of "sorted"

Storing every element sorted is overkill — to read the median you only need one or two specific positions (the middle). What if you kept just enough structure to access those positions in O(log n)?

Hint 2 — two halves

If you split your stream into a "smaller half" and a "larger half", and keep both halves at nearly the same size, the median is at the boundary between them. What data structures give you fast access to the largest of the smaller half and the smallest of the larger half?

Hint 3 — the dance
add(x):
    if lo is empty or x <= lo.top(): lo.push(x)
    else:                              hi.push(x)
    # Rebalance — keep |lo| in {|hi|, |hi|+1}
    if   lo.size > hi.size + 1: hi.push(lo.pop())
    elif hi.size > lo.size:     lo.push(hi.pop())

median():
    if lo.size > hi.size: return lo.top()
    else:                  return (lo.top() + hi.top()) / 2

Solution

import heapq

class MedianFinder:
    def __init__(self) -> None:
        # heapq is a min-heap; lo holds negated values to act as a max-heap.
        self.lo: list[int] = []   # max-heap (smaller half, negated)
        self.hi: list[int] = []   # min-heap (larger half)

    def add_num(self, x: int) -> None:
        # Always push to lo first (negated); spill smallest of lo to hi.
        heapq.heappush(self.lo, -x)
        heapq.heappush(self.hi, -heapq.heappop(self.lo))
        # Rebalance — convention: lo may have one more than hi.
        if len(self.hi) > len(self.lo):
            heapq.heappush(self.lo, -heapq.heappop(self.hi))

    def find_median(self) -> float:
        if len(self.lo) > len(self.hi):
            return float(-self.lo[0])
        return (-self.lo[0] + self.hi[0]) / 2.0

# Walk the example
mf = MedianFinder()
expected = [1.0, 1.5, 2.0, 1.5, 2.0]
for x, exp in zip([1, 2, 3, 0, 5], expected):
    mf.add_num(x)
    got  = mf.find_median()
    mark = "✓" if got == exp else "✗"
    print(f"{mark} add_num({x}); find_median() = {got}  (expected {exp})")
// Generic binary heap with a custom comparator.
class Heap {
  constructor(cmp) { this.h = []; this.cmp = cmp; }
  size() { return this.h.length; }
  top()  { return this.h[0]; }
  push(v) {
    this.h.push(v);
    let i = this.h.length - 1;
    while (i > 0) {
      const p = (i - 1) >> 1;
      if (this.cmp(this.h[p], this.h[i]) <= 0) break;
      [this.h[p], this.h[i]] = [this.h[i], this.h[p]]; i = p;
    }
  }
  pop() {
    const top = this.h[0], last = this.h.pop();
    if (this.h.length === 0) return top;
    this.h[0] = last;
    const n = this.h.length;
    for (let i = 0;;) {
      let l = 2*i+1, r = 2*i+2, m = i;
      if (l < n && this.cmp(this.h[l], this.h[m]) < 0) m = l;
      if (r < n && this.cmp(this.h[r], this.h[m]) < 0) m = r;
      if (m === i) break;
      [this.h[m], this.h[i]] = [this.h[i], this.h[m]]; i = m;
    }
    return top;
  }
}

class MedianFinder {
  constructor() {
    this.lo = new Heap((a, b) => b - a);   // max-heap
    this.hi = new Heap((a, b) => a - b);   // min-heap
  }
  addNum(x) {
    this.lo.push(x);
    this.hi.push(this.lo.pop());
    if (this.hi.size() > this.lo.size()) this.lo.push(this.hi.pop());
  }
  findMedian() {
    if (this.lo.size() > this.hi.size()) return this.lo.top();
    return (this.lo.top() + this.hi.top()) / 2;
  }
}

const mf       = new MedianFinder();
const expected = [1.0, 1.5, 2.0, 1.5, 2.0];
for (const [x, exp] of [[1, 1.0], [2, 1.5], [3, 2.0], [0, 1.5], [5, 2.0]]) {
  mf.addNum(x);
  const got = mf.findMedian();
  console.log(`${got === exp ? "✓" : "✗"} addNum(${x}); findMedian() = ${got}  (expected ${exp})`);
}
use std::cmp::Reverse;
use std::collections::BinaryHeap;

pub struct MedianFinder {
    lo: BinaryHeap<i32>,             // max-heap by default
    hi: BinaryHeap<Reverse<i32>>,    // min-heap via Reverse wrapper
}

impl MedianFinder {
    pub fn new() -> Self {
        Self { lo: BinaryHeap::new(), hi: BinaryHeap::new() }
    }
    pub fn add_num(&mut self, x: i32) {
        self.lo.push(x);
        let to_hi = self.lo.pop().unwrap();
        self.hi.push(Reverse(to_hi));
        if self.hi.len() > self.lo.len() {
            let Reverse(back) = self.hi.pop().unwrap();
            self.lo.push(back);
        }
    }
    pub fn find_median(&self) -> f64 {
        if self.lo.len() > self.hi.len() {
            *self.lo.peek().unwrap() as f64
        } else {
            let lo = *self.lo.peek().unwrap() as f64;
            let Reverse(hi) = *self.hi.peek().unwrap();
            (lo + hi as f64) / 2.0
        }
    }
}

fn main() {
    let mut mf = MedianFinder::new();
    let cases = [(1, 1.0), (2, 1.5), (3, 2.0), (0, 1.5), (5, 2.0)];
    for (x, exp) in cases {
        mf.add_num(x);
        let got  = mf.find_median();
        let mark = if (got - exp).abs() < 1e-9 { "✓" } else { "✗" };
        println!("{mark} add_num({}); find_median() = {}  (expected {})", x, got, exp);
    }
}
#include <iostream>
#include <queue>
#include <vector>

class MedianFinder {
    std::priority_queue<int>                                          lo;   // max-heap
    std::priority_queue<int, std::vector<int>, std::greater<int>>     hi;   // min-heap
public:
    void add_num(int x) {
        lo.push(x);
        hi.push(lo.top()); lo.pop();
        if (hi.size() > lo.size()) { lo.push(hi.top()); hi.pop(); }
    }
    double find_median() const {
        if (lo.size() > hi.size()) return (double)lo.top();
        return (lo.top() + hi.top()) / 2.0;
    }
};

int main() {
    MedianFinder mf;
    std::pair<int, double> cases[] = { {1, 1.0}, {2, 1.5}, {3, 2.0}, {0, 1.5}, {5, 2.0} };
    for (auto& [x, exp] : cases) {
        mf.add_num(x);
        double got = mf.find_median();
        std::cout << (got == exp ? "✓" : "✗")
                  << " add_num(" << x << "); find_median() = " << got
                  << "  (expected " << exp << ")\n";
    }
}
// Two hand-rolled heaps: lo is a max-heap, hi is a min-heap.
#include <stdio.h>
#include <stdlib.h>

#define CAP 1024
static int lo[CAP], hi[CAP];
static int lo_n = 0, hi_n = 0;

// --- max-heap on lo ---
static void lo_up(int i) {
    while (i > 0) {
        int p = (i - 1) / 2;
        if (lo[p] >= lo[i]) break;
        int t = lo[p]; lo[p] = lo[i]; lo[i] = t; i = p;
    }
}
static void lo_down(int i) {
    while (1) {
        int l = 2*i+1, r = 2*i+2, m = i;
        if (l < lo_n && lo[l] > lo[m]) m = l;
        if (r < lo_n && lo[r] > lo[m]) m = r;
        if (m == i) break;
        int t = lo[m]; lo[m] = lo[i]; lo[i] = t; i = m;
    }
}
static void lo_push(int v) { lo[lo_n++] = v; lo_up(lo_n - 1); }
static int  lo_pop(void)   { int t = lo[0]; lo[0] = lo[--lo_n]; if (lo_n) lo_down(0); return t; }

// --- min-heap on hi ---
static void hi_up(int i) {
    while (i > 0) {
        int p = (i - 1) / 2;
        if (hi[p] <= hi[i]) break;
        int t = hi[p]; hi[p] = hi[i]; hi[i] = t; i = p;
    }
}
static void hi_down(int i) {
    while (1) {
        int l = 2*i+1, r = 2*i+2, m = i;
        if (l < hi_n && hi[l] < hi[m]) m = l;
        if (r < hi_n && hi[r] < hi[m]) m = r;
        if (m == i) break;
        int t = hi[m]; hi[m] = hi[i]; hi[i] = t; i = m;
    }
}
static void hi_push(int v) { hi[hi_n++] = v; hi_up(hi_n - 1); }
static int  hi_pop(void)   { int t = hi[0]; hi[0] = hi[--hi_n]; if (hi_n) hi_down(0); return t; }

void add_num(int x) {
    lo_push(x);
    hi_push(lo_pop());
    if (hi_n > lo_n) lo_push(hi_pop());
}
double find_median(void) {
    if (lo_n > hi_n) return (double)lo[0];
    return ((double)lo[0] + (double)hi[0]) / 2.0;
}

int main(void) {
    int    xs []  = {1, 2, 3, 0, 5};
    double exp[] = {1.0, 1.5, 2.0, 1.5, 2.0};
    for (int i = 0; i < 5; i++) {
        add_num(xs[i]);
        double got = find_median();
        printf("%s add_num(%d); find_median() = %.1f  (expected %.1f)\n",
               got == exp[i] ? "✓" : "✗", xs[i], got, exp[i]);
    }
    return 0;
}

In every language the structure is identical: push into lo, spill its top to hi (this guarantees the order invariant), then rebalance by size. Rust and C++ both have native priority queues; Python uses negation to fake a max-heap; JS and C hand-roll. The algorithm is exactly five lines no matter which.

Complexity

Insert
O(log n) — at most three heap operations per insert.
Query
O(1) — just read one or two heap tops.
Space
O(n) — every value seen so far is stored across the two heaps.
vs balanced BST
An order-statistic balanced BST gives the same big-O, but with worse constant factors and a much larger implementation. Use the two-heap trick whenever you only need the median (or one fixed percentile).

In the wild

Variations