H3 Parent Rollup: Multi-Scale Aggregation Without Re-aggregating

H3 hexagons have a property that makes them uniquely suited to multi-scale spatial analytics: a strict containment hierarchy. Every resolution-9 cell has exactly one parent at resolution 8, which has exactly one parent at resolution 7, and so on up to resolution 0. Seven children per parent at each level.

This means that if you aggregate a dataset at resolution 9 — city-block scale — you can roll up to resolution 8 (neighbourhood), resolution 7 (suburb), or resolution 5 (district) by simply summing the child values. No re-reading source data. No re-scanning records. Just a group-by-parent aggregation on the already-aggregated values.

For datasets that need to be queryable at multiple zoom levels — a traffic heatmap, a disease incidence map, a property value surface — this hierarchy allows you to build a complete multi-resolution data cube in a single pass.

# H3 Resolution Reference

Resolution	Avg Cell Area	Typical Use
3	825 km²	State/region level
4	118 km²	Large area analytics
5	16.8 km²	Sub-regional
6	2.4 km²	District/suburb
7	0.34 km²	Neighbourhood
8	0.049 km²	~7 city blocks
9	0.007 km²	~1 city block
10	~900 m²	Individual property

# The Rollup Pattern

import h3
import pandas as pd
import numpy as np

# ── Simulate: 5 million delivery events with location and value ───────────────
rng = np.random.default_rng(42)
n = 5_000_000

# Random points within Sydney metro bounds
lats = rng.uniform(-34.1, -33.6, n)
lons = rng.uniform(150.7, 151.4, n)
delivery_value = rng.exponential(scale=80, size=n)  # $ value

# ── Step 1: Assign each event to resolution-10 H3 cell ───────────────────────
import time
t0 = time.perf_counter()

# h3.latlng_to_cell is vectorised via list comprehension or pandas apply
# For large datasets, use the numpy-vectorised form:
h3_res10 = pd.array([
    h3.latlng_to_cell(lat, lon, 10)
    for lat, lon in zip(lats, lons)
])
print(f"H3 index assignment: {time.perf_counter() - t0:.1f}s")
# → ~18s (pure Python loop — see optimisation below)

df = pd.DataFrame({
    'h3_10': h3_res10,
    'value': delivery_value
})

# ── Step 2: Aggregate at resolution 10 ────────────────────────────────────────
t0 = time.perf_counter()
agg_10 = df.groupby('h3_10').agg(
    count=('value', 'count'),
    total_value=('value', 'sum'),
    mean_value=('value', 'mean')
).reset_index()
print(f"Resolution-10 aggregation: {time.perf_counter() - t0:.3f}s, {len(agg_10):,} cells")
# → 0.8s, 142,847 cells

# ── Step 3: Roll up to coarser resolutions ────────────────────────────────────
def rollup_to_resolution(df_fine: pd.DataFrame, h3_col: str, target_res: int) -> pd.DataFrame:
    """Roll up H3 aggregates from the current resolution to target_res."""
    df = df_fine.copy()
    df['h3_parent'] = df[h3_col].apply(lambda c: h3.cell_to_parent(c, target_res))
    return df.groupby('h3_parent').agg(
        count=('count', 'sum'),
        total_value=('total_value', 'sum'),
        mean_value=('total_value', 'sum')   # recompute mean from totals
    ).assign(
        mean_value=lambda x: x['total_value'] / x['count']
    ).reset_index().rename(columns={'h3_parent': f'h3_{target_res}'})

t0 = time.perf_counter()
agg_9 = rollup_to_resolution(agg_10, 'h3_10', 9)
agg_8 = rollup_to_resolution(agg_9,  'h3_9',  8)
agg_7 = rollup_to_resolution(agg_8,  'h3_8',  7)
agg_6 = rollup_to_resolution(agg_7,  'h3_7',  6)
agg_5 = rollup_to_resolution(agg_6,  'h3_6',  5)
print(f"Rollup to resolutions 9→5: {time.perf_counter() - t0:.3f}s")
# → 0.09s total

print(f"Cells at each resolution:")
for name, df in [('res10', agg_10), ('res9', agg_9), ('res8', agg_8),
                  ('res7', agg_7), ('res6', agg_6), ('res5', agg_5)]:
    print(f"  {name}: {len(df):,}")
# res10: 142,847
# res9:  21,036
# res8:  3,045
# res7:  446
# res6:  66
# res5:  10

The entire rollup chain — from 142,847 resolution-10 cells to 10 resolution-5 cells — completes in 90 milliseconds. No re-reading source data. No re-scanning 5 million records.

# Fast H3 Indexing with Vectorised Lookup

The bottleneck in the example above is h3.latlng_to_cell() inside a Python loop — 18 seconds for 5 million points. The h3 library (version 4+) provides a vectorised form:

import h3
import numpy as np

# h3 4.x: pass arrays directly
h3_cells = h3.latlng_to_cell(lats, lons, resolution=10)
# Returns a numpy array of H3 cell IDs

For even higher throughput with older h3 versions, use h3pandas or the h3ronpy Rust extension:

# Using h3pandas
import h3pandas
import pandas as pd

df = pd.DataFrame({'lat': lats, 'lng': lons, 'value': delivery_value})
df = df.h3.geo_to_h3(10, lat_col='lat', lng_col='lng')
# Adds 'h3_10' column, vectorised implementation

With h3 4.x native array API: 5 million points indexed in approximately 0.6 seconds vs 18 seconds for the Python loop — 30× faster.

# Building a Multi-Resolution Data Cube

For production use, store all resolution levels together as a dictionary of GeoParquet files or as a single Parquet file with a resolution column:

import geopandas as gpd
import shapely

def h3_cells_to_geometries(h3_series: pd.Series) -> gpd.GeoDataFrame:
    """Convert H3 cell IDs to polygon GeoDataFrame."""
    geometries = []
    for cell_id in h3_series:
        boundary = h3.cell_to_boundary(cell_id)
        # boundary: list of (lat, lon) pairs
        geom = shapely.polygons(
            [[lon, lat] for lat, lon in boundary]   # H3 returns lat/lon, Shapely wants lon/lat
        )
        geometries.append(geom)
    return gpd.GeoSeries(geometries, crs="EPSG:4326")

# Build GeoParquet for each resolution level
resolutions = {
    'res10': agg_10.rename(columns={'h3_10': 'h3_id'}),
    'res9':  agg_9.rename(columns={'h3_9': 'h3_id'}),
    'res8':  agg_8.rename(columns={'h3_8': 'h3_id'}),
    'res7':  agg_7.rename(columns={'h3_7': 'h3_id'}),
    'res6':  agg_6.rename(columns={'h3_6': 'h3_id'}),
    'res5':  agg_5.rename(columns={'h3_5': 'h3_id'}),
}

for res_name, agg_df in resolutions.items():
    gdf = gpd.GeoDataFrame(
        agg_df,
        geometry=h3_cells_to_geometries(agg_df['h3_id']),
        crs="EPSG:4326"
    )
    gdf.to_parquet(f"delivery_heatmap_{res_name}.parquet", compression="zstd")
    print(f"Written {res_name}: {len(gdf):,} cells")

# Serving Multi-Resolution Data in an API

With the data cube written to GeoParquet, a FastAPI endpoint can return the appropriate resolution level based on the map zoom:

from fastapi import FastAPI
from fastapi.responses import JSONResponse
import geopandas as gpd
from shapely.geometry import box

app = FastAPI()

# Load all resolution levels at startup
RESOLUTION_FILES = {
    3: "delivery_heatmap_res5.parquet",
    5: "delivery_heatmap_res6.parquet",
    7: "delivery_heatmap_res7.parquet",
    9: "delivery_heatmap_res8.parquet",
    11: "delivery_heatmap_res9.parquet",
    13: "delivery_heatmap_res10.parquet",
}

@app.get("/heatmap")
async def get_heatmap(
    xmin: float, ymin: float, xmax: float, ymax: float,
    zoom: int = 10
):
    # Select resolution based on zoom level
    res_file = min(RESOLUTION_FILES.items(), key=lambda kv: abs(kv[0] - zoom))[1]

    # Read only cells in the viewport
    gdf = gpd.read_parquet(res_file, bbox=(xmin, ymin, xmax, ymax))

    return JSONResponse(gdf.to_json())

This pattern delivers pre-aggregated data at the appropriate spatial scale for every zoom level without any server-side computation at request time.

# Incremental Updates

A key advantage of the parent-rollup approach: when new data arrives, you only need to recompute the affected resolution-10 cells, then re-roll them up:

def update_cells(new_events_df, existing_agg_10):
    """Incrementally update aggregates when new events arrive."""
    # Assign H3 cells to new events
    new_events_df['h3_10'] = h3.latlng_to_cell(
        new_events_df['lat'].values,
        new_events_df['lng'].values,
        10
    )

    # Aggregate new events at resolution 10
    new_agg = new_events_df.groupby('h3_10').agg(
        count=('value', 'count'),
        total_value=('value', 'sum')
    ).reset_index()

    # Merge with existing: add counts and totals for matching cells
    merged = existing_agg_10.set_index('h3_10').add(
        new_agg.set_index('h3_10'), fill_value=0
    ).reset_index()
    merged['mean_value'] = merged['total_value'] / merged['count']

    return merged

Incremental updates mean a live dashboard can maintain near-real-time aggregates without reprocessing the entire dataset on every event.