A single GeoParquet file of 50 million rows is readable — GeoPandas can load it in under two minutes — but a single file is fundamentally sequential. When you need to process a subset of the data (say, all records within a specific state or bounding box), you must at minimum scan all row groups to identify which ones overlap your region of interest, even if you ultimately only read a fraction of the data.
Spatial partitioning addresses this by splitting a large dataset into multiple files, each covering a distinct geographic region. Query patterns that access a small geographic area then read only the relevant partition files, potentially accessing 1–5% of the total data rather than scanning everything.
# Partitioning Strategies
# Strategy 1: Regular Grid Partitioning
The simplest approach: divide the bounding box into an N×M grid and write one file per cell.
import geopandas as gpd
import numpy as np
import os
def partition_by_grid(gdf: gpd.GeoDataFrame, output_dir: str, n_cols: int = 10, n_rows: int = 10):
"""
Partition a GeoDataFrame by a regular bounding-box grid.
Each feature goes into the partition containing its centroid.
"""
os.makedirs(output_dir, exist_ok=True)
bounds = gdf.total_bounds # [xmin, ymin, xmax, ymax]
x_edges = np.linspace(bounds[0], bounds[2], n_cols + 1)
y_edges = np.linspace(bounds[1], bounds[3], n_rows + 1)
centroids = gdf.geometry.centroid
cx = centroids.x.values
cy = centroids.y.values
# Assign each feature to a grid cell
col_idx = np.clip(np.searchsorted(x_edges, cx, side='right') - 1, 0, n_cols - 1)
row_idx = np.clip(np.searchsorted(y_edges, cy, side='right') - 1, 0, n_rows - 1)
partition_id = row_idx * n_cols + col_idx
# Write one file per non-empty partition
for pid in np.unique(partition_id):
mask = partition_id == pid
subset = gdf[mask]
r = pid // n_cols
c = pid % n_cols
subset.to_parquet(
f"{output_dir}/r{r:03d}_c{c:03d}.parquet",
compression="zstd"
)
print(f" Partition r{r:03d}_c{c:03d}: {mask.sum():,} features")
# Usage
gdf = gpd.read_parquet("australia_buildings.parquet")
partition_by_grid(gdf, "australia_buildings_partitioned/", n_cols=10, n_rows=10)
On a 50M row dataset covering Australia, this creates up to 100 partition files. A query over Queensland (roughly 5 cells) reads ~5% of the data.
# Strategy 2: H3 Cell Partitioning
H3 hexagonal cells provide natural spatial partitioning that respects geographic density more intelligently than a regular grid. High-density urban areas can use finer resolution; sparse rural areas can use coarser cells.
import geopandas as gpd
import h3
import numpy as np
import shapely
import os
def partition_by_h3(gdf: gpd.GeoDataFrame, output_dir: str, resolution: int = 3):
"""
Partition by H3 cell at the given resolution.
Resolution 3: ~825km² cells (manageable file count for large datasets)
Resolution 4: ~105km² cells (good balance for continent-scale)
"""
os.makedirs(output_dir, exist_ok=True)
# Assign H3 index to each feature's centroid
centroids = gdf.geometry.centroid
cx = shapely.get_x(centroids.values)
cy = shapely.get_y(centroids.values)
h3_indices = np.array([
h3.latlng_to_cell(lat, lon, resolution)
for lat, lon in zip(cy, cx)
])
gdf = gdf.copy()
gdf['_h3_partition'] = h3_indices
# Write one file per H3 cell
for cell_id in np.unique(h3_indices):
subset = gdf[gdf['_h3_partition'] == cell_id].drop(columns=['_h3_partition'])
subset.to_parquet(
f"{output_dir}/{cell_id}.parquet",
compression="zstd"
)
print(f"Written {len(np.unique(h3_indices))} partition files")
Why H3 partitioning is powerful: H3’s hierarchy means a query for a large region can be expressed as a set of H3 cell IDs at a coarser resolution. All partition files whose names correspond to children of those cells are the relevant partitions — no bounding box intersection check required.
def query_h3_region(query_cells_res3: list, data_dir: str, res_data: int = 4):
"""
Read all partitions that overlap a set of resolution-3 H3 cells.
Data is partitioned at resolution 4.
"""
# Get all resolution-4 children of the query cells
target_cells = set()
for cell in query_cells_res3:
# h3.cell_to_children returns all finer-resolution children
target_cells.update(h3.cell_to_children(cell, res_data))
# Read only the matching partition files
gdfs = []
for cell in target_cells:
path = f"{data_dir}/{cell}.parquet"
if os.path.exists(path):
gdfs.append(gpd.read_parquet(path))
if gdfs:
return gpd.pd.concat(gdfs, ignore_index=True)
return gpd.GeoDataFrame()
# Strategy 3: Administrative Region Partitioning
For datasets with natural administrative breaks (state, LGA, suburb), partitioning by those boundaries is the most query-friendly:
def partition_by_admin_region(gdf, admin_boundaries, region_col, output_dir):
"""Spatial join to assign features to admin regions, then partition."""
import os
os.makedirs(output_dir, exist_ok=True)
# Spatial join to get admin region for each feature
joined = gpd.sjoin(gdf, admin_boundaries[[region_col, 'geometry']], how='left', predicate='within')
for region_id in joined[region_col].unique():
subset = joined[joined[region_col] == region_id].drop(
columns=[region_col, 'index_right']
)
safe_name = str(region_id).replace('/', '_').replace(' ', '_')
subset.to_parquet(f"{output_dir}/{safe_name}.parquet", compression="zstd")
# Querying Partitioned Data with DuckDB
DuckDB’s glob support makes querying partitioned GeoParquet datasets straightforward:
import duckdb
con = duckdb.connect()
con.install_extension("spatial")
con.load_extension("spatial")
# Query all partitions in a directory
result = con.execute("""
SELECT
zone_type,
COUNT(*) AS feature_count,
SUM(land_area_m2) AS total_area
FROM read_parquet('australia_buildings_partitioned/*.parquet')
GROUP BY zone_type
ORDER BY feature_count DESC
""").df()
# Query a specific subset of partition files (only Queensland cells)
qld_files = ["australia_buildings_partitioned/r03_c07.parquet",
"australia_buildings_partitioned/r03_c08.parquet",
"australia_buildings_partitioned/r04_c07.parquet"]
result_qld = con.execute(f"""
SELECT COUNT(*), AVG(land_value)
FROM read_parquet({qld_files})
""").df()
DuckDB reads partition files in parallel automatically, using one thread per file up to the available core count.
# Dask-GeoPandas with Partitioned Files
Dask-GeoPandas can read partitioned GeoParquet datasets lazily, processing each partition in parallel:
import dask_geopandas
import dask
import glob
# Create a Dask GeoDataFrame from multiple partition files
partition_files = sorted(glob.glob("australia_buildings_partitioned/*.parquet"))
ddf = dask_geopandas.from_geopandas(
gpd.GeoDataFrame(), # empty template
npartitions=1
)
# More direct: read each file into a delayed GeoDataFrame
delayed_parts = [
dask.delayed(gpd.read_parquet)(f) for f in partition_files
]
ddf = dask_geopandas.from_delayed(delayed_parts)
# Operations execute in parallel across partition files
area_by_zone = ddf.groupby('zone_type')['land_area_m2'].sum().compute()
# Partition Size Guidelines
The right partition size depends on your access pattern:
| Use case | Target partition size | Recommended strategy |
|---|---|---|
| Single-machine analytics | 50–200MB compressed | Regular grid or H3 res 3–4 |
| Spark / distributed | 128–512MB compressed | H3 or admin region |
| Web tile serving | 1–10MB compressed | H3 res 5–6 |
| Point queries | <1MB (many files) | H3 res 6–7 |
Too many small files creates metadata overhead. Too few large files reduces parallelism benefit. The sweet spot for most single-machine Python workflows is 50–200 compressed MB per partition file.
# Metadata Sidecar Files
For large partitioned datasets, maintain a sidecar metadata file that maps partition identifiers to bounding boxes and row counts. This allows callers to identify relevant partition files without scanning the filesystem:
import json
import geopandas as gpd
import glob
def build_partition_index(partitioned_dir: str, output_path: str):
"""Build a JSON index of partition files with bbox and row count."""
index = []
for path in sorted(glob.glob(f"{partitioned_dir}/*.parquet")):
gdf = gpd.read_parquet(path, columns=["geometry"])
bounds = gdf.total_bounds.tolist()
index.append({
"file": os.path.basename(path),
"row_count": len(gdf),
"bbox": {"xmin": bounds[0], "ymin": bounds[1],
"xmax": bounds[2], "ymax": bounds[3]}
})
with open(output_path, "w") as f:
json.dump(index, f, indent=2)
print(f"Index written: {len(index)} partitions")
Callers load the index (a few KB), intersect their query bbox against partition bboxes, and read only the matching files — without any filesystem glob or directory scan.
Related reading: GeoParquet: The Analytical Format for Vector Geodata · DuckDB Spatial Analytics on GeoParquet · H3 as a Spatial Join Accelerator