release(v0.3.4): unify IO engine, add HF loading, harden TorchGeo

print-sid8 · print-sid8 · commit 6fb842bbc4c3 · 2026-03-07T19:05:04.000Z
Unify RasterAccessor tile reads across
  get_xarray/get_numpy/get_gdf/TorchGeo.
  Simplify COGReader fetch-&gt;decompress-&gt;crop path and remove duplicate read paths.
  Add HuggingFace collection loading via hf://datasets/&lt;org&gt;/&lt;repo&gt; parquet shards.
  Harden TorchGeo edge chips with positive-overlap filtering and nodata fallback.
  Improve error surfacing for partial reads and unsupported geometry inputs.
  Fix sample_points tile-validation edge case and get_xarray xr_combine plumbing.
  Expand tests for execution paths, HF integration, TorchGeo error propagation, and API surface.

Signed-off-by: print-sid8 &lt;sidsub94@gmail.com&gt;
diff --git a/notebooks/07_aef_similarity_search.ipynb b/notebooks/07_aef_similarity_search.ipynb
@@ -7,21 +7,30 @@
     "# Embedding Similarity Search with Rasteret\n",
     "\n",
     "Find grain silos across Franklin County, Kansas using\n",
-    "[AlphaEarth Foundation (AEF)](https://source.coop/tge-labs/aef) satellite\n",
-    "embeddings — 64-band int8 COGs at 10 m resolution, derived from a\n",
-    "foundation model trained on Sentinel-2 imagery.\n",
+    "[AlphaEarth Foundations Satellite Embedding dataset (AEF)](https://source.coop/tge-labs/aef)\n",
+    "produced by Google and Google DeepMind — 64-band int8 COGs at 10 m resolution.\n",
     "\n",
-    "This replicates the\n",
-    "[GeoPython Tutorials similarity search](https://www.geopythontutorials.com/notebooks/xarray_embeddings_similarity_search.html)\n",
-    "using Rasteret instead of aef-loader + Dask. Three approaches use\n",
-    "different Rasteret APIs — all reading from the same prebuilt collection.\n",
+    "This workflow is inspired by:\n",
+    "- Ujaval Gandhi’s GeoPython Tutorials post:\n",
+    "  [Large-Scale Embedding Similarity Search with xarray and Dask](https://www.geopythontutorials.com/notebooks/xarray_embeddings_similarity_search.html)\n",
+    "- Google Earth Engine community tutorial:\n",
+    "  [Satellite Embedding Similarity Search](https://developers.google.com/earth-engine/tutorials/community/satellite-embedding-05-similarity-search)\n",
+    "\n",
+    "Data credits:\n",
+    "- County boundary data: **US Census Bureau, 2021 Cartographic Boundary Files**.\n",
+    "- Embeddings: **AlphaEarth Foundations Satellite Embedding dataset (produced by Google and Google DeepMind)**.\n",
     "\n",
     "| Step | Rasteret API | What it does |\n",
     "|---|---|---|\n",
     "| Reference vector | `sample_points` | Extract embeddings at known grain-silo locations |\n",
-    "| Approach A | `get_xarray` | Dense mosaic, band-wise cosine similarity |\n",
-    "| Approach B | `get_gdf` | Per-record arrays, vectorized matmul cosine |\n",
-    "| Approach C | `to_torchgeo_dataset` | Streaming 1024 px chips, bounded memory |"
+    "| Approach A | `get_xarray` | Dense mosaic, cosine similarity |\n",
+    "| Approach B | `get_gdf` | Per-record arrays, cosine similarity |\n",
+    "| Approach C | `to_torchgeo_dataset` | Streaming 1024 px chips, bounded memory |\n",
+    "\n",
+    "\n",
+    "Attribution:\n",
+    "\n",
+    "> \"The AlphaEarth Foundations Satellite Embedding dataset is produced by Google and Google DeepMind.\"\n"
    ]
   },
   {
@@ -58,8 +67,8 @@
     "import duckdb\n",
     "import geopandas as gpd\n",
     "import numpy as np\n",
-    "import torch\n",
     "from shapely.geometry import Point\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
     "\n",
     "import rasteret\n",
     "\n",
@@ -85,52 +94,27 @@
     "\n",
     "\n",
     "def cosine_similarity_map(cube: np.ndarray, ref: np.ndarray) -> np.ndarray:\n",
-    "    \"\"\"Cosine similarity between each pixel and a reference vector.\n",
+    "    \"\"\"Cosine similarity between each pixel and reference embedding via sklearn.\n",
     "\n",
     "    cube : (C, H, W) int8 array — raw AEF embeddings\n",
     "    ref  : (C,) float32 reference embedding (already dequantized)\n",
     "\n",
-    "    Returns (H, W) float32 array, NaN where any band is nodata.\n",
+    "    Returns (H, W) float32 array, NaN where any band is nodata or all-zero.\n",
     "    \"\"\"\n",
     "    C, H, W = cube.shape\n",
-    "    flat = cube.reshape(C, -1).astype(np.float32)\n",
-    "\n",
-    "    nd = (flat == NODATA) | np.isnan(flat)\n",
-    "    d = (flat / 127.5) ** 2 * np.sign(flat)\n",
-    "    d[nd] = 0.0\n",
-    "    valid = ~nd.any(axis=0)\n",
-    "\n",
-    "    dot = ref @ d\n",
-    "    norms = np.linalg.norm(d, axis=0)\n",
-    "    ref_norm = np.linalg.norm(ref)\n",
-    "\n",
-    "    sim = np.full(H * W, np.nan, dtype=np.float32)\n",
-    "    ok = valid & (norms > 0)\n",
-    "    sim[ok] = (dot[ok] / (norms[ok] * ref_norm)).astype(np.float32)\n",
-    "    return sim.reshape(H, W)\n",
-    "\n",
-    "\n",
-    "def cosine_similarity_map_torch(img: torch.Tensor, ref: torch.Tensor) -> torch.Tensor:\n",
-    "    \"\"\"Torch version — same math, for TorchGeo chips.\n",
-    "\n",
-    "    img : (C, H, W) float tensor — raw int8 values as float\n",
-    "    ref : (C,) float tensor — dequantized reference embedding\n",
-    "\n",
-    "    Returns (H, W) float tensor, NaN where any band is nodata.\n",
-    "    \"\"\"\n",
-    "    nd = (img == NODATA) | img.isnan()\n",
-    "    d = (img / 127.5) ** 2 * img.sign()\n",
-    "    d[nd] = 0.0\n",
-    "    valid = ~nd.any(dim=0)\n",
+    "    flat = dequantize(cube.reshape(C, -1).T)  # (N, C)\n",
     "\n",
-    "    dot = (d * ref[:, None, None]).sum(dim=0)\n",
-    "    norms = d.norm(dim=0)\n",
-    "    ref_norm = ref.norm()\n",
+    "    sim_flat = np.full(flat.shape[0], np.nan, dtype=np.float32)\n",
+    "    valid = np.isfinite(flat).all(axis=1)\n",
+    "    if np.any(valid):\n",
+    "        rows = flat[valid]\n",
+    "        nonzero = np.linalg.norm(rows, axis=1) > 0\n",
+    "        if np.any(nonzero):\n",
+    "            sim = cosine_similarity(rows[nonzero], ref.reshape(1, -1)).ravel()\n",
+    "            valid_idx = np.flatnonzero(valid)\n",
+    "            sim_flat[valid_idx[nonzero]] = sim.astype(np.float32)\n",
     "\n",
-    "    sim = torch.full_like(dot, float(\"nan\"))\n",
-    "    ok = valid & (norms > 0)\n",
-    "    sim[ok] = dot[ok] / (norms[ok] * ref_norm)\n",
-    "    return sim\n",
+    "    return sim_flat.reshape(H, W)\n",
     "\n",
     "\n",
     "timings: dict[str, float] = {}"
@@ -421,14 +405,8 @@
     "---\n",
     "## Approach C — TorchGeo streaming\n",
     "\n",
-    "`to_torchgeo_dataset` wraps the collection as a TorchGeo `GeoDataset`.\n",
-    "`GridGeoSampler` tiles the AOI into fixed-size chips — each chip is\n",
-    "fetched on the fly and discarded after use.\n",
-    "\n",
-    "Unlike A and B, the sampler works on a fixed grid that may extend\n",
-    "slightly beyond tile boundaries. Edge chips are zero-filled by\n",
-    "rasterio's merge semantics, matching TorchGeo's native behavior.\n",
-    "We apply a county polygon mask afterwards for the visualization."
+    "Uses `GridGeoSampler` for chip-wise streaming and stitches chip predictions with\n",
+    "`notebooks/utils_stitching.py` so the notebook avoids manual row/col placement math."
    ]
   },
   {
@@ -448,6 +426,11 @@
     "from rasterio.features import geometry_mask\n",
     "from torchgeo.samplers import GridGeoSampler\n",
     "\n",
+    "try:\n",
+    "    from notebooks.utils_stitching import stitch_prediction_tiles\n",
+    "except ImportError:\n",
+    "    from utils_stitching import stitch_prediction_tiles\n",
+    "\n",
     "CHIP_PX = 1024\n",
     "\n",
     "t0 = time.perf_counter()\n",
@@ -459,29 +442,34 @@
     "roi_xmin, roi_ymin, roi_xmax, roi_ymax = county_geom_utm.bounds\n",
     "out_w = round((roi_xmax - roi_xmin) / res_x)\n",
     "out_h = round((roi_ymax - roi_ymin) / res_y)\n",
-    "sim_c = np.full((out_h, out_w), np.nan, dtype=np.float32)\n",
-    "\n",
-    "ref_t = torch.from_numpy(reference_vector).float()\n",
     "n_chips = len(sampler)\n",
     "\n",
     "print(f\"{n_chips} chips ({CHIP_PX}x{CHIP_PX} px)  output grid: ({out_h}, {out_w})\")\n",
     "\n",
+    "tiles = []\n",
+    "skipped = 0\n",
     "for i, query in enumerate(sampler):\n",
-    "    sample = dataset[query]\n",
-    "    chip_sim = cosine_similarity_map_torch(sample[\"image\"].float(), ref_t)\n",
-    "\n",
-    "    tf = sample[\"transform\"].numpy()\n",
-    "    col = round((float(tf[2]) - roi_xmin) / res_x)\n",
-    "    row = round((roi_ymax - float(tf[5])) / res_y)\n",
-    "    ch, cw = chip_sim.shape\n",
-    "    r0, c0 = max(0, row), max(0, col)\n",
-    "    r1, c1 = min(row + ch, out_h), min(col + cw, out_w)\n",
-    "    if r1 > r0 and c1 > c0:\n",
-    "        sim_c[r0:r1, c0:c1] = chip_sim[r0 - row : r1 - row, c0 - col : c1 - col].numpy()\n",
+    "    try:\n",
+    "        sample = dataset[query]\n",
+    "    except Exception:\n",
+    "        skipped += 1\n",
+    "        continue\n",
+    "\n",
+    "    chip = sample[\"image\"].numpy().astype(np.float32)\n",
+    "    chip_sim = cosine_similarity_map(chip, reference_vector)\n",
+    "    tiles.append({\"prediction\": chip_sim, \"transform\": sample[\"transform\"].numpy()})\n",
     "\n",
     "    elapsed = time.perf_counter() - t0\n",
     "    print(f\"  chip {i + 1}/{n_chips}  ({elapsed:.0f}s)\", end=\"\\r\")\n",
     "\n",
+    "sim_c = stitch_prediction_tiles(\n",
+    "    tiles,\n",
+    "    roi_bounds=(roi_xmin, roi_ymin, roi_xmax, roi_ymax),\n",
+    "    res=(res_x, res_y),\n",
+    "    reducer=\"overwrite\",\n",
+    "    out_shape=(out_h, out_w),\n",
+    ")\n",
+    "\n",
     "# Mask to county polygon for apples-to-apples comparison\n",
     "out_transform = Affine(res_x, 0, roi_xmin, 0, -res_y, roi_ymax)\n",
     "county_mask = geometry_mask(\n",
@@ -500,7 +488,10 @@
     "    f\"\\nsimilarity  min={fin_c.min():.4f}  mean={fin_c.mean():.4f}  \"\n",
     "    f\"max={fin_c.max():.4f}  pixels={fin_c.size:,}\"\n",
     ")\n",
-    "print(f\"timing  total={timings['C_total']:.1f}s  ({n_chips} chips)\")"
+    "print(\n",
+    "    f\"timing  total={timings['C_total']:.1f}s  ({n_chips} chips, \"\n",
+    "    f\"used={len(tiles)}, skipped={skipped})\"\n",
+    ")"
    ]
   },
   {
@@ -662,7 +653,10 @@
    "metadata": {},
    "source": [
     "---\n",
-    "## Timing comparison"
+    "## Timing comparison\n",
+    "\n",
+    "Timings below are measured in this run only. Use them as local run diagnostics,\n",
+    "not cross-project benchmarks."
    ]
   },
   {
@@ -680,14 +674,13 @@
    "source": [
     "print(f\"{'':32s} {'Load':>8s} {'Cosine':>8s} {'Total':>8s}\")\n",
     "print(f\"{'---':32s} {'---':>8s} {'---':>8s} {'---':>8s}\")\n",
-    "print(f\"{'Blog (aef-loader + Dask)':32s} {'41s':>8s} {'281s':>8s} {'322s':>8s}\")\n",
     "print(\n",
-    "    f\"{'A  get_xarray + band-wise':32s} \"\n",
+    "    f\"{'A  get_xarray + sklearn':32s} \"\n",
     "    f\"{timings['A_load']:7.0f}s {timings['A_cosine']:7.0f}s \"\n",
     "    f\"{timings['A_total']:7.0f}s\"\n",
     ")\n",
     "print(\n",
-    "    f\"{'B  get_gdf + matmul':32s} \"\n",
+    "    f\"{'B  get_gdf + sklearn':32s} \"\n",
     "    f\"{timings['B_load']:7.0f}s {timings['B_cosine']:7.0f}s \"\n",
     "    f\"{timings['B_total']:7.0f}s\"\n",
     ")\n",
@@ -705,15 +698,16 @@
     "---\n",
     "## Summary\n",
     "\n",
-    "| API | When to use | Memory profile |\n",
-    "|---|---|---|\n",
-    "| `sample_points` | Extract values at specific locations | Minimal — reads only the tiles that contain your points |\n",
-    "| `get_xarray` | Dense AOI reads with spatial coords | Full AOI in memory as a mosaicked Dataset |\n",
-    "| `get_gdf` | Per-record arrays, ragged shapes | Full AOI in memory, one array per record |\n",
-    "| `to_torchgeo_dataset` | Large AOIs, ML training loops | One chip at a time — bounded memory |\n",
+    "| API | When to use | Memory | Speed |\n",
+    "|---|---|---|---|\n",
+    "| `sample_points` | Build reference vectors from known coordinates | Tiny | Fast |\n",
+    "| `get_xarray` | AOI-wide map-style analysis (single mosaic) | Higher | Fastest for this county |\n",
+    "| `get_gdf` | Record-wise analysis with explicit per-record arrays | Medium | Fast |\n",
+    "| `to_torchgeo_dataset` | Streaming chips for training/inference loops | Lowest | Slower, bounded-memory |\n",
     "\n",
-    "All four APIs read from the same prebuilt collection using Rasteret's\n",
-    "COG IO engine — no rasterio, no GDAL, no Dask."
+    "This notebook uses `sklearn.metrics.pairwise.cosine_similarity` for readability,\n",
+    "while showing three Rasteret access patterns (`sample_points`, `get_xarray`, `get_gdf`)\n",
+    "and TorchGeo streaming with a reusable stitch helper.\n"
    ]
   }
  ],
diff --git a/notebooks/utils_stitching.py b/notebooks/utils_stitching.py
@@ -0,0 +1,111 @@
+from __future__ import annotations
+
+from collections.abc import Iterable
+
+import numpy as np
+from affine import Affine
+
+
+def _normalize_res(res: float | tuple[float, float]) -> tuple[float, float]:
+    if isinstance(res, tuple):
+        return float(res[0]), float(res[1])
+    value = float(res)
+    return value, value
+
+
+def _normalize_transform(transform: Affine | np.ndarray | list[float]) -> Affine:
+    if isinstance(transform, Affine):
+        return transform
+    values = np.asarray(transform).reshape(-1)
+    if values.size < 6:
+        raise ValueError("Transform must have at least 6 values.")
+    return Affine(*values[:6].tolist())
+
+
+def stitch_prediction_tiles(
+    tiles: Iterable[
+        dict[str, object] | tuple[np.ndarray, Affine | np.ndarray | list[float]]
+    ],
+    *,
+    roi_bounds: tuple[float, float, float, float],
+    res: float | tuple[float, float],
+    reducer: str = "overwrite",
+    fill_value: float = np.nan,
+    out_shape: tuple[int, int] | None = None,
+) -> np.ndarray:
+    """Stitch georeferenced prediction tiles into a single north-up canvas.
+
+    Parameters
+    ----------
+    tiles:
+        Iterable of either:
+        - ``{"prediction": np.ndarray, "transform": Affine|array_like}``, or
+        - ``(prediction, transform)`` tuples.
+    roi_bounds:
+        ``(xmin, ymin, xmax, ymax)`` of output ROI in dataset CRS.
+    res:
+        Pixel size in CRS units (``(xres, yres)`` or scalar).
+    reducer:
+        ``"overwrite"`` (last-write-wins) or ``"mean"`` (average overlaps).
+    fill_value:
+        Fill value for pixels with no contributing tile.
+    out_shape:
+        Optional ``(height, width)`` override. If omitted, derived from ROI + res.
+    """
+    if reducer not in {"overwrite", "mean"}:
+        raise ValueError("reducer must be 'overwrite' or 'mean'.")
+
+    roi_xmin, roi_ymin, roi_xmax, roi_ymax = map(float, roi_bounds)
+    res_x, res_y = _normalize_res(res)
+
+    if out_shape is None:
+        out_w = int(round((roi_xmax - roi_xmin) / res_x))
+        out_h = int(round((roi_ymax - roi_ymin) / res_y))
+    else:
+        out_h, out_w = int(out_shape[0]), int(out_shape[1])
+
+    if reducer == "mean":
+        sum_grid = np.zeros((out_h, out_w), dtype=np.float64)
+        count_grid = np.zeros((out_h, out_w), dtype=np.uint32)
+    else:
+        stitched = np.full((out_h, out_w), fill_value, dtype=np.float32)
+
+    for tile in tiles:
+        if isinstance(tile, dict):
+            prediction = np.asarray(tile["prediction"], dtype=np.float32)
+            transform = _normalize_transform(tile["transform"])  # type: ignore[arg-type]
+        else:
+            prediction = np.asarray(tile[0], dtype=np.float32)
+            transform = _normalize_transform(tile[1])
+
+        row = int(round((roi_ymax - float(transform.f)) / res_y))
+        col = int(round((float(transform.c) - roi_xmin) / res_x))
+        tile_h, tile_w = prediction.shape
+
+        r0 = max(0, row)
+        c0 = max(0, col)
+        r1 = min(out_h, row + tile_h)
+        c1 = min(out_w, col + tile_w)
+        if r1 <= r0 or c1 <= c0:
+            continue
+
+        patch = prediction[r0 - row : r1 - row, c0 - col : c1 - col]
+
+        if reducer == "mean":
+            valid = np.isfinite(patch)
+            if np.any(valid):
+                block_sum = sum_grid[r0:r1, c0:c1]
+                block_count = count_grid[r0:r1, c0:c1]
+                block_sum[valid] += patch[valid]
+                block_count[valid] += 1
+                sum_grid[r0:r1, c0:c1] = block_sum
+                count_grid[r0:r1, c0:c1] = block_count
+        else:
+            stitched[r0:r1, c0:c1] = patch
+
+    if reducer == "mean":
+        stitched = np.full((out_h, out_w), fill_value, dtype=np.float32)
+        valid = count_grid > 0
+        stitched[valid] = (sum_grid[valid] / count_grid[valid]).astype(np.float32)
+
+    return stitched
diff --git a/pyproject.toml b/pyproject.toml
@@ -76,6 +76,7 @@ examples = [
     "stac-geoparquet>=0.6.0",
     "datasets>=2.20.0",
     "huggingface_hub>=0.23.0",
+    "scikit-learn>=1.5.0",
     "folium>=0.18.0",
 ]
 all = [
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -76,6 +76,7 @@ examples = [`
`76`	`76`	`"stac-geoparquet>=0.6.0",`
`77`	`77`	`"datasets>=2.20.0",`
`78`	`78`	`"huggingface_hub>=0.23.0",`
	`79`	`+ "scikit-learn>=1.5.0",`
`79`	`80`	`"folium>=0.18.0",`
`80`	`81`	`]`
`81`	`82`	`all = [`