diff --git a/.gitignore b/.gitignore
index 734d171fc8..bf3fd48238 100644
--- a/.gitignore
+++ b/.gitignore
@@ -242,3 +242,6 @@ docker-compose.override.yml
secrets/
/.playwright-mcp/
/screenshots/
+
+# Map page tuning screenshots (root-level only — must not match plots/*/preview.png etc.)
+/map-*.png
diff --git a/api/cache.py b/api/cache.py
index ccad8423c5..c7a67fa9c9 100644
--- a/api/cache.py
+++ b/api/cache.py
@@ -178,6 +178,7 @@ def clear_spec_cache(spec_id: str) -> int:
count += clear_cache_by_pattern(f"spec:{spec_id}")
count += clear_cache_by_pattern(f"spec_images:{spec_id}")
count += clear_cache_by_pattern("specs_list") # List might have changed
+ count += clear_cache_by_pattern("specs_map") # Map page payload might have changed
count += clear_cache_by_pattern("filter:") # Filters might be affected
count += clear_cache_by_pattern("stats") # Stats might have changed
count += clear_cache_by_pattern("sitemap") # Sitemap includes spec URLs
diff --git a/api/routers/seo.py b/api/routers/seo.py
index 4dd6b9f741..c331884e39 100644
--- a/api/routers/seo.py
+++ b/api/routers/seo.py
@@ -47,6 +47,7 @@ def _build_sitemap_xml(specs: list) -> str:
" https://anyplot.ai/plots",
" https://anyplot.ai/specs",
" https://anyplot.ai/libraries",
+ " https://anyplot.ai/map",
" https://anyplot.ai/palette",
" https://anyplot.ai/about",
" https://anyplot.ai/mcp",
diff --git a/api/routers/specs.py b/api/routers/specs.py
index a741e9b691..7d92bb13b8 100644
--- a/api/routers/specs.py
+++ b/api/routers/specs.py
@@ -6,7 +6,7 @@
from api.cache import cache_key, get_or_set_cache
from api.dependencies import require_db
from api.exceptions import raise_not_found
-from api.schemas import ImplementationResponse, SpecDetailResponse, SpecListItem
+from api.schemas import ImplementationResponse, SpecDetailResponse, SpecListItem, SpecMapItem
from core.config import settings
from core.database import ImplRepository, SpecRepository
from core.database.connection import get_db_context
@@ -28,6 +28,40 @@ async def _build_specs_list(db: AsyncSession) -> list[SpecListItem]:
]
+async def _build_specs_map(db: AsyncSession) -> list[SpecMapItem]:
+ """One row per spec with its best-rated impl image + spec/impl tag bag for the /map page.
+
+ Best-impl tiebreak: highest quality_score, then lexicographically *greatest* library_id
+ (since `max()` picks the largest tuple — e.g. seaborn over matplotlib on a tie).
+ Specs without any implementations are skipped (mirrors _build_specs_list).
+ """
+ repo = SpecRepository(db)
+ specs = await repo.get_all()
+ items: list[SpecMapItem] = []
+ for spec in specs:
+ if not spec.impls:
+ continue
+ # Prefer impls that actually have a preview URL — otherwise the map
+ # would render blank-bordered nodes for specs whose top-quality impl
+ # happens to have no thumbnail. Fall back to the full impl list only
+ # when *no* impl has a preview (very rare, but keeps the spec on the map).
+ with_preview = [i for i in spec.impls if i.preview_url_light or i.preview_url_dark]
+ candidates = with_preview or list(spec.impls)
+ best = max(candidates, key=lambda i: ((i.quality_score or 0.0), i.library_id))
+ items.append(
+ SpecMapItem(
+ id=spec.id,
+ title=spec.title,
+ preview_url_light=best.preview_url_light,
+ preview_url_dark=best.preview_url_dark,
+ quality_score=best.quality_score,
+ tags=spec.tags,
+ impl_tags=best.impl_tags,
+ )
+ )
+ return items
+
+
async def _build_spec_detail(db: AsyncSession, spec_id: str) -> SpecDetailResponse:
repo = SpecRepository(db)
spec = await repo.get_by_id(spec_id)
@@ -125,6 +159,25 @@ async def _refresh() -> list[SpecListItem]:
)
+@router.get("/specs/map", response_model=list[SpecMapItem])
+async def get_specs_map(db: AsyncSession = Depends(require_db)):
+ """Get one row per spec (best-impl image + tag bag) for the /map clustering page.
+
+ NOTE: must stay declared before /specs/{spec_id} so the path-parameter route doesn't capture "map".
+ """
+
+ async def _fetch() -> list[SpecMapItem]:
+ return await _build_specs_map(db)
+
+ async def _refresh() -> list[SpecMapItem]:
+ async with get_db_context() as fresh_db:
+ return await _build_specs_map(fresh_db)
+
+ return await get_or_set_cache(
+ cache_key("specs_map"), _fetch, refresh_after=settings.cache_refresh_after, refresh_factory=_refresh
+ )
+
+
@router.get("/specs/{spec_id}", response_model=SpecDetailResponse)
async def get_spec(spec_id: str, db: AsyncSession = Depends(require_db)):
"""Get detailed spec information including all implementations."""
diff --git a/api/schemas.py b/api/schemas.py
index d6c2d582cc..0d47a317e6 100644
--- a/api/schemas.py
+++ b/api/schemas.py
@@ -69,6 +69,22 @@ class SpecListItem(BaseModel):
library_count: int = 0
+class SpecMapItem(BaseModel):
+ """One row per spec for the /map page: best-impl preview + full tag bag for client-side similarity clustering."""
+
+ id: str
+ title: str
+ preview_url_light: str | None = None
+ preview_url_dark: str | None = None
+ quality_score: float | None = None
+ # Tag bags: each category maps to a list of strings. Tightened from
+ # dict[str, Any] so the OpenAPI contract matches what the /map frontend
+ # expects (Record) and so unexpected shapes get
+ # caught at validation time instead of breaking client-side similarity.
+ tags: dict[str, list[str]] | None = None
+ impl_tags: dict[str, list[str]] | None = None
+
+
class ImageResponse(BaseModel):
"""Image/plot response for grid display."""
diff --git a/app/package.json b/app/package.json
index c53784bd80..18958c1fd7 100644
--- a/app/package.json
+++ b/app/package.json
@@ -22,9 +22,11 @@
"@emotion/styled": "^11.14.1",
"@mui/icons-material": "^9.0.0",
"@mui/material": "^9.0.0",
+ "force-graph": "^1.51.4",
"fuse.js": "^7.3.0",
"react": "^19.2.5",
"react-dom": "^19.2.5",
+ "react-force-graph-2d": "^1.29.1",
"react-helmet-async": "^3.0.0",
"react-router-dom": "^7.14.2",
"react-syntax-highlighter": "^16.1.1",
diff --git a/app/src/components/NavBar.tsx b/app/src/components/NavBar.tsx
index 9e9458fe04..df0df25a8e 100644
--- a/app/src/components/NavBar.tsx
+++ b/app/src/components/NavBar.tsx
@@ -10,6 +10,7 @@ const DEBUG_CLICK_WINDOW_MS = 800;
const NAV_LINKS: { label: string; to: string; short?: string }[] = [
{ label: 'specs', to: '/specs' },
{ label: 'plots', to: '/plots' },
+ { label: 'map', to: '/map' },
{ label: 'libraries', to: '/libraries', short: 'libs' },
{ label: 'stats', to: '/stats' },
{ label: 'palette', to: '/palette', short: 'pal' },
diff --git a/app/src/pages/MapPage.helpers.test.ts b/app/src/pages/MapPage.helpers.test.ts
new file mode 100644
index 0000000000..f0bc999ff3
--- /dev/null
+++ b/app/src/pages/MapPage.helpers.test.ts
@@ -0,0 +1,349 @@
+import { describe, it, expect } from 'vitest';
+
+import {
+ flattenTags,
+ computeIDF,
+ weightedJaccard,
+ buildKNNLinks,
+ selectMapThumbUrl,
+ buildVariantUrl,
+ pickTier,
+ pickBestLoadedTier,
+ fitToBox,
+ primaryPlotType,
+ topPlotTypes,
+ type SpecMapItem,
+} from './MapPage.helpers';
+
+
+function spec(id: string, tags: SpecMapItem['tags'], implTags: SpecMapItem['impl_tags'] = null): SpecMapItem {
+ return {
+ id,
+ title: id,
+ preview_url_light: `https://example.com/${id}-light.png`,
+ preview_url_dark: `https://example.com/${id}-dark.png`,
+ quality_score: 90,
+ tags,
+ impl_tags: implTags,
+ };
+}
+
+
+describe('flattenTags', () => {
+ it('prefixes values with their category', () => {
+ const s = spec('a', { plot_type: ['scatter'], features: ['basic', '2d'] });
+ expect(flattenTags(s).sort()).toEqual(['features:2d', 'features:basic', 'plot_type:scatter']);
+ });
+
+ it('merges spec.tags with impl_tags by default', () => {
+ const s = spec('a', { plot_type: ['scatter'] }, { dependencies: ['scipy'] });
+ expect(flattenTags(s).sort()).toEqual(['dependencies:scipy', 'plot_type:scatter']);
+ });
+
+ it('skips impl_tags when includeImpl=false', () => {
+ const s = spec('a', { plot_type: ['scatter'] }, { dependencies: ['scipy'] });
+ expect(flattenTags(s, false)).toEqual(['plot_type:scatter']);
+ });
+
+ it('handles missing dicts and empty arrays', () => {
+ expect(flattenTags(spec('a', null, null))).toEqual([]);
+ expect(flattenTags(spec('a', { plot_type: [] }, null))).toEqual([]);
+ });
+
+ it('deduplicates identical category:value pairs', () => {
+ const s = spec('a', { plot_type: ['scatter', 'scatter'] }, { plot_type: ['scatter'] });
+ expect(flattenTags(s)).toEqual(['plot_type:scatter']);
+ });
+});
+
+
+describe('computeIDF', () => {
+ it('assigns log(N / df) to every tag', () => {
+ const specs = [
+ spec('a', { plot_type: ['scatter'] }),
+ spec('b', { plot_type: ['scatter'] }),
+ spec('c', { plot_type: ['line'] }),
+ ];
+ const idf = computeIDF(specs);
+ expect(idf.get('plot_type:scatter')).toBeCloseTo(Math.log(3 / 2));
+ expect(idf.get('plot_type:line')).toBeCloseTo(Math.log(3 / 1));
+ });
+
+ it('gives ubiquitous tags weight ~0', () => {
+ const specs = [
+ spec('a', { data_type: ['numeric'] }),
+ spec('b', { data_type: ['numeric'] }),
+ ];
+ expect(computeIDF(specs).get('data_type:numeric')).toBe(0);
+ });
+
+ it('survives empty input without dividing by zero', () => {
+ expect(computeIDF([]).size).toBe(0);
+ });
+
+ it('zeroes out tags above the maxDfRatio cutoff (default 0.67)', () => {
+ // 4 specs, "dependencies:selenium" appears in 3 (75%) → above default 0.67 cutoff
+ const specs = [
+ spec('a', { plot_type: ['scatter'] }, { dependencies: ['selenium'] }),
+ spec('b', { plot_type: ['scatter'] }, { dependencies: ['selenium'] }),
+ spec('c', { plot_type: ['line'] }, { dependencies: ['selenium'] }),
+ spec('d', { plot_type: ['bar'] }, { dependencies: ['matplotlib'] }),
+ ];
+ const idf = computeIDF(specs);
+ expect(idf.get('dependencies:selenium')).toBe(0);
+ // The rare one stays meaningful
+ expect(idf.get('dependencies:matplotlib')).toBeGreaterThan(0);
+ });
+
+ it('honors a custom maxDfRatio', () => {
+ const specs = [
+ spec('a', { features: ['basic'] }),
+ spec('b', { features: ['basic'] }),
+ spec('c', { features: ['rare'] }),
+ ];
+ // basic in 2/3 = 67 % — below default 0.67 cutoff, kept
+ expect(computeIDF(specs).get('features:basic')).toBeGreaterThan(0);
+ // tighten cutoff to 0.5 → basic now noise
+ expect(computeIDF(specs, 0.5).get('features:basic')).toBe(0);
+ });
+});
+
+
+describe('weightedJaccard', () => {
+ const idf = new Map([
+ ['plot_type:scatter', 1.0],
+ ['plot_type:line', 1.0],
+ ['features:basic', 0.5],
+ ]);
+
+ it('returns 1 when sets are identical', () => {
+ expect(weightedJaccard(['plot_type:scatter'], ['plot_type:scatter'], idf)).toBeCloseTo(1);
+ });
+
+ it('returns 0 when sets are disjoint', () => {
+ expect(weightedJaccard(['plot_type:scatter'], ['plot_type:line'], idf)).toBe(0);
+ });
+
+ it('weights overlap by IDF (rare overlap > common overlap)', () => {
+ const rareIdf = new Map([['plot_type:scatter', 2], ['features:basic', 0.1]]);
+ const sharedRare = weightedJaccard(['plot_type:scatter'], ['plot_type:scatter', 'features:basic'], rareIdf);
+ const sharedCommon = weightedJaccard(['features:basic'], ['features:basic', 'plot_type:scatter'], rareIdf);
+ expect(sharedRare).toBeGreaterThan(sharedCommon);
+ });
+
+ it('returns 0 when either set is empty', () => {
+ expect(weightedJaccard([], ['plot_type:scatter'], idf)).toBe(0);
+ expect(weightedJaccard(['plot_type:scatter'], [], idf)).toBe(0);
+ });
+});
+
+
+describe('buildKNNLinks', () => {
+ it('keeps top-K neighbors above the similarity threshold', () => {
+ const specs = [
+ spec('scatter1', { plot_type: ['scatter'], features: ['basic'] }),
+ spec('scatter2', { plot_type: ['scatter'], features: ['basic'] }),
+ spec('line1', { plot_type: ['line'], features: ['basic'] }),
+ spec('bar1', { plot_type: ['bar'] }),
+ ];
+ const idf = computeIDF(specs);
+ const links = buildKNNLinks(specs, idf, 2, 0.0);
+ // scatter1 ↔ scatter2 should be linked (most similar pair)
+ const ids = links.map(l => `${l.source}-${l.target}`).sort();
+ expect(ids).toContain('scatter1-scatter2');
+ });
+
+ it('produces undirected links (no A→B and B→A duplicate)', () => {
+ // Need a 3-spec corpus so IDF gives non-zero weight to scatter (otherwise
+ // a universal tag has weight 0 and no link is emitted — correct behavior).
+ const specs = [
+ spec('a', { plot_type: ['scatter'] }),
+ spec('b', { plot_type: ['scatter'] }),
+ spec('c', { plot_type: ['line'] }),
+ ];
+ const links = buildKNNLinks(specs, computeIDF(specs), 5, 0.0);
+ const keys = links.map(l => `${l.source}|${l.target}`);
+ // a-b should appear exactly once, not twice
+ expect(keys.filter(k => k === 'a|b' || k === 'b|a').length).toBe(1);
+ });
+
+ it('drops links below minSim', () => {
+ const specs = [
+ spec('a', { plot_type: ['scatter'] }),
+ spec('b', { plot_type: ['line'] }),
+ ];
+ const links = buildKNNLinks(specs, computeIDF(specs), 5, 0.5);
+ expect(links).toHaveLength(0);
+ });
+
+ it('every link weight is in (0, 1]', () => {
+ const specs = [
+ spec('a', { plot_type: ['scatter'], features: ['basic'] }),
+ spec('b', { plot_type: ['scatter'], features: ['regression'] }),
+ spec('c', { plot_type: ['line'], features: ['basic'] }),
+ ];
+ const links = buildKNNLinks(specs, computeIDF(specs), 3, 0.0);
+ for (const l of links) {
+ expect(l.weight).toBeGreaterThan(0);
+ expect(l.weight).toBeLessThanOrEqual(1);
+ }
+ });
+});
+
+
+describe('selectMapThumbUrl', () => {
+ it('returns the dark URL in dark mode and light URL in light mode', () => {
+ const s = spec('a', null);
+ expect(selectMapThumbUrl(s, true)).toBe('https://example.com/a-dark.png');
+ expect(selectMapThumbUrl(s, false)).toBe('https://example.com/a-light.png');
+ });
+
+ it('falls back to the other theme when the preferred URL is missing', () => {
+ const s: SpecMapItem = { ...spec('a', null), preview_url_dark: null };
+ expect(selectMapThumbUrl(s, true)).toBe('https://example.com/a-light.png');
+ });
+
+ it('returns null when no preview URLs at all', () => {
+ const s: SpecMapItem = { ...spec('a', null), preview_url_light: null, preview_url_dark: null };
+ expect(selectMapThumbUrl(s, false)).toBeNull();
+ });
+});
+
+
+describe('buildVariantUrl', () => {
+ it('rewrites .png to _{tier}.webp', () => {
+ expect(buildVariantUrl('https://example.com/plot.png', 400)).toBe('https://example.com/plot_400.webp');
+ expect(buildVariantUrl('https://example.com/plot-light.png', 800)).toBe('https://example.com/plot-light_800.webp');
+ expect(buildVariantUrl('https://example.com/plot-dark.png', 1200)).toBe('https://example.com/plot-dark_1200.webp');
+ });
+
+ it('passes through URLs that do not end in .png', () => {
+ expect(buildVariantUrl('https://example.com/plot.svg', 400)).toBe('https://example.com/plot.svg');
+ });
+});
+
+
+describe('pickTier', () => {
+ it('returns 400 when device pixel size fits in 400 with headroom', () => {
+ expect(pickTier(100)).toBe(400);
+ expect(pickTier(300)).toBe(400);
+ });
+
+ it('returns 800 when 400 would require upscaling', () => {
+ expect(pickTier(500)).toBe(800);
+ expect(pickTier(600)).toBe(800);
+ });
+
+ it('returns 1200 for very large device sizes', () => {
+ expect(pickTier(1000)).toBe(1200);
+ expect(pickTier(2000)).toBe(1200);
+ });
+});
+
+
+describe('primaryPlotType', () => {
+ it('returns the first plot_type entry', () => {
+ expect(primaryPlotType(spec('a', { plot_type: ['scatter', 'point'] }))).toBe('scatter');
+ });
+
+ it('returns "other" when plot_type is missing', () => {
+ expect(primaryPlotType(spec('a', null))).toBe('other');
+ expect(primaryPlotType(spec('a', { domain: ['statistics'] }))).toBe('other');
+ });
+});
+
+
+describe('topPlotTypes', () => {
+ it('returns the N most frequent primary types in descending order', () => {
+ const specs = [
+ spec('s1', { plot_type: ['line'] }),
+ spec('s2', { plot_type: ['line'] }),
+ spec('s3', { plot_type: ['line'] }),
+ spec('s4', { plot_type: ['scatter'] }),
+ spec('s5', { plot_type: ['scatter'] }),
+ spec('s6', { plot_type: ['bar'] }),
+ ];
+ expect(topPlotTypes(specs, 3)).toEqual(['line', 'scatter', 'bar']);
+ });
+
+ it('truncates to the requested length', () => {
+ const specs = [
+ spec('s1', { plot_type: ['a'] }),
+ spec('s2', { plot_type: ['b'] }),
+ spec('s3', { plot_type: ['c'] }),
+ ];
+ expect(topPlotTypes(specs, 2)).toHaveLength(2);
+ });
+
+ it('breaks ties alphabetically for determinism', () => {
+ const specs = [
+ spec('s1', { plot_type: ['zebra'] }),
+ spec('s2', { plot_type: ['apple'] }),
+ spec('s3', { plot_type: ['mango'] }),
+ ];
+ // All have count=1, alphabetic order: apple, mango, zebra
+ expect(topPlotTypes(specs, 3)).toEqual(['apple', 'mango', 'zebra']);
+ });
+
+ it('excludes the synthetic "other" bucket so it does not waste a color slot', () => {
+ const specs = [
+ spec('s1', null), // no plot_type → primaryPlotType returns 'other'
+ spec('s2', { plot_type: ['line'] }),
+ ];
+ expect(topPlotTypes(specs, 5)).toEqual(['line']);
+ });
+});
+
+
+describe('fitToBox', () => {
+ it('returns a square for 1:1 aspect ratio', () => {
+ expect(fitToBox(22, 1)).toEqual({ w: 22, h: 22 });
+ });
+
+ it('keeps width = box and shrinks height for 16:9', () => {
+ const r = fitToBox(22, 16 / 9);
+ expect(r.w).toBe(22);
+ expect(r.h).toBeCloseTo(22 * 9 / 16);
+ });
+
+ it('keeps height = box and shrinks width for portrait (9:16)', () => {
+ const r = fitToBox(22, 9 / 16);
+ expect(r.h).toBe(22);
+ expect(r.w).toBeCloseTo(22 * 9 / 16);
+ });
+
+ it('falls back to a square for invalid aspect ratios', () => {
+ expect(fitToBox(22, 0)).toEqual({ w: 22, h: 22 });
+ expect(fitToBox(22, NaN)).toEqual({ w: 22, h: 22 });
+ expect(fitToBox(22, Infinity)).toEqual({ w: 22, h: 22 });
+ });
+});
+
+
+describe('pickBestLoadedTier', () => {
+ function img(): HTMLImageElement {
+ return document.createElement('img');
+ }
+
+ it('returns the desired tier when loaded', () => {
+ const a = img();
+ const imgs = new Map([[400 as const, a]]);
+ expect(pickBestLoadedTier(imgs, 400)).toBe(a);
+ });
+
+ it('returns a higher-resolution variant when desired is not loaded', () => {
+ const a = img();
+ const imgs = new Map([[800 as const, a]]);
+ expect(pickBestLoadedTier(imgs, 400)).toBe(a);
+ });
+
+ it('falls back to a smaller tier when nothing larger is loaded', () => {
+ const a = img();
+ const imgs = new Map([[400 as const, a]]);
+ expect(pickBestLoadedTier(imgs, 800)).toBe(a);
+ });
+
+ it('returns null when nothing is loaded', () => {
+ expect(pickBestLoadedTier(new Map(), 400)).toBeNull();
+ });
+});
diff --git a/app/src/pages/MapPage.helpers.ts b/app/src/pages/MapPage.helpers.ts
new file mode 100644
index 0000000000..7bc770b103
--- /dev/null
+++ b/app/src/pages/MapPage.helpers.ts
@@ -0,0 +1,456 @@
+/**
+ * Helpers for the /map page: tag flattening, IDF weighting, weighted
+ * Jaccard similarity, KNN edge construction, plus thumbnail-tier
+ * selection and image preloading.
+ *
+ * Most helpers are pure (math + selection logic) so they can be unit
+ * tested in MapPage.helpers.test.ts. The two exceptions — preloadImages
+ * and ensureNodeTier — create DOM HTMLImageElements and trigger network
+ * fetches; their callbacks let the caller hook in cache state and a
+ * canvas refresh.
+ */
+
+import { selectPreviewUrl } from '../utils/themedPreview';
+
+
+/** Backend response shape from GET /api/specs/map. Mirrors api/schemas.py::SpecMapItem. */
+export interface SpecMapItem {
+ id: string;
+ title: string;
+ preview_url_light: string | null;
+ preview_url_dark: string | null;
+ quality_score: number | null;
+ tags: Record | null;
+ impl_tags: Record | null;
+}
+
+/** Resolution tiers baked by the responsive-image pipeline (responsiveImage.ts). */
+export const RESOLUTION_TIERS = [400, 800, 1200] as const;
+export type ResolutionTier = (typeof RESOLUTION_TIERS)[number];
+
+/**
+ * Node shape passed to ForceGraph2D. Holds a lazy collection of image variants
+ * keyed by resolution tier (400/800/1200). The page populates the 400 tier
+ * eagerly on load and progressively upgrades on zoom-in.
+ */
+export interface MapNode {
+ id: string;
+ title: string;
+ tags: string[];
+ thumbUrl: string | null; // base theme-aware .png URL
+ imgs: Map; // loaded variants
+ pendingTiers: Set; // tiers with an in-flight fetch
+ // colorBucket = primary plot_type for nodes that fall into the top-N most
+ // frequent plot types; null otherwise. Drives the per-cluster border color
+ // without imposing any spatial bias on the layout.
+ colorBucket: string | null;
+}
+
+/** Link shape passed to ForceGraph2D. `weight` = weighted-Jaccard sim ∈ (0, 1]. */
+export interface MapLink {
+ source: string;
+ target: string;
+ weight: number;
+}
+
+/**
+ * Flatten a spec's nested tag dicts to a single `category:value` string set.
+ * Prefixing prevents collisions like `numeric` appearing in both `data_type`
+ * and `dataprep` and gives the IDF/Jaccard math distinct tokens to weigh.
+ */
+export function flattenTags(spec: SpecMapItem, includeImpl = true): string[] {
+ const out: string[] = [];
+ const push = (dict: Record | null | undefined) => {
+ if (!dict) return;
+ for (const [category, values] of Object.entries(dict)) {
+ if (!Array.isArray(values)) continue;
+ for (const v of values) {
+ if (typeof v === 'string' && v.length > 0) out.push(`${category}:${v}`);
+ }
+ }
+ };
+ push(spec.tags);
+ if (includeImpl) push(spec.impl_tags);
+ return Array.from(new Set(out));
+}
+
+/**
+ * Inverse-document-frequency weights: w_t = log(N / df_t).
+ * Down-weights ubiquitous tags (`data_type:numeric` is in nearly every spec)
+ * and amplifies rare ones. Returns weight ≥ 0; tags absent from the corpus
+ * default to 0 when looked up.
+ *
+ * `maxDfRatio` zeroes out tags that appear in more than that fraction of the
+ * corpus. Plain log-IDF still gives those tags a small positive weight, which
+ * compounds across many shared common tags into spurious cross-cluster
+ * bridges — `dependencies:selenium` in ~98 % of specs, `features:basic` in
+ * ~50 %, etc. Setting them to exactly zero kills the noise without affecting
+ * tags that are merely common-but-informative.
+ */
+export function computeIDF(specs: SpecMapItem[], maxDfRatio = 0.67): Map {
+ const N = specs.length || 1;
+ const df = new Map();
+ for (const spec of specs) {
+ for (const tag of flattenTags(spec)) {
+ df.set(tag, (df.get(tag) ?? 0) + 1);
+ }
+ }
+ const idf = new Map();
+ for (const [tag, count] of df) {
+ if (count / N > maxDfRatio) {
+ idf.set(tag, 0);
+ continue;
+ }
+ idf.set(tag, Math.log(N / count));
+ }
+ return idf;
+}
+
+/**
+ * The 9 known tag categories the catalog uses. The first four come from
+ * specification.yaml (spec-level), the last five from impl metadata yaml.
+ */
+export const TAG_CATEGORIES = [
+ 'plot_type',
+ 'features',
+ 'data_type',
+ 'domain',
+ 'dependencies',
+ 'techniques',
+ 'patterns',
+ 'dataprep',
+ 'styling',
+] as const;
+
+export type TagCategory = (typeof TAG_CATEGORIES)[number];
+
+/**
+ * Default per-category multipliers applied on top of IDF weighting in the
+ * Jaccard similarity calculation. Users can override these live via the
+ * weights panel; passing a custom `weights` map to {@link weightedJaccard}
+ * or {@link buildKNNLinks} replaces the defaults entirely.
+ *
+ * The defaults privilege plot_type (2.0) with light contributions from
+ * features and data_type (0.5 each). That gives a plot_type-dominant map
+ * with subtle cross-type cohesion. Users can slide secondary categories up
+ * via the weights panel to mix in techniques/patterns/etc. for richer
+ * clustering.
+ */
+export const DEFAULT_CATEGORY_WEIGHT: Record = {
+ plot_type: 2.0,
+ features: 0.5,
+ techniques: 0,
+ patterns: 0,
+ dataprep: 0,
+ dependencies: 0,
+ domain: 0,
+ data_type: 0.5,
+ styling: 0,
+};
+
+function categoryOf(prefixedTag: string): string {
+ const idx = prefixedTag.indexOf(':');
+ return idx >= 0 ? prefixedTag.slice(0, idx) : '';
+}
+
+function tagWeight(
+ tag: string,
+ idf: Map,
+ weights: Record
+): number {
+ return (idf.get(tag) ?? 0) * (weights[categoryOf(tag)] ?? 1);
+}
+
+/**
+ * Weighted Jaccard similarity over two tag sets.
+ * sim = Σ w_t for t∈a∩b / Σ w_t for t∈a∪b
+ * Per-tag weight = IDF × weights[category prefix], so the contribution of a
+ * shared tag depends both on its rarity in the corpus and on which category
+ * it belongs to. Returns 0 when either set is empty or the denominator
+ * collapses to zero. `weights` defaults to {@link DEFAULT_CATEGORY_WEIGHT}.
+ */
+export function weightedJaccard(
+ a: string[],
+ b: string[],
+ idf: Map,
+ weights: Record = DEFAULT_CATEGORY_WEIGHT
+): number {
+ if (a.length === 0 || b.length === 0) return 0;
+ const setA = new Set(a);
+ const setB = new Set(b);
+ let num = 0;
+ let denom = 0;
+ const seen = new Set();
+ for (const t of setA) {
+ seen.add(t);
+ const w = tagWeight(t, idf, weights);
+ denom += w;
+ if (setB.has(t)) num += w;
+ }
+ for (const t of setB) {
+ if (seen.has(t)) continue;
+ denom += tagWeight(t, idf, weights);
+ }
+ return denom > 0 ? num / denom : 0;
+}
+
+/**
+ * Build a sparse KNN link list: each spec keeps its top-K most similar
+ * neighbors above `minSim`. Output is deduplicated (no A→B + B→A pair) and
+ * symmetric — the link with the higher weight wins on tie.
+ *
+ * With ~327 specs × K=5 the result is ~1.6k edges: dense enough for
+ * cohesive clustering, sparse enough to avoid hairball rendering.
+ */
+export function buildKNNLinks(
+ specs: SpecMapItem[],
+ idf: Map,
+ k = 5,
+ minSim = 0.05,
+ weights: Record = DEFAULT_CATEGORY_WEIGHT
+): MapLink[] {
+ const tagsByIdx = specs.map(s => flattenTags(s));
+ const linkSet = new Map();
+ for (let i = 0; i < specs.length; i++) {
+ const sims: { j: number; sim: number }[] = [];
+ for (let j = 0; j < specs.length; j++) {
+ if (i === j) continue;
+ const sim = weightedJaccard(tagsByIdx[i], tagsByIdx[j], idf, weights);
+ // sim > 0 drops zero-weight links (no shared tags or all-zero IDF) — pure visual noise.
+ if (sim > 0 && sim >= minSim) sims.push({ j, sim });
+ }
+ sims.sort((x, y) => y.sim - x.sim);
+ for (const { j, sim } of sims.slice(0, k)) {
+ const a = specs[i].id;
+ const b = specs[j].id;
+ const key = a < b ? `${a}|${b}` : `${b}|${a}`;
+ const existing = linkSet.get(key);
+ if (!existing || sim > existing.weight) {
+ linkSet.set(key, { source: a < b ? a : b, target: a < b ? b : a, weight: sim });
+ }
+ }
+ }
+ return Array.from(linkSet.values());
+}
+
+/**
+ * Pick the theme-aware base preview URL (the original `.png`). Variant
+ * selection happens at draw time via {@link buildVariantUrl} + {@link pickTier}
+ * so we only fetch higher-resolution thumbnails for nodes the user actually
+ * zooms into.
+ */
+export function selectMapThumbUrl(spec: SpecMapItem, isDark: boolean): string | null {
+ return selectPreviewUrl(spec, isDark);
+}
+
+/**
+ * Derive the URL of a specific resolution variant from the base `.png` URL.
+ * `.../plot-light.png` + 800 → `.../plot-light_800.webp`. Returns the original
+ * URL unchanged if it doesn't end in `.png` (no variants available).
+ */
+export function buildVariantUrl(baseUrl: string, tier: ResolutionTier): string {
+ if (!baseUrl.endsWith('.png')) return baseUrl;
+ return baseUrl.replace(/\.png$/, `_${tier}.webp`);
+}
+
+/**
+ * Pick the smallest pipeline tier whose source resolution comfortably covers
+ * the requested device-pixel size. Source needs to be ≥ device pixels for
+ * crisp rendering — we add a small headroom factor so a tiny zoom-in nudge
+ * doesn't immediately re-fetch the next tier.
+ */
+export function pickTier(devicePxSize: number): ResolutionTier {
+ const HEADROOM = 1.25;
+ const target = devicePxSize * HEADROOM;
+ if (target <= 400) return 400;
+ if (target <= 800) return 800;
+ return 1200;
+}
+
+/**
+ * Return the smallest already-loaded tier that's at least as big as
+ * `desired` (we don't waste pixels rendering a 1200 px image at the
+ * 400 px tier). Falls back to the largest loaded tier smaller than
+ * `desired` if no sufficient tier has loaded yet — better than a blank
+ * thumbnail during the lazy upgrade.
+ */
+export function pickBestLoadedTier(
+ imgs: Map,
+ desired: ResolutionTier
+): HTMLImageElement | null {
+ for (const t of RESOLUTION_TIERS) {
+ if (t >= desired && imgs.has(t)) return imgs.get(t)!;
+ }
+ for (let i = RESOLUTION_TIERS.length - 1; i >= 0; i--) {
+ const t = RESOLUTION_TIERS[i];
+ if (imgs.has(t)) return imgs.get(t)!;
+ }
+ return null;
+}
+
+/** Tag categories that come from specification.yaml (vs. impl-level metadata). */
+export const SPEC_LEVEL_CATEGORIES: readonly TagCategory[] = [
+ 'plot_type',
+ 'features',
+ 'data_type',
+ 'domain',
+] as const;
+
+/**
+ * Pick a spec's primary value for a given tag category — the first entry of
+ * the relevant list (spec.tags[category] for spec-level categories,
+ * spec.impl_tags[category] for impl-level). Falls back to "other" when the
+ * spec has no tag in that category at all.
+ */
+export function primaryCategoryValue(spec: SpecMapItem, category: TagCategory): string {
+ const dict = (SPEC_LEVEL_CATEGORIES as readonly string[]).includes(category)
+ ? spec.tags
+ : spec.impl_tags;
+ return dict?.[category]?.[0] ?? 'other';
+}
+
+/** Convenience wrapper: a spec's primary plot_type. */
+export function primaryPlotType(spec: SpecMapItem): string {
+ return primaryCategoryValue(spec, 'plot_type');
+}
+
+/**
+ * Count specs by their primary value for a given tag category (excluding
+ * the synthetic `other` bucket). Used by the legend to display per-cluster
+ * member counts.
+ */
+export function categoryValueCounts(
+ specs: SpecMapItem[],
+ category: TagCategory
+): Map {
+ const counts = new Map();
+ for (const s of specs) {
+ const v = primaryCategoryValue(s, category);
+ if (v === 'other') continue;
+ counts.set(v, (counts.get(v) ?? 0) + 1);
+ }
+ return counts;
+}
+
+/** Convenience wrapper: per-plot_type spec counts. */
+export function plotTypeCounts(specs: SpecMapItem[]): Map {
+ return categoryValueCounts(specs, 'plot_type');
+}
+
+/**
+ * Return the top-N most frequent primary values in the given category, sorted
+ * by count descending (alphabetic name as tiebreaker for determinism). Used
+ * to decide which buckets earn a distinct color border in the map.
+ *
+ * Excludes the synthetic `other` bucket (specs missing the category entirely)
+ * so it never wastes a color slot.
+ */
+export function topCategoryValues(
+ specs: SpecMapItem[],
+ category: TagCategory,
+ n: number
+): string[] {
+ return Array.from(categoryValueCounts(specs, category).entries())
+ .sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]))
+ .slice(0, n)
+ .map(([v]) => v);
+}
+
+/** Convenience wrapper: top-N plot_types by spec count. */
+export function topPlotTypes(specs: SpecMapItem[], n: number): string[] {
+ return topCategoryValues(specs, 'plot_type', n);
+}
+
+/**
+ * Read a node's intrinsic aspect ratio (width/height) from any already-loaded
+ * thumbnail variant. Defaults to 1 when nothing is loaded yet (and the page
+ * draws a square fallback rect anyway). Most plots are 16:9 (figsize=(16,9)),
+ * so the typical return value is ~1.78.
+ */
+export function nodeAspectRatio(node: MapNode): number {
+ for (const t of RESOLUTION_TIERS) {
+ const img = node.imgs.get(t);
+ if (img && img.naturalWidth > 0 && img.naturalHeight > 0) {
+ return img.naturalWidth / img.naturalHeight;
+ }
+ }
+ return 1;
+}
+
+/**
+ * Given a target box size and an aspect ratio, return the (width, height) that
+ * fits inside the box without distortion (longer side = boxSize). Used for both
+ * canvas drawing and hit-area painting so they always agree.
+ */
+export function fitToBox(boxSize: number, aspectRatio: number): { w: number; h: number } {
+ if (!isFinite(aspectRatio) || aspectRatio <= 0) return { w: boxSize, h: boxSize };
+ if (aspectRatio >= 1) return { w: boxSize, h: boxSize / aspectRatio };
+ return { w: boxSize * aspectRatio, h: boxSize };
+}
+
+/**
+ * Lazily fetch the requested tier for a node and call `onLoad` when it lands.
+ * Idempotent — safe to call repeatedly from `nodeCanvasObject` on every paint.
+ * force-graph only invokes that callback for visible nodes, so off-screen
+ * specs never trigger a higher-tier fetch.
+ */
+export function ensureNodeTier(
+ node: MapNode,
+ tier: ResolutionTier,
+ onLoad: () => void
+): void {
+ if (!node.thumbUrl) return;
+ if (node.imgs.has(tier) || node.pendingTiers.has(tier)) return;
+ node.pendingTiers.add(tier);
+ const img = document.createElement('img');
+ img.onload = () => {
+ node.imgs.set(tier, img);
+ node.pendingTiers.delete(tier);
+ onLoad();
+ };
+ img.onerror = () => {
+ node.pendingTiers.delete(tier);
+ };
+ img.src = buildVariantUrl(node.thumbUrl, tier);
+}
+
+/**
+ * Eager-preload every node's thumbnail at the smallest tier (400 px wide ≈ 6 KB
+ * webp). Resolves once all images either loaded or errored — failures are
+ * swallowed (the node renders as a plain dot in the fallback path).
+ *
+ * `onLoad` fires per-image so the page can call fgRef.refresh() to re-paint
+ * without re-running the simulation, producing the "thumbnails pop in
+ * organically" UX rather than a blocking wait. Higher-resolution tiers are
+ * lazy-loaded on demand by {@link ensureNodeTier} from `nodeCanvasObject`
+ * when the user zooms in.
+ */
+export async function preloadImages(
+ items: { id: string; thumbUrl: string | null }[],
+ onLoad?: (id: string, tier: ResolutionTier, img: HTMLImageElement) => void
+): Promise