From c9bdfa4393b5cb28199c310ee81cea26a40c890a Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Tue, 21 Apr 2026 14:54:42 +0200 Subject: [PATCH 01/28] feat: anon piece selection and retrieval --- .gitignore | 2 + apps/backend/.env.example | 17 +- apps/backend/README.md | 2 +- apps/backend/src/app.module.ts | 2 + apps/backend/src/config/app.config.ts | 86 +++++- .../data-retention/data-retention.module.ts | 4 +- .../data-retention.service.spec.ts | 182 ++++++------- .../data-retention/data-retention.service.ts | 16 +- apps/backend/src/database/database.module.ts | 9 +- .../entities/anon-retrieval.entity.ts | 100 +++++++ .../entities/job-schedule-state.entity.ts | 1 + .../1762000000000-CreateAnonRetrievals.ts | 64 +++++ .../http-client/http-client.service.spec.ts | 93 ++++++- .../src/http-client/http-client.service.ts | 87 +++++-- apps/backend/src/http-client/types.ts | 2 + apps/backend/src/jobs/job-queues.ts | 1 + apps/backend/src/jobs/jobs.module.ts | 2 + apps/backend/src/jobs/jobs.service.spec.ts | 128 ++++----- apps/backend/src/jobs/jobs.service.ts | 101 +++++++- .../metrics-prometheus/check-metric-labels.ts | 2 +- .../check-metrics.service.ts | 63 +++++ .../metrics-prometheus.module.ts | 53 ++++ .../src/pdp-subgraph/pdp-subgraph.module.ts | 8 - apps/backend/src/pdp-subgraph/queries.ts | 24 -- .../anon-piece-selector.service.spec.ts | 168 ++++++++++++ .../anon-piece-selector.service.ts | 208 +++++++++++++++ .../anon-retrieval.service.spec.ts | 189 ++++++++++++++ .../retrieval-anon/anon-retrieval.service.ts | 244 ++++++++++++++++++ .../retrieval-anon/car-validation.service.ts | 223 ++++++++++++++++ .../retrieval-anon/piece-retrieval.service.ts | 195 ++++++++++++++ .../retrieval-anon/retrieval-anon.module.ts | 27 ++ apps/backend/src/retrieval-anon/types.ts | 35 +++ apps/backend/src/subgraph/queries.ts | 78 ++++++ apps/backend/src/subgraph/subgraph.module.ts | 8 + .../subgraph.service.spec.ts} | 167 +++++++++++- .../subgraph.service.ts} | 232 ++++++++++++++--- .../{pdp-subgraph => subgraph}/types.spec.ts | 0 .../src/{pdp-subgraph => subgraph}/types.ts | 101 ++++++++ .../src/wallet-sdk/wallet-sdk.service.spec.ts | 2 +- docs/checks/data-retention.md | 10 +- ...-configuration-and-approval-methodology.md | 2 +- docs/environment-variables.md | 34 ++- .../local/backend-configmap-local.yaml | 2 +- pnpm-lock.yaml | 36 +-- 44 files changed, 2683 insertions(+), 327 deletions(-) create mode 100644 apps/backend/src/database/entities/anon-retrieval.entity.ts create mode 100644 apps/backend/src/database/migrations/1762000000000-CreateAnonRetrievals.ts delete mode 100644 apps/backend/src/pdp-subgraph/pdp-subgraph.module.ts delete mode 100644 apps/backend/src/pdp-subgraph/queries.ts create mode 100644 apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts create mode 100644 apps/backend/src/retrieval-anon/anon-piece-selector.service.ts create mode 100644 apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts create mode 100644 apps/backend/src/retrieval-anon/anon-retrieval.service.ts create mode 100644 apps/backend/src/retrieval-anon/car-validation.service.ts create mode 100644 apps/backend/src/retrieval-anon/piece-retrieval.service.ts create mode 100644 apps/backend/src/retrieval-anon/retrieval-anon.module.ts create mode 100644 apps/backend/src/retrieval-anon/types.ts create mode 100644 apps/backend/src/subgraph/queries.ts create mode 100644 apps/backend/src/subgraph/subgraph.module.ts rename apps/backend/src/{pdp-subgraph/pdp-subgraph.service.spec.ts => subgraph/subgraph.service.spec.ts} (79%) rename apps/backend/src/{pdp-subgraph/pdp-subgraph.service.ts => subgraph/subgraph.service.ts} (52%) rename apps/backend/src/{pdp-subgraph => subgraph}/types.spec.ts (100%) rename apps/backend/src/{pdp-subgraph => subgraph}/types.ts (58%) diff --git a/.gitignore b/.gitignore index fc72832b..cbf7f9d7 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,5 @@ coverage/ # per-package lockfiles are stray apps/*/pnpm-lock.yaml !pnpm-lock.yaml + +.tool-versions diff --git a/apps/backend/.env.example b/apps/backend/.env.example index 6815a66f..26469c52 100644 --- a/apps/backend/.env.example +++ b/apps/backend/.env.example @@ -23,7 +23,8 @@ WALLET_ADDRESS=0x0000000000000000000000000000000000000000 WALLET_PRIVATE_KEY=your_private_key_here CHECK_DATASET_CREATION_FEES=true USE_ONLY_APPROVED_PROVIDERS=true -PDP_SUBGRAPH_ENDPOINT=https://api.thegraph.com/subgraphs/filecoin/pdp +# Point at the dealbot-owned subgraph on Goldsky (see apps/subgraph/README.md). +SUBGRAPH_ENDPOINT=https://api.goldsky.com/api/public//subgraphs/dealbot-subgraph//gn # Minimum number of datasets per SP (default: 1). When > 1, a separate data_set_creation job provisions extra datasets. MIN_NUM_DATASETS_FOR_CHECKS=1 @@ -52,6 +53,9 @@ DEALBOT_MAINTENANCE_WINDOW_MINUTES=20 DEALS_PER_SP_PER_HOUR=2 DATASET_CREATIONS_PER_SP_PER_HOUR=1 RETRIEVALS_PER_SP_PER_HOUR=1 +RETRIEVALS_ANON_PER_SP_PER_HOUR= +ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT=5 +METRICS_PER_HOUR=2 PG_BOSS_LOCAL_CONCURRENCY=20 JOB_SCHEDULER_POLL_SECONDS=300 JOB_WORKER_POLL_SECONDS=60 @@ -60,6 +64,7 @@ JOB_SCHEDULE_PHASE_SECONDS=0 JOB_ENQUEUE_JITTER_SECONDS=0 DEAL_JOB_TIMEOUT_SECONDS=360 # 6m: Max runtime for deal jobs (TODO: reduce default to 3m) RETRIEVAL_JOB_TIMEOUT_SECONDS=60 # 1m: Max runtime for retrieval jobs (TODO: reduce default to 30s) +ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS=360 # 6m: Max runtime for anon retrieval jobs (pieces up to ~70 MiB) IPFS_BLOCK_FETCH_CONCURRENCY=6 # Parallel block fetches when validating IPFS DAGs DEALBOT_PGBOSS_POOL_MAX=1 DEALBOT_PGBOSS_SCHEDULER_ENABLED=true @@ -73,9 +78,13 @@ PROXY_LIST=http://username:password@host:port,http://username:password@host:port PROXY_LOCATIONS=l1,l2 # Timeout Configuration (in milliseconds) -CONNECT_TIMEOUT_MS=10000 # 10s: Initial connection timeout -HTTP_REQUEST_TIMEOUT_MS=240000 # 4m: Total transfer timeout for HTTP/1.1 (10MiB @ 170KB/s + overhead) -HTTP2_REQUEST_TIMEOUT_MS=240000 # 4m: Total transfer timeout for HTTP/2 (10MiB @ 170KB/s + overhead) +CONNECT_TIMEOUT_MS=10000 # 10s: Connection + response-headers timeout (scoped to the header phase only) +# HTTP_REQUEST_TIMEOUT_MS and HTTP2_REQUEST_TIMEOUT_MS default to the longest job timeout above +# (max of DEAL_/RETRIEVAL_/ANON_RETRIEVAL_/DATA_SET_CREATION_/MAX_PIECE_CLEANUP_ * 1000 ms) so the +# HTTP-level ceiling never pre-empts a job-scoped AbortSignal. Only override when you have a non-job +# caller of HttpClientService that needs a specific deadline. +# HTTP_REQUEST_TIMEOUT_MS=360000 +# HTTP2_REQUEST_TIMEOUT_MS=360000 # SP Blocklists configuration # BLOCKED_SP_IDS=1234,5678 diff --git a/apps/backend/README.md b/apps/backend/README.md index 19ee970a..4805080f 100644 --- a/apps/backend/README.md +++ b/apps/backend/README.md @@ -104,7 +104,7 @@ All configuration is done via environment variables in `.env`. | `CHECK_DATASET_CREATION_FEES` | Check fees before dataset creation | `true` | | `ENABLE_IPNI_TESTING` | IPNI testing mode (`disabled`/`random`/`always`) | `always` | | `USE_ONLY_APPROVED_PROVIDERS` | Only use approved storage providers | `true` | -| `PDP_SUBGRAPH_ENDPOINT` | PDP subgraph API endpoint for PDP proof-set/data-retention | `https://api.thegraph.com/subgraphs/filecoin/pdp` | +| `SUBGRAPH_ENDPOINT` | Subgraph GraphQL endpoint for PDP proof-set/data-retention and anon-retrieval queries | `https://api.goldsky.com/api/public//subgraphs/dealbot-subgraph//gn` | ### Scheduling Configuration (pg-boss) diff --git a/apps/backend/src/app.module.ts b/apps/backend/src/app.module.ts index 569ec5e4..0580f339 100644 --- a/apps/backend/src/app.module.ts +++ b/apps/backend/src/app.module.ts @@ -13,6 +13,7 @@ import { JobsModule } from "./jobs/jobs.module.js"; import { MetricsPrometheusModule } from "./metrics-prometheus/metrics-prometheus.module.js"; import { ProvidersModule } from "./providers/providers.module.js"; import { RetrievalModule } from "./retrieval/retrieval.module.js"; +import { RetrievalAnonModule } from "./retrieval-anon/retrieval-anon.module.js"; @Module({ imports: [ @@ -28,6 +29,7 @@ import { RetrievalModule } from "./retrieval/retrieval.module.js"; JobsModule, DealModule, RetrievalModule, + RetrievalAnonModule, DataSourceModule, ProvidersModule, ...(process.env.ENABLE_DEV_MODE === "true" ? [DevToolsModule] : []), diff --git a/apps/backend/src/config/app.config.ts b/apps/backend/src/config/app.config.ts index b3b32a37..4e49e4d8 100644 --- a/apps/backend/src/config/app.config.ts +++ b/apps/backend/src/config/app.config.ts @@ -56,7 +56,7 @@ export const configValidationSchema = Joi.object({ USE_ONLY_APPROVED_PROVIDERS: Joi.boolean().default(true), DEALBOT_DATASET_VERSION: Joi.string().optional(), MIN_NUM_DATASETS_FOR_CHECKS: Joi.number().integer().min(1).default(1), - PDP_SUBGRAPH_ENDPOINT: Joi.string().uri().optional().allow(""), + SUBGRAPH_ENDPOINT: Joi.string().uri().optional().allow(""), // Scheduling PROVIDERS_REFRESH_INTERVAL_SECONDS: Joi.number().default(4 * 3600), @@ -80,6 +80,7 @@ export const configValidationSchema = Joi.object({ DEALS_PER_SP_PER_HOUR: Joi.number().min(0.001).max(20).default(4), DATASET_CREATIONS_PER_SP_PER_HOUR: Joi.number().min(0.001).max(20).default(1), RETRIEVALS_PER_SP_PER_HOUR: Joi.number().min(0.001).max(20).default(2), + RETRIEVALS_ANON_PER_SP_PER_HOUR: Joi.number().min(0.001).max(20).optional(), // Polling interval for pg-boss scheduler (lower = more responsive, higher = less DB chatter). JOB_SCHEDULER_POLL_SECONDS: Joi.number().min(60).default(300), JOB_WORKER_POLL_SECONDS: Joi.number().min(5).default(60), @@ -91,8 +92,10 @@ export const configValidationSchema = Joi.object({ JOB_ENQUEUE_JITTER_SECONDS: Joi.number().min(0).default(0), DEAL_JOB_TIMEOUT_SECONDS: Joi.number().min(120).default(360), // 6 minutes max runtime for data storage jobs (TODO: reduce default to 3 minutes) RETRIEVAL_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(60), // 1 minute max runtime for retrieval jobs (TODO: reduce default to 30 seconds) + ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(360), // 6 minutes max runtime for anon retrieval jobs (pieces can be up to ~70 MiB) DATA_SET_CREATION_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(300), // 5 minutes max runtime for dataset creation jobs IPFS_BLOCK_FETCH_CONCURRENCY: Joi.number().integer().min(1).max(32).default(6), + ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT: Joi.number().integer().min(1).max(50).default(5), // Piece Cleanup MAX_DATASET_STORAGE_SIZE_BYTES: Joi.number() @@ -131,8 +134,9 @@ export const configValidationSchema = Joi.object({ // Timeouts (in milliseconds) CONNECT_TIMEOUT_MS: Joi.number().min(1000).default(10000), // 10 seconds to establish connection/receive headers - HTTP_REQUEST_TIMEOUT_MS: Joi.number().min(1000).default(240000), // 4 minutes total for HTTP requests (10MiB @ 170KB/s + overhead) - HTTP2_REQUEST_TIMEOUT_MS: Joi.number().min(1000).default(240000), // 4 minutes total for HTTP/2 requests (10MiB @ 170KB/s + overhead) + // Defaults intentionally omitted so loadConfig can derive them from the longest job timeout. + HTTP_REQUEST_TIMEOUT_MS: Joi.number().min(1000).optional(), + HTTP2_REQUEST_TIMEOUT_MS: Joi.number().min(1000).optional(), IPNI_VERIFICATION_TIMEOUT_MS: Joi.number().min(1000).default(60000), // 60 seconds max time to wait for IPNI verification IPNI_VERIFICATION_POLLING_MS: Joi.number().min(250).default(2000), // 2 seconds between IPNI verification polls @@ -173,7 +177,7 @@ export interface IBlockchainConfig { useOnlyApprovedProviders: boolean; dealbotDataSetVersion?: string; minNumDataSetsForChecks: number; - pdpSubgraphEndpoint?: string; + subgraphEndpoint?: string; } export interface ISchedulingConfig { @@ -264,6 +268,14 @@ export interface IJobsConfig { * Uses AbortController to actively cancel job execution. */ retrievalJobTimeoutSeconds: number; + /** + * Maximum runtime (seconds) for anonymous retrieval jobs before forced abort. + * + * Anonymous retrievals fetch arbitrary pieces (up to ~70 MiB), so this is + * typically larger than `retrievalJobTimeoutSeconds`. Uses AbortController + * to actively cancel job execution while still persisting partial metrics. + */ + anonRetrievalJobTimeoutSeconds: number; /** * Target number of piece cleanup runs per storage provider per hour. * @@ -278,6 +290,12 @@ export interface IJobsConfig { * Only used when `DEALBOT_JOBS_MODE=pgboss`. */ maxPieceCleanupRuntimeSeconds: number; + + /** + * Target number of anonymous retrieval tests per storage provider per hour. + * Defaults to retrievalsPerSpPerHour when not set. + */ + retrievalsAnonPerSpPerHour: number; } export interface IDatasetConfig { @@ -295,6 +313,10 @@ export interface ITimeoutConfig { export interface IRetrievalConfig { ipfsBlockFetchConcurrency: number; + /** + * Number of CAR blocks to sample for IPNI + block-fetch validation. + */ + anonBlockSampleCount: number; } export interface IPieceCleanupConfig { @@ -336,6 +358,43 @@ export interface IConfig { } export function loadConfig(): IConfig { + const jobTimeoutSeconds = { + deal: Number.parseInt(process.env.DEAL_JOB_TIMEOUT_SECONDS || "360", 10), + retrieval: Number.parseInt(process.env.RETRIEVAL_JOB_TIMEOUT_SECONDS || "60", 10), + anonRetrieval: Number.parseInt(process.env.ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS || "360", 10), + dataSetCreation: Number.parseInt(process.env.DATA_SET_CREATION_JOB_TIMEOUT_SECONDS || "300", 10), + pieceCleanup: Number.parseInt(process.env.MAX_PIECE_CLEANUP_RUNTIME_SECONDS || "300", 10), + }; + + // HTTP-level request timeouts default to the longest job timeout so the + // per-request ceiling never caps below the per-job budget. Any job-scoped + // AbortSignal fires first and is authoritative; the HTTP timer only kicks + // in for callers that do not pass a parent signal. + const longestJobTimeoutMs = Math.max(...Object.values(jobTimeoutSeconds)) * 1000; + + const httpRequestTimeoutMs = Number.parseInt(process.env.HTTP_REQUEST_TIMEOUT_MS || String(longestJobTimeoutMs), 10); + const http2RequestTimeoutMs = Number.parseInt( + process.env.HTTP2_REQUEST_TIMEOUT_MS || String(longestJobTimeoutMs), + 10, + ); + + // Misconfiguration guard: if someone explicitly sets an HTTP timeout below + // the longest job timeout, the HTTP-level timer will abort in-flight work + // before the job signal has a chance to report it. Warn loudly so this is + // caught at boot rather than inferred from short-timeout incidents later. + for (const [name, value] of [ + ["HTTP_REQUEST_TIMEOUT_MS", httpRequestTimeoutMs], + ["HTTP2_REQUEST_TIMEOUT_MS", http2RequestTimeoutMs], + ] as const) { + if (value < longestJobTimeoutMs) { + // eslint-disable-next-line no-console + console.warn( + `[config] ${name}=${value}ms is lower than the longest job timeout (${longestJobTimeoutMs}ms). ` + + `HTTP requests may abort before the job signal fires, producing short, unexplained timeouts.`, + ); + } + } + return { app: { env: process.env.NODE_ENV || "development", @@ -378,7 +437,7 @@ export function loadConfig(): IConfig { useOnlyApprovedProviders: process.env.USE_ONLY_APPROVED_PROVIDERS !== "false", dealbotDataSetVersion: process.env.DEALBOT_DATASET_VERSION, minNumDataSetsForChecks: Number.parseInt(process.env.MIN_NUM_DATASETS_FOR_CHECKS || "1", 10), - pdpSubgraphEndpoint: process.env.PDP_SUBGRAPH_ENDPOINT || "", + subgraphEndpoint: process.env.SUBGRAPH_ENDPOINT || "", }, scheduling: { providersRefreshIntervalSeconds: Number.parseInt(process.env.PROVIDERS_REFRESH_INTERVAL_SECONDS || "14400", 10), @@ -401,11 +460,15 @@ export function loadConfig(): IConfig { catchupMaxEnqueue: Number.parseInt(process.env.JOB_CATCHUP_MAX_ENQUEUE || "10", 10), schedulePhaseSeconds: Number.parseInt(process.env.JOB_SCHEDULE_PHASE_SECONDS || "0", 10), enqueueJitterSeconds: Number.parseInt(process.env.JOB_ENQUEUE_JITTER_SECONDS || "0", 10), - dealJobTimeoutSeconds: Number.parseInt(process.env.DEAL_JOB_TIMEOUT_SECONDS || "360", 10), - retrievalJobTimeoutSeconds: Number.parseInt(process.env.RETRIEVAL_JOB_TIMEOUT_SECONDS || "60", 10), - dataSetCreationJobTimeoutSeconds: Number.parseInt(process.env.DATA_SET_CREATION_JOB_TIMEOUT_SECONDS || "300", 10), + dealJobTimeoutSeconds: jobTimeoutSeconds.deal, + retrievalJobTimeoutSeconds: jobTimeoutSeconds.retrieval, + anonRetrievalJobTimeoutSeconds: jobTimeoutSeconds.anonRetrieval, + retrievalsAnonPerSpPerHour: Number.parseFloat( + process.env.RETRIEVALS_ANON_PER_SP_PER_HOUR || process.env.RETRIEVALS_PER_SP_PER_HOUR || "2", + ), + dataSetCreationJobTimeoutSeconds: jobTimeoutSeconds.dataSetCreation, pieceCleanupPerSpPerHour: Number.parseFloat(process.env.JOB_PIECE_CLEANUP_PER_SP_PER_HOUR || String(1 / 24)), - maxPieceCleanupRuntimeSeconds: Number.parseInt(process.env.MAX_PIECE_CLEANUP_RUNTIME_SECONDS || "300", 10), + maxPieceCleanupRuntimeSeconds: jobTimeoutSeconds.pieceCleanup, }, dataset: { localDatasetsPath: process.env.DEALBOT_LOCAL_DATASETS_PATH || DEFAULT_LOCAL_DATASETS_PATH, @@ -427,13 +490,14 @@ export function loadConfig(): IConfig { }, timeouts: { connectTimeoutMs: Number.parseInt(process.env.CONNECT_TIMEOUT_MS || "10000", 10), - httpRequestTimeoutMs: Number.parseInt(process.env.HTTP_REQUEST_TIMEOUT_MS || "240000", 10), - http2RequestTimeoutMs: Number.parseInt(process.env.HTTP2_REQUEST_TIMEOUT_MS || "240000", 10), + httpRequestTimeoutMs, + http2RequestTimeoutMs, ipniVerificationTimeoutMs: Number.parseInt(process.env.IPNI_VERIFICATION_TIMEOUT_MS || "60000", 10), ipniVerificationPollingMs: Number.parseInt(process.env.IPNI_VERIFICATION_POLLING_MS || "2000", 10), }, retrieval: { ipfsBlockFetchConcurrency: Number.parseInt(process.env.IPFS_BLOCK_FETCH_CONCURRENCY || "6", 10), + anonBlockSampleCount: Number.parseInt(process.env.ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT || "5", 10), }, clickhouse: { url: process.env.CLICKHOUSE_URL || undefined, diff --git a/apps/backend/src/data-retention/data-retention.module.ts b/apps/backend/src/data-retention/data-retention.module.ts index f459570a..f0aec1ec 100644 --- a/apps/backend/src/data-retention/data-retention.module.ts +++ b/apps/backend/src/data-retention/data-retention.module.ts @@ -2,12 +2,12 @@ import { Module } from "@nestjs/common"; import { TypeOrmModule } from "@nestjs/typeorm"; import { DataRetentionBaseline } from "../database/entities/data-retention-baseline.entity.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; -import { PdpSubgraphModule } from "../pdp-subgraph/pdp-subgraph.module.js"; +import { SubgraphModule } from "../subgraph/subgraph.module.js"; import { WalletSdkModule } from "../wallet-sdk/wallet-sdk.module.js"; import { DataRetentionService } from "./data-retention.service.js"; @Module({ - imports: [WalletSdkModule, PdpSubgraphModule, TypeOrmModule.forFeature([DataRetentionBaseline, StorageProvider])], + imports: [WalletSdkModule, SubgraphModule, TypeOrmModule.forFeature([DataRetentionBaseline, StorageProvider])], providers: [DataRetentionService], exports: [DataRetentionService], }) diff --git a/apps/backend/src/data-retention/data-retention.service.spec.ts b/apps/backend/src/data-retention/data-retention.service.spec.ts index 87ced66a..d2d539cf 100644 --- a/apps/backend/src/data-retention/data-retention.service.spec.ts +++ b/apps/backend/src/data-retention/data-retention.service.spec.ts @@ -7,8 +7,8 @@ import type { IConfig } from "../config/app.config.js"; import type { DataRetentionBaseline } from "../database/entities/data-retention-baseline.entity.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; import { buildCheckMetricLabels } from "../metrics-prometheus/check-metric-labels.js"; -import type { PDPSubgraphService } from "../pdp-subgraph/pdp-subgraph.service.js"; -import type { ProviderDataSetResponse } from "../pdp-subgraph/types.js"; +import type { SubgraphService } from "../subgraph/subgraph.service.js"; +import type { ProviderDataSetResponse } from "../subgraph/types.js"; import type { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; import { DataRetentionService } from "./data-retention.service.js"; @@ -42,7 +42,7 @@ describe("DataRetentionService", () => { let walletSdkServiceMock: { getTestingProviders: ReturnType; }; - let pdpSubgraphServiceMock: { + let subgraphServiceMock: { fetchSubgraphMeta: ReturnType; fetchProvidersWithDatasets: ReturnType; }; @@ -69,7 +69,7 @@ describe("DataRetentionService", () => { configServiceMock = { get: vi.fn((key: keyof IConfig) => { if (key === "blockchain") { - return { pdpSubgraphEndpoint: "https://example.com/subgraph" }; + return { subgraphEndpoint: "https://example.com/subgraph" }; } if (key === "spBlocklists") { return { ids: new Set(), addresses: new Set() }; @@ -95,7 +95,7 @@ describe("DataRetentionService", () => { ]), }; - pdpSubgraphServiceMock = { + subgraphServiceMock = { fetchSubgraphMeta: vi.fn().mockResolvedValue({ _meta: { block: { @@ -146,7 +146,7 @@ describe("DataRetentionService", () => { service = new DataRetentionService( configServiceMock, walletSdkServiceMock as unknown as WalletSdkService, - pdpSubgraphServiceMock as unknown as PDPSubgraphService, + subgraphServiceMock as unknown as SubgraphService, mockBaselineRepository as unknown as Repository, mockSPRepository as unknown as Repository, counterMock as unknown as Counter, @@ -155,15 +155,15 @@ describe("DataRetentionService", () => { ); }); - it("returns early when pdpSubgraphEndpoint is empty", async () => { + it("returns early when subgraphEndpoint is empty", async () => { (configServiceMock.get as ReturnType).mockReturnValue({ - pdpSubgraphEndpoint: "", + subgraphEndpoint: "", }); await service.pollDataRetention(); - expect(pdpSubgraphServiceMock.fetchSubgraphMeta).not.toHaveBeenCalled(); - expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); + expect(subgraphServiceMock.fetchSubgraphMeta).not.toHaveBeenCalled(); + expect(subgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); }); it("returns early when no testing providers configured", async () => { @@ -171,31 +171,31 @@ describe("DataRetentionService", () => { await service.pollDataRetention(); - expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); + expect(subgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); }); it("returns early when all providers are blocked for data-retention", async () => { (configServiceMock.get as ReturnType).mockImplementation((key: string) => { - if (key === "blockchain") return { pdpSubgraphEndpoint: "https://example.com/subgraph" }; + if (key === "blockchain") return { subgraphEndpoint: "https://example.com/subgraph" }; if (key === "spBlocklists") return { ids: new Set(), addresses: new Set([PROVIDER_A, PROVIDER_B]) }; }); await service.pollDataRetention(); - expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); + expect(subgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); }); it("excludes blocked providers from data-retention polling while retaining unblocked ones", async () => { (configServiceMock.get as ReturnType).mockImplementation((key: string) => { - if (key === "blockchain") return { pdpSubgraphEndpoint: "https://example.com/subgraph" }; + if (key === "blockchain") return { subgraphEndpoint: "https://example.com/subgraph" }; if (key === "spBlocklists") return { ids: new Set(), addresses: new Set([PROVIDER_A]) }; }); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); const allAddressesPolled: string[] = ( - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mock.calls as [{ addresses: string[] }][] + subgraphServiceMock.fetchProvidersWithDatasets.mock.calls as [{ addresses: string[] }][] ).flatMap(([{ addresses }]) => addresses); expect(allAddressesPolled).toContain(PROVIDER_B.toLowerCase()); expect(allAddressesPolled).not.toContain(PROVIDER_A.toLowerCase()); @@ -206,16 +206,16 @@ describe("DataRetentionService", () => { await service.pollDataRetention(); - expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); + expect(subgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); }); it("sets baseline on first poll without emitting counters (fresh deploy / new provider)", async () => { - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); - expect(pdpSubgraphServiceMock.fetchSubgraphMeta).toHaveBeenCalled(); - expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenCalledWith({ + expect(subgraphServiceMock.fetchSubgraphMeta).toHaveBeenCalled(); + expect(subgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenCalledWith({ blockNumber: 1200, addresses: [PROVIDER_A, PROVIDER_B], }); @@ -239,20 +239,20 @@ describe("DataRetentionService", () => { it("computes deltas correctly on consecutive polls", async () => { // First poll: blockNumber=1200 - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); const firstCallCount = counterMock.labels.mock.calls.length; // Second poll: blockNumber=1300, provider totals changed - pdpSubgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ + subgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1300, }, }, }); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 12n, totalProvingPeriods: 105n, @@ -266,7 +266,7 @@ describe("DataRetentionService", () => { }); it("does not increment counters when deltas are zero", async () => { - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([makeProvider()]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([makeProvider()]); // First poll await service.pollDataRetention(); @@ -288,7 +288,7 @@ describe("DataRetentionService", () => { const providerA = makeProvider({ address: PROVIDER_A, totalFaultedPeriods: 5n }); const providerB = makeProvider({ address: PROVIDER_B, totalFaultedPeriods: 20n }); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([providerA, providerB]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([providerA, providerB]); await service.pollDataRetention(); @@ -310,7 +310,7 @@ describe("DataRetentionService", () => { ]); const provider = makeProvider(); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([provider]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([provider]); await service.pollDataRetention(); @@ -333,7 +333,7 @@ describe("DataRetentionService", () => { }); it("handles empty providers array without errors", async () => { - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([]); await service.pollDataRetention(); @@ -347,7 +347,7 @@ describe("DataRetentionService", () => { ]); const provider = makeProvider(); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([provider]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([provider]); await service.pollDataRetention(); @@ -370,7 +370,7 @@ describe("DataRetentionService", () => { }); it("catches and logs errors without rethrowing", async () => { - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockRejectedValueOnce(new Error("subgraph down")); + subgraphServiceMock.fetchProvidersWithDatasets.mockRejectedValueOnce(new Error("subgraph down")); // Should not throw await expect(service.pollDataRetention()).resolves.toBeUndefined(); @@ -378,14 +378,14 @@ describe("DataRetentionService", () => { it("resets baseline on negative deltas without incrementing counters", async () => { // First poll: high values - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 100n, totalProvingPeriods: 200n }), ]); await service.pollDataRetention(); counterMock.labels.mockClear(); // Second poll: lower values (e.g., chain reorg or subgraph correction) - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 50n, totalProvingPeriods: 100n }), ]); await service.pollDataRetention(); @@ -394,7 +394,7 @@ describe("DataRetentionService", () => { expect(counterMock.labels).not.toHaveBeenCalled(); // Third poll: values increase from new baseline - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 52n, totalProvingPeriods: 105n }), ]); await service.pollDataRetention(); @@ -412,7 +412,7 @@ describe("DataRetentionService", () => { { providerAddress: PROVIDER_A, faultedPeriods: "0", successPeriods: "0", lastBlockNumber: "1000" }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: largeValue, totalProvingPeriods: largeValue * 2n }), ]); @@ -436,7 +436,7 @@ describe("DataRetentionService", () => { { providerAddress: PROVIDER_A, faultedPeriods: "0", successPeriods: "0", lastBlockNumber: "1000" }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: maxSafeInt, totalProvingPeriods: maxSafeInt * 2n }), ]); @@ -456,7 +456,7 @@ describe("DataRetentionService", () => { totalFaultedPeriods: 5n, totalProvingPeriods: 50n, }); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([provider]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([provider]); await service.pollDataRetention(); @@ -475,18 +475,18 @@ describe("DataRetentionService", () => { })); walletSdkServiceMock.getTestingProviders.mockReturnValueOnce(manyProviders); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([]); await service.pollDataRetention(); // Should be called twice: once for first 50, once for remaining 25 - expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenCalledTimes(2); - expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenNthCalledWith(1, { + expect(subgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenCalledTimes(2); + expect(subgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenNthCalledWith(1, { addresses: expect.arrayContaining([expect.any(String)]), blockNumber: 1200, }); - expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets.mock.calls[0][0].addresses).toHaveLength(50); - expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets.mock.calls[1][0].addresses).toHaveLength(25); + expect(subgraphServiceMock.fetchProvidersWithDatasets.mock.calls[0][0].addresses).toHaveLength(50); + expect(subgraphServiceMock.fetchProvidersWithDatasets.mock.calls[1][0].addresses).toHaveLength(25); }); it("continues processing next batch if one batch fails", async () => { @@ -499,20 +499,20 @@ describe("DataRetentionService", () => { walletSdkServiceMock.getTestingProviders.mockReturnValueOnce(manyProviders); // First batch fails, second succeeds - pdpSubgraphServiceMock.fetchProvidersWithDatasets + subgraphServiceMock.fetchProvidersWithDatasets .mockRejectedValueOnce(new Error("Subgraph timeout")) .mockResolvedValueOnce([]); await service.pollDataRetention(); // Both batches should be attempted - expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenCalledTimes(2); + expect(subgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenCalledTimes(2); }); it("logs error and skips counter update when provider not found in cache but returned from subgraph", async () => { // Provider C not in cache const PROVIDER_C = "0x1234567890123456789012345678901234567890"; - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_C })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_C })]); await service.pollDataRetention(); @@ -523,7 +523,7 @@ describe("DataRetentionService", () => { describe("cleanupStaleProviders", () => { it("does not cleanup when no stale providers exist", async () => { // First poll establishes baseline for both providers - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ address: PROVIDER_A }), makeProvider({ address: PROVIDER_B }), ]); @@ -536,7 +536,7 @@ describe("DataRetentionService", () => { it("successfully cleans up stale provider with valid database entry", async () => { // First poll: establish baseline for PROVIDER_A - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: PROVIDER_A removed from active list, only PROVIDER_B active @@ -558,7 +558,7 @@ describe("DataRetentionService", () => { }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -589,7 +589,7 @@ describe("DataRetentionService", () => { it("skips cleanup entirely when database fetch fails", async () => { // First poll: establish baseline - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: provider removed, but DB fails @@ -604,7 +604,7 @@ describe("DataRetentionService", () => { mockSPRepository.find.mockRejectedValueOnce(new Error("Database connection failed")); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -624,7 +624,7 @@ describe("DataRetentionService", () => { }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ address: PROVIDER_A, totalFaultedPeriods: 12n, totalProvingPeriods: 105n }), ]); @@ -637,7 +637,7 @@ describe("DataRetentionService", () => { it("retains baseline when provider not found in database", async () => { // First poll: establish baseline - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: provider removed from active list @@ -653,7 +653,7 @@ describe("DataRetentionService", () => { // Database returns empty array (provider not found) mockSPRepository.find.mockResolvedValueOnce([]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -670,7 +670,7 @@ describe("DataRetentionService", () => { }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ address: PROVIDER_A, totalFaultedPeriods: 12n, totalProvingPeriods: 105n }), ]); @@ -683,7 +683,7 @@ describe("DataRetentionService", () => { it("retains baseline when provider has null providerId", async () => { // First poll: establish baseline - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: provider removed @@ -706,7 +706,7 @@ describe("DataRetentionService", () => { }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -716,7 +716,7 @@ describe("DataRetentionService", () => { it("retains baseline when counter removal throws error", async () => { // First poll: establish baseline - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: provider removed @@ -743,7 +743,7 @@ describe("DataRetentionService", () => { throw new Error("Counter removal failed"); }); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -760,7 +760,7 @@ describe("DataRetentionService", () => { }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ address: PROVIDER_A, totalFaultedPeriods: 12n, totalProvingPeriods: 110n }), ]); @@ -781,7 +781,7 @@ describe("DataRetentionService", () => { { id: 3, serviceProvider: PROVIDER_C, name: "Provider C", isApproved: true }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ address: PROVIDER_A }), makeProvider({ address: PROVIDER_B }), makeProvider({ address: PROVIDER_C }), @@ -799,7 +799,7 @@ describe("DataRetentionService", () => { { address: PROVIDER_C, name: "Provider C", providerId: 3, isApproved: true }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); @@ -815,7 +815,7 @@ describe("DataRetentionService", () => { it("skips cleanup when processing errors occurred", async () => { // First poll: establish baseline - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: provider removed, but processing has errors @@ -824,7 +824,7 @@ describe("DataRetentionService", () => { ]); // Simulate processing error - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockRejectedValueOnce(new Error("Processing failed")); + subgraphServiceMock.fetchProvidersWithDatasets.mockRejectedValueOnce(new Error("Processing failed")); await service.pollDataRetention(); @@ -841,7 +841,7 @@ describe("DataRetentionService", () => { { id: 1, serviceProvider: PROVIDER_MIXED_CASE, name: "Provider A", isApproved: true }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ address: PROVIDER_MIXED_CASE.toLowerCase() as `0x${string}` }), ]); @@ -861,7 +861,7 @@ describe("DataRetentionService", () => { }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -885,7 +885,7 @@ describe("DataRetentionService", () => { // Subgraph returns same values: totalFaultedPeriods=10, totalProvingPeriods=100 // confirmedTotalSuccess = 100 - 10 = 90 // With DB baseline: faultedDelta = 10 - 10 = 0, successDelta = 90 - 90 = 0 - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); @@ -907,7 +907,7 @@ describe("DataRetentionService", () => { // Subgraph returns: totalFaultedPeriods=10, totalProvingPeriods=100 // confirmedTotalSuccess = 100 - 10 = 90 // faultedDelta = 10 - 8 = 2, successDelta = 90 - 85 = 5 - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); @@ -922,7 +922,7 @@ describe("DataRetentionService", () => { }); it("reloads baselines from DB on every poll", async () => { - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([makeProvider()]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([makeProvider()]); await service.pollDataRetention(); await service.pollDataRetention(); @@ -932,13 +932,13 @@ describe("DataRetentionService", () => { }); it("does not double-count when poll ownership alternates across worker pods", async () => { - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); const secondPod = new DataRetentionService( configServiceMock, walletSdkServiceMock as unknown as WalletSdkService, - pdpSubgraphServiceMock as unknown as PDPSubgraphService, + subgraphServiceMock as unknown as SubgraphService, mockBaselineRepository as unknown as Repository, mockSPRepository as unknown as Repository, counterMock as unknown as Counter, @@ -946,8 +946,8 @@ describe("DataRetentionService", () => { { insert: vi.fn(), probeLocation: "test" } as unknown as ClickhouseService, ); - pdpSubgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1300 } } }); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1300 } } }); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 11n, totalProvingPeriods: 102n }), ]); await secondPod.pollDataRetention(); @@ -955,8 +955,8 @@ describe("DataRetentionService", () => { counterMock.labels.mockClear(); counterMock.inc.mockClear(); - pdpSubgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1400 } } }); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1400 } } }); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 12n, totalProvingPeriods: 104n }), ]); await service.pollDataRetention(); @@ -972,8 +972,8 @@ describe("DataRetentionService", () => { ]; mockBaselineRepository.upsert.mockRejectedValueOnce(new Error("DB write failed")); - pdpSubgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1300 } } }); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1300 } } }); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 12n, totalProvingPeriods: 105n }), ]); @@ -981,8 +981,8 @@ describe("DataRetentionService", () => { expect(counterMock.labels).not.toHaveBeenCalled(); - pdpSubgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1400 } } }); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1400 } } }); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 12n, totalProvingPeriods: 105n }), ]); @@ -1003,12 +1003,12 @@ describe("DataRetentionService", () => { }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([makeProvider()]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([makeProvider()]); // First poll: DB load fails, poll bails out to avoid emitting bloated values await service.pollDataRetention(); expect(mockBaselineRepository.find).toHaveBeenCalledTimes(1); - expect(pdpSubgraphServiceMock.fetchSubgraphMeta).not.toHaveBeenCalled(); + expect(subgraphServiceMock.fetchSubgraphMeta).not.toHaveBeenCalled(); expect(counterMock.labels).not.toHaveBeenCalled(); // Second poll: DB load succeeds, baselines restored, normal delta computation @@ -1021,16 +1021,16 @@ describe("DataRetentionService", () => { it("emits real deltas on second poll after fresh deploy baseline-only first poll", async () => { // First poll: fresh deploy, no baselines in DB // Baseline set to: faultedPeriods=10, successPeriods=90 - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); counterMock.labels.mockClear(); counterMock.inc.mockClear(); // Second poll: values have increased - pdpSubgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ + subgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1300 } }, }); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 12n, totalProvingPeriods: 105n }), ]); @@ -1044,7 +1044,7 @@ describe("DataRetentionService", () => { it("deletes baseline from DB when stale provider is cleaned up", async () => { // First poll: establish baseline for PROVIDER_A - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: PROVIDER_A removed from active list @@ -1056,7 +1056,7 @@ describe("DataRetentionService", () => { { address: PROVIDER_A, name: "Provider A", providerId: 1, isApproved: true }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -1069,7 +1069,7 @@ describe("DataRetentionService", () => { it("emits overdue gauge on first poll (baseline-only)", async () => { // Provider is overdue: currentBlock=1200, // estimatedOverduePeriods = (1200 - 901) / 100 = 2.99 -> 2 - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); @@ -1086,7 +1086,7 @@ describe("DataRetentionService", () => { it("emits overdue gauge = 0 when provider is not overdue", async () => { // nextDeadline=2000 > currentBlock=1200 - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ proofSets: [] })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ proofSets: [] })]); await service.pollDataRetention(); @@ -1095,7 +1095,7 @@ describe("DataRetentionService", () => { it("emits overdue gauge even on negative delta (baseline reset)", async () => { // First poll: high values - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 100n, totalProvingPeriods: 200n }), ]); await service.pollDataRetention(); @@ -1103,7 +1103,7 @@ describe("DataRetentionService", () => { gaugeMock.set.mockClear(); // Second poll: lower values (negative delta) but still overdue - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 50n, totalProvingPeriods: 100n }), ]); await service.pollDataRetention(); @@ -1115,7 +1115,7 @@ describe("DataRetentionService", () => { it("naturally resets gauge to 0 when subgraph catches up", async () => { // First poll: provider is overdue (currentBlock=1200, nextDeadline=1000) - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); expect(gaugeMock.set).toHaveBeenCalledWith(2); @@ -1124,7 +1124,7 @@ describe("DataRetentionService", () => { gaugeMock.set.mockClear(); // Second poll: subgraph caught up, nextDeadline advanced past currentBlock - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 12n, totalProvingPeriods: 102n, @@ -1140,7 +1140,7 @@ describe("DataRetentionService", () => { it("removes overdue gauge when stale provider is cleaned up", async () => { // First poll: establish baseline for PROVIDER_A - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: PROVIDER_A removed from active list @@ -1152,7 +1152,7 @@ describe("DataRetentionService", () => { { address: PROVIDER_A, name: "Provider A", providerId: 1, isApproved: true }, ]); - pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); diff --git a/apps/backend/src/data-retention/data-retention.service.ts b/apps/backend/src/data-retention/data-retention.service.ts index c6ece7b5..1422bbfd 100644 --- a/apps/backend/src/data-retention/data-retention.service.ts +++ b/apps/backend/src/data-retention/data-retention.service.ts @@ -11,8 +11,8 @@ import { IConfig } from "../config/app.config.js"; import { DataRetentionBaseline } from "../database/entities/data-retention-baseline.entity.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; import { buildCheckMetricLabels, CheckMetricLabels } from "../metrics-prometheus/check-metric-labels.js"; -import { PDPSubgraphService } from "../pdp-subgraph/pdp-subgraph.service.js"; -import { type ProviderDataSetResponse } from "../pdp-subgraph/types.js"; +import { SubgraphService } from "../subgraph/subgraph.service.js"; +import { type ProviderDataSetResponse } from "../subgraph/types.js"; import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; import { type PDPProviderEx } from "../wallet-sdk/wallet-sdk.types.js"; @@ -41,7 +41,7 @@ export class DataRetentionService { constructor( private readonly configService: ConfigService, private readonly walletSdkService: WalletSdkService, - private readonly pdpSubgraphService: PDPSubgraphService, + private readonly subgraphService: SubgraphService, @InjectRepository(DataRetentionBaseline) private readonly baselineRepository: Repository, @InjectRepository(StorageProvider) @@ -59,10 +59,10 @@ export class DataRetentionService { * challenge delta since the last poll. */ async pollDataRetention(): Promise { - const pdpSubgraphEndpoint = this.configService.get("blockchain").pdpSubgraphEndpoint; - if (!pdpSubgraphEndpoint) { + const subgraphEndpoint = this.configService.get("blockchain").subgraphEndpoint; + if (!subgraphEndpoint) { this.logger.warn({ - event: "pdp_subgraph_endpoint_not_configured", + event: "subgraph_endpoint_not_configured", message: "No PDP subgraph endpoint configured", }); return; @@ -75,7 +75,7 @@ export class DataRetentionService { } try { - const subgraphMeta = await this.pdpSubgraphService.fetchSubgraphMeta(); + const subgraphMeta = await this.subgraphService.fetchSubgraphMeta(); const allProviderInfos = this.walletSdkService.getTestingProviders(); const spBlocklists = this.configService.get("spBlocklists"); const providerInfos = allProviderInfos?.filter((p) => !isSpBlocked(spBlocklists, p.serviceProvider, p.id)); @@ -104,7 +104,7 @@ export class DataRetentionService { ); try { - const providersFromSubgraph = await this.pdpSubgraphService.fetchProvidersWithDatasets({ + const providersFromSubgraph = await this.subgraphService.fetchProvidersWithDatasets({ blockNumber, addresses: batchAddresses, }); diff --git a/apps/backend/src/database/database.module.ts b/apps/backend/src/database/database.module.ts index 9249c3a9..f3f9ed09 100644 --- a/apps/backend/src/database/database.module.ts +++ b/apps/backend/src/database/database.module.ts @@ -7,6 +7,7 @@ import { fileURLToPath } from "url"; import { toStructuredError } from "../common/logging.js"; import { createPinoExitLogger } from "../common/pino.config.js"; import type { IAppConfig, IConfig, IDatabaseConfig } from "../config/app.config.js"; +import { AnonRetrieval } from "./entities/anon-retrieval.entity.js"; import { DataRetentionBaseline } from "./entities/data-retention-baseline.entity.js"; import { Deal } from "./entities/deal.entity.js"; import { JobScheduleState } from "./entities/job-schedule-state.entity.js"; @@ -49,7 +50,7 @@ function toSafeDataSourceContext(options: DataSourceOptions): Record { + await queryRunner.query(` + CREATE TABLE anon_retrievals ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + sp_address VARCHAR NOT NULL, + piece_cid VARCHAR NOT NULL, + data_set_id BIGINT NOT NULL, + piece_id BIGINT NOT NULL, + raw_size BIGINT NOT NULL, + with_ipfs_indexing BOOLEAN NOT NULL, + ipfs_root_cid VARCHAR NULL, + service_type VARCHAR NOT NULL DEFAULT 'direct_sp', + retrieval_endpoint VARCHAR NOT NULL, + status VARCHAR NOT NULL DEFAULT 'pending', + started_at TIMESTAMPTZ NOT NULL, + completed_at TIMESTAMPTZ NULL, + latency_ms INT NULL, + ttfb_ms INT NULL, + throughput_bps INT NULL, + bytes_retrieved BIGINT NULL, + response_code INT NULL, + error_message VARCHAR NULL, + commp_valid BOOLEAN NULL, + car_valid BOOLEAN NULL, + created_at TIMESTAMPTZ NOT NULL DEFAULT now(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT now() + ) + `); + + // Per-SP dashboards. + await queryRunner.query(` + CREATE INDEX "IDX_anon_retrievals_sp_address" + ON anon_retrievals (sp_address) + `); + + // Used by the recent-dedup query in AnonPieceSelectorService — keeps the + // most-recently-tested CIDs out of the next selection. + await queryRunner.query(` + CREATE INDEX "IDX_anon_retrievals_piece_cid" + ON anon_retrievals (piece_cid) + `); + + // Supports "last N anonymous retrievals" ordering used by the selector. + await queryRunner.query(` + CREATE INDEX "IDX_anon_retrievals_created_at" + ON anon_retrievals (created_at DESC) + `); + } + + public async down(queryRunner: QueryRunner): Promise { + await queryRunner.query(`DROP TABLE IF EXISTS anon_retrievals`); + } +} diff --git a/apps/backend/src/http-client/http-client.service.spec.ts b/apps/backend/src/http-client/http-client.service.spec.ts index 96604139..511910ba 100644 --- a/apps/backend/src/http-client/http-client.service.spec.ts +++ b/apps/backend/src/http-client/http-client.service.spec.ts @@ -64,25 +64,94 @@ describe("HttpClientService", () => { expect(config.timeout).toBe(120000); }); - it("times out HTTP/2 requests using the connection timeout", async () => { + it("passes the configured headersTimeout to undici and translates its error", async () => { const service = await createService(); - if (typeof AbortSignal.timeout !== "function") { - (AbortSignal as any).timeout = () => new AbortController().signal; + let receivedHeadersTimeout: number | undefined; + undiciRequestMock.mockImplementationOnce((_url: string, options: { headersTimeout?: number }) => { + receivedHeadersTimeout = options.headersTimeout; + const err = new Error("Headers Timeout Error") as Error & { code?: string }; + err.name = "HeadersTimeoutError"; + err.code = "UND_ERR_HEADERS_TIMEOUT"; + return Promise.reject(err); + }); + + await expect(service.requestWithMetrics("http://example.com", { httpVersion: "2" })).rejects.toThrow( + "HTTP/2 connection/headers timed out after 25ms", + ); + + expect(receivedHeadersTimeout).toBe(25); + }); + + it("keeps the request signal alive after the connect timeout window elapses", async () => { + const service = await createService(); + + // Previously, connectTimeoutMs (25ms) was folded into the request signal, + // so any download lasting longer than 25ms was aborted mid-stream. The + // signal must now stay live until the transfer timeout or parent signal + // fires. + let sawAbortBeforeResolve = false; + undiciRequestMock.mockImplementationOnce(async (_url: string, options: { signal?: AbortSignal }) => { + await new Promise((r) => setTimeout(r, 75)); + sawAbortBeforeResolve = options.signal?.aborted === true; + async function* body() { + yield Buffer.from("ok"); + } + return { statusCode: 200, body: body() }; + }); + + const result = await service.requestWithMetrics("http://example.com", { httpVersion: "2" }); + + expect(sawAbortBeforeResolve).toBe(false); + expect(result.aborted).toBeUndefined(); + expect(result.metrics.statusCode).toBe(200); + }); + + it("returns partial bytes and metrics when HTTP/2 download is aborted after headers", async () => { + const service = await createService(); + + const parentAbort = new AbortController(); + + async function* abortingBody() { + yield Buffer.from("hello"); + yield Buffer.from(" world"); + // Simulate an abort mid-stream after two chunks. + parentAbort.abort(new Error("Anon retrieval job timeout (60s) for sp1")); + throw new Error("aborted"); } - undiciRequestMock.mockImplementationOnce((_url: string, options: { signal?: AbortSignal }) => { - return new Promise((_resolve, reject) => { - options.signal?.addEventListener("abort", () => reject(new Error("aborted")), { once: true }); - }); + undiciRequestMock.mockImplementationOnce(async () => ({ + statusCode: 200, + body: abortingBody(), + })); + + const result = await service.requestWithMetrics("http://example.com/piece", { + httpVersion: "2", + signal: parentAbort.signal, }); - vi.useFakeTimers(); + expect(result.aborted).toBe(true); + expect(result.abortReason).toContain("timeout"); + expect(result.metrics.statusCode).toBe(200); + expect(result.metrics.responseSize).toBe(11); + expect(Buffer.isBuffer(result.data) ? result.data.toString() : "").toBe("hello world"); + }); + + it("rethrows non-abort download errors on HTTP/2", async () => { + const service = await createService(); - const promise = service.requestWithMetrics("http://example.com", { httpVersion: "2" }); - const assertion = expect(promise).rejects.toThrow("HTTP/2 connection/headers timed out after 25ms"); - await vi.advanceTimersByTimeAsync(25); + async function* brokenBody() { + yield Buffer.from("partial"); + throw new Error("network reset"); + } + + undiciRequestMock.mockImplementationOnce(async () => ({ + statusCode: 200, + body: brokenBody(), + })); - await assertion; + await expect(service.requestWithMetrics("http://example.com/piece", { httpVersion: "2" })).rejects.toThrow( + "network reset", + ); }); }); diff --git a/apps/backend/src/http-client/http-client.service.ts b/apps/backend/src/http-client/http-client.service.ts index 48e10e5c..81140162 100644 --- a/apps/backend/src/http-client/http-client.service.ts +++ b/apps/backend/src/http-client/http-client.service.ts @@ -81,12 +81,11 @@ export class HttpClientService { let ttfbTime = 0; let statusCode = 0; - /** - * Dual-timeout strategy for HTTP/2 requests: - * 1. AbortSignal.timeout() - Undici's native timeout (10 min default) - * 2. AbortSignal.timeout() for connection/headers (10 sec default) - */ - const { signal, connectTimeoutSignal } = this.buildHttp2Signals(options.signal); + // Dual-timeout strategy for HTTP/2 requests: + // - `headersTimeout` (undici): scopes the connect + response-headers phase. + // - Combined AbortSignal: transfer-timeout ceiling + parent (job) signal. + const transferTimeoutSignal = AbortSignal.timeout(this.http2TimeoutMs); + const signal = options.signal ? anySignal([transferTimeoutSignal, options.signal]) : transferTimeoutSignal; const requestOptions: any = { method, headers: { @@ -94,6 +93,7 @@ export class HttpClientService { ...headers, }, signal, + headersTimeout: this.connectTimeoutMs, }; if (data) { @@ -105,7 +105,8 @@ export class HttpClientService { try { response = await undiciRequest(url, requestOptions); } catch (error) { - if (connectTimeoutSignal.aborted) { + // discern connection error from transfer error + if (isHeadersTimeoutError(error)) { throw new Error(`HTTP/2 connection/headers timed out after ${this.connectTimeoutMs}ms`); } throw error; @@ -115,8 +116,15 @@ export class HttpClientService { statusCode = response.statusCode; const chunks: Buffer[] = []; - for await (const chunk of response.body) { - chunks.push(Buffer.from(chunk)); + let downloadError: unknown; + try { + for await (const chunk of response.body) { + chunks.push(Buffer.from(chunk)); + } + } catch (error) { + // Download-phase failures (e.g. abort signal) fall through so we can + // return the partial buffer + metrics collected so far. + downloadError = error; } const dataBuffer = Buffer.concat(chunks); @@ -133,6 +141,29 @@ export class HttpClientService { httpVersion: "2", }; + if (downloadError !== undefined) { + const aborted = options.signal?.aborted === true || isAbortLikeError(downloadError); + if (!aborted) { + throw downloadError; + } + const abortReason = describeAbortReason(options.signal, downloadError); + this.logger.warn({ + event: "http2_download_aborted", + message: "HTTP/2 download aborted after headers; returning partial data", + url, + bytesReceived: dataBuffer.length, + totalTime: metrics.totalTime, + ttfb: metrics.ttfb, + abortReason, + }); + return { + data: dataBuffer as T, + metrics, + aborted: true, + abortReason, + }; + } + return { data: dataBuffer as T, metrics, @@ -255,24 +286,28 @@ export class HttpClientService { // Fallback for objects/arrays return Buffer.from(JSON.stringify(data)); } +} - private buildHttp2Signals(parentSignal?: AbortSignal): { - signal: AbortSignal; - connectTimeoutSignal: AbortSignal; - } { - const transferTimeoutSignal = AbortSignal.timeout(this.http2TimeoutMs); - const connectTimeoutSignal = AbortSignal.timeout(this.connectTimeoutMs); +function isAbortLikeError(error: unknown): boolean { + if (error instanceof Error) { + return error.name === "AbortError" || error.name === "TimeoutError" || /abort/i.test(error.message); + } + return false; +} - if (parentSignal) { - return { - signal: anySignal([transferTimeoutSignal, connectTimeoutSignal, parentSignal]), - connectTimeoutSignal, - }; - } +/** + * Determines if a given error represents a "Headers Timeout" error. + */ +function isHeadersTimeoutError(error: unknown): boolean { + if (!(error instanceof Error)) return false; + const code = (error as Error & { code?: string }).code; + return error.name === "HeadersTimeoutError" || code === "UND_ERR_HEADERS_TIMEOUT"; +} - return { - signal: anySignal([transferTimeoutSignal, connectTimeoutSignal]), - connectTimeoutSignal, - }; - } +function describeAbortReason(signal: AbortSignal | undefined, fallback: unknown): string { + const reason = signal?.reason; + if (reason instanceof Error && reason.message) return reason.message; + if (typeof reason === "string" && reason.length > 0) return reason; + if (fallback instanceof Error && fallback.message) return fallback.message; + return "aborted"; } diff --git a/apps/backend/src/http-client/types.ts b/apps/backend/src/http-client/types.ts index 7e48ce7d..26892ee6 100644 --- a/apps/backend/src/http-client/types.ts +++ b/apps/backend/src/http-client/types.ts @@ -13,4 +13,6 @@ export interface RequestMetrics { export interface RequestWithMetrics { data: T; metrics: RequestMetrics; + aborted?: boolean; // Set when the request was aborted mid-download after response headers arrived. + abortReason?: string; // Error message when `aborted` is true; human-readable summary of the abort reason. } diff --git a/apps/backend/src/jobs/job-queues.ts b/apps/backend/src/jobs/job-queues.ts index 9488ce7b..db475d49 100644 --- a/apps/backend/src/jobs/job-queues.ts +++ b/apps/backend/src/jobs/job-queues.ts @@ -7,3 +7,4 @@ export const LEGACY_DEAL_QUEUE = "deal.run"; export const LEGACY_RETRIEVAL_QUEUE = "retrieval.run"; export const DATA_RETENTION_POLL_QUEUE = "data.retention.poll"; export const PROVIDERS_REFRESH_QUEUE = "providers.refresh"; +export const RETRIEVAL_ANON_QUEUE = "retrieval.anon.run"; diff --git a/apps/backend/src/jobs/jobs.module.ts b/apps/backend/src/jobs/jobs.module.ts index 15ad4d64..69f1edb1 100644 --- a/apps/backend/src/jobs/jobs.module.ts +++ b/apps/backend/src/jobs/jobs.module.ts @@ -7,6 +7,7 @@ import { StorageProvider } from "../database/entities/storage-provider.entity.js import { DealModule } from "../deal/deal.module.js"; import { PieceCleanupModule } from "../piece-cleanup/piece-cleanup.module.js"; import { RetrievalModule } from "../retrieval/retrieval.module.js"; +import { RetrievalAnonModule } from "../retrieval-anon/retrieval-anon.module.js"; import { WalletSdkModule } from "../wallet-sdk/wallet-sdk.module.js"; import { JobsService } from "./jobs.service.js"; import { JobScheduleRepository } from "./repositories/job-schedule.repository.js"; @@ -17,6 +18,7 @@ import { JobScheduleRepository } from "./repositories/job-schedule.repository.js TypeOrmModule.forFeature([StorageProvider, JobScheduleState]), DealModule, RetrievalModule, + RetrievalAnonModule, WalletSdkModule, DataRetentionModule, PieceCleanupModule, diff --git a/apps/backend/src/jobs/jobs.service.spec.ts b/apps/backend/src/jobs/jobs.service.spec.ts index d556f3d6..c20d0890 100644 --- a/apps/backend/src/jobs/jobs.service.spec.ts +++ b/apps/backend/src/jobs/jobs.service.spec.ts @@ -30,18 +30,18 @@ describe("JobsService schedule rows", () => { }; let dataRetentionServiceMock: { pollDataRetention: ReturnType }; let metricsMocks: { - jobsQueuedGauge: JobsServiceDeps[8]; - jobsRetryScheduledGauge: JobsServiceDeps[9]; - oldestQueuedAgeGauge: JobsServiceDeps[10]; - oldestInFlightAgeGauge: JobsServiceDeps[11]; - jobsInFlightGauge: JobsServiceDeps[12]; - jobsEnqueueAttemptsCounter: JobsServiceDeps[13]; - jobsStartedCounter: JobsServiceDeps[14]; - jobsCompletedCounter: JobsServiceDeps[15]; - jobsPausedGauge: JobsServiceDeps[16]; - jobDuration: JobsServiceDeps[17]; - storageProvidersActive: JobsServiceDeps[18]; - storageProvidersTested: JobsServiceDeps[19]; + jobsQueuedGauge: JobsServiceDeps[9]; + jobsRetryScheduledGauge: JobsServiceDeps[10]; + oldestQueuedAgeGauge: JobsServiceDeps[11]; + oldestInFlightAgeGauge: JobsServiceDeps[12]; + jobsInFlightGauge: JobsServiceDeps[13]; + jobsEnqueueAttemptsCounter: JobsServiceDeps[14]; + jobsStartedCounter: JobsServiceDeps[15]; + jobsCompletedCounter: JobsServiceDeps[16]; + jobsPausedGauge: JobsServiceDeps[17]; + jobDuration: JobsServiceDeps[18]; + storageProvidersActive: JobsServiceDeps[19]; + storageProvidersTested: JobsServiceDeps[20]; }; let baseConfigValues: Partial; let configService: JobsServiceDeps[0]; @@ -52,21 +52,22 @@ describe("JobsService schedule rows", () => { jobScheduleRepository: JobsServiceDeps[2]; dealService: JobsServiceDeps[3]; retrievalService: JobsServiceDeps[4]; - walletSdkService: JobsServiceDeps[5]; - dataRetentionService: JobsServiceDeps[6]; - pieceCleanupService: JobsServiceDeps[7]; - jobsQueuedGauge: JobsServiceDeps[8]; - jobsRetryScheduledGauge: JobsServiceDeps[9]; - oldestQueuedAgeGauge: JobsServiceDeps[10]; - oldestInFlightAgeGauge: JobsServiceDeps[11]; - jobsInFlightGauge: JobsServiceDeps[12]; - jobsEnqueueAttemptsCounter: JobsServiceDeps[13]; - jobsStartedCounter: JobsServiceDeps[14]; - jobsCompletedCounter: JobsServiceDeps[15]; - jobsPausedGauge: JobsServiceDeps[16]; - jobDuration: JobsServiceDeps[17]; - storageProvidersActive: JobsServiceDeps[18]; - storageProvidersTested: JobsServiceDeps[19]; + anonRetrievalService: JobsServiceDeps[5]; + walletSdkService: JobsServiceDeps[6]; + dataRetentionService: JobsServiceDeps[7]; + pieceCleanupService: JobsServiceDeps[8]; + jobsQueuedGauge: JobsServiceDeps[9]; + jobsRetryScheduledGauge: JobsServiceDeps[10]; + oldestQueuedAgeGauge: JobsServiceDeps[11]; + oldestInFlightAgeGauge: JobsServiceDeps[12]; + jobsInFlightGauge: JobsServiceDeps[13]; + jobsEnqueueAttemptsCounter: JobsServiceDeps[14]; + jobsStartedCounter: JobsServiceDeps[15]; + jobsCompletedCounter: JobsServiceDeps[16]; + jobsPausedGauge: JobsServiceDeps[17]; + jobDuration: JobsServiceDeps[18]; + storageProvidersActive: JobsServiceDeps[19]; + storageProvidersTested: JobsServiceDeps[20]; }>, ) => JobsService; @@ -96,18 +97,18 @@ describe("JobsService schedule rows", () => { }; metricsMocks = { - jobsQueuedGauge: { set: vi.fn() } as unknown as JobsServiceDeps[8], - jobsRetryScheduledGauge: { set: vi.fn() } as unknown as JobsServiceDeps[9], - oldestQueuedAgeGauge: { set: vi.fn() } as unknown as JobsServiceDeps[10], - oldestInFlightAgeGauge: { set: vi.fn() } as unknown as JobsServiceDeps[11], - jobsInFlightGauge: { set: vi.fn() } as unknown as JobsServiceDeps[12], - jobsEnqueueAttemptsCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[13], - jobsStartedCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[14], - jobsCompletedCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[15], - jobsPausedGauge: { set: vi.fn() } as unknown as JobsServiceDeps[16], - jobDuration: { observe: vi.fn() } as unknown as JobsServiceDeps[17], - storageProvidersActive: { set: vi.fn() } as unknown as JobsServiceDeps[18], - storageProvidersTested: { set: vi.fn() } as unknown as JobsServiceDeps[19], + jobsQueuedGauge: { set: vi.fn() } as unknown as JobsServiceDeps[9], + jobsRetryScheduledGauge: { set: vi.fn() } as unknown as JobsServiceDeps[10], + oldestQueuedAgeGauge: { set: vi.fn() } as unknown as JobsServiceDeps[11], + oldestInFlightAgeGauge: { set: vi.fn() } as unknown as JobsServiceDeps[12], + jobsInFlightGauge: { set: vi.fn() } as unknown as JobsServiceDeps[13], + jobsEnqueueAttemptsCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[14], + jobsStartedCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[15], + jobsCompletedCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[16], + jobsPausedGauge: { set: vi.fn() } as unknown as JobsServiceDeps[17], + jobDuration: { observe: vi.fn() } as unknown as JobsServiceDeps[18], + storageProvidersActive: { set: vi.fn() } as unknown as JobsServiceDeps[19], + storageProvidersTested: { set: vi.fn() } as unknown as JobsServiceDeps[20], }; const emptySpBlocklists: ISpBlocklistConfig = { @@ -133,6 +134,7 @@ describe("JobsService schedule rows", () => { dataSetCreationJobTimeoutSeconds: 300, pieceCleanupPerSpPerHour: 1, maxPieceCleanupRuntimeSeconds: 300, + retrievalsAnonPerSpPerHour: 2, } as IConfig["jobs"], database: { host: "localhost", @@ -158,9 +160,10 @@ describe("JobsService schedule rows", () => { overrides.jobScheduleRepository ?? (jobScheduleRepositoryMock as unknown as JobsServiceDeps[2]), overrides.dealService ?? ({} as JobsServiceDeps[3]), overrides.retrievalService ?? ({} as JobsServiceDeps[4]), - overrides.walletSdkService ?? ({} as JobsServiceDeps[5]), - overrides.dataRetentionService ?? (dataRetentionServiceMock as unknown as JobsServiceDeps[6]), - overrides.pieceCleanupService ?? ({} as JobsServiceDeps[7]), + overrides.anonRetrievalService ?? ({} as JobsServiceDeps[5]), + overrides.walletSdkService ?? ({} as JobsServiceDeps[6]), + overrides.dataRetentionService ?? (dataRetentionServiceMock as unknown as JobsServiceDeps[7]), + overrides.pieceCleanupService ?? ({} as JobsServiceDeps[8]), overrides.jobsQueuedGauge ?? metricsMocks.jobsQueuedGauge, overrides.jobsRetryScheduledGauge ?? metricsMocks.jobsRetryScheduledGauge, overrides.oldestQueuedAgeGauge ?? metricsMocks.oldestQueuedAgeGauge, @@ -284,7 +287,7 @@ describe("JobsService schedule rows", () => { service = buildService({ configService, dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[5], + walletSdkService: walletSdkService as unknown as ConstructorParameters[6], }); // Trigger the timeout immediately by using fake timers @@ -343,7 +346,7 @@ describe("JobsService schedule rows", () => { service = buildService({ configService, retrievalService: retrievalService as unknown as ConstructorParameters[4], - walletSdkService: walletSdkService as unknown as ConstructorParameters[5], + walletSdkService: walletSdkService as unknown as ConstructorParameters[6], }); vi.useFakeTimers(); @@ -382,7 +385,7 @@ describe("JobsService schedule rows", () => { service = buildService({ retrievalService: retrievalService as unknown as ConstructorParameters[4], - walletSdkService: walletSdkService as unknown as ConstructorParameters[5], + walletSdkService: walletSdkService as unknown as ConstructorParameters[6], }); await callPrivate(service, "handleRetrievalJob", { @@ -422,7 +425,7 @@ describe("JobsService schedule rows", () => { service = buildService({ retrievalService: retrievalService as unknown as ConstructorParameters[4], - walletSdkService: walletSdkService as unknown as ConstructorParameters[5], + walletSdkService: walletSdkService as unknown as ConstructorParameters[6], }); await expect( @@ -615,12 +618,13 @@ describe("JobsService schedule rows", () => { // Check upserts for providerB const upsertCalls = jobScheduleRepositoryMock.upsertSchedule.mock.calls; const upsertsForB = upsertCalls.filter((call) => call[1] === providerB.address); - expect(upsertsForB).toHaveLength(4); + expect(upsertsForB).toHaveLength(5); expect(upsertsForB.map((call) => call[0]).sort()).toEqual([ "data_set_creation", "deal", "piece_cleanup", "retrieval", + "retrieval_anon", ]); }); @@ -924,7 +928,7 @@ describe("JobsService schedule rows", () => { service = buildService({ dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[5], + walletSdkService: walletSdkService as unknown as ConstructorParameters[6], }); await callPrivate(service, "handleDealJob", { @@ -963,8 +967,8 @@ describe("JobsService schedule rows", () => { service = buildService({ dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[5], - pieceCleanupService: pieceCleanupService as unknown as JobsServiceDeps[7], + walletSdkService: walletSdkService as unknown as ConstructorParameters[6], + pieceCleanupService: pieceCleanupService as unknown as JobsServiceDeps[8], }); await callPrivate(service, "handleDealJob", { @@ -976,7 +980,7 @@ describe("JobsService schedule rows", () => { expect(dealService.createDealForProvider).toHaveBeenCalledTimes(1); }); - it("deal job maps DealJobTerminatedDataSetError to handler_result=error", async () => { + it("data storage job does not run data-storage check when data-set selection aborts", async () => { const completedCounter = metricsMocks.jobsCompletedCounter as unknown as { inc: ReturnType }; vi.useFakeTimers(); vi.setSystemTime(new Date("2024-01-01T12:00:00Z")); @@ -996,7 +1000,7 @@ describe("JobsService schedule rows", () => { service = buildService({ dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[5], + walletSdkService: walletSdkService as unknown as ConstructorParameters[6], }); await callPrivate(service, "handleDealJob", { @@ -1025,7 +1029,7 @@ describe("JobsService schedule rows", () => { service = buildService({ dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[5], + walletSdkService: walletSdkService as unknown as ConstructorParameters[6], }); await callPrivate(service, "handleDataSetCreationJob", { @@ -1067,7 +1071,7 @@ describe("JobsService schedule rows", () => { service = buildService({ configService, dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[5], + walletSdkService: walletSdkService as unknown as ConstructorParameters[6], }); await callPrivate(service, "handleDataSetCreationJob", { @@ -1108,7 +1112,7 @@ describe("JobsService schedule rows", () => { service = buildService({ configService, dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[5], + walletSdkService: walletSdkService as unknown as ConstructorParameters[6], }); await callPrivate(service, "handleDataSetCreationJob", { @@ -1153,7 +1157,7 @@ describe("JobsService schedule rows", () => { service = buildService({ configService, dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[5], + walletSdkService: walletSdkService as unknown as ConstructorParameters[6], }); await callPrivate(service, "handleDataSetCreationJob", { @@ -1326,7 +1330,7 @@ describe("JobsService schedule rows", () => { service = buildService({ dealService: dealService as unknown as JobsServiceDeps[3], - walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[6], }); await callPrivate(service, "handleDealJob", { @@ -1350,7 +1354,7 @@ describe("JobsService schedule rows", () => { service = buildService({ retrievalService: retrievalService as unknown as JobsServiceDeps[4], - walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[6], }); await callPrivate(service, "handleRetrievalJob", { @@ -1379,7 +1383,7 @@ describe("JobsService schedule rows", () => { service = buildService({ dealService: dealService as unknown as JobsServiceDeps[3], - walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[6], }); await callPrivate(service, "handleDataSetCreationJob", { @@ -1421,7 +1425,7 @@ describe("JobsService schedule rows", () => { intervalSeconds: 60, service: buildService({ dealService: dealService as unknown as JobsServiceDeps[3], - walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[6], }), expectCheckNotRun: () => expect(dealService.createDealForProvider).not.toHaveBeenCalled(), }, @@ -1431,7 +1435,7 @@ describe("JobsService schedule rows", () => { intervalSeconds: 60, service: buildService({ retrievalService: retrievalService as unknown as JobsServiceDeps[4], - walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[6], }), expectCheckNotRun: () => expect(retrievalService.performRandomRetrievalForProvider).not.toHaveBeenCalled(), }, @@ -1441,7 +1445,7 @@ describe("JobsService schedule rows", () => { intervalSeconds: 3600, service: buildService({ dealService: dataSetDealService as unknown as JobsServiceDeps[3], - walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[6], }), expectCheckNotRun: () => expect(dataSetDealService.createDataSetWithPiece).not.toHaveBeenCalled(), }, diff --git a/apps/backend/src/jobs/jobs.service.ts b/apps/backend/src/jobs/jobs.service.ts index f8fe1d80..b070de5a 100644 --- a/apps/backend/src/jobs/jobs.service.ts +++ b/apps/backend/src/jobs/jobs.service.ts @@ -16,18 +16,32 @@ import { StorageProvider } from "../database/entities/storage-provider.entity.js import { DealService } from "../deal/deal.service.js"; import { PieceCleanupService } from "../piece-cleanup/piece-cleanup.service.js"; import { RetrievalService } from "../retrieval/retrieval.service.js"; +import { AnonRetrievalService } from "../retrieval-anon/anon-retrieval.service.js"; import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; import { provisionNextMissingDataSet } from "./data-set-creation.handler.js"; -import { DATA_RETENTION_POLL_QUEUE, PROVIDERS_REFRESH_QUEUE, SP_WORK_QUEUE } from "./job-queues.js"; +import { + DATA_RETENTION_POLL_QUEUE, + PROVIDERS_REFRESH_QUEUE, + RETRIEVAL_ANON_QUEUE, + SP_WORK_QUEUE, +} from "./job-queues.js"; import { JobScheduleRepository } from "./repositories/job-schedule.repository.js"; -type SpJobType = "deal" | "retrieval" | "data_set_creation" | "piece_cleanup"; -const SP_JOB_TYPES: ReadonlySet = new Set(["deal", "retrieval", "data_set_creation", "piece_cleanup"]); +type SpJobType = "deal" | "retrieval" | "data_set_creation" | "retrieval_anon" | "piece_cleanup"; +const SP_JOB_TYPES: ReadonlySet = new Set([ + "deal", + "retrieval", + "retrieval_anon", + "data_set_creation", + "piece_cleanup", +]); + function isSpJobType(jobType: string): jobType is SpJobType { return SP_JOB_TYPES.has(jobType); } type SpJobData = { jobType: SpJobType; spAddress: string; intervalSeconds: number }; +type AnonRetrievalJobData = { spAddress: string; intervalSeconds: number }; type ProvidersRefreshJobData = { intervalSeconds: number }; type SpJob = Job; type DataRetentionJobData = { intervalSeconds: number }; @@ -58,6 +72,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { private readonly jobScheduleRepository: JobScheduleRepository, private readonly dealService: DealService, private readonly retrievalService: RetrievalService, + private readonly anonRetrievalService: AnonRetrievalService, private readonly walletSdkService: WalletSdkService, private readonly dataRetentionService: DataRetentionService, private readonly pieceCleanupService: PieceCleanupService, @@ -258,6 +273,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { await boss.createQueue(SP_WORK_QUEUE, { policy: "singleton" }); await boss.createQueue(PROVIDERS_REFRESH_QUEUE); await boss.createQueue(DATA_RETENTION_POLL_QUEUE); + await boss.createQueue(RETRIEVAL_ANON_QUEUE); } private registerWorkers(): void { @@ -335,6 +351,23 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { error: toStructuredError(error), }), ); + void this.boss + .work( + RETRIEVAL_ANON_QUEUE, + { batchSize: 1, localConcurrency: spConcurrency, pollingIntervalSeconds: workerPollSeconds }, + async ([job]) => { + if (!job) return; + await this.handleAnonRetrievalJob(job); + }, + ) + .catch((error) => + this.logger.error({ + event: "worker_register_failed", + message: "Failed to register worker", + queue: RETRIEVAL_ANON_QUEUE, + error: toStructuredError(error), + }), + ); } private getMaintenanceWindowStatus(now: Date = new Date()) { @@ -587,6 +620,51 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { }); } + private async handleAnonRetrievalJob(job: Job): Promise { + const data = job.data; + const spAddress = data.spAddress; + + // Create AbortController for job timeout enforcement + const abortController = new AbortController(); + const timeoutSeconds = this.configService.get("jobs").anonRetrievalJobTimeoutSeconds; + const timeoutMs = Math.max(60000, timeoutSeconds * 1000); + const effectiveTimeoutSeconds = Math.round(timeoutMs / 1000); + const abortReason = new Error(`Anon retrieval job timeout (${effectiveTimeoutSeconds}s) for ${spAddress}`); + const timeoutId = setTimeout(() => { + abortController.abort(abortReason); + }, timeoutMs); + + await this.recordJobExecution("retrieval_anon", async () => { + const logContext = await this.resolveProviderJobContext(spAddress, job.id); + try { + await this.anonRetrievalService.performForProvider(spAddress, abortController.signal, logContext); + return "success"; + } catch (error) { + if (abortController.signal.aborted) { + const reason = abortController.signal.reason; + const reasonMessage = reason instanceof Error ? reason.message : String(reason ?? ""); + this.logger.error({ + ...logContext, + event: "anon_retrieval_job_aborted", + message: reasonMessage || "Anon retrieval job aborted after timeout", + timeoutSeconds: effectiveTimeoutSeconds, + error: toStructuredError(reason ?? error), + }); + return "aborted"; + } + this.logger.error({ + ...logContext, + event: "anon_retrieval_job_failed", + message: "Anon retrieval job failed", + error: toStructuredError(error), + }); + throw error; + } finally { + clearTimeout(timeoutId); + } + }); + } + private async handleDataRetentionJob(data: DataRetentionJobData): Promise { void data; await this.recordJobExecution("data_retention_poll", async () => { @@ -865,6 +943,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { private getIntervalSecondsForRates(): { dealIntervalSeconds: number; retrievalIntervalSeconds: number; + retrievalAnonIntervalSeconds: number; dataSetCreationIntervalSeconds: number; dataRetentionPollIntervalSeconds: number; providersRefreshIntervalSeconds: number; @@ -885,9 +964,13 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { const dataRetentionPollIntervalSeconds = scheduling.dataRetentionPollIntervalSeconds; const providersRefreshIntervalSeconds = scheduling.providersRefreshIntervalSeconds; + const retrievalsAnonPerHour = jobsConfig.retrievalsAnonPerSpPerHour; + const retrievalAnonIntervalSeconds = Math.max(1, Math.round(3600 / retrievalsAnonPerHour)); + return { dealIntervalSeconds, retrievalIntervalSeconds, + retrievalAnonIntervalSeconds, dataSetCreationIntervalSeconds, dataRetentionPollIntervalSeconds, providersRefreshIntervalSeconds, @@ -907,6 +990,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { const { dealIntervalSeconds, retrievalIntervalSeconds, + retrievalAnonIntervalSeconds, dataSetCreationIntervalSeconds, dataRetentionPollIntervalSeconds, providersRefreshIntervalSeconds, @@ -924,6 +1008,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { const phaseMs = this.schedulePhaseSeconds() * 1000; const dealStartAt = new Date(now.getTime() + phaseMs); const retrievalStartAt = new Date(now.getTime() + phaseMs); + const retrievalAnonStartAt = new Date(now.getTime() + phaseMs); const dataSetCreationStartAt = new Date(now.getTime() + phaseMs); const dataRetentionPollStartAt = new Date(now.getTime() + phaseMs); const providersRefreshStartAt = new Date(now.getTime() + phaseMs); @@ -947,6 +1032,12 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { for (const address of unblockedAddresses) { await this.jobScheduleRepository.upsertSchedule("deal", address, dealIntervalSeconds, dealStartAt); await this.jobScheduleRepository.upsertSchedule("retrieval", address, retrievalIntervalSeconds, retrievalStartAt); + await this.jobScheduleRepository.upsertSchedule( + "retrieval_anon", + address, + retrievalAnonIntervalSeconds, + retrievalAnonStartAt, + ); if (minDataSets >= 1) { await this.jobScheduleRepository.upsertSchedule( "data_set_creation", @@ -1104,6 +1195,8 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { return SP_WORK_QUEUE; case "piece_cleanup": return SP_WORK_QUEUE; + case "retrieval_anon": + return RETRIEVAL_ANON_QUEUE; case "data_retention_poll": return DATA_RETENTION_POLL_QUEUE; case "providers_refresh": @@ -1123,6 +1216,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { if ( row.job_type === "deal" || row.job_type === "retrieval" || + row.job_type === "retrieval_anon" || row.job_type === "data_set_creation" || row.job_type === "piece_cleanup" ) { @@ -1195,6 +1289,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { const jobTypes: JobType[] = [ "deal", "retrieval", + "retrieval_anon", "data_set_creation", "piece_cleanup", "data_retention_poll", diff --git a/apps/backend/src/metrics-prometheus/check-metric-labels.ts b/apps/backend/src/metrics-prometheus/check-metric-labels.ts index d8447160..9d776586 100644 --- a/apps/backend/src/metrics-prometheus/check-metric-labels.ts +++ b/apps/backend/src/metrics-prometheus/check-metric-labels.ts @@ -1,4 +1,4 @@ -export type CheckType = "dataStorage" | "retrieval" | "dataRetention" | "dataSetCreation"; +export type CheckType = "dataStorage" | "retrieval" | "anon_retrieval" | "dataRetention" | "dataSetCreation"; export type ProviderStatus = "approved" | "unapproved"; export type CheckMetricLabels = { diff --git a/apps/backend/src/metrics-prometheus/check-metrics.service.ts b/apps/backend/src/metrics-prometheus/check-metrics.service.ts index 55975cad..85f1cdcf 100644 --- a/apps/backend/src/metrics-prometheus/check-metrics.service.ts +++ b/apps/backend/src/metrics-prometheus/check-metrics.service.ts @@ -248,3 +248,66 @@ export class DataSetCreationCheckMetrics { this.dataSetCreationStatusCounter.inc({ ...labels, value }); } } + +@Injectable() +export class AnonRetrievalCheckMetrics { + constructor( + @InjectMetric("anonPieceRetrievalFirstByteMs") + private readonly firstByteMs: Histogram, + @InjectMetric("anonPieceRetrievalLastByteMs") + private readonly lastByteMs: Histogram, + @InjectMetric("anonPieceRetrievalThroughputBps") + private readonly throughputBps: Histogram, + @InjectMetric("anonRetrievalCheckMs") + private readonly checkMs: Histogram, + @InjectMetric("anonRetrievalStatus") + private readonly statusCounter: Counter, + @InjectMetric("anonPieceHttpResponseCode") + private readonly httpResponseCounter: Counter, + @InjectMetric("anonCarParseStatus") + private readonly carParseCounter: Counter, + @InjectMetric("anonIpniStatus") + private readonly ipniCounter: Counter, + @InjectMetric("anonBlockFetchStatus") + private readonly blockFetchCounter: Counter, + ) {} + + observeFirstByteMs(labels: CheckMetricLabels, value: number | null | undefined): void { + observePositive(this.firstByteMs, labels, value); + } + + observeLastByteMs(labels: CheckMetricLabels, value: number | null | undefined): void { + observePositive(this.lastByteMs, labels, value); + } + + observeThroughput(labels: CheckMetricLabels, value: number | null | undefined): void { + observePositive(this.throughputBps, labels, value); + } + + observeCheckDuration(labels: CheckMetricLabels, value: number | null | undefined): void { + observePositive(this.checkMs, labels, value); + } + + recordStatus(labels: CheckMetricLabels, value: string): void { + this.statusCounter.inc({ ...labels, value }); + } + + recordHttpResponseCode(labels: CheckMetricLabels, statusCode: number): void { + this.httpResponseCounter.inc({ + ...labels, + value: classifyHttpResponseCode(statusCode), + }); + } + + recordCarParseStatus(labels: CheckMetricLabels, parseable: boolean): void { + this.carParseCounter.inc({ ...labels, value: parseable ? "parseable" : "not_parseable" }); + } + + recordIpniStatus(labels: CheckMetricLabels, value: "valid" | "invalid" | "skipped"): void { + this.ipniCounter.inc({ ...labels, value }); + } + + recordBlockFetchStatus(labels: CheckMetricLabels, value: "valid" | "invalid" | "skipped"): void { + this.blockFetchCounter.inc({ ...labels, value }); + } +} diff --git a/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts b/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts index 18bda30d..45f728b6 100644 --- a/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts +++ b/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts @@ -8,6 +8,7 @@ import { } from "@willsoto/nestjs-prometheus"; import { WalletSdkModule } from "../wallet-sdk/wallet-sdk.module.js"; import { + AnonRetrievalCheckMetrics, DataSetCreationCheckMetrics, DataStorageCheckMetrics, DiscoverabilityCheckMetrics, @@ -207,6 +208,56 @@ const metricProviders = [ help: "Estimated number of unrecorded overdue proving periods per provider. Resets to 0 when the subgraph catches up.", labelNames: ["checkType", "providerId", "providerName", "providerStatus"] as const, }), + // Anonymous Retrieval Metrics + makeHistogramProvider({ + name: "anonPieceRetrievalFirstByteMs", + help: "Time to first byte for anonymous piece retrievals via /piece/{cid} (ms)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus"] as const, + buckets: [1, 5, 10, 50, 100, 250, 500, 1000, 2000, 5000, 10000, 30000], + }), + makeHistogramProvider({ + name: "anonPieceRetrievalLastByteMs", + help: "Total time to retrieve an anonymous piece via /piece/{cid} (ms)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus"] as const, + buckets: [1, 5, 10, 50, 100, 250, 500, 1000, 2000, 5000, 10000, 30000, 60000, 120000, 300000], + }), + makeHistogramProvider({ + name: "anonPieceRetrievalThroughputBps", + help: "Throughput for anonymous piece retrievals (bytes/s)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus"] as const, + buckets: throughputBuckets, + }), + makeHistogramProvider({ + name: "anonRetrievalCheckMs", + help: "End-to-end anonymous retrieval check duration (ms)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus"] as const, + buckets: [100, 500, 1000, 2000, 5000, 10000, 30000, 60000, 120000, 300000, 600000], + }), + makeCounterProvider({ + name: "anonRetrievalStatus", + help: "Anonymous retrieval overall outcome", + labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, + }), + makeCounterProvider({ + name: "anonPieceHttpResponseCode", + help: "HTTP response codes for anonymous piece retrieval requests", + labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, + }), + makeCounterProvider({ + name: "anonCarParseStatus", + help: "Anonymous retrieval CAR parse outcomes (parseable / not_parseable)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, + }), + makeCounterProvider({ + name: "anonIpniStatus", + help: "Anonymous retrieval IPNI check outcomes (valid / invalid / skipped)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, + }), + makeCounterProvider({ + name: "anonBlockFetchStatus", + help: "Anonymous retrieval block fetch validation outcomes (valid / invalid / skipped)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, + }), // Storage provider metrics: absolute counts, independent of query filters. makeGaugeProvider({ name: "storage_providers_active", @@ -333,6 +384,7 @@ const metricProviders = [ RetrievalCheckMetrics, DiscoverabilityCheckMetrics, DataSetCreationCheckMetrics, + AnonRetrievalCheckMetrics, WalletBalanceCollector, // HTTP metrics interceptor { @@ -347,6 +399,7 @@ const metricProviders = [ RetrievalCheckMetrics, DiscoverabilityCheckMetrics, DataSetCreationCheckMetrics, + AnonRetrievalCheckMetrics, WalletBalanceCollector, ], }) diff --git a/apps/backend/src/pdp-subgraph/pdp-subgraph.module.ts b/apps/backend/src/pdp-subgraph/pdp-subgraph.module.ts deleted file mode 100644 index 6e084fc1..00000000 --- a/apps/backend/src/pdp-subgraph/pdp-subgraph.module.ts +++ /dev/null @@ -1,8 +0,0 @@ -import { Module } from "@nestjs/common"; -import { PDPSubgraphService } from "./pdp-subgraph.service.js"; - -@Module({ - providers: [PDPSubgraphService], - exports: [PDPSubgraphService], -}) -export class PdpSubgraphModule {} diff --git a/apps/backend/src/pdp-subgraph/queries.ts b/apps/backend/src/pdp-subgraph/queries.ts deleted file mode 100644 index a21a3991..00000000 --- a/apps/backend/src/pdp-subgraph/queries.ts +++ /dev/null @@ -1,24 +0,0 @@ -export const Queries = { - GET_PROVIDERS_WITH_DATASETS: ` - query GetProvidersWithDataSet($addresses: [Bytes!], $blockNumber: BigInt!) { - providers(where: {address_in: $addresses}) { - address - totalFaultedPeriods - totalProvingPeriods - proofSets (where: {nextDeadline_lt: $blockNumber, status: PROVING}) { - nextDeadline - maxProvingPeriod - } - } - } - `, - GET_SUBGRAPH_META: ` - query GetSubgraphMeta { - _meta { - block { - number - } - } - } - `, -} as const; diff --git a/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts b/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts new file mode 100644 index 00000000..b822fe5f --- /dev/null +++ b/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts @@ -0,0 +1,168 @@ +import type { ConfigService } from "@nestjs/config"; +import type { Repository } from "typeorm"; +import { beforeEach, describe, expect, it, vi } from "vitest"; +import type { IConfig } from "../config/app.config.js"; +import type { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; +import type { SampleAnonPieceParams, SubgraphService } from "../subgraph/subgraph.service.js"; +import type { AnonCandidatePiece } from "../subgraph/types.js"; +import { AnonPieceSelectorService } from "./anon-piece-selector.service.js"; + +const SP_ADDRESS = "0xAaAaAAaAaaaAaAAAAaaaaAAaaAaaaAAaaaaa1111"; +const DEALBOT_PAYER = "0xBbBBBbBBbbbBbBBBBBbbbbbBBbbBbbbBBbbbb2222"; + +const makePiece = (overrides: Partial = {}): AnonCandidatePiece => ({ + pieceCid: `baga6ea4seaqpiece${Math.random().toString(36).slice(2, 10)}`, + pieceId: "1", + dataSetId: "42", + rawSize: "1048576", + withIPFSIndexing: true, + ipfsRootCid: "bafyroot", + indexedAtBlock: 12345, + pdpPaymentEndEpoch: null, + ...overrides, +}); + +const makeRetrievalRepository = (recentPieceCids: string[]): Repository => { + const queryBuilder = { + select: vi.fn().mockReturnThis(), + orderBy: vi.fn().mockReturnThis(), + limit: vi.fn().mockReturnThis(), + getRawMany: vi.fn().mockResolvedValue(recentPieceCids.map((c) => ({ pieceCid: c }))), + }; + return { + createQueryBuilder: vi.fn().mockReturnValue(queryBuilder), + } as unknown as Repository; +}; + +const makeConfigService = (): ConfigService => + ({ + get: vi.fn((key: string) => { + if (key === "blockchain") { + return { walletAddress: DEALBOT_PAYER }; + } + return undefined; + }), + }) as unknown as ConfigService; + +describe("AnonPieceSelectorService", () => { + let subgraphService: SubgraphService; + let sampleAnonPiece: ReturnType; + + beforeEach(() => { + sampleAnonPiece = vi.fn(); + subgraphService = { sampleAnonPiece } as unknown as SubgraphService; + }); + + it("returns null when every fallback attempt yields no piece", async () => { + sampleAnonPiece.mockResolvedValue(null); + const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + + const result = await service.selectPieceForProvider(SP_ADDRESS); + + expect(result).toBeNull(); + expect(sampleAnonPiece).toHaveBeenCalled(); + }); + + it("returns the sampled piece with SP address lowercased", async () => { + sampleAnonPiece.mockResolvedValueOnce(makePiece({ pieceCid: "baga-the-one" })); + const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + + const result = await service.selectPieceForProvider(SP_ADDRESS); + + expect(result).not.toBeNull(); + expect(result?.pieceCid).toBe("baga-the-one"); + expect(result?.serviceProvider).toBe(SP_ADDRESS.toLowerCase()); + }); + + it("passes the dealbot payer address to sampleAnonPiece for exclusion", async () => { + sampleAnonPiece.mockResolvedValueOnce(makePiece()); + const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + + await service.selectPieceForProvider(SP_ADDRESS); + + const call = sampleAnonPiece.mock.calls[0][0] as SampleAnonPieceParams; + expect(call.payer).toBe(DEALBOT_PAYER); + expect(call.serviceProvider).toBe(SP_ADDRESS); + }); + + it("redraws when the first sampled piece's payment has already terminated", async () => { + const staleCid = "baga-terminated"; + const freshCid = "baga-live"; + sampleAnonPiece + .mockResolvedValueOnce(makePiece({ pieceCid: staleCid, pdpPaymentEndEpoch: 100n, indexedAtBlock: 200 })) + .mockResolvedValueOnce(makePiece({ pieceCid: freshCid, pdpPaymentEndEpoch: null })); + + const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + const result = await service.selectPieceForProvider(SP_ADDRESS); + + expect(result?.pieceCid).toBe(freshCid); + }); + + it("redraws when the first sampled piece was recently tested", async () => { + const staleCid = "baga-stale"; + const freshCid = "baga-fresh"; + sampleAnonPiece + .mockResolvedValueOnce(makePiece({ pieceCid: staleCid })) + .mockResolvedValueOnce(makePiece({ pieceCid: freshCid })); + + const service = new AnonPieceSelectorService( + subgraphService, + makeConfigService(), + makeRetrievalRepository([staleCid]), + ); + const result = await service.selectPieceForProvider(SP_ADDRESS); + + expect(result?.pieceCid).toBe(freshCid); + }); + + it("falls back to the opposite pool when the preferred one is empty", async () => { + // First pool call returns nothing twice (both attempts), second pool succeeds. + const fresh = makePiece({ pieceCid: "baga-other-pool" }); + sampleAnonPiece.mockResolvedValueOnce(null).mockResolvedValueOnce(null).mockResolvedValueOnce(fresh); + + const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + const result = await service.selectPieceForProvider(SP_ADDRESS); + + expect(result?.pieceCid).toBe("baga-other-pool"); + + // The second (fallback) call should target the opposite pool. + const firstCall = sampleAnonPiece.mock.calls[0][0] as SampleAnonPieceParams; + const fallbackCall = sampleAnonPiece.mock.calls[2][0] as SampleAnonPieceParams; + expect(fallbackCall.pool).not.toBe(firstCall.pool); + }); + + it("widens size bucket to 'any' after both pools fail in the primary bucket", async () => { + // 4 empty attempts across (bucket × both pools × 2 draws each) then + // succeed on the first `any` bucket call. + sampleAnonPiece + .mockResolvedValueOnce(null) + .mockResolvedValueOnce(null) + .mockResolvedValueOnce(null) + .mockResolvedValueOnce(null) + .mockResolvedValueOnce(makePiece({ pieceCid: "baga-any-bucket" })); + + const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + const result = await service.selectPieceForProvider(SP_ADDRESS); + + expect(result?.pieceCid).toBe("baga-any-bucket"); + + // The 5th call (index 4) should be the widened-bucket attempt; its size + // range covers at least the 32 GiB ceiling of the "large" bucket. + const widened = sampleAnonPiece.mock.calls[4][0] as SampleAnonPieceParams; + expect(BigInt(widened.maxSize)).toBeGreaterThanOrEqual(32n * 1024n * 1024n * 1024n); + expect(widened.minSize).toBe("0"); + }); + + it("draws a fresh sampleKey for each subgraph call", async () => { + sampleAnonPiece.mockResolvedValueOnce(null).mockResolvedValueOnce(makePiece()); + + const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + await service.selectPieceForProvider(SP_ADDRESS); + + const call1 = sampleAnonPiece.mock.calls[0][0] as SampleAnonPieceParams; + const call2 = sampleAnonPiece.mock.calls[1][0] as SampleAnonPieceParams; + expect(call1.sampleKey).toMatch(/^0x[0-9a-f]{64}$/); + expect(call2.sampleKey).toMatch(/^0x[0-9a-f]{64}$/); + expect(call1.sampleKey).not.toBe(call2.sampleKey); + }); +}); diff --git a/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts new file mode 100644 index 00000000..acc19832 --- /dev/null +++ b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts @@ -0,0 +1,208 @@ +import { randomBytes } from "node:crypto"; +import { Injectable, Logger } from "@nestjs/common"; +import { ConfigService } from "@nestjs/config"; +import { InjectRepository } from "@nestjs/typeorm"; +import type { Repository } from "typeorm"; +import type { IConfig } from "../config/app.config.js"; +import { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; +import type { AnonPiecePool, SampleAnonPieceParams } from "../subgraph/subgraph.service.js"; +import { SubgraphService } from "../subgraph/subgraph.service.js"; +import type { AnonCandidatePiece } from "../subgraph/types.js"; +import type { AnonPiece } from "./types.js"; + +/** + * Number of most-recently-tested anonymous pieces to exclude from selection + * to avoid immediately retesting the same piece. Piece CIDs are globally + * unique and each one lives on a single SP's dataset, so scoping by CID + * is equivalent to scoping by (SP, CID) for this workload. + */ +const RECENT_DEDUP_WINDOW = 500; + +/** + * Piece size buckets, in raw (unpadded) bytes. Weighted sampling across + * these buckets keeps tests meaningful for bandwidth measurement without + * locking out SPs whose corpus skews small or large. + */ +type SizeBucket = "small" | "medium" | "large"; +type SizeRange = { min: bigint; max: bigint }; + +const MIB = 1024n * 1024n; + +// All downloads are buffered in-memory, so we need to keep piece sizes reasonable +const SIZE_BUCKETS: Record = { + small: { min: 1n * MIB, max: 20n * MIB - 1n }, + medium: { min: 20n * MIB, max: 100n * MIB - 1n }, + large: { min: 100n * MIB, max: 500n * MIB - 1n }, +}; + +/** Weights for choosing a bucket per selection. Must sum to 1. */ +const BUCKET_WEIGHTS: Record = { + small: 0.2, + medium: 0.5, + large: 0.3, +}; + +/** + * Probability the primary draw targets the withIPFSIndexing pool. + * The rest of the time we sample across all FWSS pieces so SPs can't + * optimise only their CAR corpus. + */ +const IPFS_INDEXED_SAMPLE_RATE = 0.8; + +@Injectable() +export class AnonPieceSelectorService { + private readonly logger = new Logger(AnonPieceSelectorService.name); + + constructor( + private readonly subgraphService: SubgraphService, + private readonly configService: ConfigService, + @InjectRepository(AnonRetrieval) + private readonly anonRetrievalRepository: Repository, + ) {} + + /** + * Select an anonymous piece to test against the given SP. + * + * Strategy: + * 1. Pick a size bucket by weighted random. + * 2. Pick a pool (`indexed` 80% / `any` 20%). + * 3. Generate a uniform-random sampleKey and query the subgraph for the + * smallest `Root.sampleKey ≥ $sampleKey` matching the filters. + * 4. Drop the pick if `pdpPaymentEndEpoch` has passed or it was tested + * recently; redraw once. + * 5. If still empty, fall back through: (same bucket, opposite pool) → + * (any bucket, indexed) → (any bucket, any). + */ + async selectPieceForProvider(spAddress: string): Promise { + const dealbotPayer = this.configService.get("blockchain", { infer: true }).walletAddress; + const recentlyTested = await this.loadRecentlyTestedPieceCids(); + + const bucket = this.pickBucket(); + const pool: AnonPiecePool = Math.random() < IPFS_INDEXED_SAMPLE_RATE ? "indexed" : "any"; + + const attempts: Array<{ bucket: SizeBucket | "any"; pool: AnonPiecePool }> = [ + { bucket, pool }, + { bucket, pool: pool === "indexed" ? "any" : "indexed" }, + { bucket: "any", pool: "indexed" }, + { bucket: "any", pool: "any" }, + ]; + + for (const attempt of attempts) { + const piece = await this.drawPiece({ + spAddress, + dealbotPayer, + bucket: attempt.bucket, + pool: attempt.pool, + recentlyTested, + }); + + if (piece) { + this.logger.log({ + event: "anon_piece_selected", + message: "Selected anonymous piece for retrieval test", + spAddress, + pieceCid: piece.pieceCid, + dataSetId: piece.dataSetId, + withIPFSIndexing: piece.withIPFSIndexing, + bucket: attempt.bucket, + pool: attempt.pool, + }); + return { + pieceCid: piece.pieceCid, + dataSetId: piece.dataSetId, + pieceId: piece.pieceId, + serviceProvider: spAddress.toLowerCase(), + withIPFSIndexing: piece.withIPFSIndexing, + ipfsRootCid: piece.ipfsRootCid, + rawSize: piece.rawSize, + }; + } + } + + this.logger.warn({ + event: "anon_no_candidates", + message: "No anonymous piece found after all fallbacks", + spAddress, + }); + return null; + } + + /** + * Try to draw a piece for one (bucket, pool) combination. Up to two draws + * with fresh sampleKeys, each filtered by dedup + epoch-termination. + */ + private async drawPiece(args: { + spAddress: string; + dealbotPayer: string; + bucket: SizeBucket | "any"; + pool: AnonPiecePool; + recentlyTested: Set; + }): Promise { + const range = args.bucket === "any" ? fullRange() : SIZE_BUCKETS[args.bucket]; + + for (let attempt = 0; attempt < 2; attempt++) { + const params: SampleAnonPieceParams = { + serviceProvider: args.spAddress, + payer: args.dealbotPayer, + sampleKey: randomSampleKey(), + minSize: range.min.toString(), + maxSize: range.max.toString(), + pool: args.pool, + }; + + const piece = await this.subgraphService.sampleAnonPiece(params); + if (!piece) { + continue; + } + + if (piece.pdpPaymentEndEpoch != null && piece.pdpPaymentEndEpoch <= BigInt(piece.indexedAtBlock)) { + continue; + } + + if (args.recentlyTested.has(piece.pieceCid)) { + continue; + } + + return piece; + } + + return null; + } + + private pickBucket(): SizeBucket { + const r = Math.random(); + let acc = 0; + for (const [name, weight] of Object.entries(BUCKET_WEIGHTS) as Array<[SizeBucket, number]>) { + acc += weight; + if (r < acc) { + return name; + } + } + return "medium"; + } + + /** + * Return the set of piece CIDs tested in the last RECENT_DEDUP_WINDOW + * anonymous retrievals across all SPs. + */ + private async loadRecentlyTestedPieceCids(): Promise> { + const rows = await this.anonRetrievalRepository + .createQueryBuilder("r") + .select("r.piece_cid", "pieceCid") + .orderBy("r.created_at", "DESC") + .limit(RECENT_DEDUP_WINDOW) + .getRawMany<{ pieceCid: string }>(); + + return new Set(rows.map((row) => row.pieceCid)); + } +} + +/** Uniform-random 32-byte sort key as `0x`-prefixed hex. */ +function randomSampleKey(): string { + return `0x${randomBytes(32).toString("hex")}`; +} + +/** The full size range (used when bucket fallback is "any"). */ +function fullRange(): SizeRange { + return { min: 0n, max: (1n << 63n) - 1n }; +} diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts new file mode 100644 index 00000000..61e97105 --- /dev/null +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts @@ -0,0 +1,189 @@ +import type { Repository } from "typeorm"; +import { beforeEach, describe, expect, it, vi } from "vitest"; +import type { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; +import type { StorageProvider } from "../database/entities/storage-provider.entity.js"; +import { RetrievalStatus } from "../database/types.js"; +import type { AnonRetrievalCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; +import type { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; +import type { AnonPieceSelectorService } from "./anon-piece-selector.service.js"; +import { AnonRetrievalService } from "./anon-retrieval.service.js"; +import type { CarValidationService } from "./car-validation.service.js"; +import type { PieceRetrievalService } from "./piece-retrieval.service.js"; +import type { PieceRetrievalResult } from "./types.js"; + +const SP_ADDRESS = "0xaaaa0000000000000000000000000000000000aa"; + +const PIECE = { + pieceCid: "baga6ea4seaqpiece", + pieceId: "1", + dataSetId: "42", + rawSize: "1048576", + withIPFSIndexing: false, + ipfsRootCid: null, + serviceProvider: SP_ADDRESS, +}; + +function makeProvider(): StorageProvider { + return { + address: SP_ADDRESS, + providerId: 7, + name: "sp-test", + isApproved: true, + } as unknown as StorageProvider; +} + +function makeService(opts: { + pieceResult: PieceRetrievalResult; + fetchPieceImpl?: (signal?: AbortSignal) => Promise; +}): { + service: AnonRetrievalService; + saveSpy: ReturnType; + fetchSpy: ReturnType; +} { + const saveSpy = vi.fn(async (entity: AnonRetrieval) => entity); + const createdEntities: Partial[] = []; + const anonRetrievalRepository = { + create: vi.fn((data: Partial) => { + createdEntities.push(data); + return data; + }), + save: saveSpy, + } as unknown as Repository; + + const spRepository = { + findOne: vi.fn(async () => makeProvider()), + } as unknown as Repository; + + const anonPieceSelector = { + selectPieceForProvider: vi.fn(async () => PIECE), + } as unknown as AnonPieceSelectorService; + + const fetchSpy = vi.fn(opts.fetchPieceImpl ?? (async () => opts.pieceResult)); + const pieceRetrievalService = { + fetchPiece: fetchSpy, + } as unknown as PieceRetrievalService; + + const carValidationService = { + validateCarPiece: vi.fn(), + } as unknown as CarValidationService; + + const walletSdkService = { + getProviderInfo: vi.fn(() => ({ pdp: { serviceURL: "https://sp.test/" } })), + } as unknown as WalletSdkService; + + const metrics = { + observeFirstByteMs: vi.fn(), + observeLastByteMs: vi.fn(), + observeThroughput: vi.fn(), + observeCheckDuration: vi.fn(), + recordStatus: vi.fn(), + recordHttpResponseCode: vi.fn(), + recordCarParseStatus: vi.fn(), + recordIpniStatus: vi.fn(), + recordBlockFetchStatus: vi.fn(), + } as unknown as AnonRetrievalCheckMetrics; + + const service = new AnonRetrievalService( + anonPieceSelector, + pieceRetrievalService, + carValidationService, + walletSdkService, + metrics, + anonRetrievalRepository, + spRepository, + ); + + return { service, saveSpy, fetchSpy }; +} + +describe("AnonRetrievalService", () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + it("persists partial metrics when fetchPiece returns aborted=true", async () => { + const partial: PieceRetrievalResult = { + success: false, + pieceCid: PIECE.pieceCid, + bytesReceived: 524288, + pieceBytes: null, + latencyMs: 42000, + ttfbMs: 150, + throughputBps: 12500, + statusCode: 200, + commPValid: false, + errorMessage: "Anon retrieval job timeout (60s) for sp1", + aborted: true, + }; + + const { service, saveSpy } = makeService({ pieceResult: partial }); + + await service.performForProvider(SP_ADDRESS); + + expect(saveSpy).toHaveBeenCalledTimes(1); + const saved = saveSpy.mock.calls[0][0] as Partial; + expect(saved.status).toBe(RetrievalStatus.FAILED); + expect(saved.bytesRetrieved).toBe(524288); + expect(saved.ttfbMs).toBe(150); + expect(saved.latencyMs).toBe(42000); + expect(saved.throughputBps).toBe(12500); + expect(saved.responseCode).toBe(200); + expect(saved.errorMessage).toContain("Anon retrieval job timeout"); + }); + + it("still saves a row when the signal aborts before fetchPiece runs", async () => { + const ac = new AbortController(); + ac.abort(new Error("Anon retrieval job timeout (60s) for sp1")); + + const never: PieceRetrievalResult = { + success: false, + pieceCid: PIECE.pieceCid, + bytesReceived: 0, + pieceBytes: null, + latencyMs: 0, + ttfbMs: 0, + throughputBps: 0, + statusCode: 0, + commPValid: false, + }; + + const { service, saveSpy, fetchSpy } = makeService({ pieceResult: never }); + + await service.performForProvider(SP_ADDRESS, ac.signal); + + expect(fetchSpy).not.toHaveBeenCalled(); + expect(saveSpy).toHaveBeenCalledTimes(1); + const saved = saveSpy.mock.calls[0][0] as Partial; + expect(saved.status).toBe(RetrievalStatus.FAILED); + expect(saved.errorMessage).toContain("Anon retrieval job timeout"); + expect(saved.bytesRetrieved).toBeNull(); + expect(saved.ttfbMs).toBeNull(); + }); + + it("still saves a row when fetchPiece throws unexpectedly", async () => { + const never: PieceRetrievalResult = { + success: false, + pieceCid: PIECE.pieceCid, + bytesReceived: 0, + pieceBytes: null, + latencyMs: 0, + ttfbMs: 0, + throughputBps: 0, + statusCode: 0, + commPValid: false, + }; + + const { service, saveSpy } = makeService({ + pieceResult: never, + fetchPieceImpl: async () => { + throw new Error("network down"); + }, + }); + + await expect(service.performForProvider(SP_ADDRESS)).rejects.toThrow("network down"); + + expect(saveSpy).toHaveBeenCalledTimes(1); + const saved = saveSpy.mock.calls[0][0] as Partial; + expect(saved.status).toBe(RetrievalStatus.FAILED); + }); +}); diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts new file mode 100644 index 00000000..d40fe315 --- /dev/null +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -0,0 +1,244 @@ +import { Injectable, Logger } from "@nestjs/common"; +import { InjectRepository } from "@nestjs/typeorm"; +import type { Repository } from "typeorm"; +import { type ProviderJobContext, toStructuredError } from "../common/logging.js"; +import { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; +import { StorageProvider } from "../database/entities/storage-provider.entity.js"; +import { RetrievalStatus, ServiceType } from "../database/types.js"; +import { buildCheckMetricLabels } from "../metrics-prometheus/check-metric-labels.js"; +import { AnonRetrievalCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; +import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; +import { AnonPieceSelectorService } from "./anon-piece-selector.service.js"; +import { CarValidationService } from "./car-validation.service.js"; +import { PieceRetrievalService } from "./piece-retrieval.service.js"; +import type { CarValidationResult, PieceRetrievalResult } from "./types.js"; + +@Injectable() +export class AnonRetrievalService { + private readonly logger = new Logger(AnonRetrievalService.name); + + constructor( + private readonly anonPieceSelectorService: AnonPieceSelectorService, + private readonly pieceRetrievalService: PieceRetrievalService, + private readonly carValidationService: CarValidationService, + private readonly walletSdkService: WalletSdkService, + private readonly metrics: AnonRetrievalCheckMetrics, + @InjectRepository(AnonRetrieval) + private readonly anonRetrievalRepository: Repository, + @InjectRepository(StorageProvider) + private readonly spRepository: Repository, + ) {} + + async performForProvider( + spAddress: string, + signal?: AbortSignal, + logContext?: ProviderJobContext, + ): Promise { + // Build metric labels + const provider = await this.spRepository.findOne({ where: { address: spAddress } }); + const labels = buildCheckMetricLabels({ + checkType: "anon_retrieval", + providerId: provider?.providerId, + providerName: provider?.name, + providerIsApproved: provider?.isApproved, + }); + + // 1. Select an anonymous piece + const piece = await this.anonPieceSelectorService.selectPieceForProvider(spAddress); + if (!piece) { + this.logger.warn({ + ...logContext, + event: "anon_retrieval_no_piece", + message: "No anonymous piece found for SP", + spAddress, + }); + this.metrics.recordStatus(labels, "failure.no_piece"); + return null; + } + + this.logger.log({ + ...logContext, + event: "anon_retrieval_started", + message: "Starting anonymous retrieval test", + pieceCid: piece.pieceCid, + dataSetId: piece.dataSetId, + pieceId: piece.pieceId, + withIPFSIndexing: piece.withIPFSIndexing, + spAddress, + }); + + const checkStart = Date.now(); + const startedAt = new Date(); + + let pieceResult: PieceRetrievalResult | null = null; + let carResult: CarValidationResult | null = null; + let saved: AnonRetrieval | null = null; + + try { + // 2. Fetch the piece. fetchPiece never throws on abort — it returns a + // result with partial metrics so we can persist what we have. + if (signal?.aborted) { + pieceResult = buildAbortedPlaceholder(piece.pieceCid, signal.reason); + } else { + pieceResult = await this.pieceRetrievalService.fetchPiece(spAddress, piece.pieceCid, signal); + } + + // Emit piece retrieval metrics + this.metrics.observeFirstByteMs(labels, pieceResult.ttfbMs); + this.metrics.observeLastByteMs(labels, pieceResult.latencyMs); + this.metrics.observeThroughput(labels, pieceResult.throughputBps); + this.metrics.recordHttpResponseCode(labels, pieceResult.statusCode); + + // 3. CAR validation (only if piece was successfully retrieved and has IPFS indexing) + if ( + pieceResult.success && + piece.withIPFSIndexing && + piece.ipfsRootCid && + pieceResult.pieceBytes && + provider && + !signal?.aborted + ) { + try { + carResult = await this.carValidationService.validateCarPiece( + pieceResult.pieceBytes, + provider, + piece.ipfsRootCid, + signal, + ); + } catch (error) { + this.logger.warn({ + ...logContext, + event: "anon_retrieval_car_validation_failed", + message: "CAR validation threw an error", + pieceCid: piece.pieceCid, + spAddress, + error: toStructuredError(error), + }); + } + } + + // Emit CAR validation metrics + if (carResult) { + this.metrics.recordCarParseStatus(labels, carResult.carParseable); + this.metrics.recordIpniStatus( + labels, + carResult.ipniValid === null ? "skipped" : carResult.ipniValid ? "valid" : "invalid", + ); + this.metrics.recordBlockFetchStatus( + labels, + carResult.blockFetchValid === null ? "skipped" : carResult.blockFetchValid ? "valid" : "invalid", + ); + } else if (!pieceResult.success) { + // Piece retrieval failed — IPNI and block fetch were skipped + this.metrics.recordIpniStatus(labels, "skipped"); + this.metrics.recordBlockFetchStatus(labels, "skipped"); + } + + // Overall check duration and status + this.metrics.observeCheckDuration(labels, Date.now() - checkStart); + this.metrics.recordStatus( + labels, + pieceResult.success ? "success" : pieceResult.aborted ? "failure.aborted" : "failure.http", + ); + } finally { + // Always save a record — even on abort or unexpected error — so we never + // lose the evidence (ttfb, bytes, response code) we already collected. + pieceResult ??= buildAbortedPlaceholder(piece.pieceCid, signal?.reason); + saved = await this.saveRetrievalRecord(spAddress, piece, pieceResult, carResult, startedAt, logContext); + } + + return saved; + } + + private async saveRetrievalRecord( + spAddress: string, + piece: { + pieceCid: string; + dataSetId: string; + pieceId: string; + rawSize: string; + withIPFSIndexing: boolean; + ipfsRootCid: string | null; + }, + pieceResult: PieceRetrievalResult, + carResult: CarValidationResult | null, + startedAt: Date, + logContext?: ProviderJobContext, + ): Promise { + const providerInfo = this.walletSdkService.getProviderInfo(spAddress); + const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; + + const retrieval = this.anonRetrievalRepository.create({ + spAddress, + pieceCid: piece.pieceCid, + dataSetId: BigInt(piece.dataSetId), + pieceId: BigInt(piece.pieceId), + rawSize: BigInt(piece.rawSize), + withIpfsIndexing: piece.withIPFSIndexing, + ipfsRootCid: piece.ipfsRootCid, + serviceType: ServiceType.DIRECT_SP, + retrievalEndpoint: `${spBaseUrl}/piece/${piece.pieceCid}`, + status: pieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED, + startedAt, + completedAt: new Date(), + latencyMs: pieceResult.latencyMs > 0 ? Math.round(pieceResult.latencyMs) : null, + ttfbMs: pieceResult.ttfbMs > 0 ? Math.round(pieceResult.ttfbMs) : null, + throughputBps: pieceResult.throughputBps > 0 ? Math.round(pieceResult.throughputBps) : null, + bytesRetrieved: pieceResult.bytesReceived > 0 ? pieceResult.bytesReceived : null, + responseCode: pieceResult.statusCode > 0 ? pieceResult.statusCode : null, + errorMessage: pieceResult.errorMessage ?? null, + commpValid: pieceResult.success ? pieceResult.commPValid : null, + carValid: carResult ? carResult.ipniValid !== false && carResult.blockFetchValid !== false : null, + }); + + try { + await this.anonRetrievalRepository.save(retrieval); + } catch (error) { + this.logger.warn({ + ...logContext, + event: "anon_retrieval_save_failed", + message: "Failed to save anonymous retrieval record", + pieceCid: piece.pieceCid, + spAddress, + error: toStructuredError(error), + }); + return null; + } + + this.logger.log({ + ...logContext, + event: "anon_retrieval_completed", + message: "Anonymous retrieval test completed", + pieceCid: piece.pieceCid, + spAddress, + success: pieceResult.success, + aborted: pieceResult.aborted === true, + latencyMs: pieceResult.latencyMs, + ttfbMs: pieceResult.ttfbMs, + bytesRetrieved: pieceResult.bytesReceived, + carParseable: carResult?.carParseable, + ipniValid: carResult?.ipniValid, + blockFetchValid: carResult?.blockFetchValid, + }); + + return retrieval; + } +} + +function buildAbortedPlaceholder(pieceCid: string, reason: unknown): PieceRetrievalResult { + const message = + reason instanceof Error && reason.message ? reason.message : typeof reason === "string" ? reason : "aborted"; + return { + success: false, + pieceCid, + bytesReceived: 0, + pieceBytes: null, + latencyMs: 0, + ttfbMs: 0, + throughputBps: 0, + statusCode: 0, + commPValid: false, + errorMessage: message, + aborted: true, + }; +} diff --git a/apps/backend/src/retrieval-anon/car-validation.service.ts b/apps/backend/src/retrieval-anon/car-validation.service.ts new file mode 100644 index 00000000..8019b8df --- /dev/null +++ b/apps/backend/src/retrieval-anon/car-validation.service.ts @@ -0,0 +1,223 @@ +import { CarReader } from "@ipld/car"; +import * as dagPB from "@ipld/dag-pb"; +import { Injectable, Logger } from "@nestjs/common"; +import { ConfigService } from "@nestjs/config"; +import { create as createBlock } from "multiformats/block"; +import { CID } from "multiformats/cid"; +import * as raw from "multiformats/codecs/raw"; +import { sha256 } from "multiformats/hashes/sha2"; +import { toStructuredError } from "../common/logging.js"; +import type { IConfig } from "../config/app.config.js"; +import type { StorageProvider } from "../database/entities/storage-provider.entity.js"; +import { HttpClientService } from "../http-client/http-client.service.js"; +import { IpniVerificationService } from "../ipni/ipni-verification.service.js"; +import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; +import type { CarValidationResult } from "./types.js"; + +// UnixFS DAGs use only dag-pb (interior nodes) and raw (leaf data) codecs +const unixfsCodecs: Record unknown }> = { + [dagPB.code]: dagPB, + [raw.code]: raw, +}; + +@Injectable() +export class CarValidationService { + private readonly logger = new Logger(CarValidationService.name); + + constructor( + private readonly configService: ConfigService, + private readonly httpClientService: HttpClientService, + private readonly walletSdkService: WalletSdkService, + private readonly ipniVerificationService: IpniVerificationService, + ) {} + + /** + * Validate an anonymous piece retrieved as a CAR: + * 1. parse the CAR, + * 2. sample random blocks, + * 3. confirm the SP is advertised for the root + sampled CIDs via IPNI, + * 4. fetch each sampled block from the SP and hash-verify it. + * + * CAR parse failure is attributed to the client (bad upload), not the SP. + */ + async validateCarPiece( + pieceBytes: Buffer, + provider: StorageProvider, + ipfsRootCid: string, + signal?: AbortSignal, + ): Promise { + const blocks = await this.parseCar(pieceBytes, provider.address, ipfsRootCid); + if (blocks === null) { + return { carParseable: false, blockCount: 0, sampledCidCount: 0, ipniValid: null, blockFetchValid: null }; + } + if (blocks.length === 0) { + return { + carParseable: true, + blockCount: 0, + sampledCidCount: 0, + ipniValid: null, + blockFetchValid: null, + errorMessage: "CAR contained no blocks", + }; + } + + const sampleCount = this.configService.get("retrieval", { infer: true }).anonBlockSampleCount; + const shuffled = [...blocks].sort(() => Math.random() - 0.5); + const sampledBlocks = shuffled.slice(0, sampleCount); + + const ipniValid = await this.checkIpni(provider, ipfsRootCid, sampledBlocks, signal); + const blockFetchResult = await this.checkBlockFetch(sampledBlocks, provider.address, signal); + + return { + carParseable: true, + blockCount: blocks.length, + sampledCidCount: sampledBlocks.length, + ipniValid, + blockFetchValid: blockFetchResult.valid, + errorMessage: blockFetchResult.errorMessage, + }; + } + + private async parseCar( + pieceBytes: Buffer, + spAddress: string, + ipfsRootCid: string, + ): Promise<{ cid: CID; bytes: Uint8Array }[] | null> { + try { + const reader = await CarReader.fromBytes(new Uint8Array(pieceBytes)); + const blocks: { cid: CID; bytes: Uint8Array }[] = []; + for await (const block of reader.blocks()) { + blocks.push({ cid: block.cid, bytes: block.bytes }); + } + return blocks; + } catch (error) { + this.logger.debug({ + event: "car_parse_failed", + message: "Failed to parse piece bytes as CAR - client fault, not SP", + spAddress, + ipfsRootCid, + error: toStructuredError(error), + }); + return null; + } + } + + /** + * Verify via IPNI that the SP is advertised for the root CID and each sampled child CID. + * Delegates to the shared IpniVerificationService which uses filecoin-pin's provider-scoped check. + */ + private async checkIpni( + provider: StorageProvider, + ipfsRootCid: string, + sampledBlocks: ReadonlyArray<{ cid: CID }>, + signal?: AbortSignal, + ): Promise { + const timeouts = this.configService.get("timeouts", { infer: true }); + let rootCid: CID; + try { + rootCid = CID.parse(ipfsRootCid); + } catch (error) { + this.logger.warn({ + event: "ipni_root_cid_invalid", + message: "Failed to parse ipfsRootCID", + ipfsRootCid, + providerAddress: provider.address, + error: toStructuredError(error), + }); + return false; + } + + const result = await this.ipniVerificationService.verify({ + rootCid, + blockCids: sampledBlocks.map((b) => b.cid), + storageProvider: provider, + timeoutMs: timeouts.ipniVerificationTimeoutMs, + pollIntervalMs: timeouts.ipniVerificationPollingMs, + signal, + }); + + return result.rootCIDVerified; + } + + /** + * Fetch each sampled block from the SP endpoint and hash-verify the response + * against the declared CID. Mirrors IpfsBlockRetrievalStrategy's per-block + * verification for the sampled subset (no DAG traversal). + */ + private async checkBlockFetch( + sampledBlocks: ReadonlyArray<{ cid: CID; bytes: Uint8Array }>, + spAddress: string, + signal?: AbortSignal, + ): Promise<{ valid: boolean | null; errorMessage?: string }> { + const providerInfo = this.walletSdkService.getProviderInfo(spAddress); + if (!providerInfo) { + return { valid: null, errorMessage: `Provider info not found for ${spAddress}` }; + } + + const spBaseUrl = providerInfo.pdp.serviceURL.replace(/\/$/, ""); + let allValid = true; + + for (const block of sampledBlocks) { + signal?.throwIfAborted(); + const cidStr = block.cid.toString(); + const blockUrl = `${spBaseUrl}/ipfs/${cidStr}?format=raw`; + + try { + const resp = await this.httpClientService.requestWithMetrics(blockUrl, { + headers: { Accept: "application/vnd.ipld.raw" }, + httpVersion: "2", + signal, + }); + + if (resp.metrics.statusCode < 200 || resp.metrics.statusCode >= 300) { + allValid = false; + this.logger.warn({ + event: "block_fetch_non_2xx", + message: "Block fetch returned non-2xx status", + cid: cidStr, + spAddress, + statusCode: resp.metrics.statusCode, + }); + continue; + } + + if (block.cid.multihash.code !== sha256.code) { + this.logger.warn({ + event: "block_unsupported_hash", + message: `Unsupported hash algorithm 0x${block.cid.multihash.code.toString(16)}`, + cid: cidStr, + spAddress, + }); + allValid = false; + continue; + } + + const codec = unixfsCodecs[block.cid.code]; + if (!codec) { + this.logger.warn({ + event: "block_unsupported_codec", + message: `Unsupported codec 0x${block.cid.code.toString(16)}`, + cid: cidStr, + spAddress, + }); + allValid = false; + continue; + } + + // Hash-verifies and decodes; throws on mismatch + await createBlock({ bytes: resp.data, cid: block.cid, hasher: sha256, codec }); + } catch (error) { + allValid = false; + this.logger.warn({ + event: "block_fetch_failed", + message: "Block fetch or hash verification failed", + cid: cidStr, + spAddress, + error: toStructuredError(error), + }); + } + } + + return { valid: allValid }; + } +} diff --git a/apps/backend/src/retrieval-anon/piece-retrieval.service.ts b/apps/backend/src/retrieval-anon/piece-retrieval.service.ts new file mode 100644 index 00000000..51150661 --- /dev/null +++ b/apps/backend/src/retrieval-anon/piece-retrieval.service.ts @@ -0,0 +1,195 @@ +import { asPieceCID, calculate as calculatePieceCid } from "@filoz/synapse-core/piece"; +import { Injectable, Logger } from "@nestjs/common"; +import { toStructuredError } from "../common/logging.js"; +import { HttpClientService } from "../http-client/http-client.service.js"; +import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; +import type { PieceRetrievalResult } from "./types.js"; + +@Injectable() +export class PieceRetrievalService { + private readonly logger = new Logger(PieceRetrievalService.name); + + constructor( + private readonly walletSdkService: WalletSdkService, + private readonly httpClientService: HttpClientService, + ) {} + + async fetchPiece(spAddress: string, pieceCid: string, signal?: AbortSignal): Promise { + const providerInfo = this.walletSdkService.getProviderInfo(spAddress); + + if (!providerInfo) { + this.logger.warn({ + event: "provider_info_not_found", + message: "Cannot fetch piece: provider info not found", + spAddress, + pieceCid, + }); + + return { + success: false, + pieceCid, + bytesReceived: 0, + pieceBytes: null, + latencyMs: 0, + ttfbMs: 0, + throughputBps: 0, + statusCode: 0, + commPValid: false, + errorMessage: `Provider info not found for ${spAddress}`, + }; + } + + const baseUrl = providerInfo.pdp.serviceURL.replace(/\/$/, ""); + const url = `${baseUrl}/piece/${pieceCid}`; + + try { + const result = await this.httpClientService.requestWithMetrics(url, { + httpVersion: "2", + signal, + }); + + const { metrics } = result; + const isSuccess = metrics.statusCode >= 200 && metrics.statusCode < 300; + const throughputBps = metrics.totalTime > 0 ? metrics.responseSize / (metrics.totalTime / 1000) : 0; + + if (result.aborted) { + this.logger.warn({ + event: "piece_fetch_aborted", + message: "Piece fetch aborted mid-download; returning partial metrics", + url, + pieceCid, + spAddress, + bytesReceived: metrics.responseSize, + ttfbMs: metrics.ttfb, + abortReason: result.abortReason, + }); + + return { + success: false, + pieceCid, + bytesReceived: metrics.responseSize, + pieceBytes: null, + latencyMs: metrics.totalTime, + ttfbMs: metrics.ttfb, + throughputBps, + statusCode: metrics.statusCode, + commPValid: false, + errorMessage: result.abortReason ?? "aborted", + aborted: true, + }; + } + + if (!isSuccess) { + this.logger.warn({ + event: "piece_fetch_non_2xx", + message: "Piece fetch returned non-2xx status", + url, + statusCode: metrics.statusCode, + pieceCid, + spAddress, + }); + + return { + success: false, + pieceCid, + bytesReceived: metrics.responseSize, + pieceBytes: null, + latencyMs: metrics.totalTime, + ttfbMs: metrics.ttfb, + throughputBps, + statusCode: metrics.statusCode, + commPValid: false, + errorMessage: `HTTP ${metrics.statusCode}`, + }; + } + + const pieceBytes = Buffer.isBuffer(result.data) ? result.data : Buffer.from(result.data); + const commPValid = await this.validateCommP(pieceBytes, pieceCid); + + this.logger.debug({ + event: "piece_fetch_success", + message: "Piece fetched successfully", + pieceCid, + spAddress, + bytesReceived: metrics.responseSize, + latencyMs: metrics.totalTime, + ttfbMs: metrics.ttfb, + }); + + return { + success: true, + pieceCid, + bytesReceived: metrics.responseSize, + pieceBytes, + latencyMs: metrics.totalTime, + ttfbMs: metrics.ttfb, + throughputBps, + statusCode: metrics.statusCode, + commPValid, + }; + } catch (error) { + const aborted = signal?.aborted === true; + this.logger.warn({ + event: "piece_fetch_failed", + message: "Piece fetch threw an error", + url, + pieceCid, + spAddress, + aborted, + error: toStructuredError(error), + }); + + return { + success: false, + pieceCid, + bytesReceived: 0, + pieceBytes: null, + latencyMs: 0, + ttfbMs: 0, + throughputBps: 0, + statusCode: 0, + commPValid: false, + errorMessage: error instanceof Error ? error.message : String(error), + aborted, + }; + } + } + + /** + * Compute the piece CID (sha2-256-trunc254-padded) of the retrieved bytes and compare + * against the expected CID. Returns false on parse failure, computation failure, or mismatch. + */ + private async validateCommP(bytes: Buffer, pieceCid: string): Promise { + const expected = asPieceCID(pieceCid); + if (!expected) { + this.logger.warn({ + event: "commp_invalid_piece_cid", + message: "Cannot parse expected piece CID for CommP validation", + pieceCid, + }); + return false; + } + + try { + const computed = calculatePieceCid(bytes); + const matches = computed.toString() === expected.toString(); + if (!matches) { + this.logger.warn({ + event: "commp_mismatch", + message: "Piece CID mismatch: SP-returned bytes hash to a different CID", + expected: expected.toString(), + computed: computed.toString(), + }); + } + return matches; + } catch (error) { + this.logger.warn({ + event: "commp_validation_error", + message: "CommP computation threw an error", + pieceCid, + error: toStructuredError(error), + }); + return false; + } + } +} diff --git a/apps/backend/src/retrieval-anon/retrieval-anon.module.ts b/apps/backend/src/retrieval-anon/retrieval-anon.module.ts new file mode 100644 index 00000000..4e9e38df --- /dev/null +++ b/apps/backend/src/retrieval-anon/retrieval-anon.module.ts @@ -0,0 +1,27 @@ +import { Module } from "@nestjs/common"; +import { ConfigModule } from "@nestjs/config"; +import { TypeOrmModule } from "@nestjs/typeorm"; +import { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; +import { StorageProvider } from "../database/entities/storage-provider.entity.js"; +import { HttpClientModule } from "../http-client/http-client.module.js"; +import { IpniModule } from "../ipni/ipni.module.js"; +import { SubgraphModule } from "../subgraph/subgraph.module.js"; +import { WalletSdkModule } from "../wallet-sdk/wallet-sdk.module.js"; +import { AnonPieceSelectorService } from "./anon-piece-selector.service.js"; +import { AnonRetrievalService } from "./anon-retrieval.service.js"; +import { CarValidationService } from "./car-validation.service.js"; +import { PieceRetrievalService } from "./piece-retrieval.service.js"; + +@Module({ + imports: [ + ConfigModule, + TypeOrmModule.forFeature([AnonRetrieval, StorageProvider]), + SubgraphModule, + WalletSdkModule, + HttpClientModule, + IpniModule, + ], + providers: [AnonPieceSelectorService, PieceRetrievalService, CarValidationService, AnonRetrievalService], + exports: [AnonRetrievalService], +}) +export class RetrievalAnonModule {} diff --git a/apps/backend/src/retrieval-anon/types.ts b/apps/backend/src/retrieval-anon/types.ts new file mode 100644 index 00000000..2c3384d5 --- /dev/null +++ b/apps/backend/src/retrieval-anon/types.ts @@ -0,0 +1,35 @@ +/** The result of anonymous piece selection. */ +export type AnonPiece = { + pieceCid: string; + dataSetId: string; + pieceId: string; + serviceProvider: string; + withIPFSIndexing: boolean; + ipfsRootCid: string | null; + rawSize: string; +}; + +/** Result of piece retrieval. */ +export type PieceRetrievalResult = { + success: boolean; + pieceCid: string; + bytesReceived: number; + pieceBytes: Buffer | null; + latencyMs: number; + ttfbMs: number; + throughputBps: number; + statusCode: number; + commPValid: boolean; + errorMessage?: string; + aborted?: boolean; +}; + +/** Result of CAR validation. */ +export type CarValidationResult = { + carParseable: boolean; + blockCount: number; + sampledCidCount: number; + ipniValid: boolean | null; + blockFetchValid: boolean | null; + errorMessage?: string; +}; diff --git a/apps/backend/src/subgraph/queries.ts b/apps/backend/src/subgraph/queries.ts new file mode 100644 index 00000000..74802ddf --- /dev/null +++ b/apps/backend/src/subgraph/queries.ts @@ -0,0 +1,78 @@ +export const Queries = { + GET_PROVIDERS_WITH_DATASETS: ` + query GetProvidersWithDataSet($addresses: [Bytes!], $blockNumber: BigInt!) { + providers(where: {address_in: $addresses}) { + address + totalFaultedPeriods + totalProvingPeriods + proofSets (where: {nextDeadline_lt: $blockNumber, status: PROVING}) { + nextDeadline + maxProvingPeriod + } + } + } + `, + GET_SUBGRAPH_META: ` + query GetSubgraphMeta { + _meta { + block { + number + } + } + } + `, +} as const; + +/** + * Build a sampleAnonPiece query scoped to the requested pool. The single + * piece of query shape that differs is whether the proofSet filter pins + * `withIPFSIndexing: true`; assembling the fragment here keeps the rest + * of the query and the returned selection set shared. + */ +export function buildSampleAnonPieceQuery(pool: "indexed" | "any"): string { + const indexingFilter = pool === "indexed" ? "withIPFSIndexing: true" : ""; + return ` + query SampleAnonPiece( + $serviceProvider: Bytes! + $payer: Bytes! + $sampleKey: Bytes! + $minSize: BigInt! + $maxSize: BigInt! + ) { + _meta { + block { + number + } + } + roots( + first: 1 + orderBy: sampleKey + orderDirection: asc + where: { + sampleKey_gte: $sampleKey + removed: false + rawSize_gte: $minSize + rawSize_lte: $maxSize + proofSet_: { + fwssServiceProvider: $serviceProvider + fwssPayer_not: $payer + isActive: true + ${indexingFilter} + } + } + subgraphError: allow + ) { + rootId + cid + rawSize + ipfsRootCID + proofSet { + setId + withIPFSIndexing + fwssPayer + pdpPaymentEndEpoch + } + } + } + `; +} diff --git a/apps/backend/src/subgraph/subgraph.module.ts b/apps/backend/src/subgraph/subgraph.module.ts new file mode 100644 index 00000000..7834c39b --- /dev/null +++ b/apps/backend/src/subgraph/subgraph.module.ts @@ -0,0 +1,8 @@ +import { Module } from "@nestjs/common"; +import { SubgraphService } from "./subgraph.service.js"; + +@Module({ + providers: [SubgraphService], + exports: [SubgraphService], +}) +export class SubgraphModule {} diff --git a/apps/backend/src/pdp-subgraph/pdp-subgraph.service.spec.ts b/apps/backend/src/subgraph/subgraph.service.spec.ts similarity index 79% rename from apps/backend/src/pdp-subgraph/pdp-subgraph.service.spec.ts rename to apps/backend/src/subgraph/subgraph.service.spec.ts index cd3a1ea8..4dc2cd5e 100644 --- a/apps/backend/src/pdp-subgraph/pdp-subgraph.service.spec.ts +++ b/apps/backend/src/subgraph/subgraph.service.spec.ts @@ -1,7 +1,8 @@ import type { ConfigService } from "@nestjs/config"; +import { CID } from "multiformats/cid"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import type { IConfig } from "../config/app.config.js"; -import { PDPSubgraphService } from "./pdp-subgraph.service.js"; +import { SubgraphService } from "./subgraph.service.js"; const VALID_ADDRESS = "0xd8da6bf26964af9d7eed9e03e53415d37aa96045" as const; const SUBGRAPH_ENDPOINT = "https://api.thegraph.com/subgraphs/filecoin/pdp" as const; @@ -35,21 +36,57 @@ const makeSubgraphMetaResponse = (blockNumber = 12345) => ({ }, }); -describe("PDPSubgraphService", () => { - let service: PDPSubgraphService; +const FWSS_SP_ADDRESS = "0xAaaaAAaaaaAAaaaAaAaAaaAaaaAaAaAaaAaaa111"; +const FWSS_PAYER = "0xBBbbBBbbBBbBBbBbbBBbbBBbbbbBbBBbbBBbb222"; +const EXAMPLE_PIECE_CID = "baga6ea4seaqpzwrimvoc4jp4l7mk6knsknf6owsc2ev4krrs2peenl5qelh6u4y"; +const pieceCidHex = `0x${Buffer.from(CID.parse(EXAMPLE_PIECE_CID).bytes).toString("hex")}`; + +const makeSampleRoot = (overrides: Record = {}) => ({ + rootId: "1", + cid: pieceCidHex, + rawSize: "1048576", + ipfsRootCID: "bafyroot", + proofSet: { + setId: "42", + withIPFSIndexing: true, + fwssPayer: FWSS_PAYER.toLowerCase(), + pdpPaymentEndEpoch: null, + }, + ...overrides, +}); + +const makeSampleResponse = (roots: Record[] = [], blockNumber = 12345) => ({ + data: { + _meta: { block: { number: blockNumber } }, + roots, + }, +}); + +const SAMPLE_KEY = "0x0000000000000000000000000000000000000000000000000000000000000001"; +const defaultSampleParams = { + serviceProvider: FWSS_SP_ADDRESS, + payer: FWSS_PAYER, + sampleKey: SAMPLE_KEY, + minSize: "0", + maxSize: "1000000000000", + pool: "indexed" as const, +}; + +describe("SubgraphService", () => { + let service: SubgraphService; let fetchMock: ReturnType; beforeEach(() => { const configService = { get: vi.fn((key: keyof IConfig) => { if (key === "blockchain") { - return { pdpSubgraphEndpoint: SUBGRAPH_ENDPOINT }; + return { subgraphEndpoint: SUBGRAPH_ENDPOINT }; } return undefined; }), } as unknown as ConfigService; - service = new PDPSubgraphService(configService); + service = new SubgraphService(configService); fetchMock = vi.fn(); vi.stubGlobal("fetch", fetchMock); @@ -362,10 +399,10 @@ describe("PDPSubgraphService", () => { it("throws when PDP subgraph endpoint is not configured", async () => { const configService = { - get: vi.fn(() => ({ pdpSubgraphEndpoint: "" })), + get: vi.fn(() => ({ subgraphEndpoint: "" })), } as unknown as ConfigService; - const serviceWithoutEndpoint = new PDPSubgraphService(configService); + const serviceWithoutEndpoint = new SubgraphService(configService); await expect(serviceWithoutEndpoint.fetchSubgraphMeta()).rejects.toThrow("No PDP subgraph endpoint configured"); }); @@ -691,4 +728,120 @@ describe("PDPSubgraphService", () => { expect(timestamps.length).toBe(1); }); }); + + describe("sampleAnonPiece", () => { + it("returns null when endpoint is not configured", async () => { + const noEndpointConfig = { + get: vi.fn(() => ({ subgraphEndpoint: "" })), + } as unknown as ConfigService; + const noEndpointService = new SubgraphService(noEndpointConfig); + + const piece = await noEndpointService.sampleAnonPiece(defaultSampleParams); + expect(piece).toBeNull(); + expect(fetchMock).not.toHaveBeenCalled(); + }); + + it("returns null when the subgraph yields no matching root", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => makeSampleResponse([]), + }); + + const piece = await service.sampleAnonPiece(defaultSampleParams); + expect(piece).toBeNull(); + }); + + it("parses the sampled root into a decoded candidate piece", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => makeSampleResponse([makeSampleRoot()]), + }); + + const piece = await service.sampleAnonPiece(defaultSampleParams); + + expect(piece).toMatchObject({ + pieceCid: EXAMPLE_PIECE_CID, + pieceId: "1", + dataSetId: "42", + rawSize: "1048576", + withIPFSIndexing: true, + ipfsRootCid: "bafyroot", + pdpPaymentEndEpoch: null, + indexedAtBlock: 12345, + }); + }); + + it("returns pdpPaymentEndEpoch as bigint when the dataset is terminating", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => + makeSampleResponse([ + makeSampleRoot({ + proofSet: { + setId: "42", + withIPFSIndexing: true, + fwssPayer: FWSS_PAYER.toLowerCase(), + pdpPaymentEndEpoch: "5000", + }, + }), + ]), + }); + + const piece = await service.sampleAnonPiece(defaultSampleParams); + expect(piece?.pdpPaymentEndEpoch).toBe(5000n); + }); + + it("lowercases SP and payer addresses before querying", async () => { + fetchMock.mockResolvedValueOnce({ ok: true, json: async () => makeSampleResponse([]) }); + + await service.sampleAnonPiece(defaultSampleParams); + + const [, opts] = fetchMock.mock.calls[0]; + const body = JSON.parse(opts.body as string); + expect(body.variables.serviceProvider).toBe(FWSS_SP_ADDRESS.toLowerCase()); + expect(body.variables.payer).toBe(FWSS_PAYER.toLowerCase()); + expect(body.query).toContain("withIPFSIndexing: true"); + }); + + it("uses the any-pool query when pool is 'any'", async () => { + fetchMock.mockResolvedValueOnce({ ok: true, json: async () => makeSampleResponse([]) }); + + await service.sampleAnonPiece({ ...defaultSampleParams, pool: "any" }); + + const [, opts] = fetchMock.mock.calls[0]; + const body = JSON.parse(opts.body as string); + expect(body.query).not.toContain("withIPFSIndexing: true"); + }); + + it("returns null when the sampled root has an undecodable CID", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => makeSampleResponse([makeSampleRoot({ cid: "0xdeadbeef" })]), + }); + + const piece = await service.sampleAnonPiece(defaultSampleParams); + expect(piece).toBeNull(); + }); + + it("throws after max retries on repeated HTTP errors", async () => { + fetchMock.mockResolvedValue({ ok: false, status: 500, statusText: "Internal Server Error" }); + + const promise = service.sampleAnonPiece(defaultSampleParams); + promise.catch(() => {}); + await vi.runAllTimersAsync(); + + await expect(promise).rejects.toThrow("Failed to fetch subgraph sample_anon_piece_indexed after 3 attempts"); + expect(fetchMock).toHaveBeenCalledTimes(3); + }); + + it("does not retry on schema validation failure", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => ({ data: { _meta: { block: { number: 1 } } } }), // missing roots + }); + + await expect(service.sampleAnonPiece(defaultSampleParams)).rejects.toThrow(/validation failed/i); + expect(fetchMock).toHaveBeenCalledTimes(1); + }); + }); }); diff --git a/apps/backend/src/pdp-subgraph/pdp-subgraph.service.ts b/apps/backend/src/subgraph/subgraph.service.ts similarity index 52% rename from apps/backend/src/pdp-subgraph/pdp-subgraph.service.ts rename to apps/backend/src/subgraph/subgraph.service.ts index aedd8bce..55359179 100644 --- a/apps/backend/src/pdp-subgraph/pdp-subgraph.service.ts +++ b/apps/backend/src/subgraph/subgraph.service.ts @@ -2,9 +2,40 @@ import { Injectable, Logger } from "@nestjs/common"; import { ConfigService } from "@nestjs/config"; import { toStructuredError } from "../common/logging.js"; import type { IBlockchainConfig, IConfig } from "../config/app.config.js"; -import { Queries } from "./queries.js"; -import type { GraphQLResponse, ProviderDataSetResponse, ProvidersWithDataSetsOptions, SubgraphMeta } from "./types.js"; -import { validateProviderDataSetResponse, validateSubgraphMetaResponse } from "./types.js"; +import { buildSampleAnonPieceQuery, Queries } from "./queries.js"; +import type { + AnonCandidatePiece, + GraphQLResponse, + ProviderDataSetResponse, + ProvidersWithDataSetsOptions, + RawSampleAnonPieceResponse, + SubgraphMeta, +} from "./types.js"; +import { + decodePieceCid, + validateProviderDataSetResponse, + validateSampleAnonPieceResponse, + validateSubgraphMetaResponse, +} from "./types.js"; + +/** Pool of pieces to sample from. */ +export type AnonPiecePool = "indexed" | "any"; + +/** Inputs for a single anonymous piece sample query. */ +export type SampleAnonPieceParams = { + /** Service provider address (lowercase hex). */ + serviceProvider: string; + /** Dealbot's own payer address (excluded to keep the sample non-dealbot). */ + payer: string; + /** Uniform-random 32-byte sort key as `0x`-prefixed hex. */ + sampleKey: string; + /** Inclusive lower bound on raw piece size in bytes (decimal string). */ + minSize: string; + /** Inclusive upper bound on raw piece size in bytes (decimal string). */ + maxSize: string; + /** Which pool to sample from. */ + pool: AnonPiecePool; +}; /** * Error thrown when data validation fails. @@ -21,8 +52,8 @@ class ValidationError extends Error { } @Injectable() -export class PDPSubgraphService { - private readonly logger: Logger = new Logger(PDPSubgraphService.name); +export class SubgraphService { + private readonly logger: Logger = new Logger(SubgraphService.name); private readonly blockchainConfig: IBlockchainConfig; private static readonly MAX_PROVIDERS_PER_QUERY = 100; @@ -45,14 +76,14 @@ export class PDPSubgraphService { * @throws Error if endpoint is not configured or after MAX_RETRIES attempts */ async fetchSubgraphMeta(attempt: number = 1): Promise { - if (!this.blockchainConfig.pdpSubgraphEndpoint) { + if (!this.blockchainConfig.subgraphEndpoint) { throw new Error("No PDP subgraph endpoint configured"); } try { await this.enforceRateLimit(); - const response = await fetch(this.blockchainConfig.pdpSubgraphEndpoint, { + const response = await fetch(this.blockchainConfig.subgraphEndpoint, { method: "POST", headers: { "Content-Type": "application/json", @@ -95,13 +126,13 @@ export class PDPSubgraphService { } // Retry on network/HTTP errors - if (attempt < PDPSubgraphService.MAX_RETRIES) { - const delay = PDPSubgraphService.INITIAL_RETRY_DELAY_MS * (1 << (attempt - 1)); + if (attempt < SubgraphService.MAX_RETRIES) { + const delay = SubgraphService.INITIAL_RETRY_DELAY_MS * (1 << (attempt - 1)); this.logger.warn({ event: "subgraph_meta_request_retry", message: "Subgraph meta request failed. Retrying...", attempt, - maxRetries: PDPSubgraphService.MAX_RETRIES, + maxRetries: SubgraphService.MAX_RETRIES, retryDelayMs: delay, error: toStructuredError(error), }); @@ -112,11 +143,11 @@ export class PDPSubgraphService { this.logger.error({ event: "subgraph_meta_request_failed", message: "Subgraph meta request failed after maximum retries", - maxRetries: PDPSubgraphService.MAX_RETRIES, + maxRetries: SubgraphService.MAX_RETRIES, error: toStructuredError(error), }); throw new Error( - `Failed to fetch subgraph metadata after ${PDPSubgraphService.MAX_RETRIES} attempts: ${errorMessage}`, + `Failed to fetch subgraph metadata after ${SubgraphService.MAX_RETRIES} attempts: ${errorMessage}`, ); } } @@ -136,13 +167,154 @@ export class PDPSubgraphService { return []; } - if (addresses.length <= PDPSubgraphService.MAX_PROVIDERS_PER_QUERY) { + if (addresses.length <= SubgraphService.MAX_PROVIDERS_PER_QUERY) { return this.fetchWithRetry(blockNumber, addresses); } return this.fetchMultipleBatchesWithRateLimit(blockNumber, addresses); } + /** + * Draw a single random anonymous piece for retrieval testing. + * + * Uses the Root.sampleKey (keccak256 of the entity id) to pick the + * smallest key ≥ `params.sampleKey` that matches the filters — a uniform + * random pick when `sampleKey` is generated uniformly. Server-side filters + * cover SP, payer-exclusion, active status, size range, and optionally + * `withIPFSIndexing`. Returns null when no piece matches (callers should + * retry with a fresh sampleKey or relax the pool/bucket). + * + * `pdpPaymentEndEpoch` is returned to the caller for a cheap client-side + * epoch comparison — GraphQL filters on nullable BigInts are awkward. + */ + async sampleAnonPiece(params: SampleAnonPieceParams): Promise { + if (!this.blockchainConfig.subgraphEndpoint) { + return null; + } + + const query = buildSampleAnonPieceQuery(params.pool); + const variables = { + serviceProvider: params.serviceProvider.toLowerCase(), + payer: params.payer.toLowerCase(), + sampleKey: params.sampleKey, + minSize: params.minSize, + maxSize: params.maxSize, + }; + + const validated = await this.executeQuery( + `sample_anon_piece_${params.pool}`, + query, + variables, + validateSampleAnonPieceResponse, + ); + + const root = validated.roots[0]; + if (!root) { + return null; + } + + try { + return { + pieceCid: decodePieceCid(root.cid), + pieceId: root.rootId, + dataSetId: root.proofSet.setId, + rawSize: root.rawSize, + withIPFSIndexing: root.proofSet.withIPFSIndexing, + ipfsRootCid: root.ipfsRootCID ?? null, + indexedAtBlock: validated._meta.block.number, + pdpPaymentEndEpoch: root.proofSet.pdpPaymentEndEpoch != null ? BigInt(root.proofSet.pdpPaymentEndEpoch) : null, + }; + } catch (error) { + this.logger.warn({ + event: "anon_piece_cid_decode_failed", + message: "Failed to decode piece CID from subgraph data", + dataSetId: root.proofSet.setId, + pieceId: root.rootId, + error: toStructuredError(error), + }); + return null; + } + } + + /** + * Generic single-query helper with retry and rate limiting. Used by queries that + * don't fit the batched provider-fetch shape. + */ + private async executeQuery( + operationName: string, + query: string, + variables: Record, + transform: (data: unknown) => T, + attempt: number = 1, + ): Promise { + if (!this.blockchainConfig.subgraphEndpoint) { + throw new Error("No PDP subgraph endpoint configured"); + } + + try { + await this.enforceRateLimit(); + + const response = await fetch(this.blockchainConfig.subgraphEndpoint, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ query, variables }), + }); + + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } + + const result = (await response.json()) as GraphQLResponse; + + if (result.errors) { + const errorMessage = result.errors?.[0]?.message || "Unknown GraphQL error"; + throw new Error(`GraphQL error: ${errorMessage}`); + } + + try { + return transform(result.data); + } catch (validationError) { + const errorMessage = validationError instanceof Error ? validationError.message : "Unknown validation error"; + throw new ValidationError(`Data validation failed: ${errorMessage}`); + } + } catch (error) { + const errorMessage = error instanceof Error ? error.message : "Unknown error"; + + if (error instanceof ValidationError) { + this.logger.error({ + event: `subgraph_${operationName}_validation_failed`, + message: `Subgraph ${operationName} validation failed`, + error: toStructuredError(error), + }); + throw error; + } + + if (attempt < SubgraphService.MAX_RETRIES) { + const delay = SubgraphService.INITIAL_RETRY_DELAY_MS * (1 << (attempt - 1)); + this.logger.warn({ + event: `subgraph_${operationName}_request_retry`, + message: `Subgraph ${operationName} request failed. Retrying...`, + attempt, + maxRetries: SubgraphService.MAX_RETRIES, + retryDelayMs: delay, + error: toStructuredError(error), + }); + await new Promise((resolve) => setTimeout(resolve, delay)); + return this.executeQuery(operationName, query, variables, transform, attempt + 1); + } + + this.logger.error({ + event: `subgraph_${operationName}_request_failed`, + message: `Subgraph ${operationName} request failed after maximum retries`, + maxRetries: SubgraphService.MAX_RETRIES, + error: toStructuredError(error), + }); + throw new Error( + `Failed to fetch subgraph ${operationName} after ${SubgraphService.MAX_RETRIES} attempts: ${errorMessage}`, + ); + } + } + /** * Fetch multiple batches with rate limiting and concurrency control */ @@ -151,15 +323,15 @@ export class PDPSubgraphService { addresses: string[], ): Promise { const batches: string[][] = []; - for (let i = 0; i < addresses.length; i += PDPSubgraphService.MAX_PROVIDERS_PER_QUERY) { - const addressesLimit = Math.min(addresses.length, i + PDPSubgraphService.MAX_PROVIDERS_PER_QUERY); + for (let i = 0; i < addresses.length; i += SubgraphService.MAX_PROVIDERS_PER_QUERY) { + const addressesLimit = Math.min(addresses.length, i + SubgraphService.MAX_PROVIDERS_PER_QUERY); batches.push(addresses.slice(i, addressesLimit)); } const allProviders: ProviderDataSetResponse["providers"] = []; - for (let i = 0; i < batches.length; i += PDPSubgraphService.MAX_CONCURRENT_REQUESTS) { - const batchGroup = batches.slice(i, i + PDPSubgraphService.MAX_CONCURRENT_REQUESTS); + for (let i = 0; i < batches.length; i += SubgraphService.MAX_CONCURRENT_REQUESTS) { + const batchGroup = batches.slice(i, i + SubgraphService.MAX_CONCURRENT_REQUESTS); const results = await Promise.all(batchGroup.map((batch) => this.fetchWithRetry(blockNumber, batch))); @@ -178,7 +350,7 @@ export class PDPSubgraphService { addresses: string[], attempt: number = 1, ): Promise { - if (!this.blockchainConfig.pdpSubgraphEndpoint) { + if (!this.blockchainConfig.subgraphEndpoint) { throw new Error("No PDP subgraph endpoint configured"); } @@ -190,7 +362,7 @@ export class PDPSubgraphService { try { await this.enforceRateLimit(); - const response = await fetch(this.blockchainConfig.pdpSubgraphEndpoint, { + const response = await fetch(this.blockchainConfig.subgraphEndpoint, { method: "POST", headers: { "Content-Type": "application/json", @@ -235,13 +407,13 @@ export class PDPSubgraphService { } // Retry on network/HTTP errors - if (attempt < PDPSubgraphService.MAX_RETRIES) { - const delay = PDPSubgraphService.INITIAL_RETRY_DELAY_MS * (1 << (attempt - 1)); + if (attempt < SubgraphService.MAX_RETRIES) { + const delay = SubgraphService.INITIAL_RETRY_DELAY_MS * (1 << (attempt - 1)); this.logger.warn({ event: "subgraph_provider_request_retry", message: "Subgraph provider request failed. Retrying...", attempt, - maxRetries: PDPSubgraphService.MAX_RETRIES, + maxRetries: SubgraphService.MAX_RETRIES, retryDelayMs: delay, addressCount: addresses.length, error: toStructuredError(error), @@ -253,14 +425,12 @@ export class PDPSubgraphService { this.logger.error({ event: "subgraph_provider_request_failed", message: "Subgraph provider request failed after maximum retries", - maxRetries: PDPSubgraphService.MAX_RETRIES, + maxRetries: SubgraphService.MAX_RETRIES, blockNumber, addressCount: addresses.length, error: toStructuredError(error), }); - throw new Error( - `Failed to fetch provider data after ${PDPSubgraphService.MAX_RETRIES} attempts: ${errorMessage}`, - ); + throw new Error(`Failed to fetch provider data after ${SubgraphService.MAX_RETRIES} attempts: ${errorMessage}`); } } @@ -270,18 +440,18 @@ export class PDPSubgraphService { * Read more here: https://docs.goldsky.com/subgraphs/graphql-endpoints#public-endpoints */ private async enforceRateLimit(requestCount: number = 1): Promise { - if (requestCount > PDPSubgraphService.MAX_CONCURRENT_REQUESTS) { + if (requestCount > SubgraphService.MAX_CONCURRENT_REQUESTS) { throw new Error( - `Cannot request ${requestCount} items; exceeds rate limit window of ${PDPSubgraphService.MAX_CONCURRENT_REQUESTS}`, + `Cannot request ${requestCount} items; exceeds rate limit window of ${SubgraphService.MAX_CONCURRENT_REQUESTS}`, ); } const now = Date.now(); - const windowStart = now - PDPSubgraphService.RATE_LIMIT_WINDOW_MS; + const windowStart = now - SubgraphService.RATE_LIMIT_WINDOW_MS; this.requestTimestamps = this.requestTimestamps.filter((timestamp) => timestamp > windowStart); - const availableSlots = PDPSubgraphService.MAX_CONCURRENT_REQUESTS - this.requestTimestamps.length; + const availableSlots = SubgraphService.MAX_CONCURRENT_REQUESTS - this.requestTimestamps.length; if (requestCount > availableSlots) { const requiredSlots = requestCount - availableSlots; @@ -290,7 +460,7 @@ export class PDPSubgraphService { const oldestTimestamp = this.requestTimestamps[index] || now; // wait time with 10ms buffer - const waitTime = oldestTimestamp + PDPSubgraphService.RATE_LIMIT_WINDOW_MS - now + 10; + const waitTime = oldestTimestamp + SubgraphService.RATE_LIMIT_WINDOW_MS - now + 10; if (waitTime > 0) { await new Promise((resolve) => setTimeout(resolve, waitTime)); diff --git a/apps/backend/src/pdp-subgraph/types.spec.ts b/apps/backend/src/subgraph/types.spec.ts similarity index 100% rename from apps/backend/src/pdp-subgraph/types.spec.ts rename to apps/backend/src/subgraph/types.spec.ts diff --git a/apps/backend/src/pdp-subgraph/types.ts b/apps/backend/src/subgraph/types.ts similarity index 58% rename from apps/backend/src/pdp-subgraph/types.ts rename to apps/backend/src/subgraph/types.ts index ad8dcdc4..3a89f360 100644 --- a/apps/backend/src/pdp-subgraph/types.ts +++ b/apps/backend/src/subgraph/types.ts @@ -1,4 +1,5 @@ import Joi from "joi"; +import { CID } from "multiformats/cid"; import { Hex, isAddress } from "viem"; // ----------------------------------------- @@ -54,6 +55,58 @@ export type ProviderDataSetResponse = { }[]; }; +/** A piece eligible for anonymous retrieval. */ +export type AnonCandidatePiece = { + /** Decoded piece CID string (e.g. "bafk..."). */ + pieceCid: string; + /** On-chain piece ID (rootId) as a decimal string. */ + pieceId: string; + /** On-chain dataset ID (setId) as a decimal string. */ + dataSetId: string; + /** Raw piece size in bytes, as a decimal string. */ + rawSize: string; + /** True iff the parent dataset declared withIPFSIndexing metadata. */ + withIPFSIndexing: boolean; + /** IPFS root CID declared by the client when uploading, or null. */ + ipfsRootCid: string | null; + /** Subgraph-indexed block number at query time. */ + indexedAtBlock: number; + /** pdpPaymentEndEpoch from the parent dataset, or null. */ + pdpPaymentEndEpoch: bigint | null; +}; + +/** + * Validated raw shape of the anonymous piece sampling subgraph response. + * At most one root is returned (`first: 1`). + */ +export type RawSampleAnonPieceResponse = { + _meta: { block: { number: number } }; + roots: Array<{ + rootId: string; + cid: string; + rawSize: string; + ipfsRootCID: string | null; + proofSet: { + setId: string; + withIPFSIndexing: boolean; + fwssPayer: string | null; + pdpPaymentEndEpoch: string | null; + }; + }>; +}; + +// ----------------------------------------- +// Helpers +// ----------------------------------------- + +/** + * Decodes a hex-encoded CID (0x...) into its string representation. + */ +export function decodePieceCid(hexData: string): string { + const bytes = Buffer.from(hexData.slice(2), "hex"); + return CID.decode(new Uint8Array(bytes)).toString(); +} + // ----------------------------------------- // Joi Custom Schema Converters // ----------------------------------------- @@ -117,6 +170,41 @@ const providerDataSetResponseSchema = Joi.object({ .unknown(true) .required(); +const sampleRootProofSetSchema = Joi.object({ + setId: Joi.string().pattern(/^\d+$/).required(), + withIPFSIndexing: Joi.boolean().required(), + fwssPayer: Joi.string() + .pattern(/^0x[0-9a-fA-F]{40}$/) + .allow(null) + .optional(), + pdpPaymentEndEpoch: Joi.string().pattern(/^\d+$/).allow(null).optional(), +}).unknown(true); + +const sampleRootSchema = Joi.object({ + rootId: Joi.string().pattern(/^\d+$/).required(), + cid: Joi.string() + .pattern(/^0x[0-9a-fA-F]+$/) + .required(), + rawSize: Joi.string().pattern(/^\d+$/).required(), + ipfsRootCID: Joi.string().allow(null).optional(), + proofSet: sampleRootProofSetSchema.required(), +}).unknown(true); + +const sampleAnonPieceResponseSchema = Joi.object({ + _meta: Joi.object({ + block: Joi.object({ + number: Joi.number().integer().positive().required(), + }) + .unknown(true) + .required(), + }) + .unknown(true) + .required(), + roots: Joi.array().items(sampleRootSchema).max(1).required(), +}) + .unknown(true) + .required(); + // ----------------------------------------- // Validator Functions // ----------------------------------------- @@ -149,3 +237,16 @@ export function validateProviderDataSetResponse(value: unknown): ProviderDataSet } return validated as ProviderDataSetResponse; } + +/** + * Validates the raw sampleAnonPiece response from the subgraph. + * + * @throws Error if validation fails + */ +export function validateSampleAnonPieceResponse(value: unknown): RawSampleAnonPieceResponse { + const { error, value: validated } = sampleAnonPieceResponseSchema.validate(value, { abortEarly: false }); + if (error) { + throw new Error(`Invalid sampleAnonPiece response format: ${error.message}`); + } + return validated as RawSampleAnonPieceResponse; +} diff --git a/apps/backend/src/wallet-sdk/wallet-sdk.service.spec.ts b/apps/backend/src/wallet-sdk/wallet-sdk.service.spec.ts index d6613a31..195db19f 100644 --- a/apps/backend/src/wallet-sdk/wallet-sdk.service.spec.ts +++ b/apps/backend/src/wallet-sdk/wallet-sdk.service.spec.ts @@ -18,7 +18,7 @@ const baseConfig: IBlockchainConfig = { checkDatasetCreationFees: false, useOnlyApprovedProviders: false, minNumDataSetsForChecks: 1, - pdpSubgraphEndpoint: "https://api.thegraph.com/subgraphs/filecoin/pdp", + subgraphEndpoint: "https://api.thegraph.com/subgraphs/filecoin/pdp", }; const makeProvider = (overrides: Partial): PDPProviderEx => diff --git a/docs/checks/data-retention.md b/docs/checks/data-retention.md index 605753e7..4eb7a912 100644 --- a/docs/checks/data-retention.md +++ b/docs/checks/data-retention.md @@ -27,7 +27,7 @@ Dealbot polls The Graph API endpoint for PDP (Proof of Data Possession) data at **Subgraph repository**: [FilOzone/pdp-explorer](https://github.com/FilOzone/pdp-explorer/blob/main/subgraph/src/pdp-verifier.ts) -**Subgraph endpoint**: Configured via `PDP_SUBGRAPH_ENDPOINT` environment variable (see [environment-variables.md](../environment-variables.md#pdp_subgraph_endpoint)) +**Subgraph endpoint**: Configured via `SUBGRAPH_ENDPOINT` environment variable (see [environment-variables.md](../environment-variables.md#subgraph_endpoint)) > **Note**: The production subgraph URL is currently being finalized [here](https://github.com/FilOzone/pdp-explorer/pull/86). @@ -48,7 +48,7 @@ From `GET_PROVIDERS_WITH_DATASETS` query for each provider: > **Note**: The subgraph query uses the field name `proofSets`, but this refers to "dataSets" in the current codebase. The terminology was updated from "proof set" to "data set" but the subgraph schema retains the old naming. -Source: [`pdp-subgraph.service.ts` (`fetchSubgraphMeta`, `fetchProvidersWithDatasets`)](../../apps/backend/src/pdp-subgraph/pdp-subgraph.service.ts) +Source: [`subgraph.service.ts` (`fetchSubgraphMeta`, `fetchProvidersWithDatasets`)](../../apps/backend/src/subgraph/subgraph.service.ts) ### 2. Compute Challenge Totals and Overdue Estimates @@ -170,7 +170,7 @@ The PDP subgraph service enforces Goldsky's public endpoint rate limits: Rate limiting is enforced client-side to prevent 429 errors. -Source: [`pdp-subgraph.service.ts` (`enforceRateLimit`)](../../apps/backend/src/pdp-subgraph/pdp-subgraph.service.ts) +Source: [`subgraph.service.ts` (`enforceRateLimit`)](../../apps/backend/src/subgraph/subgraph.service.ts) ## Metrics Recorded @@ -210,11 +210,11 @@ Key environment variables that control data retention check behavior: | Variable | Required | Default | Description | | ----------------------- | -------- | ------------ | ------------------------------------------------------------------------------------------------ | -| `PDP_SUBGRAPH_ENDPOINT` | No | Empty string | The Graph API endpoint for PDP subgraph queries. When empty, data retention checks are disabled. | +| `SUBGRAPH_ENDPOINT` | No | Empty string | The Graph API endpoint for PDP subgraph queries. When empty, data retention checks are disabled. | Source: [`app.config.ts`](../../apps/backend/src/config/app.config.ts) -See also: [`environment-variables.md`](../environment-variables.md#pdp_subgraph_endpoint) for the full configuration reference. +See also: [`environment-variables.md`](../environment-variables.md#subgraph_endpoint) for the full configuration reference. ## Error Handling diff --git a/docs/checks/production-configuration-and-approval-methodology.md b/docs/checks/production-configuration-and-approval-methodology.md index 5566904d..3d956aa4 100644 --- a/docs/checks/production-configuration-and-approval-methodology.md +++ b/docs/checks/production-configuration-and-approval-methodology.md @@ -40,7 +40,7 @@ Relevant parameters include: | Parameter | Value | Notes | |-----------|-------|-------| -| [`PDP_SUBGRAPH_ENDPOINT`](../environment-variables.md#pdp_subgraph_endpoint) | TODO: fill this in | Uses the subgraph from [pdp-explorer](https://github.com/FilOzone/pdp-explorer). | +| [`SUBGRAPH_ENDPOINT`](../environment-variables.md#subgraph_endpoint) | TODO: fill this in | Uses the subgraph from [pdp-explorer](https://github.com/FilOzone/pdp-explorer). | | [`MIN_NUM_DATASETS_FOR_CHECKS`](../environment-variables.md#dataset-configuration) | 15 | Ensure there are enough datasets with pieces being added so that statistical significance for [Data Retention Fault Rate](#data-retention-fault-rate) can be achieved quicker. Note that on mainnet each dataset incurs 5 challenges[^1] per daily proof[^2]. With this many datasets, an SP can be approved for data retention after a faultless ~7 days even if the SP doesn't have other datasets. | See [How are data retention statistics/thresholds calculated?](#how-are-data-retention-statisticsthresholds-calculated) for more details. diff --git a/docs/environment-variables.md b/docs/environment-variables.md index 359d86da..2f25943c 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -8,10 +8,10 @@ This document provides a comprehensive guide to all environment variables used b | ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | | [Application](#application-configuration) | `NODE_ENV`, `DEALBOT_PORT`, `DEALBOT_HOST`, `DEALBOT_RUN_MODE`, `DEALBOT_METRICS_PORT`, `DEALBOT_METRICS_HOST`, `DEALBOT_ALLOWED_ORIGINS`, `ENABLE_DEV_MODE` | | [Database](#database-configuration) | `DATABASE_HOST`, `DATABASE_PORT`, `DATABASE_POOL_MAX`, `DATABASE_USER`, `DATABASE_PASSWORD`, `DATABASE_NAME` | -| [Blockchain](#blockchain-configuration) | `NETWORK`, `RPC_URL`, `WALLET_ADDRESS`, `WALLET_PRIVATE_KEY`, `SESSION_KEY_PRIVATE_KEY`, `CHECK_DATASET_CREATION_FEES`, `USE_ONLY_APPROVED_PROVIDERS`, `PDP_SUBGRAPH_ENDPOINT` | +| [Blockchain](#blockchain-configuration) | `NETWORK`, `RPC_URL`, `WALLET_ADDRESS`, `WALLET_PRIVATE_KEY`, `SESSION_KEY_PRIVATE_KEY`, `CHECK_DATASET_CREATION_FEES`, `USE_ONLY_APPROVED_PROVIDERS`, `SUBGRAPH_ENDPOINT` | | [Dataset Versioning](#dataset-versioning) | `DEALBOT_DATASET_VERSION` | | [Scheduling](#scheduling-configuration) | `PROVIDERS_REFRESH_INTERVAL_SECONDS`, `DATA_RETENTION_POLL_INTERVAL_SECONDS`, `DEALBOT_MAINTENANCE_WINDOWS_UTC`, `DEALBOT_MAINTENANCE_WINDOW_MINUTES` | -| [Jobs (pg-boss)](#jobs-pg-boss) | `DEALBOT_PGBOSS_SCHEDULER_ENABLED`, `DEALBOT_PGBOSS_POOL_MAX`, `DEALS_PER_SP_PER_HOUR`, `DATASET_CREATIONS_PER_SP_PER_HOUR`, `RETRIEVALS_PER_SP_PER_HOUR`, `JOB_SCHEDULER_POLL_SECONDS`, `JOB_WORKER_POLL_SECONDS`, `PG_BOSS_LOCAL_CONCURRENCY`, `JOB_CATCHUP_MAX_ENQUEUE`, `JOB_SCHEDULE_PHASE_SECONDS`, `JOB_ENQUEUE_JITTER_SECONDS`, `DEAL_JOB_TIMEOUT_SECONDS`, `RETRIEVAL_JOB_TIMEOUT_SECONDS`, `IPFS_BLOCK_FETCH_CONCURRENCY` | +| [Jobs (pg-boss)](#jobs-pg-boss) | `DEALBOT_PGBOSS_SCHEDULER_ENABLED`, `DEALBOT_PGBOSS_POOL_MAX`, `DEALS_PER_SP_PER_HOUR`, `DATASET_CREATIONS_PER_SP_PER_HOUR`, `RETRIEVALS_PER_SP_PER_HOUR`, `JOB_SCHEDULER_POLL_SECONDS`, `JOB_WORKER_POLL_SECONDS`, `PG_BOSS_LOCAL_CONCURRENCY`, `JOB_CATCHUP_MAX_ENQUEUE`, `JOB_SCHEDULE_PHASE_SECONDS`, `JOB_ENQUEUE_JITTER_SECONDS`, `DEAL_JOB_TIMEOUT_SECONDS`, `RETRIEVAL_JOB_TIMEOUT_SECONDS`, `ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS`, `IPFS_BLOCK_FETCH_CONCURRENCY` | | [Dataset](#dataset-configuration) | `DEALBOT_LOCAL_DATASETS_PATH`, `RANDOM_PIECE_SIZES` | | [ClickHouse](#clickhouse-configuration) | `CLICKHOUSE_URL`, `CLICKHOUSE_BATCH_SIZE`, `CLICKHOUSE_FLUSH_INTERVAL_MS`, `DEALBOT_PROBE_LOCATION` | | [Timeouts](#timeout-configuration) | `CONNECT_TIMEOUT_MS`, `HTTP_REQUEST_TIMEOUT_MS`, `HTTP2_REQUEST_TIMEOUT_MS`, `IPNI_VERIFICATION_TIMEOUT_MS`, `IPNI_VERIFICATION_POLLING_MS` | @@ -425,22 +425,25 @@ Session keys are scoped (only storage operations, not deposits or withdrawals) a --- -### `PDP_SUBGRAPH_ENDPOINT` +### `SUBGRAPH_ENDPOINT` - **Type**: `string` (URL) - **Required**: No - **Default**: Empty string (feature disabled) -**Role**: The Graph API endpoint for querying PDP (Proof of Data Possession) subgraph data. This endpoint is used to retrieve data retention info for provider data. +**Role**: The Graph API endpoint for querying PDP (Proof of Data Possession) subgraph data. Drives the overdue-periods metric and the anonymous-retrieval candidate-piece query. + +The dealbot-owned subgraph lives at `apps/subgraph/` (package `@dealbot/subgraph`) and is deployed to Goldsky. Point this variable at one of those slots; the exact slugs are documented in `apps/subgraph/README.md`. **When to update**: -- When switching between different Graph API endpoints +- When swapping between the dealbot-owned subgraph slots on Goldsky (mainnet vs calibnet). +- When deploying a new subgraph version. **Example**: ```bash -PDP_SUBGRAPH_ENDPOINT=https://api.thegraph.com/subgraphs/filecoin/pdp +SUBGRAPH_ENDPOINT=https://api.goldsky.com/api/public//subgraphs/dealbot-subgraph//gn ``` --- @@ -784,6 +787,25 @@ Use this to stagger multiple dealbot deployments that are not sharing a database **Note**: This is independent of HTTP-level timeouts. The job timeout enforces end-to-end execution time of a Retrieval Check job. +--- + +### `ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS` + +- **Type**: `number` +- **Required**: No +- **Default**: `360` (6 minutes) +- **Minimum**: `60` +- **Enforced**: Yes (config validation) + +**Role**: Maximum runtime for anonymous retrieval jobs before forced abort. Anonymous retrievals fetch arbitrary pieces (up to ~70 MiB) that were not produced by the dealbot, so this is typically larger than `RETRIEVAL_JOB_TIMEOUT_SECONDS`. When the timeout trips, partial metrics (`ttfb_ms`, `bytes_retrieved`, `response_code`) are still persisted so the abort is not silently lost. + +**When to update**: + +- Increase if large pieces are consistently being cut off mid-download +- Decrease to detect and fail stuck retrievals faster + +**Note**: This is independent of HTTP-level timeouts (`CONNECT_TIMEOUT_MS`, `HTTP2_REQUEST_TIMEOUT_MS`). The job timeout covers the end-to-end execution of an Anon Retrieval Check (piece selection, download, CommP validation, CAR/IPNI validation). + --- ### `IPFS_BLOCK_FETCH_CONCURRENCY` diff --git a/kustomize/overlays/local/backend-configmap-local.yaml b/kustomize/overlays/local/backend-configmap-local.yaml index 9226d24e..b4febf61 100644 --- a/kustomize/overlays/local/backend-configmap-local.yaml +++ b/kustomize/overlays/local/backend-configmap-local.yaml @@ -26,7 +26,7 @@ data: PG_BOSS_LOCAL_CONCURRENCY: "3" JOB_WORKER_POLL_SECONDS: "60" RANDOM_PIECE_SIZES: "10485760" - PDP_SUBGRAPH_ENDPOINT: "https://api.goldsky.com/api/public/project_cmdfaaxeuz6us01u359yjdctw/subgraphs/pdp-explorer/calibration311a/gn" + SUBGRAPH_ENDPOINT: "https://api.goldsky.com/api/public/project_cmdfaaxeuz6us01u359yjdctw/subgraphs/pdp-explorer/calibration311a/gn" JOB_SCHEDULER_POLL_SECONDS: "60" CLICKHOUSE_URL: "http://default:@dealbot-clickhouse:8123/dealbot" DEALBOT_PROBE_LOCATION: "local" diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 0495aa11..8089b756 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1513,24 +1513,24 @@ packages: engines: {node: ^14.18.0 || >=16.10.0, npm: '>=5.10.0'} hasBin: true - '@oclif/core@4.10.5': - resolution: {integrity: sha512-qcdCF7NrdWPfme6Kr34wwljRCXbCVpL1WVxiNy0Ep6vbWKjxAjFQwuhqkoyL0yjI+KdwtLcOCGn5z2yzdijc8w==} + '@oclif/core@4.10.6': + resolution: {integrity: sha512-ySCOYnPKZE3KACT1V9It99hWG9b8E5MpagbRdWxPNRO3beMqmbr4SLUQoFtZ9XRtW++kks1ZVwZOdpnR8rpb9A==} engines: {node: '>=18.0.0'} '@oclif/core@4.5.5': resolution: {integrity: sha512-iQzlaJQgPeUXrtrX71OzDwxPikQ7c2FhNd8U8rBB7BCtj2XYfmzBT/Hmbc+g9OKDIG/JkbJT0fXaWMMBrhi+1A==} engines: {node: '>=18.0.0'} - '@oclif/plugin-autocomplete@3.2.45': - resolution: {integrity: sha512-ENrUg8rbVCjh40uvi3MC9kGbiUoEf11nyqE59RBzegeeLpRXNo/Zp27L9j1tUmPEqGgfS2/wvHPihNzkpK1FDw==} + '@oclif/plugin-autocomplete@3.2.46': + resolution: {integrity: sha512-TFvuD6JlmqEVsEvMqunyj3cyCz/l2Q4MqCjp/XtlSLS9x3xTlam7PGlqWi4WAhxl/K8CtpYqVlMYFEnlLTHspw==} engines: {node: '>=18.0.0'} - '@oclif/plugin-not-found@3.2.80': - resolution: {integrity: sha512-yTLjWvR1r/Rd/cO2LxHdMCDoL5sQhBYRUcOMCmxZtWVWhx4rAZ8KVUPDVsb+SvjJDV5ADTDBgt1H52fFx7YWqg==} + '@oclif/plugin-not-found@3.2.81': + resolution: {integrity: sha512-M88tLONBH36hLAbkFbmCo1hoZPSdU5l8Px1xEIlIgSmGMam+CoAzx4kGqpLbokgfpaHeP8/Jx3QJ18u9ef/2Qw==} engines: {node: '>=18.0.0'} - '@oclif/plugin-warn-if-update-available@3.1.60': - resolution: {integrity: sha512-cRKBZm14IuA6G8W84dfd3iXj3BTAoxQ5o3pUE8DKEQ4n/tVha20t5nkVeD+ISC68e0Fuw5koTMvRwXb1lJSnzg==} + '@oclif/plugin-warn-if-update-available@3.1.61': + resolution: {integrity: sha512-4XcrTxcCs+brR/eZ0BPeuiREiH3USlJiaHbUqPhnIBuyxhhUSYVd8ZO6s5MQN7AXJq4SMQ+B5zLaHq+ep/afIw==} engines: {node: '>=18.0.0'} '@open-draft/deferred-promise@2.2.0': @@ -7599,9 +7599,9 @@ snapshots: dependencies: '@float-capital/float-subgraph-uncrashable': 0.0.0-internal-testing.5 '@oclif/core': 4.5.5 - '@oclif/plugin-autocomplete': 3.2.45 - '@oclif/plugin-not-found': 3.2.80(@types/node@25.6.2) - '@oclif/plugin-warn-if-update-available': 3.1.60 + '@oclif/plugin-autocomplete': 3.2.46 + '@oclif/plugin-not-found': 3.2.81(@types/node@25.2.3) + '@oclif/plugin-warn-if-update-available': 3.1.61 '@pinax/graph-networks-registry': 0.7.1 '@whatwg-node/fetch': 0.10.13 assemblyscript: 0.19.23 @@ -8937,7 +8937,7 @@ snapshots: dependencies: consola: 3.4.2 - '@oclif/core@4.10.5': + '@oclif/core@4.10.6': dependencies: ansi-escapes: 4.3.2 ansis: 3.17.0 @@ -8979,7 +8979,7 @@ snapshots: wordwrap: 1.0.0 wrap-ansi: 7.0.0 - '@oclif/plugin-autocomplete@3.2.45': + '@oclif/plugin-autocomplete@3.2.46': dependencies: '@oclif/core': 4.5.5 ansis: 3.17.0 @@ -8988,16 +8988,16 @@ snapshots: transitivePeerDependencies: - supports-color - '@oclif/plugin-not-found@3.2.80(@types/node@25.6.2)': + '@oclif/plugin-not-found@3.2.81(@types/node@25.2.3)': dependencies: - '@inquirer/prompts': 7.10.1(@types/node@25.6.2) - '@oclif/core': 4.10.5 + '@inquirer/prompts': 7.10.1(@types/node@25.2.3) + '@oclif/core': 4.10.6 ansis: 3.17.0 fast-levenshtein: 3.0.0 transitivePeerDependencies: - '@types/node' - '@oclif/plugin-warn-if-update-available@3.1.60': + '@oclif/plugin-warn-if-update-available@3.1.61': dependencies: '@oclif/core': 4.5.5 ansis: 3.17.0 @@ -11779,7 +11779,7 @@ snapshots: dependencies: foreground-child: 3.3.1 jackspeak: 4.2.3 - minimatch: 10.2.4 + minimatch: 10.2.5 minipass: 7.1.2 package-json-from-dist: 1.0.1 path-scurry: 2.0.1 From 96c82c66f050f5de83c2530f74dda0b18c68618a Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 29 Apr 2026 10:14:56 +0200 Subject: [PATCH 02/28] refactor(anon): only use clickhouse --- .../src/clickhouse/clickhouse.schema.ts | 36 ++++++ apps/backend/src/database/database.module.ts | 9 +- .../entities/anon-retrieval.entity.ts | 100 ----------------- .../1762000000000-CreateAnonRetrievals.ts | 64 ----------- .../anon-piece-selector.service.spec.ts | 49 ++++----- .../anon-piece-selector.service.ts | 54 ++++----- .../anon-retrieval.service.spec.ts | 98 ++++++++++------- .../retrieval-anon/anon-retrieval.service.ts | 104 ++++++++++-------- .../retrieval-anon/retrieval-anon.module.ts | 3 +- 9 files changed, 208 insertions(+), 309 deletions(-) delete mode 100644 apps/backend/src/database/entities/anon-retrieval.entity.ts delete mode 100644 apps/backend/src/database/migrations/1762000000000-CreateAnonRetrievals.ts diff --git a/apps/backend/src/clickhouse/clickhouse.schema.ts b/apps/backend/src/clickhouse/clickhouse.schema.ts index 85d91052..8af769d7 100644 --- a/apps/backend/src/clickhouse/clickhouse.schema.ts +++ b/apps/backend/src/clickhouse/clickhouse.schema.ts @@ -62,6 +62,42 @@ export function buildMigrations(database: string): string[] { PARTITION BY toStartOfMonth(timestamp) TTL toDateTime(timestamp) + INTERVAL 1 YEAR`, + `CREATE TABLE IF NOT EXISTS ${database}.anon_retrieval_checks +( + timestamp DateTime64(3, 'UTC'), -- when the check completed + probe_location LowCardinality(String), -- dealbot location + sp_address String, -- storage provider address (lowercased) + sp_id Nullable(UInt64), -- storage provider numeric id + sp_name Nullable(String), -- storage provider name + + retrieval_id UUID, -- per-event correlation id (log/Prometheus join) + + piece_cid String, -- piece CID (v2/CommP) sampled from the subgraph + data_set_id UInt64, -- on-chain data set id + piece_id UInt64, -- on-chain piece id within the data set + raw_size UInt64, -- raw (unpadded) piece size, bytes + with_ipfs_indexing Bool, -- whether the piece advertises IPNI metadata + ipfs_root_cid Nullable(String), -- root CID of the contained DAG; null when not IPFS-indexed + + service_type LowCardinality(String), -- 'direct_sp' (only mode for anon retrievals today) + retrieval_endpoint String, -- URL probed (e.g. {spBaseUrl}/piece/{pieceCid}) + + status LowCardinality(String), -- RetrievalStatus: 'success' | 'failed' | 'pending' | 'in_progress' | 'timeout' + http_response_code Nullable(UInt16), -- raw HTTP status; null on transport failure + first_byte_ms Nullable(Float64), -- time to first response byte + last_byte_ms Nullable(Float64), -- time to last response byte + bytes_retrieved Nullable(UInt64), -- bytes received from /piece/{cid} + throughput_bps Nullable(UInt64), -- effective throughput, bytes per second + + commp_valid Nullable(Bool), -- null when retrieval failed before CommP could be hashed + car_valid Nullable(Bool), -- null when CAR validation was skipped (no IPFS indexing or piece fetch failed) + + error_message Nullable(String) -- failure reason; null on success +) ENGINE MergeTree() + PRIMARY KEY (probe_location, sp_address, timestamp) + PARTITION BY toStartOfMonth(timestamp) + TTL toDateTime(timestamp) + INTERVAL 1 YEAR`, + `CREATE TABLE IF NOT EXISTS ${database}.data_retention_challenges ( timestamp DateTime64(3, 'UTC'), -- when the poll ran and detected these periods diff --git a/apps/backend/src/database/database.module.ts b/apps/backend/src/database/database.module.ts index f3f9ed09..9249c3a9 100644 --- a/apps/backend/src/database/database.module.ts +++ b/apps/backend/src/database/database.module.ts @@ -7,7 +7,6 @@ import { fileURLToPath } from "url"; import { toStructuredError } from "../common/logging.js"; import { createPinoExitLogger } from "../common/pino.config.js"; import type { IAppConfig, IConfig, IDatabaseConfig } from "../config/app.config.js"; -import { AnonRetrieval } from "./entities/anon-retrieval.entity.js"; import { DataRetentionBaseline } from "./entities/data-retention-baseline.entity.js"; import { Deal } from "./entities/deal.entity.js"; import { JobScheduleState } from "./entities/job-schedule-state.entity.js"; @@ -50,7 +49,7 @@ function toSafeDataSourceContext(options: DataSourceOptions): Record { - await queryRunner.query(` - CREATE TABLE anon_retrievals ( - id UUID PRIMARY KEY DEFAULT gen_random_uuid(), - sp_address VARCHAR NOT NULL, - piece_cid VARCHAR NOT NULL, - data_set_id BIGINT NOT NULL, - piece_id BIGINT NOT NULL, - raw_size BIGINT NOT NULL, - with_ipfs_indexing BOOLEAN NOT NULL, - ipfs_root_cid VARCHAR NULL, - service_type VARCHAR NOT NULL DEFAULT 'direct_sp', - retrieval_endpoint VARCHAR NOT NULL, - status VARCHAR NOT NULL DEFAULT 'pending', - started_at TIMESTAMPTZ NOT NULL, - completed_at TIMESTAMPTZ NULL, - latency_ms INT NULL, - ttfb_ms INT NULL, - throughput_bps INT NULL, - bytes_retrieved BIGINT NULL, - response_code INT NULL, - error_message VARCHAR NULL, - commp_valid BOOLEAN NULL, - car_valid BOOLEAN NULL, - created_at TIMESTAMPTZ NOT NULL DEFAULT now(), - updated_at TIMESTAMPTZ NOT NULL DEFAULT now() - ) - `); - - // Per-SP dashboards. - await queryRunner.query(` - CREATE INDEX "IDX_anon_retrievals_sp_address" - ON anon_retrievals (sp_address) - `); - - // Used by the recent-dedup query in AnonPieceSelectorService — keeps the - // most-recently-tested CIDs out of the next selection. - await queryRunner.query(` - CREATE INDEX "IDX_anon_retrievals_piece_cid" - ON anon_retrievals (piece_cid) - `); - - // Supports "last N anonymous retrievals" ordering used by the selector. - await queryRunner.query(` - CREATE INDEX "IDX_anon_retrievals_created_at" - ON anon_retrievals (created_at DESC) - `); - } - - public async down(queryRunner: QueryRunner): Promise { - await queryRunner.query(`DROP TABLE IF EXISTS anon_retrievals`); - } -} diff --git a/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts b/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts index b822fe5f..6a787fbb 100644 --- a/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts @@ -1,8 +1,6 @@ import type { ConfigService } from "@nestjs/config"; -import type { Repository } from "typeorm"; import { beforeEach, describe, expect, it, vi } from "vitest"; import type { IConfig } from "../config/app.config.js"; -import type { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; import type { SampleAnonPieceParams, SubgraphService } from "../subgraph/subgraph.service.js"; import type { AnonCandidatePiece } from "../subgraph/types.js"; import { AnonPieceSelectorService } from "./anon-piece-selector.service.js"; @@ -22,18 +20,6 @@ const makePiece = (overrides: Partial = {}): AnonCandidatePi ...overrides, }); -const makeRetrievalRepository = (recentPieceCids: string[]): Repository => { - const queryBuilder = { - select: vi.fn().mockReturnThis(), - orderBy: vi.fn().mockReturnThis(), - limit: vi.fn().mockReturnThis(), - getRawMany: vi.fn().mockResolvedValue(recentPieceCids.map((c) => ({ pieceCid: c }))), - }; - return { - createQueryBuilder: vi.fn().mockReturnValue(queryBuilder), - } as unknown as Repository; -}; - const makeConfigService = (): ConfigService => ({ get: vi.fn((key: string) => { @@ -55,7 +41,7 @@ describe("AnonPieceSelectorService", () => { it("returns null when every fallback attempt yields no piece", async () => { sampleAnonPiece.mockResolvedValue(null); - const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); const result = await service.selectPieceForProvider(SP_ADDRESS); @@ -65,7 +51,7 @@ describe("AnonPieceSelectorService", () => { it("returns the sampled piece with SP address lowercased", async () => { sampleAnonPiece.mockResolvedValueOnce(makePiece({ pieceCid: "baga-the-one" })); - const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); const result = await service.selectPieceForProvider(SP_ADDRESS); @@ -76,7 +62,7 @@ describe("AnonPieceSelectorService", () => { it("passes the dealbot payer address to sampleAnonPiece for exclusion", async () => { sampleAnonPiece.mockResolvedValueOnce(makePiece()); - const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); await service.selectPieceForProvider(SP_ADDRESS); @@ -92,27 +78,30 @@ describe("AnonPieceSelectorService", () => { .mockResolvedValueOnce(makePiece({ pieceCid: staleCid, pdpPaymentEndEpoch: 100n, indexedAtBlock: 200 })) .mockResolvedValueOnce(makePiece({ pieceCid: freshCid, pdpPaymentEndEpoch: null })); - const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); const result = await service.selectPieceForProvider(SP_ADDRESS); expect(result?.pieceCid).toBe(freshCid); }); - it("redraws when the first sampled piece was recently tested", async () => { + it("redraws when the first sampled piece was recently selected by this process", async () => { const staleCid = "baga-stale"; const freshCid = "baga-fresh"; + + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); + + // Prime the in-memory ring buffer by first selecting `staleCid`. + sampleAnonPiece.mockResolvedValueOnce(makePiece({ pieceCid: staleCid })); + const first = await service.selectPieceForProvider(SP_ADDRESS); + expect(first?.pieceCid).toBe(staleCid); + + // Now the second selection should skip `staleCid` and use `freshCid`. sampleAnonPiece .mockResolvedValueOnce(makePiece({ pieceCid: staleCid })) .mockResolvedValueOnce(makePiece({ pieceCid: freshCid })); + const second = await service.selectPieceForProvider(SP_ADDRESS); - const service = new AnonPieceSelectorService( - subgraphService, - makeConfigService(), - makeRetrievalRepository([staleCid]), - ); - const result = await service.selectPieceForProvider(SP_ADDRESS); - - expect(result?.pieceCid).toBe(freshCid); + expect(second?.pieceCid).toBe(freshCid); }); it("falls back to the opposite pool when the preferred one is empty", async () => { @@ -120,7 +109,7 @@ describe("AnonPieceSelectorService", () => { const fresh = makePiece({ pieceCid: "baga-other-pool" }); sampleAnonPiece.mockResolvedValueOnce(null).mockResolvedValueOnce(null).mockResolvedValueOnce(fresh); - const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); const result = await service.selectPieceForProvider(SP_ADDRESS); expect(result?.pieceCid).toBe("baga-other-pool"); @@ -141,7 +130,7 @@ describe("AnonPieceSelectorService", () => { .mockResolvedValueOnce(null) .mockResolvedValueOnce(makePiece({ pieceCid: "baga-any-bucket" })); - const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); const result = await service.selectPieceForProvider(SP_ADDRESS); expect(result?.pieceCid).toBe("baga-any-bucket"); @@ -156,7 +145,7 @@ describe("AnonPieceSelectorService", () => { it("draws a fresh sampleKey for each subgraph call", async () => { sampleAnonPiece.mockResolvedValueOnce(null).mockResolvedValueOnce(makePiece()); - const service = new AnonPieceSelectorService(subgraphService, makeConfigService(), makeRetrievalRepository([])); + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); await service.selectPieceForProvider(SP_ADDRESS); const call1 = sampleAnonPiece.mock.calls[0][0] as SampleAnonPieceParams; diff --git a/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts index acc19832..8de50fa3 100644 --- a/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts +++ b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts @@ -1,10 +1,7 @@ import { randomBytes } from "node:crypto"; import { Injectable, Logger } from "@nestjs/common"; import { ConfigService } from "@nestjs/config"; -import { InjectRepository } from "@nestjs/typeorm"; -import type { Repository } from "typeorm"; import type { IConfig } from "../config/app.config.js"; -import { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; import type { AnonPiecePool, SampleAnonPieceParams } from "../subgraph/subgraph.service.js"; import { SubgraphService } from "../subgraph/subgraph.service.js"; import type { AnonCandidatePiece } from "../subgraph/types.js"; @@ -15,6 +12,9 @@ import type { AnonPiece } from "./types.js"; * to avoid immediately retesting the same piece. Piece CIDs are globally * unique and each one lives on a single SP's dataset, so scoping by CID * is equivalent to scoping by (SP, CID) for this workload. + * + * The buffer is process-local: a duplicate piece that gets retested shortly + * after a restart is harmless (still a valid measurement, just less diverse). */ const RECENT_DEDUP_WINDOW = 500; @@ -44,7 +44,7 @@ const BUCKET_WEIGHTS: Record = { /** * Probability the primary draw targets the withIPFSIndexing pool. - * The rest of the time we sample across all FWSS pieces so SPs can't + * The rest of the time we sample across all FWSS pieces, so SPs can't * optimise only their CAR corpus. */ const IPFS_INDEXED_SAMPLE_RATE = 0.8; @@ -53,11 +53,13 @@ const IPFS_INDEXED_SAMPLE_RATE = 0.8; export class AnonPieceSelectorService { private readonly logger = new Logger(AnonPieceSelectorService.name); + /** Bounded FIFO of recently-selected piece CIDs. Process-local; lost on restart. */ + private readonly recentlyTested = new Set(); + private readonly recentlyTestedQueue: string[] = []; + constructor( private readonly subgraphService: SubgraphService, private readonly configService: ConfigService, - @InjectRepository(AnonRetrieval) - private readonly anonRetrievalRepository: Repository, ) {} /** @@ -75,14 +77,13 @@ export class AnonPieceSelectorService { */ async selectPieceForProvider(spAddress: string): Promise { const dealbotPayer = this.configService.get("blockchain", { infer: true }).walletAddress; - const recentlyTested = await this.loadRecentlyTestedPieceCids(); const bucket = this.pickBucket(); const pool: AnonPiecePool = Math.random() < IPFS_INDEXED_SAMPLE_RATE ? "indexed" : "any"; const attempts: Array<{ bucket: SizeBucket | "any"; pool: AnonPiecePool }> = [ - { bucket, pool }, - { bucket, pool: pool === "indexed" ? "any" : "indexed" }, + { bucket: bucket, pool: pool }, + { bucket: bucket, pool: pool === "indexed" ? "any" : "indexed" }, { bucket: "any", pool: "indexed" }, { bucket: "any", pool: "any" }, ]; @@ -93,10 +94,10 @@ export class AnonPieceSelectorService { dealbotPayer, bucket: attempt.bucket, pool: attempt.pool, - recentlyTested, }); if (piece) { + this.rememberRecent(piece.pieceCid); this.logger.log({ event: "anon_piece_selected", message: "Selected anonymous piece for retrieval test", @@ -107,6 +108,7 @@ export class AnonPieceSelectorService { bucket: attempt.bucket, pool: attempt.pool, }); + return { pieceCid: piece.pieceCid, dataSetId: piece.dataSetId, @@ -124,6 +126,7 @@ export class AnonPieceSelectorService { message: "No anonymous piece found after all fallbacks", spAddress, }); + return null; } @@ -136,7 +139,6 @@ export class AnonPieceSelectorService { dealbotPayer: string; bucket: SizeBucket | "any"; pool: AnonPiecePool; - recentlyTested: Set; }): Promise { const range = args.bucket === "any" ? fullRange() : SIZE_BUCKETS[args.bucket]; @@ -159,7 +161,7 @@ export class AnonPieceSelectorService { continue; } - if (args.recentlyTested.has(piece.pieceCid)) { + if (this.recentlyTested.has(piece.pieceCid)) { continue; } @@ -181,19 +183,21 @@ export class AnonPieceSelectorService { return "medium"; } - /** - * Return the set of piece CIDs tested in the last RECENT_DEDUP_WINDOW - * anonymous retrievals across all SPs. - */ - private async loadRecentlyTestedPieceCids(): Promise> { - const rows = await this.anonRetrievalRepository - .createQueryBuilder("r") - .select("r.piece_cid", "pieceCid") - .orderBy("r.created_at", "DESC") - .limit(RECENT_DEDUP_WINDOW) - .getRawMany<{ pieceCid: string }>(); - - return new Set(rows.map((row) => row.pieceCid)); + /** Push a CID into the bounded FIFO; evict the oldest when at capacity. */ + private rememberRecent(pieceCid: string): void { + if (this.recentlyTested.has(pieceCid)) { + return; + } + + this.recentlyTested.add(pieceCid); + this.recentlyTestedQueue.push(pieceCid); + + while (this.recentlyTestedQueue.length > RECENT_DEDUP_WINDOW) { + const evicted = this.recentlyTestedQueue.shift(); + if (evicted !== undefined) { + this.recentlyTested.delete(evicted); + } + } } } diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts index 61e97105..e6619e32 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts @@ -1,6 +1,6 @@ import type { Repository } from "typeorm"; import { beforeEach, describe, expect, it, vi } from "vitest"; -import type { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; +import type { ClickhouseService } from "../clickhouse/clickhouse.service.js"; import type { StorageProvider } from "../database/entities/storage-provider.entity.js"; import { RetrievalStatus } from "../database/types.js"; import type { AnonRetrievalCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; @@ -35,20 +35,18 @@ function makeProvider(): StorageProvider { function makeService(opts: { pieceResult: PieceRetrievalResult; fetchPieceImpl?: (signal?: AbortSignal) => Promise; + clickhouseEnabled?: boolean; }): { service: AnonRetrievalService; - saveSpy: ReturnType; + insertSpy: ReturnType; fetchSpy: ReturnType; } { - const saveSpy = vi.fn(async (entity: AnonRetrieval) => entity); - const createdEntities: Partial[] = []; - const anonRetrievalRepository = { - create: vi.fn((data: Partial) => { - createdEntities.push(data); - return data; - }), - save: saveSpy, - } as unknown as Repository; + const insertSpy = vi.fn(); + const clickhouseService = { + insert: insertSpy, + enabled: opts.clickhouseEnabled ?? true, + probeLocation: "test-location", + } as unknown as ClickhouseService; const spRepository = { findOne: vi.fn(async () => makeProvider()), @@ -89,11 +87,11 @@ function makeService(opts: { carValidationService, walletSdkService, metrics, - anonRetrievalRepository, + clickhouseService, spRepository, ); - return { service, saveSpy, fetchSpy }; + return { service, insertSpy, fetchSpy }; } describe("AnonRetrievalService", () => { @@ -101,7 +99,7 @@ describe("AnonRetrievalService", () => { vi.clearAllMocks(); }); - it("persists partial metrics when fetchPiece returns aborted=true", async () => { + it("emits a ClickHouse row with partial metrics when fetchPiece returns aborted=true", async () => { const partial: PieceRetrievalResult = { success: false, pieceCid: PIECE.pieceCid, @@ -116,22 +114,28 @@ describe("AnonRetrievalService", () => { aborted: true, }; - const { service, saveSpy } = makeService({ pieceResult: partial }); + const { service, insertSpy } = makeService({ pieceResult: partial }); await service.performForProvider(SP_ADDRESS); - expect(saveSpy).toHaveBeenCalledTimes(1); - const saved = saveSpy.mock.calls[0][0] as Partial; - expect(saved.status).toBe(RetrievalStatus.FAILED); - expect(saved.bytesRetrieved).toBe(524288); - expect(saved.ttfbMs).toBe(150); - expect(saved.latencyMs).toBe(42000); - expect(saved.throughputBps).toBe(12500); - expect(saved.responseCode).toBe(200); - expect(saved.errorMessage).toContain("Anon retrieval job timeout"); + expect(insertSpy).toHaveBeenCalledTimes(1); + const [table, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(table).toBe("anon_retrieval_checks"); + expect(row.status).toBe(RetrievalStatus.FAILED); + expect(row.bytes_retrieved).toBe(524288); + expect(row.first_byte_ms).toBe(150); + expect(row.last_byte_ms).toBe(42000); + expect(row.throughput_bps).toBe(12500); + expect(row.http_response_code).toBe(200); + expect(row.error_message).toContain("Anon retrieval job timeout"); + expect(row.piece_cid).toBe(PIECE.pieceCid); + expect(row.sp_address).toBe(SP_ADDRESS); + expect(row.sp_id).toBe(7); + expect(row.probe_location).toBe("test-location"); + expect(typeof row.retrieval_id).toBe("string"); }); - it("still saves a row when the signal aborts before fetchPiece runs", async () => { + it("still emits a row when the signal aborts before fetchPiece runs", async () => { const ac = new AbortController(); ac.abort(new Error("Anon retrieval job timeout (60s) for sp1")); @@ -147,20 +151,20 @@ describe("AnonRetrievalService", () => { commPValid: false, }; - const { service, saveSpy, fetchSpy } = makeService({ pieceResult: never }); + const { service, insertSpy, fetchSpy } = makeService({ pieceResult: never }); await service.performForProvider(SP_ADDRESS, ac.signal); expect(fetchSpy).not.toHaveBeenCalled(); - expect(saveSpy).toHaveBeenCalledTimes(1); - const saved = saveSpy.mock.calls[0][0] as Partial; - expect(saved.status).toBe(RetrievalStatus.FAILED); - expect(saved.errorMessage).toContain("Anon retrieval job timeout"); - expect(saved.bytesRetrieved).toBeNull(); - expect(saved.ttfbMs).toBeNull(); + expect(insertSpy).toHaveBeenCalledTimes(1); + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.status).toBe(RetrievalStatus.FAILED); + expect(row.error_message).toContain("Anon retrieval job timeout"); + expect(row.bytes_retrieved).toBeNull(); + expect(row.first_byte_ms).toBeNull(); }); - it("still saves a row when fetchPiece throws unexpectedly", async () => { + it("still emits a row when fetchPiece throws unexpectedly", async () => { const never: PieceRetrievalResult = { success: false, pieceCid: PIECE.pieceCid, @@ -173,7 +177,7 @@ describe("AnonRetrievalService", () => { commPValid: false, }; - const { service, saveSpy } = makeService({ + const { service, insertSpy } = makeService({ pieceResult: never, fetchPieceImpl: async () => { throw new Error("network down"); @@ -182,8 +186,28 @@ describe("AnonRetrievalService", () => { await expect(service.performForProvider(SP_ADDRESS)).rejects.toThrow("network down"); - expect(saveSpy).toHaveBeenCalledTimes(1); - const saved = saveSpy.mock.calls[0][0] as Partial; - expect(saved.status).toBe(RetrievalStatus.FAILED); + expect(insertSpy).toHaveBeenCalledTimes(1); + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.status).toBe(RetrievalStatus.FAILED); + }); + + it("skips ClickHouse insert when ClickHouse is disabled", async () => { + const ok: PieceRetrievalResult = { + success: true, + pieceCid: PIECE.pieceCid, + bytesReceived: 1024, + pieceBytes: null, + latencyMs: 100, + ttfbMs: 10, + throughputBps: 10240, + statusCode: 200, + commPValid: true, + }; + + const { service, insertSpy } = makeService({ pieceResult: ok, clickhouseEnabled: false }); + + await service.performForProvider(SP_ADDRESS); + + expect(insertSpy).not.toHaveBeenCalled(); }); }); diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index d40fe315..1d56d2f0 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -1,8 +1,9 @@ +import { randomUUID } from "node:crypto"; import { Injectable, Logger } from "@nestjs/common"; import { InjectRepository } from "@nestjs/typeorm"; import type { Repository } from "typeorm"; +import { ClickhouseService } from "../clickhouse/clickhouse.service.js"; import { type ProviderJobContext, toStructuredError } from "../common/logging.js"; -import { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; import { RetrievalStatus, ServiceType } from "../database/types.js"; import { buildCheckMetricLabels } from "../metrics-prometheus/check-metric-labels.js"; @@ -13,6 +14,8 @@ import { CarValidationService } from "./car-validation.service.js"; import { PieceRetrievalService } from "./piece-retrieval.service.js"; import type { CarValidationResult, PieceRetrievalResult } from "./types.js"; +const ANON_RETRIEVAL_CHECKS_TABLE = "anon_retrieval_checks"; + @Injectable() export class AnonRetrievalService { private readonly logger = new Logger(AnonRetrievalService.name); @@ -23,17 +26,12 @@ export class AnonRetrievalService { private readonly carValidationService: CarValidationService, private readonly walletSdkService: WalletSdkService, private readonly metrics: AnonRetrievalCheckMetrics, - @InjectRepository(AnonRetrieval) - private readonly anonRetrievalRepository: Repository, + private readonly clickhouseService: ClickhouseService, @InjectRepository(StorageProvider) private readonly spRepository: Repository, ) {} - async performForProvider( - spAddress: string, - signal?: AbortSignal, - logContext?: ProviderJobContext, - ): Promise { + async performForProvider(spAddress: string, signal?: AbortSignal, logContext?: ProviderJobContext): Promise { // Build metric labels const provider = await this.spRepository.findOne({ where: { address: spAddress } }); const labels = buildCheckMetricLabels({ @@ -53,7 +51,7 @@ export class AnonRetrievalService { spAddress, }); this.metrics.recordStatus(labels, "failure.no_piece"); - return null; + return; } this.logger.log({ @@ -72,7 +70,6 @@ export class AnonRetrievalService { let pieceResult: PieceRetrievalResult | null = null; let carResult: CarValidationResult | null = null; - let saved: AnonRetrieval | null = null; try { // 2. Fetch the piece. fetchPiece never throws on abort — it returns a @@ -141,16 +138,15 @@ export class AnonRetrievalService { pieceResult.success ? "success" : pieceResult.aborted ? "failure.aborted" : "failure.http", ); } finally { - // Always save a record — even on abort or unexpected error — so we never - // lose the evidence (ttfb, bytes, response code) we already collected. + // Always emit a ClickHouse row — even on abort or unexpected error — so + // we never lose the evidence (ttfb, bytes, response code) we already + // collected. pieceResult ??= buildAbortedPlaceholder(piece.pieceCid, signal?.reason); - saved = await this.saveRetrievalRecord(spAddress, piece, pieceResult, carResult, startedAt, logContext); + this.emitClickhouseRow(spAddress, piece, pieceResult, carResult, startedAt, provider, logContext); } - - return saved; } - private async saveRetrievalRecord( + private emitClickhouseRow( spAddress: string, piece: { pieceCid: string; @@ -163,52 +159,70 @@ export class AnonRetrievalService { pieceResult: PieceRetrievalResult, carResult: CarValidationResult | null, startedAt: Date, + provider: StorageProvider | null, logContext?: ProviderJobContext, - ): Promise { + ): void { + if (!this.clickhouseService.enabled) { + this.logger.debug({ + ...logContext, + event: "anon_retrieval_clickhouse_disabled", + message: "ClickHouse disabled — anon retrieval row not emitted", + pieceCid: piece.pieceCid, + spAddress, + }); + return; + } + const providerInfo = this.walletSdkService.getProviderInfo(spAddress); const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; - - const retrieval = this.anonRetrievalRepository.create({ - spAddress, - pieceCid: piece.pieceCid, - dataSetId: BigInt(piece.dataSetId), - pieceId: BigInt(piece.pieceId), - rawSize: BigInt(piece.rawSize), - withIpfsIndexing: piece.withIPFSIndexing, - ipfsRootCid: piece.ipfsRootCid, - serviceType: ServiceType.DIRECT_SP, - retrievalEndpoint: `${spBaseUrl}/piece/${piece.pieceCid}`, - status: pieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED, - startedAt, - completedAt: new Date(), - latencyMs: pieceResult.latencyMs > 0 ? Math.round(pieceResult.latencyMs) : null, - ttfbMs: pieceResult.ttfbMs > 0 ? Math.round(pieceResult.ttfbMs) : null, - throughputBps: pieceResult.throughputBps > 0 ? Math.round(pieceResult.throughputBps) : null, - bytesRetrieved: pieceResult.bytesReceived > 0 ? pieceResult.bytesReceived : null, - responseCode: pieceResult.statusCode > 0 ? pieceResult.statusCode : null, - errorMessage: pieceResult.errorMessage ?? null, - commpValid: pieceResult.success ? pieceResult.commPValid : null, - carValid: carResult ? carResult.ipniValid !== false && carResult.blockFetchValid !== false : null, - }); + const status = pieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED; + const carValid = carResult ? carResult.ipniValid !== false && carResult.blockFetchValid !== false : null; + const retrievalId = randomUUID(); try { - await this.anonRetrievalRepository.save(retrieval); + this.clickhouseService.insert(ANON_RETRIEVAL_CHECKS_TABLE, { + timestamp: startedAt.getTime(), + probe_location: this.clickhouseService.probeLocation, + sp_address: spAddress, + sp_id: provider?.providerId != null ? Number(provider.providerId) : null, + sp_name: provider?.name ?? null, + retrieval_id: retrievalId, + piece_cid: piece.pieceCid, + data_set_id: piece.dataSetId, + piece_id: piece.pieceId, + raw_size: piece.rawSize, + with_ipfs_indexing: piece.withIPFSIndexing, + ipfs_root_cid: piece.ipfsRootCid, + service_type: ServiceType.DIRECT_SP, + retrieval_endpoint: `${spBaseUrl}/piece/${piece.pieceCid}`, + status, + http_response_code: pieceResult.statusCode > 0 ? pieceResult.statusCode : null, + first_byte_ms: pieceResult.ttfbMs > 0 ? pieceResult.ttfbMs : null, + last_byte_ms: pieceResult.latencyMs > 0 ? pieceResult.latencyMs : null, + bytes_retrieved: pieceResult.bytesReceived > 0 ? pieceResult.bytesReceived : null, + throughput_bps: pieceResult.throughputBps > 0 ? Math.round(pieceResult.throughputBps) : null, + commp_valid: pieceResult.success ? pieceResult.commPValid : null, + car_valid: carValid, + error_message: pieceResult.errorMessage ?? null, + }); } catch (error) { + // ClickhouseService.insert is buffered/non-throwing in normal operation, but + // guard against unexpected runtime errors so we don't break the probe cycle. this.logger.warn({ ...logContext, - event: "anon_retrieval_save_failed", - message: "Failed to save anonymous retrieval record", + event: "anon_retrieval_clickhouse_insert_failed", + message: "Failed to enqueue anonymous retrieval row to ClickHouse", pieceCid: piece.pieceCid, spAddress, error: toStructuredError(error), }); - return null; } this.logger.log({ ...logContext, event: "anon_retrieval_completed", message: "Anonymous retrieval test completed", + retrievalId, pieceCid: piece.pieceCid, spAddress, success: pieceResult.success, @@ -220,8 +234,6 @@ export class AnonRetrievalService { ipniValid: carResult?.ipniValid, blockFetchValid: carResult?.blockFetchValid, }); - - return retrieval; } } diff --git a/apps/backend/src/retrieval-anon/retrieval-anon.module.ts b/apps/backend/src/retrieval-anon/retrieval-anon.module.ts index 4e9e38df..c05dcb5f 100644 --- a/apps/backend/src/retrieval-anon/retrieval-anon.module.ts +++ b/apps/backend/src/retrieval-anon/retrieval-anon.module.ts @@ -1,7 +1,6 @@ import { Module } from "@nestjs/common"; import { ConfigModule } from "@nestjs/config"; import { TypeOrmModule } from "@nestjs/typeorm"; -import { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; import { HttpClientModule } from "../http-client/http-client.module.js"; import { IpniModule } from "../ipni/ipni.module.js"; @@ -15,7 +14,7 @@ import { PieceRetrievalService } from "./piece-retrieval.service.js"; @Module({ imports: [ ConfigModule, - TypeOrmModule.forFeature([AnonRetrieval, StorageProvider]), + TypeOrmModule.forFeature([StorageProvider]), SubgraphModule, WalletSdkModule, HttpClientModule, From 81a38b1fa9fa62e8cd6707e74058bb1b0454c084 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 29 Apr 2026 11:01:55 +0200 Subject: [PATCH 03/28] feat(retrieval-anon): track ipni metrics --- .../src/clickhouse/clickhouse.schema.ts | 68 +++++++++++-------- .../anon-retrieval.service.spec.ts | 6 +- .../retrieval-anon/anon-retrieval.service.ts | 18 +++-- .../retrieval-anon/car-validation.service.ts | 65 ++++++++++++++---- apps/backend/src/retrieval-anon/types.ts | 5 ++ 5 files changed, 112 insertions(+), 50 deletions(-) diff --git a/apps/backend/src/clickhouse/clickhouse.schema.ts b/apps/backend/src/clickhouse/clickhouse.schema.ts index 8af769d7..e30f6151 100644 --- a/apps/backend/src/clickhouse/clickhouse.schema.ts +++ b/apps/backend/src/clickhouse/clickhouse.schema.ts @@ -64,35 +64,45 @@ export function buildMigrations(database: string): string[] { `CREATE TABLE IF NOT EXISTS ${database}.anon_retrieval_checks ( - timestamp DateTime64(3, 'UTC'), -- when the check completed - probe_location LowCardinality(String), -- dealbot location - sp_address String, -- storage provider address (lowercased) - sp_id Nullable(UInt64), -- storage provider numeric id - sp_name Nullable(String), -- storage provider name - - retrieval_id UUID, -- per-event correlation id (log/Prometheus join) - - piece_cid String, -- piece CID (v2/CommP) sampled from the subgraph - data_set_id UInt64, -- on-chain data set id - piece_id UInt64, -- on-chain piece id within the data set - raw_size UInt64, -- raw (unpadded) piece size, bytes - with_ipfs_indexing Bool, -- whether the piece advertises IPNI metadata - ipfs_root_cid Nullable(String), -- root CID of the contained DAG; null when not IPFS-indexed - - service_type LowCardinality(String), -- 'direct_sp' (only mode for anon retrievals today) - retrieval_endpoint String, -- URL probed (e.g. {spBaseUrl}/piece/{pieceCid}) - - status LowCardinality(String), -- RetrievalStatus: 'success' | 'failed' | 'pending' | 'in_progress' | 'timeout' - http_response_code Nullable(UInt16), -- raw HTTP status; null on transport failure - first_byte_ms Nullable(Float64), -- time to first response byte - last_byte_ms Nullable(Float64), -- time to last response byte - bytes_retrieved Nullable(UInt64), -- bytes received from /piece/{cid} - throughput_bps Nullable(UInt64), -- effective throughput, bytes per second - - commp_valid Nullable(Bool), -- null when retrieval failed before CommP could be hashed - car_valid Nullable(Bool), -- null when CAR validation was skipped (no IPFS indexing or piece fetch failed) - - error_message Nullable(String) -- failure reason; null on success + timestamp DateTime64(3, 'UTC'), -- when the check completed + probe_location LowCardinality(String), -- dealbot location + sp_address String, -- storage provider address (lowercased) + sp_id Nullable(UInt64), -- storage provider numeric id + sp_name Nullable(String), -- storage provider name + + retrieval_id UUID, -- per-event correlation id (log/Prometheus join) + + piece_cid String, -- piece CID (v2/CommP) sampled from the subgraph + data_set_id UInt64, -- on-chain data set id + piece_id UInt64, -- on-chain piece id within the data set + raw_size UInt64, -- raw (unpadded) piece size, bytes + with_ipfs_indexing Bool, -- whether the piece advertises IPNI metadata + ipfs_root_cid Nullable(String), -- root CID of the contained DAG; null when not IPFS-indexed + + service_type LowCardinality(String), -- 'direct_sp' (only mode for anon retrievals today) + retrieval_endpoint String, -- URL probed (e.g. {spBaseUrl}/piece/{pieceCid}) + + piece_fetch_status LowCardinality(String), -- 'success' | 'failed' — outcome of GET /piece/ (HTTP 2xx AND CommP match). CAR/IPNI/block-fetch outcomes live in their own columns. + http_response_code Nullable(UInt16), -- raw HTTP status; null on transport failure + first_byte_ms Nullable(Float64), -- time to first response byte + last_byte_ms Nullable(Float64), -- time to last response byte + bytes_retrieved Nullable(UInt64), -- bytes received from /piece/{cid} + throughput_bps Nullable(UInt64), -- effective throughput, bytes per second + + commp_valid Nullable(Bool), -- null when retrieval failed before CommP could be hashed + car_parseable Nullable(Bool), -- null when CAR validation was skipped (no IPFS indexing or piece fetch failed); true if bytes parsed as a CAR + car_block_count Nullable(UInt32), -- total number of blocks observed inside the CAR; null when skipped or unparseable + block_fetch_endpoint Nullable(String), -- gateway base URL probed for block fetch (e.g. {spBaseUrl}/ipfs/); null when skipped + block_fetch_valid Nullable(Bool), -- null when skipped; true if all sampled blocks fetched + hash-verified + block_fetch_sampled_count Nullable(UInt32), -- number of blocks sampled and probed via /ipfs/?format=raw + block_fetch_failed_count Nullable(UInt32), -- number of sampled blocks that failed (non-2xx, hash mismatch, unsupported codec, or transport error) + + ipni_status LowCardinality(String), -- 'valid' | 'invalid' | 'skipped' (mirrors data_storage_checks naming) + ipni_verify_ms Nullable(Float64), -- IPNI verification duration; null when skipped + ipni_verified_cids_count Nullable(UInt32), -- CIDs confirmed findable via IPNI + ipni_unverified_cids_count Nullable(UInt32), -- CIDs checked but not findable + + error_message Nullable(String) -- failure reason; null on success ) ENGINE MergeTree() PRIMARY KEY (probe_location, sp_address, timestamp) PARTITION BY toStartOfMonth(timestamp) diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts index e6619e32..275a3de2 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts @@ -121,7 +121,7 @@ describe("AnonRetrievalService", () => { expect(insertSpy).toHaveBeenCalledTimes(1); const [table, row] = insertSpy.mock.calls[0] as [string, Record]; expect(table).toBe("anon_retrieval_checks"); - expect(row.status).toBe(RetrievalStatus.FAILED); + expect(row.piece_fetch_status).toBe(RetrievalStatus.FAILED); expect(row.bytes_retrieved).toBe(524288); expect(row.first_byte_ms).toBe(150); expect(row.last_byte_ms).toBe(42000); @@ -158,7 +158,7 @@ describe("AnonRetrievalService", () => { expect(fetchSpy).not.toHaveBeenCalled(); expect(insertSpy).toHaveBeenCalledTimes(1); const [, row] = insertSpy.mock.calls[0] as [string, Record]; - expect(row.status).toBe(RetrievalStatus.FAILED); + expect(row.piece_fetch_status).toBe(RetrievalStatus.FAILED); expect(row.error_message).toContain("Anon retrieval job timeout"); expect(row.bytes_retrieved).toBeNull(); expect(row.first_byte_ms).toBeNull(); @@ -188,7 +188,7 @@ describe("AnonRetrievalService", () => { expect(insertSpy).toHaveBeenCalledTimes(1); const [, row] = insertSpy.mock.calls[0] as [string, Record]; - expect(row.status).toBe(RetrievalStatus.FAILED); + expect(row.piece_fetch_status).toBe(RetrievalStatus.FAILED); }); it("skips ClickHouse insert when ClickHouse is disabled", async () => { diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index 1d56d2f0..8f2e135a 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -175,8 +175,9 @@ export class AnonRetrievalService { const providerInfo = this.walletSdkService.getProviderInfo(spAddress); const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; - const status = pieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED; - const carValid = carResult ? carResult.ipniValid !== false && carResult.blockFetchValid !== false : null; + const pieceFetchStatus = pieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED; + const ipniStatus = + carResult == null || carResult.ipniValid === null ? "skipped" : carResult.ipniValid ? "valid" : "invalid"; const retrievalId = randomUUID(); try { @@ -195,14 +196,23 @@ export class AnonRetrievalService { ipfs_root_cid: piece.ipfsRootCid, service_type: ServiceType.DIRECT_SP, retrieval_endpoint: `${spBaseUrl}/piece/${piece.pieceCid}`, - status, + piece_fetch_status: pieceFetchStatus, http_response_code: pieceResult.statusCode > 0 ? pieceResult.statusCode : null, first_byte_ms: pieceResult.ttfbMs > 0 ? pieceResult.ttfbMs : null, last_byte_ms: pieceResult.latencyMs > 0 ? pieceResult.latencyMs : null, bytes_retrieved: pieceResult.bytesReceived > 0 ? pieceResult.bytesReceived : null, throughput_bps: pieceResult.throughputBps > 0 ? Math.round(pieceResult.throughputBps) : null, commp_valid: pieceResult.success ? pieceResult.commPValid : null, - car_valid: carValid, + car_parseable: carResult ? carResult.carParseable : null, + car_block_count: carResult?.carParseable ? carResult.blockCount : null, + block_fetch_endpoint: carResult?.blockFetchEndpoint ?? null, + block_fetch_valid: carResult ? carResult.blockFetchValid : null, + block_fetch_sampled_count: carResult?.carParseable ? carResult.sampledCidCount : null, + block_fetch_failed_count: carResult?.blockFetchFailedCount ?? null, + ipni_status: ipniStatus, + ipni_verify_ms: carResult?.ipniVerifyMs ?? null, + ipni_verified_cids_count: carResult?.ipniVerifiedCidsCount ?? null, + ipni_unverified_cids_count: carResult?.ipniUnverifiedCidsCount ?? null, error_message: pieceResult.errorMessage ?? null, }); } catch (error) { diff --git a/apps/backend/src/retrieval-anon/car-validation.service.ts b/apps/backend/src/retrieval-anon/car-validation.service.ts index 8019b8df..017a38e8 100644 --- a/apps/backend/src/retrieval-anon/car-validation.service.ts +++ b/apps/backend/src/retrieval-anon/car-validation.service.ts @@ -48,7 +48,18 @@ export class CarValidationService { ): Promise { const blocks = await this.parseCar(pieceBytes, provider.address, ipfsRootCid); if (blocks === null) { - return { carParseable: false, blockCount: 0, sampledCidCount: 0, ipniValid: null, blockFetchValid: null }; + return { + carParseable: false, + blockCount: 0, + sampledCidCount: 0, + ipniValid: null, + ipniVerifyMs: null, + ipniVerifiedCidsCount: null, + ipniUnverifiedCidsCount: null, + blockFetchValid: null, + blockFetchFailedCount: null, + blockFetchEndpoint: null, + }; } if (blocks.length === 0) { return { @@ -56,7 +67,12 @@ export class CarValidationService { blockCount: 0, sampledCidCount: 0, ipniValid: null, + ipniVerifyMs: null, + ipniVerifiedCidsCount: null, + ipniUnverifiedCidsCount: null, blockFetchValid: null, + blockFetchFailedCount: null, + blockFetchEndpoint: null, errorMessage: "CAR contained no blocks", }; } @@ -65,15 +81,20 @@ export class CarValidationService { const shuffled = [...blocks].sort(() => Math.random() - 0.5); const sampledBlocks = shuffled.slice(0, sampleCount); - const ipniValid = await this.checkIpni(provider, ipfsRootCid, sampledBlocks, signal); + const ipni = await this.checkIpni(provider, ipfsRootCid, sampledBlocks, signal); const blockFetchResult = await this.checkBlockFetch(sampledBlocks, provider.address, signal); return { carParseable: true, blockCount: blocks.length, sampledCidCount: sampledBlocks.length, - ipniValid, + ipniValid: ipni.valid, + ipniVerifyMs: ipni.durationMs, + ipniVerifiedCidsCount: ipni.verifiedCount, + ipniUnverifiedCidsCount: ipni.unverifiedCount, blockFetchValid: blockFetchResult.valid, + blockFetchFailedCount: blockFetchResult.failedCount, + blockFetchEndpoint: blockFetchResult.endpoint, errorMessage: blockFetchResult.errorMessage, }; } @@ -111,7 +132,12 @@ export class CarValidationService { ipfsRootCid: string, sampledBlocks: ReadonlyArray<{ cid: CID }>, signal?: AbortSignal, - ): Promise { + ): Promise<{ + valid: boolean; + durationMs: number | null; + verifiedCount: number | null; + unverifiedCount: number | null; + }> { const timeouts = this.configService.get("timeouts", { infer: true }); let rootCid: CID; try { @@ -124,7 +150,7 @@ export class CarValidationService { providerAddress: provider.address, error: toStructuredError(error), }); - return false; + return { valid: false, durationMs: null, verifiedCount: null, unverifiedCount: null }; } const result = await this.ipniVerificationService.verify({ @@ -136,7 +162,12 @@ export class CarValidationService { signal, }); - return result.rootCIDVerified; + return { + valid: result.rootCIDVerified, + durationMs: result.durationMs, + verifiedCount: result.verified, + unverifiedCount: result.unverified, + }; } /** @@ -148,14 +179,20 @@ export class CarValidationService { sampledBlocks: ReadonlyArray<{ cid: CID; bytes: Uint8Array }>, spAddress: string, signal?: AbortSignal, - ): Promise<{ valid: boolean | null; errorMessage?: string }> { + ): Promise<{ valid: boolean | null; failedCount: number | null; endpoint: string | null; errorMessage?: string }> { const providerInfo = this.walletSdkService.getProviderInfo(spAddress); if (!providerInfo) { - return { valid: null, errorMessage: `Provider info not found for ${spAddress}` }; + return { + valid: null, + failedCount: null, + endpoint: null, + errorMessage: `Provider info not found for ${spAddress}`, + }; } const spBaseUrl = providerInfo.pdp.serviceURL.replace(/\/$/, ""); - let allValid = true; + const endpoint = `${spBaseUrl}/ipfs/`; + let failedCount = 0; for (const block of sampledBlocks) { signal?.throwIfAborted(); @@ -170,7 +207,7 @@ export class CarValidationService { }); if (resp.metrics.statusCode < 200 || resp.metrics.statusCode >= 300) { - allValid = false; + failedCount += 1; this.logger.warn({ event: "block_fetch_non_2xx", message: "Block fetch returned non-2xx status", @@ -188,7 +225,7 @@ export class CarValidationService { cid: cidStr, spAddress, }); - allValid = false; + failedCount += 1; continue; } @@ -200,14 +237,14 @@ export class CarValidationService { cid: cidStr, spAddress, }); - allValid = false; + failedCount += 1; continue; } // Hash-verifies and decodes; throws on mismatch await createBlock({ bytes: resp.data, cid: block.cid, hasher: sha256, codec }); } catch (error) { - allValid = false; + failedCount += 1; this.logger.warn({ event: "block_fetch_failed", message: "Block fetch or hash verification failed", @@ -218,6 +255,6 @@ export class CarValidationService { } } - return { valid: allValid }; + return { valid: failedCount === 0, failedCount, endpoint }; } } diff --git a/apps/backend/src/retrieval-anon/types.ts b/apps/backend/src/retrieval-anon/types.ts index 2c3384d5..3ba2b9f9 100644 --- a/apps/backend/src/retrieval-anon/types.ts +++ b/apps/backend/src/retrieval-anon/types.ts @@ -30,6 +30,11 @@ export type CarValidationResult = { blockCount: number; sampledCidCount: number; ipniValid: boolean | null; + ipniVerifyMs: number | null; + ipniVerifiedCidsCount: number | null; + ipniUnverifiedCidsCount: number | null; blockFetchValid: boolean | null; + blockFetchFailedCount: number | null; + blockFetchEndpoint: string | null; errorMessage?: string; }; From 072a096b44ca2194bf2607f96abbba66364aae11 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 29 Apr 2026 12:57:14 +0200 Subject: [PATCH 04/28] test(retrieval-anon): new ipni fields --- .../anon-retrieval.service.spec.ts | 157 +++++++++++++++++- 1 file changed, 153 insertions(+), 4 deletions(-) diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts index 275a3de2..812b8169 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts @@ -9,7 +9,7 @@ import type { AnonPieceSelectorService } from "./anon-piece-selector.service.js" import { AnonRetrievalService } from "./anon-retrieval.service.js"; import type { CarValidationService } from "./car-validation.service.js"; import type { PieceRetrievalService } from "./piece-retrieval.service.js"; -import type { PieceRetrievalResult } from "./types.js"; +import type { AnonPiece, CarValidationResult, PieceRetrievalResult } from "./types.js"; const SP_ADDRESS = "0xaaaa0000000000000000000000000000000000aa"; @@ -36,10 +36,13 @@ function makeService(opts: { pieceResult: PieceRetrievalResult; fetchPieceImpl?: (signal?: AbortSignal) => Promise; clickhouseEnabled?: boolean; + piece?: AnonPiece; + carResult?: CarValidationResult; }): { service: AnonRetrievalService; insertSpy: ReturnType; fetchSpy: ReturnType; + validateCarSpy: ReturnType; } { const insertSpy = vi.fn(); const clickhouseService = { @@ -53,7 +56,7 @@ function makeService(opts: { } as unknown as Repository; const anonPieceSelector = { - selectPieceForProvider: vi.fn(async () => PIECE), + selectPieceForProvider: vi.fn(async () => opts.piece ?? PIECE), } as unknown as AnonPieceSelectorService; const fetchSpy = vi.fn(opts.fetchPieceImpl ?? (async () => opts.pieceResult)); @@ -61,8 +64,9 @@ function makeService(opts: { fetchPiece: fetchSpy, } as unknown as PieceRetrievalService; + const validateCarSpy = vi.fn(async () => opts.carResult); const carValidationService = { - validateCarPiece: vi.fn(), + validateCarPiece: validateCarSpy, } as unknown as CarValidationService; const walletSdkService = { @@ -91,7 +95,7 @@ function makeService(opts: { spRepository, ); - return { service, insertSpy, fetchSpy }; + return { service, insertSpy, fetchSpy, validateCarSpy }; } describe("AnonRetrievalService", () => { @@ -133,6 +137,19 @@ describe("AnonRetrievalService", () => { expect(row.sp_id).toBe(7); expect(row.probe_location).toBe("test-location"); expect(typeof row.retrieval_id).toBe("string"); + + // CAR/IPNI/block-fetch were never run on a non-IPFS-indexed piece — every + // dimension column should explicitly say "skipped" (ipni_status) or null. + expect(row.car_parseable).toBeNull(); + expect(row.car_block_count).toBeNull(); + expect(row.block_fetch_endpoint).toBeNull(); + expect(row.block_fetch_valid).toBeNull(); + expect(row.block_fetch_sampled_count).toBeNull(); + expect(row.block_fetch_failed_count).toBeNull(); + expect(row.ipni_status).toBe("skipped"); + expect(row.ipni_verify_ms).toBeNull(); + expect(row.ipni_verified_cids_count).toBeNull(); + expect(row.ipni_unverified_cids_count).toBeNull(); }); it("still emits a row when the signal aborts before fetchPiece runs", async () => { @@ -210,4 +227,136 @@ describe("AnonRetrievalService", () => { expect(insertSpy).not.toHaveBeenCalled(); }); + + describe("with IPFS indexing", () => { + const INDEXED_PIECE: AnonPiece = { + ...PIECE, + withIPFSIndexing: true, + ipfsRootCid: "bafyrootcid", + }; + + function okPiece(bytes: Buffer): PieceRetrievalResult { + return { + success: true, + pieceCid: INDEXED_PIECE.pieceCid, + bytesReceived: bytes.length, + pieceBytes: bytes, + latencyMs: 200, + ttfbMs: 20, + throughputBps: 51200, + statusCode: 200, + commPValid: true, + }; + } + + it("emits populated CAR/IPNI/block-fetch columns when validation fully succeeds", async () => { + const carResult: CarValidationResult = { + carParseable: true, + blockCount: 42, + sampledCidCount: 5, + ipniValid: true, + ipniVerifyMs: 137, + ipniVerifiedCidsCount: 6, + ipniUnverifiedCidsCount: 0, + blockFetchValid: true, + blockFetchFailedCount: 0, + blockFetchEndpoint: "https://sp.test/ipfs/", + }; + + const { service, insertSpy, validateCarSpy } = makeService({ + pieceResult: okPiece(Buffer.from("car-bytes")), + piece: INDEXED_PIECE, + carResult, + }); + + await service.performForProvider(SP_ADDRESS); + + expect(validateCarSpy).toHaveBeenCalledTimes(1); + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.piece_fetch_status).toBe(RetrievalStatus.SUCCESS); + expect(row.commp_valid).toBe(true); + expect(row.car_parseable).toBe(true); + expect(row.car_block_count).toBe(42); + expect(row.block_fetch_endpoint).toBe("https://sp.test/ipfs/"); + expect(row.block_fetch_valid).toBe(true); + expect(row.block_fetch_sampled_count).toBe(5); + expect(row.block_fetch_failed_count).toBe(0); + expect(row.ipni_status).toBe("valid"); + expect(row.ipni_verify_ms).toBe(137); + expect(row.ipni_verified_cids_count).toBe(6); + expect(row.ipni_unverified_cids_count).toBe(0); + }); + + it("distinguishes IPNI invalid from block-fetch failures with explicit counts", async () => { + const carResult: CarValidationResult = { + carParseable: true, + blockCount: 100, + sampledCidCount: 5, + ipniValid: false, + ipniVerifyMs: 250, + ipniVerifiedCidsCount: 0, + ipniUnverifiedCidsCount: 6, + blockFetchValid: false, + blockFetchFailedCount: 2, + blockFetchEndpoint: "https://sp.test/ipfs/", + }; + + const { service, insertSpy } = makeService({ + pieceResult: okPiece(Buffer.from("car-bytes")), + piece: INDEXED_PIECE, + carResult, + }); + + await service.performForProvider(SP_ADDRESS); + + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + // The piece-fetch path still succeeded — failures are surfaced as + // independent dimensions, not folded into piece_fetch_status. + expect(row.piece_fetch_status).toBe(RetrievalStatus.SUCCESS); + expect(row.car_parseable).toBe(true); + expect(row.ipni_status).toBe("invalid"); + expect(row.ipni_verified_cids_count).toBe(0); + expect(row.ipni_unverified_cids_count).toBe(6); + expect(row.block_fetch_valid).toBe(false); + expect(row.block_fetch_sampled_count).toBe(5); + expect(row.block_fetch_failed_count).toBe(2); + }); + + it("emits car_parseable=false with skipped IPNI/block-fetch when bytes don't parse as CAR", async () => { + const carResult: CarValidationResult = { + carParseable: false, + blockCount: 0, + sampledCidCount: 0, + ipniValid: null, + ipniVerifyMs: null, + ipniVerifiedCidsCount: null, + ipniUnverifiedCidsCount: null, + blockFetchValid: null, + blockFetchFailedCount: null, + blockFetchEndpoint: null, + }; + + const { service, insertSpy } = makeService({ + pieceResult: okPiece(Buffer.from("not-a-car")), + piece: INDEXED_PIECE, + carResult, + }); + + await service.performForProvider(SP_ADDRESS); + + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.car_parseable).toBe(false); + // car_block_count and block_fetch_sampled_count are gated on carParseable + // so an unparseable CAR doesn't emit a misleading 0. + expect(row.car_block_count).toBeNull(); + expect(row.block_fetch_sampled_count).toBeNull(); + expect(row.block_fetch_endpoint).toBeNull(); + expect(row.block_fetch_valid).toBeNull(); + expect(row.block_fetch_failed_count).toBeNull(); + expect(row.ipni_status).toBe("skipped"); + expect(row.ipni_verify_ms).toBeNull(); + expect(row.ipni_verified_cids_count).toBeNull(); + expect(row.ipni_unverified_cids_count).toBeNull(); + }); + }); }); From 1fcee6001cda14f6ead2117c68ee1c40b2b927ff Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 29 Apr 2026 13:10:13 +0200 Subject: [PATCH 05/28] refactor(retrieval-anon): function signatures --- .../retrieval-anon/anon-retrieval.service.ts | 171 ++++++++---------- .../retrieval-anon/car-validation.service.ts | 40 ++-- 2 files changed, 93 insertions(+), 118 deletions(-) diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index 8f2e135a..4c6ade8a 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -141,109 +141,90 @@ export class AnonRetrievalService { // Always emit a ClickHouse row — even on abort or unexpected error — so // we never lose the evidence (ttfb, bytes, response code) we already // collected. - pieceResult ??= buildAbortedPlaceholder(piece.pieceCid, signal?.reason); - this.emitClickhouseRow(spAddress, piece, pieceResult, carResult, startedAt, provider, logContext); - } - } + const finalPieceResult = pieceResult ?? buildAbortedPlaceholder(piece.pieceCid, signal?.reason); + const retrievalId = randomUUID(); - private emitClickhouseRow( - spAddress: string, - piece: { - pieceCid: string; - dataSetId: string; - pieceId: string; - rawSize: string; - withIPFSIndexing: boolean; - ipfsRootCid: string | null; - }, - pieceResult: PieceRetrievalResult, - carResult: CarValidationResult | null, - startedAt: Date, - provider: StorageProvider | null, - logContext?: ProviderJobContext, - ): void { - if (!this.clickhouseService.enabled) { - this.logger.debug({ - ...logContext, - event: "anon_retrieval_clickhouse_disabled", - message: "ClickHouse disabled — anon retrieval row not emitted", - pieceCid: piece.pieceCid, - spAddress, - }); - return; - } + if (this.clickhouseService.enabled) { + const providerInfo = this.walletSdkService.getProviderInfo(spAddress); + const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; + const pieceFetchStatus = finalPieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED; + const ipniStatus = + carResult == null || carResult.ipniValid === null ? "skipped" : carResult.ipniValid ? "valid" : "invalid"; - const providerInfo = this.walletSdkService.getProviderInfo(spAddress); - const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; - const pieceFetchStatus = pieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED; - const ipniStatus = - carResult == null || carResult.ipniValid === null ? "skipped" : carResult.ipniValid ? "valid" : "invalid"; - const retrievalId = randomUUID(); + try { + this.clickhouseService.insert(ANON_RETRIEVAL_CHECKS_TABLE, { + timestamp: startedAt.getTime(), + probe_location: this.clickhouseService.probeLocation, + sp_address: spAddress, + sp_id: provider?.providerId != null ? Number(provider.providerId) : null, + sp_name: provider?.name ?? null, + retrieval_id: retrievalId, + piece_cid: piece.pieceCid, + data_set_id: piece.dataSetId, + piece_id: piece.pieceId, + raw_size: piece.rawSize, + with_ipfs_indexing: piece.withIPFSIndexing, + ipfs_root_cid: piece.ipfsRootCid, + service_type: ServiceType.DIRECT_SP, + retrieval_endpoint: `${spBaseUrl}/piece/${piece.pieceCid}`, + piece_fetch_status: pieceFetchStatus, + http_response_code: finalPieceResult.statusCode > 0 ? finalPieceResult.statusCode : null, + first_byte_ms: finalPieceResult.ttfbMs > 0 ? finalPieceResult.ttfbMs : null, + last_byte_ms: finalPieceResult.latencyMs > 0 ? finalPieceResult.latencyMs : null, + bytes_retrieved: finalPieceResult.bytesReceived > 0 ? finalPieceResult.bytesReceived : null, + throughput_bps: finalPieceResult.throughputBps > 0 ? Math.round(finalPieceResult.throughputBps) : null, + commp_valid: finalPieceResult.success ? finalPieceResult.commPValid : null, + car_parseable: carResult ? carResult.carParseable : null, + car_block_count: carResult != null && carResult.carParseable ? carResult.blockCount : null, + block_fetch_endpoint: carResult?.blockFetchEndpoint ?? null, + block_fetch_valid: carResult ? carResult.blockFetchValid : null, + block_fetch_sampled_count: carResult != null && carResult.carParseable ? carResult.sampledCidCount : null, + block_fetch_failed_count: carResult?.blockFetchFailedCount ?? null, + ipni_status: ipniStatus, + ipni_verify_ms: carResult?.ipniVerifyMs ?? null, + ipni_verified_cids_count: carResult?.ipniVerifiedCidsCount ?? null, + ipni_unverified_cids_count: carResult?.ipniUnverifiedCidsCount ?? null, + error_message: finalPieceResult.errorMessage ?? null, + }); + } catch (error) { + // ClickhouseService.insert is buffered/non-throwing in normal operation, but + // guard against unexpected runtime errors so we don't break the probe cycle. + this.logger.warn({ + ...logContext, + event: "anon_retrieval_clickhouse_insert_failed", + message: "Failed to enqueue anonymous retrieval row to ClickHouse", + pieceCid: piece.pieceCid, + spAddress, + error: toStructuredError(error), + }); + } + } else { + this.logger.debug({ + ...logContext, + event: "anon_retrieval_clickhouse_disabled", + message: "ClickHouse disabled — anon retrieval row not emitted", + pieceCid: piece.pieceCid, + spAddress, + }); + } - try { - this.clickhouseService.insert(ANON_RETRIEVAL_CHECKS_TABLE, { - timestamp: startedAt.getTime(), - probe_location: this.clickhouseService.probeLocation, - sp_address: spAddress, - sp_id: provider?.providerId != null ? Number(provider.providerId) : null, - sp_name: provider?.name ?? null, - retrieval_id: retrievalId, - piece_cid: piece.pieceCid, - data_set_id: piece.dataSetId, - piece_id: piece.pieceId, - raw_size: piece.rawSize, - with_ipfs_indexing: piece.withIPFSIndexing, - ipfs_root_cid: piece.ipfsRootCid, - service_type: ServiceType.DIRECT_SP, - retrieval_endpoint: `${spBaseUrl}/piece/${piece.pieceCid}`, - piece_fetch_status: pieceFetchStatus, - http_response_code: pieceResult.statusCode > 0 ? pieceResult.statusCode : null, - first_byte_ms: pieceResult.ttfbMs > 0 ? pieceResult.ttfbMs : null, - last_byte_ms: pieceResult.latencyMs > 0 ? pieceResult.latencyMs : null, - bytes_retrieved: pieceResult.bytesReceived > 0 ? pieceResult.bytesReceived : null, - throughput_bps: pieceResult.throughputBps > 0 ? Math.round(pieceResult.throughputBps) : null, - commp_valid: pieceResult.success ? pieceResult.commPValid : null, - car_parseable: carResult ? carResult.carParseable : null, - car_block_count: carResult?.carParseable ? carResult.blockCount : null, - block_fetch_endpoint: carResult?.blockFetchEndpoint ?? null, - block_fetch_valid: carResult ? carResult.blockFetchValid : null, - block_fetch_sampled_count: carResult?.carParseable ? carResult.sampledCidCount : null, - block_fetch_failed_count: carResult?.blockFetchFailedCount ?? null, - ipni_status: ipniStatus, - ipni_verify_ms: carResult?.ipniVerifyMs ?? null, - ipni_verified_cids_count: carResult?.ipniVerifiedCidsCount ?? null, - ipni_unverified_cids_count: carResult?.ipniUnverifiedCidsCount ?? null, - error_message: pieceResult.errorMessage ?? null, - }); - } catch (error) { - // ClickhouseService.insert is buffered/non-throwing in normal operation, but - // guard against unexpected runtime errors so we don't break the probe cycle. - this.logger.warn({ + this.logger.log({ ...logContext, - event: "anon_retrieval_clickhouse_insert_failed", - message: "Failed to enqueue anonymous retrieval row to ClickHouse", + event: "anon_retrieval_completed", + message: "Anonymous retrieval test completed", + retrievalId, pieceCid: piece.pieceCid, spAddress, - error: toStructuredError(error), + success: finalPieceResult.success, + aborted: finalPieceResult.aborted === true, + latencyMs: finalPieceResult.latencyMs, + ttfbMs: finalPieceResult.ttfbMs, + bytesRetrieved: finalPieceResult.bytesReceived, + carParseable: carResult?.carParseable, + ipniValid: carResult?.ipniValid, + blockFetchValid: carResult?.blockFetchValid, }); } - - this.logger.log({ - ...logContext, - event: "anon_retrieval_completed", - message: "Anonymous retrieval test completed", - retrievalId, - pieceCid: piece.pieceCid, - spAddress, - success: pieceResult.success, - aborted: pieceResult.aborted === true, - latencyMs: pieceResult.latencyMs, - ttfbMs: pieceResult.ttfbMs, - bytesRetrieved: pieceResult.bytesReceived, - carParseable: carResult?.carParseable, - ipniValid: carResult?.ipniValid, - blockFetchValid: carResult?.blockFetchValid, - }); } } diff --git a/apps/backend/src/retrieval-anon/car-validation.service.ts b/apps/backend/src/retrieval-anon/car-validation.service.ts index 017a38e8..789f5ba6 100644 --- a/apps/backend/src/retrieval-anon/car-validation.service.ts +++ b/apps/backend/src/retrieval-anon/car-validation.service.ts @@ -46,8 +46,17 @@ export class CarValidationService { ipfsRootCid: string, signal?: AbortSignal, ): Promise { - const blocks = await this.parseCar(pieceBytes, provider.address, ipfsRootCid); - if (blocks === null) { + let blocks: { cid: CID; bytes: Uint8Array }[]; + try { + blocks = await this.parseCar(pieceBytes); + } catch (error) { + this.logger.debug({ + event: "car_parse_failed", + message: "Failed to parse piece bytes as CAR - client fault, not SP", + spAddress: provider.address, + ipfsRootCid, + error: toStructuredError(error), + }); return { carParseable: false, blockCount: 0, @@ -99,28 +108,13 @@ export class CarValidationService { }; } - private async parseCar( - pieceBytes: Buffer, - spAddress: string, - ipfsRootCid: string, - ): Promise<{ cid: CID; bytes: Uint8Array }[] | null> { - try { - const reader = await CarReader.fromBytes(new Uint8Array(pieceBytes)); - const blocks: { cid: CID; bytes: Uint8Array }[] = []; - for await (const block of reader.blocks()) { - blocks.push({ cid: block.cid, bytes: block.bytes }); - } - return blocks; - } catch (error) { - this.logger.debug({ - event: "car_parse_failed", - message: "Failed to parse piece bytes as CAR - client fault, not SP", - spAddress, - ipfsRootCid, - error: toStructuredError(error), - }); - return null; + private async parseCar(pieceBytes: Buffer): Promise<{ cid: CID; bytes: Uint8Array }[]> { + const reader = await CarReader.fromBytes(new Uint8Array(pieceBytes)); + const blocks: { cid: CID; bytes: Uint8Array }[] = []; + for await (const block of reader.blocks()) { + blocks.push({ cid: block.cid, bytes: block.bytes }); } + return blocks; } /** From 4527d292c1cb537287274eb9638a46a5641eff21 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 29 Apr 2026 14:24:41 +0200 Subject: [PATCH 06/28] refactor(retrieval-anon): cleanup --- .../check-metrics.service.ts | 4 +- .../anon-piece-selector.service.spec.ts | 16 ++++ .../anon-piece-selector.service.ts | 11 +-- .../anon-retrieval.service.spec.ts | 53 +++++++++-- .../retrieval-anon/anon-retrieval.service.ts | 33 +++---- .../retrieval-anon/car-validation.service.ts | 1 - .../src/subgraph/subgraph.service.spec.ts | 10 ++- apps/backend/src/subgraph/subgraph.service.ts | 89 +++---------------- 8 files changed, 103 insertions(+), 114 deletions(-) diff --git a/apps/backend/src/metrics-prometheus/check-metrics.service.ts b/apps/backend/src/metrics-prometheus/check-metrics.service.ts index 85f1cdcf..8d4be313 100644 --- a/apps/backend/src/metrics-prometheus/check-metrics.service.ts +++ b/apps/backend/src/metrics-prometheus/check-metrics.service.ts @@ -303,11 +303,11 @@ export class AnonRetrievalCheckMetrics { this.carParseCounter.inc({ ...labels, value: parseable ? "parseable" : "not_parseable" }); } - recordIpniStatus(labels: CheckMetricLabels, value: "valid" | "invalid" | "skipped"): void { + recordIpniStatus(labels: CheckMetricLabels, value: "valid" | "invalid" | "skipped" | "error"): void { this.ipniCounter.inc({ ...labels, value }); } - recordBlockFetchStatus(labels: CheckMetricLabels, value: "valid" | "invalid" | "skipped"): void { + recordBlockFetchStatus(labels: CheckMetricLabels, value: "valid" | "invalid" | "skipped" | "error"): void { this.blockFetchCounter.inc({ ...labels, value }); } } diff --git a/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts b/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts index 6a787fbb..32d13719 100644 --- a/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts @@ -84,6 +84,22 @@ describe("AnonPieceSelectorService", () => { expect(result?.pieceCid).toBe(freshCid); }); + it("treats payment-end exactly equal to current epoch as terminated (boundary)", async () => { + // pdpPaymentEndEpoch === indexedAtBlock should be rejected (<=, not <). + // This guards against an off-by-one regression where pieces in the final + // payment epoch silently slip through. + const boundaryCid = "baga-boundary"; + const liveCid = "baga-still-live"; + sampleAnonPiece + .mockResolvedValueOnce(makePiece({ pieceCid: boundaryCid, pdpPaymentEndEpoch: 200n, indexedAtBlock: 200 })) + .mockResolvedValueOnce(makePiece({ pieceCid: liveCid, pdpPaymentEndEpoch: 201n, indexedAtBlock: 200 })); + + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); + const result = await service.selectPieceForProvider(SP_ADDRESS); + + expect(result?.pieceCid).toBe(liveCid); + }); + it("redraws when the first sampled piece was recently selected by this process", async () => { const staleCid = "baga-stale"; const freshCid = "baga-fresh"; diff --git a/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts index 8de50fa3..342a4780 100644 --- a/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts +++ b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts @@ -8,13 +8,7 @@ import type { AnonCandidatePiece } from "../subgraph/types.js"; import type { AnonPiece } from "./types.js"; /** - * Number of most-recently-tested anonymous pieces to exclude from selection - * to avoid immediately retesting the same piece. Piece CIDs are globally - * unique and each one lives on a single SP's dataset, so scoping by CID - * is equivalent to scoping by (SP, CID) for this workload. - * - * The buffer is process-local: a duplicate piece that gets retested shortly - * after a restart is harmless (still a valid measurement, just less diverse). + * Number of most-recently-tested piece CIDs to exclude from re-selection. */ const RECENT_DEDUP_WINDOW = 500; @@ -157,6 +151,9 @@ export class AnonPieceSelectorService { continue; } + // On Filecoin FEVM the EVM block number IS the chain epoch (one block per + // epoch), so the subgraph's indexedAtBlock is a safe proxy for "now" when + // checking if PDP payment for this piece has already terminated. if (piece.pdpPaymentEndEpoch != null && piece.pdpPaymentEndEpoch <= BigInt(piece.indexedAtBlock)) { continue; } diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts index 812b8169..b5f17c57 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts @@ -36,13 +36,17 @@ function makeService(opts: { pieceResult: PieceRetrievalResult; fetchPieceImpl?: (signal?: AbortSignal) => Promise; clickhouseEnabled?: boolean; - piece?: AnonPiece; + piece?: AnonPiece | null; carResult?: CarValidationResult; + validateCarImpl?: () => Promise; }): { service: AnonRetrievalService; insertSpy: ReturnType; fetchSpy: ReturnType; validateCarSpy: ReturnType; + metricsRecordStatusSpy: ReturnType; + metricsRecordIpniSpy: ReturnType; + metricsRecordBlockFetchSpy: ReturnType; } { const insertSpy = vi.fn(); const clickhouseService = { @@ -56,7 +60,7 @@ function makeService(opts: { } as unknown as Repository; const anonPieceSelector = { - selectPieceForProvider: vi.fn(async () => opts.piece ?? PIECE), + selectPieceForProvider: vi.fn(async () => (opts.piece === null ? null : (opts.piece ?? PIECE))), } as unknown as AnonPieceSelectorService; const fetchSpy = vi.fn(opts.fetchPieceImpl ?? (async () => opts.pieceResult)); @@ -64,7 +68,7 @@ function makeService(opts: { fetchPiece: fetchSpy, } as unknown as PieceRetrievalService; - const validateCarSpy = vi.fn(async () => opts.carResult); + const validateCarSpy = vi.fn(opts.validateCarImpl ?? (async () => opts.carResult)); const carValidationService = { validateCarPiece: validateCarSpy, } as unknown as CarValidationService; @@ -73,16 +77,19 @@ function makeService(opts: { getProviderInfo: vi.fn(() => ({ pdp: { serviceURL: "https://sp.test/" } })), } as unknown as WalletSdkService; + const metricsRecordStatusSpy = vi.fn(); + const metricsRecordIpniSpy = vi.fn(); + const metricsRecordBlockFetchSpy = vi.fn(); const metrics = { observeFirstByteMs: vi.fn(), observeLastByteMs: vi.fn(), observeThroughput: vi.fn(), observeCheckDuration: vi.fn(), - recordStatus: vi.fn(), + recordStatus: metricsRecordStatusSpy, recordHttpResponseCode: vi.fn(), recordCarParseStatus: vi.fn(), - recordIpniStatus: vi.fn(), - recordBlockFetchStatus: vi.fn(), + recordIpniStatus: metricsRecordIpniSpy, + recordBlockFetchStatus: metricsRecordBlockFetchSpy, } as unknown as AnonRetrievalCheckMetrics; const service = new AnonRetrievalService( @@ -95,7 +102,15 @@ function makeService(opts: { spRepository, ); - return { service, insertSpy, fetchSpy, validateCarSpy }; + return { + service, + insertSpy, + fetchSpy, + validateCarSpy, + metricsRecordStatusSpy, + metricsRecordIpniSpy, + metricsRecordBlockFetchSpy, + }; } describe("AnonRetrievalService", () => { @@ -322,6 +337,30 @@ describe("AnonRetrievalService", () => { expect(row.block_fetch_failed_count).toBe(2); }); + it("emits ipni_status='error' (not 'skipped') when CAR validation throws on a successful piece", async () => { + // Distinguishes a real infra outage (e.g. IpniVerificationService down) + // from a piece that legitimately had no IPFS indexing. Without the + // distinction, an outage looks like normal non-IPFS volume in dashboards. + const { service, insertSpy, metricsRecordIpniSpy, metricsRecordBlockFetchSpy } = makeService({ + pieceResult: okPiece(Buffer.from("car-bytes")), + piece: INDEXED_PIECE, + validateCarImpl: async () => { + throw new Error("IpniVerificationService down"); + }, + }); + + await service.performForProvider(SP_ADDRESS); + + expect(metricsRecordIpniSpy).toHaveBeenCalledWith(expect.anything(), "error"); + expect(metricsRecordBlockFetchSpy).toHaveBeenCalledWith(expect.anything(), "error"); + + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.ipni_status).toBe("error"); + // Piece-fetch path itself succeeded — only the validation pipeline failed. + expect(row.commp_valid).toBe(true); + expect(row.car_parseable).toBeNull(); + }); + it("emits car_parseable=false with skipped IPNI/block-fetch when bytes don't parse as CAR", async () => { const carResult: CarValidationResult = { carParseable: false, diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index 4c6ade8a..418ea8d2 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -70,6 +70,7 @@ export class AnonRetrievalService { let pieceResult: PieceRetrievalResult | null = null; let carResult: CarValidationResult | null = null; + let validatedCarPiece: boolean = false; try { // 2. Fetch the piece. fetchPiece never throws on abort — it returns a @@ -96,13 +97,24 @@ export class AnonRetrievalService { !signal?.aborted ) { try { + validatedCarPiece = true; carResult = await this.carValidationService.validateCarPiece( pieceResult.pieceBytes, provider, piece.ipfsRootCid, signal, ); + this.metrics.recordCarParseStatus(labels, carResult.carParseable); + this.metrics.recordIpniStatus(labels, ipniStatusFromResult(carResult)); + this.metrics.recordBlockFetchStatus( + labels, + carResult.blockFetchValid === null ? "skipped" : carResult.blockFetchValid ? "valid" : "invalid", + ); } catch (error) { + // Validation was attempted on a successful piece retrieval but threw. + this.metrics.recordCarParseStatus(labels, false); + this.metrics.recordIpniStatus(labels, "error"); + this.metrics.recordBlockFetchStatus(labels, "error"); this.logger.warn({ ...logContext, event: "anon_retrieval_car_validation_failed", @@ -112,19 +124,6 @@ export class AnonRetrievalService { error: toStructuredError(error), }); } - } - - // Emit CAR validation metrics - if (carResult) { - this.metrics.recordCarParseStatus(labels, carResult.carParseable); - this.metrics.recordIpniStatus( - labels, - carResult.ipniValid === null ? "skipped" : carResult.ipniValid ? "valid" : "invalid", - ); - this.metrics.recordBlockFetchStatus( - labels, - carResult.blockFetchValid === null ? "skipped" : carResult.blockFetchValid ? "valid" : "invalid", - ); } else if (!pieceResult.success) { // Piece retrieval failed — IPNI and block fetch were skipped this.metrics.recordIpniStatus(labels, "skipped"); @@ -148,8 +147,7 @@ export class AnonRetrievalService { const providerInfo = this.walletSdkService.getProviderInfo(spAddress); const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; const pieceFetchStatus = finalPieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED; - const ipniStatus = - carResult == null || carResult.ipniValid === null ? "skipped" : carResult.ipniValid ? "valid" : "invalid"; + const ipniStatus = !validatedCarPiece ? "skipped" : carResult ? ipniStatusFromResult(carResult) : "error"; try { this.clickhouseService.insert(ANON_RETRIEVAL_CHECKS_TABLE, { @@ -228,6 +226,11 @@ export class AnonRetrievalService { } } +function ipniStatusFromResult(result: CarValidationResult): "valid" | "invalid" | "skipped" { + if (result.ipniValid === null) return "skipped"; + return result.ipniValid ? "valid" : "invalid"; +} + function buildAbortedPlaceholder(pieceCid: string, reason: unknown): PieceRetrievalResult { const message = reason instanceof Error && reason.message ? reason.message : typeof reason === "string" ? reason : "aborted"; diff --git a/apps/backend/src/retrieval-anon/car-validation.service.ts b/apps/backend/src/retrieval-anon/car-validation.service.ts index 789f5ba6..27ec2744 100644 --- a/apps/backend/src/retrieval-anon/car-validation.service.ts +++ b/apps/backend/src/retrieval-anon/car-validation.service.ts @@ -189,7 +189,6 @@ export class CarValidationService { let failedCount = 0; for (const block of sampledBlocks) { - signal?.throwIfAborted(); const cidStr = block.cid.toString(); const blockUrl = `${spBaseUrl}/ipfs/${cidStr}?format=raw`; diff --git a/apps/backend/src/subgraph/subgraph.service.spec.ts b/apps/backend/src/subgraph/subgraph.service.spec.ts index 4dc2cd5e..8703b2c5 100644 --- a/apps/backend/src/subgraph/subgraph.service.spec.ts +++ b/apps/backend/src/subgraph/subgraph.service.spec.ts @@ -730,14 +730,18 @@ describe("SubgraphService", () => { }); describe("sampleAnonPiece", () => { - it("returns null when endpoint is not configured", async () => { + it("throws when endpoint is not configured (distinct from empty result)", async () => { + // Returning null here would make a misconfigured deployment indistinguishable + // from a genuinely empty candidate pool — every anon job would silently + // no-op forever. Fail loudly instead. const noEndpointConfig = { get: vi.fn(() => ({ subgraphEndpoint: "" })), } as unknown as ConfigService; const noEndpointService = new SubgraphService(noEndpointConfig); - const piece = await noEndpointService.sampleAnonPiece(defaultSampleParams); - expect(piece).toBeNull(); + await expect(noEndpointService.sampleAnonPiece(defaultSampleParams)).rejects.toThrow( + "No PDP subgraph endpoint configured", + ); expect(fetchMock).not.toHaveBeenCalled(); }); diff --git a/apps/backend/src/subgraph/subgraph.service.ts b/apps/backend/src/subgraph/subgraph.service.ts index 55359179..3067532c 100644 --- a/apps/backend/src/subgraph/subgraph.service.ts +++ b/apps/backend/src/subgraph/subgraph.service.ts @@ -69,87 +69,12 @@ export class SubgraphService { } /** - * Fetch subgraph metadata including the latest indexed block number + * Fetch subgraph metadata including the latest indexed block number. * - * @param attempt - Current retry attempt number (default: 1) - * @returns Subgraph metadata with block number * @throws Error if endpoint is not configured or after MAX_RETRIES attempts */ - async fetchSubgraphMeta(attempt: number = 1): Promise { - if (!this.blockchainConfig.subgraphEndpoint) { - throw new Error("No PDP subgraph endpoint configured"); - } - - try { - await this.enforceRateLimit(); - - const response = await fetch(this.blockchainConfig.subgraphEndpoint, { - method: "POST", - headers: { - "Content-Type": "application/json", - }, - body: JSON.stringify({ - query: Queries.GET_SUBGRAPH_META, - }), - }); - - if (!response.ok) { - throw new Error(`HTTP ${response.status}: ${response.statusText}`); - } - - const result = (await response.json()) as GraphQLResponse; - - if (result.errors) { - const errorMessage = result.errors?.[0]?.message || "Unknown GraphQL error"; - throw new Error(`GraphQL error: ${errorMessage}`); - } - let validated: SubgraphMeta; - try { - validated = validateSubgraphMetaResponse(result.data); - } catch (validationError) { - const errorMessage = validationError instanceof Error ? validationError.message : "Unknown validation error"; - throw new ValidationError(`Data validation failed: ${errorMessage}`); - } - - return validated; - } catch (error) { - const errorMessage = error instanceof Error ? error.message : "Unknown error"; - - // No need to retry on validation errors - they indicate schema/data issues, not transient failures - if (error instanceof ValidationError) { - this.logger.error({ - event: "subgraph_meta_validation_failed", - message: "Subgraph data validation failed", - error: toStructuredError(error), - }); - throw error; - } - - // Retry on network/HTTP errors - if (attempt < SubgraphService.MAX_RETRIES) { - const delay = SubgraphService.INITIAL_RETRY_DELAY_MS * (1 << (attempt - 1)); - this.logger.warn({ - event: "subgraph_meta_request_retry", - message: "Subgraph meta request failed. Retrying...", - attempt, - maxRetries: SubgraphService.MAX_RETRIES, - retryDelayMs: delay, - error: toStructuredError(error), - }); - await new Promise((resolve) => setTimeout(resolve, delay)); - return this.fetchSubgraphMeta(attempt + 1); - } - - this.logger.error({ - event: "subgraph_meta_request_failed", - message: "Subgraph meta request failed after maximum retries", - maxRetries: SubgraphService.MAX_RETRIES, - error: toStructuredError(error), - }); - throw new Error( - `Failed to fetch subgraph metadata after ${SubgraphService.MAX_RETRIES} attempts: ${errorMessage}`, - ); - } + async fetchSubgraphMeta(): Promise { + return this.executeQuery("metadata", Queries.GET_SUBGRAPH_META, {}, validateSubgraphMetaResponse); } /** @@ -189,7 +114,13 @@ export class SubgraphService { */ async sampleAnonPiece(params: SampleAnonPieceParams): Promise { if (!this.blockchainConfig.subgraphEndpoint) { - return null; + // Surface misconfiguration distinctly so it does not look like an empty + // candidate pool (which silently no-ops every anon retrieval job). + this.logger.error({ + event: "subgraph_endpoint_not_configured", + message: "Cannot sample anonymous piece — no PDP subgraph endpoint configured", + }); + throw new Error("No PDP subgraph endpoint configured"); } const query = buildSampleAnonPieceQuery(params.pool); From a797c15255549fe57510301da22e6010086f2989 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Wed, 29 Apr 2026 14:27:11 +0200 Subject: [PATCH 07/28] chore: format code --- apps/backend/src/retrieval-anon/anon-retrieval.service.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index 418ea8d2..c11daa19 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -107,8 +107,8 @@ export class AnonRetrievalService { this.metrics.recordCarParseStatus(labels, carResult.carParseable); this.metrics.recordIpniStatus(labels, ipniStatusFromResult(carResult)); this.metrics.recordBlockFetchStatus( - labels, - carResult.blockFetchValid === null ? "skipped" : carResult.blockFetchValid ? "valid" : "invalid", + labels, + carResult.blockFetchValid === null ? "skipped" : carResult.blockFetchValid ? "valid" : "invalid", ); } catch (error) { // Validation was attempted on a successful piece retrieval but threw. From 54cc48719c1fb24222ce63ef7f78216061f9c8bc Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Thu, 30 Apr 2026 09:59:58 +0200 Subject: [PATCH 08/28] fix: biome checks --- apps/backend/src/retrieval-anon/anon-retrieval.service.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index c11daa19..5343d59a 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -173,10 +173,10 @@ export class AnonRetrievalService { throughput_bps: finalPieceResult.throughputBps > 0 ? Math.round(finalPieceResult.throughputBps) : null, commp_valid: finalPieceResult.success ? finalPieceResult.commPValid : null, car_parseable: carResult ? carResult.carParseable : null, - car_block_count: carResult != null && carResult.carParseable ? carResult.blockCount : null, + car_block_count: carResult?.carParseable ? carResult?.blockCount : null, block_fetch_endpoint: carResult?.blockFetchEndpoint ?? null, block_fetch_valid: carResult ? carResult.blockFetchValid : null, - block_fetch_sampled_count: carResult != null && carResult.carParseable ? carResult.sampledCidCount : null, + block_fetch_sampled_count: carResult?.carParseable ? carResult?.sampledCidCount : null, block_fetch_failed_count: carResult?.blockFetchFailedCount ?? null, ipni_status: ipniStatus, ipni_verify_ms: carResult?.ipniVerifyMs ?? null, From fcfe569e5c4bac09e96388cd78a90951e493ddfd Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Thu, 30 Apr 2026 10:00:28 +0200 Subject: [PATCH 09/28] fix(ipni): return actual verified/unverfied counts --- .../src/ipni/ipni-verification.service.ts | 122 +++++++++++------- 1 file changed, 72 insertions(+), 50 deletions(-) diff --git a/apps/backend/src/ipni/ipni-verification.service.ts b/apps/backend/src/ipni/ipni-verification.service.ts index 3d7d52f9..51fcc8e0 100644 --- a/apps/backend/src/ipni/ipni-verification.service.ts +++ b/apps/backend/src/ipni/ipni-verification.service.ts @@ -3,7 +3,7 @@ import { PDPProvider } from "filecoin-pin"; import { waitForIpniProviderResults } from "filecoin-pin/core/utils"; import { CID } from "multiformats/cid"; import type { StorageProvider } from "../database/entities/storage-provider.entity.js"; -import type { IPNIVerificationResult } from "../deal-addons/strategies/ipni.types.js"; +import type { FailedCID, IPNIVerificationResult } from "../deal-addons/strategies/ipni.types.js"; export type IpniVerificationInput = { rootCid: CID; @@ -44,7 +44,6 @@ export class IpniVerificationService { const expectedProviders = [this.buildExpectedProviderInfo(storageProvider as StorageProviderWithUrl)]; const timeoutSignal = AbortSignal.timeout(timeoutMs); const verificationSignal = signal ? AbortSignal.any([signal, timeoutSignal]) : timeoutSignal; - let failureReason = "IPNI did not return expected provider results via filecoin-pin"; this.logger.log({ event: "ipni_verification_started", @@ -61,56 +60,69 @@ export class IpniVerificationService { }); const ipniVerificationStartTime = Date.now(); + const cidsToValidate: { cid: CID; isRoot: boolean }[] = [ + { cid: rootCid, isRoot: true }, + ...blockCids.map((cid) => ({ cid, isRoot: false })), + ]; - const ipniValidated = await waitForIpniProviderResults(rootCid, { - childBlocks: blockCids, - maxAttempts, - delayMs, - expectedProviders, - signal: verificationSignal, - }).catch((error) => { + let verified = 0; + const failedCIDs: FailedCID[] = []; + let rootCIDVerified = false; + + // waitForIpniProviderResults is all-or-nothing per call (throws on first failure), + // so we invoke it once per CID to get accurate per-CID verified/unverified counts. + // The shared verificationSignal bounds total wall-clock time across all CIDs. + for (const { cid, isRoot } of cidsToValidate) { if (signal?.aborted) { signal.throwIfAborted(); } + if (verificationSignal.aborted) { - failureReason = `IPNI verification timed out after ${timeoutMs}ms`; - this.logger.error({ - event: "ipni_verification_timed_out", - message: failureReason, - rootCID: rootCid.toString(), + failedCIDs.push({ cid: cid.toString(), reason: `IPNI verification timed out after ${timeoutMs}ms` }); + continue; + } + + try { + await waitForIpniProviderResults(cid, { + maxAttempts, + delayMs, + expectedProviders, + signal: verificationSignal, + }); + verified += 1; + if (isRoot) rootCIDVerified = true; + } catch (error) { + if (signal?.aborted) { + signal.throwIfAborted(); + } + + const reason = verificationSignal.aborted + ? `IPNI verification timed out after ${timeoutMs}ms` + : error instanceof Error + ? error.message + : String(error); + + failedCIDs.push({ cid: cid.toString(), reason }); + + this.logger.warn({ + event: "ipni_cid_verification_failed", + message: "IPNI verification failed for CID", + cid: cid.toString(), + isRoot, providerAddress: storageProvider.address, providerId: storageProvider.providerId, providerName: storageProvider.name, serviceUrl: storageProvider.serviceUrl, - blockCIDCount: blockCids.length, - timeoutMs, - pollIntervalMs: delayMs, - maxAttempts, + failureReason: reason, }); - return false; } - const errorMessage = error instanceof Error ? error.message : String(error); - failureReason = errorMessage; - this.logger.error({ - event: "ipni_verification_failed", - message: "IPNI verification failed", - rootCID: rootCid.toString(), - providerAddress: storageProvider.address, - providerId: storageProvider.providerId, - providerName: storageProvider.name, - serviceUrl: storageProvider.serviceUrl, - blockCIDCount: blockCids.length, - timeoutMs, - pollIntervalMs: delayMs, - maxAttempts, - failureReason, - }); - return false; - }); + } const ipniVerificationDurationMs = Date.now() - ipniVerificationStartTime; + const total = cidsToValidate.length; + const unverified = total - verified; - if (ipniValidated) { + if (verified === total) { this.logger.log({ event: "ipni_verification_succeeded", message: "IPNI verification succeeded", @@ -121,22 +133,32 @@ export class IpniVerificationService { verifyDurationMs: ipniVerificationDurationMs, blockCIDCount: blockCids.length, }); + } else { + this.logger.error({ + event: verificationSignal.aborted ? "ipni_verification_timed_out" : "ipni_verification_failed", + message: "IPNI verification did not fully succeed", + rootCID: rootCid.toString(), + providerAddress: storageProvider.address, + providerId: storageProvider.providerId, + providerName: storageProvider.name, + serviceUrl: storageProvider.serviceUrl, + blockCIDCount: blockCids.length, + timeoutMs, + pollIntervalMs: delayMs, + maxAttempts, + verified, + unverified, + total, + }); } return { - verified: ipniValidated ? 1 : 0, - unverified: ipniValidated ? 0 : 1, - total: 1, - rootCIDVerified: ipniValidated, + verified: verified, + unverified: unverified, + total: total, + rootCIDVerified: rootCIDVerified, durationMs: ipniVerificationDurationMs, - failedCIDs: ipniValidated - ? [] - : [ - { - cid: rootCid.toString(), - reason: failureReason, - }, - ], + failedCIDs: failedCIDs, verifiedAt: new Date().toISOString(), }; } From fb45bd076600779eac47999e0a8a26d45182c542 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Thu, 30 Apr 2026 13:17:03 +0200 Subject: [PATCH 10/28] refactor: store anon retrieval data primarily in postgres --- .../src/clickhouse/clickhouse.schema.ts | 14 +- apps/backend/src/database/database.module.ts | 9 +- .../entities/anon-retrieval.entity.ts | 120 +++++++++++ .../1776300000000-CreateAnonRetrievals.ts | 72 +++++++ apps/backend/src/database/types.ts | 12 ++ .../anon-retrieval.service.spec.ts | 203 +++++++++++------- .../retrieval-anon/anon-retrieval.service.ts | 172 +++++++++------ .../retrieval-anon/retrieval-anon.module.ts | 3 +- 8 files changed, 444 insertions(+), 161 deletions(-) create mode 100644 apps/backend/src/database/entities/anon-retrieval.entity.ts create mode 100644 apps/backend/src/database/migrations/1776300000000-CreateAnonRetrievals.ts diff --git a/apps/backend/src/clickhouse/clickhouse.schema.ts b/apps/backend/src/clickhouse/clickhouse.schema.ts index e30f6151..5a9a805e 100644 --- a/apps/backend/src/clickhouse/clickhouse.schema.ts +++ b/apps/backend/src/clickhouse/clickhouse.schema.ts @@ -70,17 +70,12 @@ export function buildMigrations(database: string): string[] { sp_id Nullable(UInt64), -- storage provider numeric id sp_name Nullable(String), -- storage provider name - retrieval_id UUID, -- per-event correlation id (log/Prometheus join) + retrieval_id UUID, -- per-event correlation id (matches anon_retrievals.id in Postgres) - piece_cid String, -- piece CID (v2/CommP) sampled from the subgraph - data_set_id UInt64, -- on-chain data set id - piece_id UInt64, -- on-chain piece id within the data set raw_size UInt64, -- raw (unpadded) piece size, bytes with_ipfs_indexing Bool, -- whether the piece advertises IPNI metadata - ipfs_root_cid Nullable(String), -- root CID of the contained DAG; null when not IPFS-indexed service_type LowCardinality(String), -- 'direct_sp' (only mode for anon retrievals today) - retrieval_endpoint String, -- URL probed (e.g. {spBaseUrl}/piece/{pieceCid}) piece_fetch_status LowCardinality(String), -- 'success' | 'failed' — outcome of GET /piece/ (HTTP 2xx AND CommP match). CAR/IPNI/block-fetch outcomes live in their own columns. http_response_code Nullable(UInt16), -- raw HTTP status; null on transport failure @@ -92,17 +87,14 @@ export function buildMigrations(database: string): string[] { commp_valid Nullable(Bool), -- null when retrieval failed before CommP could be hashed car_parseable Nullable(Bool), -- null when CAR validation was skipped (no IPFS indexing or piece fetch failed); true if bytes parsed as a CAR car_block_count Nullable(UInt32), -- total number of blocks observed inside the CAR; null when skipped or unparseable - block_fetch_endpoint Nullable(String), -- gateway base URL probed for block fetch (e.g. {spBaseUrl}/ipfs/); null when skipped block_fetch_valid Nullable(Bool), -- null when skipped; true if all sampled blocks fetched + hash-verified block_fetch_sampled_count Nullable(UInt32), -- number of blocks sampled and probed via /ipfs/?format=raw block_fetch_failed_count Nullable(UInt32), -- number of sampled blocks that failed (non-2xx, hash mismatch, unsupported codec, or transport error) - ipni_status LowCardinality(String), -- 'valid' | 'invalid' | 'skipped' (mirrors data_storage_checks naming) + ipni_status LowCardinality(String), -- 'valid' | 'invalid' | 'skipped' | 'error' ipni_verify_ms Nullable(Float64), -- IPNI verification duration; null when skipped ipni_verified_cids_count Nullable(UInt32), -- CIDs confirmed findable via IPNI - ipni_unverified_cids_count Nullable(UInt32), -- CIDs checked but not findable - - error_message Nullable(String) -- failure reason; null on success + ipni_unverified_cids_count Nullable(UInt32) -- CIDs checked but not findable ) ENGINE MergeTree() PRIMARY KEY (probe_location, sp_address, timestamp) PARTITION BY toStartOfMonth(timestamp) diff --git a/apps/backend/src/database/database.module.ts b/apps/backend/src/database/database.module.ts index 9249c3a9..f3f9ed09 100644 --- a/apps/backend/src/database/database.module.ts +++ b/apps/backend/src/database/database.module.ts @@ -7,6 +7,7 @@ import { fileURLToPath } from "url"; import { toStructuredError } from "../common/logging.js"; import { createPinoExitLogger } from "../common/pino.config.js"; import type { IAppConfig, IConfig, IDatabaseConfig } from "../config/app.config.js"; +import { AnonRetrieval } from "./entities/anon-retrieval.entity.js"; import { DataRetentionBaseline } from "./entities/data-retention-baseline.entity.js"; import { Deal } from "./entities/deal.entity.js"; import { JobScheduleState } from "./entities/job-schedule-state.entity.js"; @@ -49,7 +50,7 @@ function toSafeDataSourceContext(options: DataSourceOptions): Record { + await queryRunner.query(` + CREATE TYPE anon_retrievals_piece_fetch_status_enum AS ENUM ('success', 'failed') + `); + await queryRunner.query(` + CREATE TYPE anon_retrievals_ipni_status_enum AS ENUM ('valid', 'invalid', 'skipped', 'error') + `); + await queryRunner.query(` + CREATE TYPE anon_retrievals_service_type_enum AS ENUM ('direct_sp', 'ipfs_pin') + `); + + await queryRunner.query(` + CREATE TABLE IF NOT EXISTS anon_retrievals ( + id UUID NOT NULL PRIMARY KEY DEFAULT gen_random_uuid(), + started_at TIMESTAMPTZ NOT NULL, + probe_location VARCHAR NOT NULL, + sp_address VARCHAR NOT NULL, + sp_id BIGINT, + sp_name VARCHAR, + piece_cid VARCHAR NOT NULL, + data_set_id BIGINT NOT NULL, + piece_id BIGINT NOT NULL, + raw_size BIGINT NOT NULL, + with_ipfs_indexing BOOLEAN NOT NULL, + ipfs_root_cid VARCHAR, + service_type anon_retrievals_service_type_enum NOT NULL DEFAULT 'direct_sp', + retrieval_endpoint VARCHAR NOT NULL, + piece_fetch_status anon_retrievals_piece_fetch_status_enum NOT NULL, + http_response_code INTEGER, + first_byte_ms DOUBLE PRECISION, + last_byte_ms DOUBLE PRECISION, + bytes_retrieved BIGINT, + throughput_bps BIGINT, + commp_valid BOOLEAN, + car_parseable BOOLEAN, + car_block_count INTEGER, + block_fetch_endpoint VARCHAR, + block_fetch_valid BOOLEAN, + block_fetch_sampled_count INTEGER, + block_fetch_failed_count INTEGER, + ipni_status anon_retrievals_ipni_status_enum NOT NULL, + ipni_verify_ms DOUBLE PRECISION, + ipni_verified_cids_count INTEGER, + ipni_unverified_cids_count INTEGER, + error_message VARCHAR, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() + ) + `); + + await queryRunner.query(` + CREATE INDEX IF NOT EXISTS "IDX_anon_retrievals_sp_address_started_at" + ON anon_retrievals (sp_address, started_at) + `); + + await queryRunner.query(` + CREATE INDEX IF NOT EXISTS "IDX_anon_retrievals_started_at" + ON anon_retrievals (started_at) + `); + } + + public async down(queryRunner: QueryRunner): Promise { + await queryRunner.query(`DROP TABLE IF EXISTS anon_retrievals CASCADE`); + await queryRunner.query(`DROP TYPE IF EXISTS anon_retrievals_service_type_enum`); + await queryRunner.query(`DROP TYPE IF EXISTS anon_retrievals_ipni_status_enum`); + await queryRunner.query(`DROP TYPE IF EXISTS anon_retrievals_piece_fetch_status_enum`); + } +} diff --git a/apps/backend/src/database/types.ts b/apps/backend/src/database/types.ts index 46fd5d28..e09d1dd3 100644 --- a/apps/backend/src/database/types.ts +++ b/apps/backend/src/database/types.ts @@ -28,6 +28,18 @@ export enum IpniStatus { FAILED = "failed", } +export enum PieceFetchStatus { + SUCCESS = "success", + FAILED = "failed", +} + +export enum IpniCheckStatus { + VALID = "valid", + INVALID = "invalid", + SKIPPED = "skipped", + ERROR = "error", +} + /** * Metadata schema for deal storage and retrieval */ diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts index b5f17c57..4f775150 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts @@ -1,8 +1,9 @@ import type { Repository } from "typeorm"; import { beforeEach, describe, expect, it, vi } from "vitest"; import type { ClickhouseService } from "../clickhouse/clickhouse.service.js"; +import type { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; import type { StorageProvider } from "../database/entities/storage-provider.entity.js"; -import { RetrievalStatus } from "../database/types.js"; +import { IpniCheckStatus, PieceFetchStatus } from "../database/types.js"; import type { AnonRetrievalCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; import type { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; import type { AnonPieceSelectorService } from "./anon-piece-selector.service.js"; @@ -26,7 +27,7 @@ const PIECE = { function makeProvider(): StorageProvider { return { address: SP_ADDRESS, - providerId: 7, + providerId: 7n, name: "sp-test", isApproved: true, } as unknown as StorageProvider; @@ -39,6 +40,7 @@ function makeService(opts: { piece?: AnonPiece | null; carResult?: CarValidationResult; validateCarImpl?: () => Promise; + saveImpl?: (entity: AnonRetrieval) => Promise; }): { service: AnonRetrievalService; insertSpy: ReturnType; @@ -47,6 +49,7 @@ function makeService(opts: { metricsRecordStatusSpy: ReturnType; metricsRecordIpniSpy: ReturnType; metricsRecordBlockFetchSpy: ReturnType; + saveSpy: ReturnType; } { const insertSpy = vi.fn(); const clickhouseService = { @@ -59,6 +62,11 @@ function makeService(opts: { findOne: vi.fn(async () => makeProvider()), } as unknown as Repository; + const saveSpy = vi.fn(opts.saveImpl ?? (async (entity: AnonRetrieval) => entity)); + const anonRetrievalRepository = { + save: saveSpy, + } as unknown as Repository; + const anonPieceSelector = { selectPieceForProvider: vi.fn(async () => (opts.piece === null ? null : (opts.piece ?? PIECE))), } as unknown as AnonPieceSelectorService; @@ -100,6 +108,7 @@ function makeService(opts: { metrics, clickhouseService, spRepository, + anonRetrievalRepository, ); return { @@ -110,6 +119,7 @@ function makeService(opts: { metricsRecordStatusSpy, metricsRecordIpniSpy, metricsRecordBlockFetchSpy, + saveSpy, }; } @@ -118,7 +128,7 @@ describe("AnonRetrievalService", () => { vi.clearAllMocks(); }); - it("emits a ClickHouse row with partial metrics when fetchPiece returns aborted=true", async () => { + it("persists a Postgres row with partial metrics when fetchPiece returns aborted=true", async () => { const partial: PieceRetrievalResult = { success: false, pieceCid: PIECE.pieceCid, @@ -133,41 +143,59 @@ describe("AnonRetrievalService", () => { aborted: true, }; - const { service, insertSpy } = makeService({ pieceResult: partial }); + const { service, saveSpy, insertSpy } = makeService({ pieceResult: partial }); await service.performForProvider(SP_ADDRESS); + expect(saveSpy).toHaveBeenCalledTimes(1); + const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; + expect(entity.pieceFetchStatus).toBe(PieceFetchStatus.FAILED); + expect(entity.bytesRetrieved).toBe(524288n); + expect(entity.firstByteMs).toBe(150); + expect(entity.lastByteMs).toBe(42000); + expect(entity.throughputBps).toBe(12500n); + expect(entity.httpResponseCode).toBe(200); + expect(entity.errorMessage).toContain("Anon retrieval job timeout"); + expect(entity.pieceCid).toBe(PIECE.pieceCid); + expect(entity.spAddress).toBe(SP_ADDRESS); + expect(entity.spId).toBe(7n); + expect(entity.probeLocation).toBe("test-location"); + expect(entity.retrievalEndpoint).toBe(`https://sp.test/piece/${PIECE.pieceCid}`); + expect(typeof entity.id).toBe("string"); + + // CAR/IPNI/block-fetch were never run on a non-IPFS-indexed piece. + expect(entity.carParseable).toBeNull(); + expect(entity.carBlockCount).toBeNull(); + expect(entity.blockFetchEndpoint).toBeNull(); + expect(entity.blockFetchValid).toBeNull(); + expect(entity.blockFetchSampledCount).toBeNull(); + expect(entity.blockFetchFailedCount).toBeNull(); + expect(entity.ipniStatus).toBe(IpniCheckStatus.SKIPPED); + + // ClickHouse mirror is also written. expect(insertSpy).toHaveBeenCalledTimes(1); const [table, row] = insertSpy.mock.calls[0] as [string, Record]; expect(table).toBe("anon_retrieval_checks"); - expect(row.piece_fetch_status).toBe(RetrievalStatus.FAILED); + expect(row.retrieval_id).toBe(entity.id); + expect(row.piece_fetch_status).toBe(PieceFetchStatus.FAILED); expect(row.bytes_retrieved).toBe(524288); expect(row.first_byte_ms).toBe(150); expect(row.last_byte_ms).toBe(42000); expect(row.throughput_bps).toBe(12500); expect(row.http_response_code).toBe(200); - expect(row.error_message).toContain("Anon retrieval job timeout"); - expect(row.piece_cid).toBe(PIECE.pieceCid); - expect(row.sp_address).toBe(SP_ADDRESS); - expect(row.sp_id).toBe(7); - expect(row.probe_location).toBe("test-location"); - expect(typeof row.retrieval_id).toBe("string"); - - // CAR/IPNI/block-fetch were never run on a non-IPFS-indexed piece — every - // dimension column should explicitly say "skipped" (ipni_status) or null. - expect(row.car_parseable).toBeNull(); - expect(row.car_block_count).toBeNull(); - expect(row.block_fetch_endpoint).toBeNull(); - expect(row.block_fetch_valid).toBeNull(); - expect(row.block_fetch_sampled_count).toBeNull(); - expect(row.block_fetch_failed_count).toBeNull(); - expect(row.ipni_status).toBe("skipped"); - expect(row.ipni_verify_ms).toBeNull(); - expect(row.ipni_verified_cids_count).toBeNull(); - expect(row.ipni_unverified_cids_count).toBeNull(); + expect(row.ipni_status).toBe(IpniCheckStatus.SKIPPED); + + // Trimmed CH columns must NOT appear (they live only in Postgres). + expect(row).not.toHaveProperty("piece_cid"); + expect(row).not.toHaveProperty("data_set_id"); + expect(row).not.toHaveProperty("piece_id"); + expect(row).not.toHaveProperty("ipfs_root_cid"); + expect(row).not.toHaveProperty("retrieval_endpoint"); + expect(row).not.toHaveProperty("block_fetch_endpoint"); + expect(row).not.toHaveProperty("error_message"); }); - it("still emits a row when the signal aborts before fetchPiece runs", async () => { + it("still persists when the signal aborts before fetchPiece runs", async () => { const ac = new AbortController(); ac.abort(new Error("Anon retrieval job timeout (60s) for sp1")); @@ -183,20 +211,21 @@ describe("AnonRetrievalService", () => { commPValid: false, }; - const { service, insertSpy, fetchSpy } = makeService({ pieceResult: never }); + const { service, saveSpy, insertSpy, fetchSpy } = makeService({ pieceResult: never }); await service.performForProvider(SP_ADDRESS, ac.signal); expect(fetchSpy).not.toHaveBeenCalled(); + expect(saveSpy).toHaveBeenCalledTimes(1); + const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; + expect(entity.pieceFetchStatus).toBe(PieceFetchStatus.FAILED); + expect(entity.errorMessage).toContain("Anon retrieval job timeout"); + expect(entity.bytesRetrieved).toBeNull(); + expect(entity.firstByteMs).toBeNull(); expect(insertSpy).toHaveBeenCalledTimes(1); - const [, row] = insertSpy.mock.calls[0] as [string, Record]; - expect(row.piece_fetch_status).toBe(RetrievalStatus.FAILED); - expect(row.error_message).toContain("Anon retrieval job timeout"); - expect(row.bytes_retrieved).toBeNull(); - expect(row.first_byte_ms).toBeNull(); }); - it("still emits a row when fetchPiece throws unexpectedly", async () => { + it("still persists when fetchPiece throws unexpectedly", async () => { const never: PieceRetrievalResult = { success: false, pieceCid: PIECE.pieceCid, @@ -209,7 +238,7 @@ describe("AnonRetrievalService", () => { commPValid: false, }; - const { service, insertSpy } = makeService({ + const { service, saveSpy } = makeService({ pieceResult: never, fetchPieceImpl: async () => { throw new Error("network down"); @@ -218,12 +247,12 @@ describe("AnonRetrievalService", () => { await expect(service.performForProvider(SP_ADDRESS)).rejects.toThrow("network down"); - expect(insertSpy).toHaveBeenCalledTimes(1); - const [, row] = insertSpy.mock.calls[0] as [string, Record]; - expect(row.piece_fetch_status).toBe(RetrievalStatus.FAILED); + expect(saveSpy).toHaveBeenCalledTimes(1); + const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; + expect(entity.pieceFetchStatus).toBe(PieceFetchStatus.FAILED); }); - it("skips ClickHouse insert when ClickHouse is disabled", async () => { + it("does not throw when Postgres save fails and still attempts the CH insert", async () => { const ok: PieceRetrievalResult = { success: true, pieceCid: PIECE.pieceCid, @@ -236,11 +265,20 @@ describe("AnonRetrievalService", () => { commPValid: true, }; - const { service, insertSpy } = makeService({ pieceResult: ok, clickhouseEnabled: false }); + const { service, saveSpy, insertSpy } = makeService({ + pieceResult: ok, + saveImpl: async () => { + throw new Error("connection refused"); + }, + }); - await service.performForProvider(SP_ADDRESS); + await expect(service.performForProvider(SP_ADDRESS)).resolves.toBeUndefined(); - expect(insertSpy).not.toHaveBeenCalled(); + expect(saveSpy).toHaveBeenCalledTimes(1); + // CH still gets the row keyed by the client-side uuid. + expect(insertSpy).toHaveBeenCalledTimes(1); + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(typeof row.retrieval_id).toBe("string"); }); describe("with IPFS indexing", () => { @@ -264,7 +302,7 @@ describe("AnonRetrievalService", () => { }; } - it("emits populated CAR/IPNI/block-fetch columns when validation fully succeeds", async () => { + it("populates CAR/IPNI/block-fetch columns when validation fully succeeds", async () => { const carResult: CarValidationResult = { carParseable: true, blockCount: 42, @@ -278,7 +316,7 @@ describe("AnonRetrievalService", () => { blockFetchEndpoint: "https://sp.test/ipfs/", }; - const { service, insertSpy, validateCarSpy } = makeService({ + const { service, saveSpy, insertSpy, validateCarSpy } = makeService({ pieceResult: okPiece(Buffer.from("car-bytes")), piece: INDEXED_PIECE, carResult, @@ -287,19 +325,24 @@ describe("AnonRetrievalService", () => { await service.performForProvider(SP_ADDRESS); expect(validateCarSpy).toHaveBeenCalledTimes(1); + const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; + expect(entity.pieceFetchStatus).toBe(PieceFetchStatus.SUCCESS); + expect(entity.commpValid).toBe(true); + expect(entity.carParseable).toBe(true); + expect(entity.carBlockCount).toBe(42); + expect(entity.blockFetchEndpoint).toBe("https://sp.test/ipfs/"); + expect(entity.blockFetchValid).toBe(true); + expect(entity.blockFetchSampledCount).toBe(5); + expect(entity.blockFetchFailedCount).toBe(0); + expect(entity.ipniStatus).toBe(IpniCheckStatus.VALID); + expect(entity.ipniVerifyMs).toBe(137); + expect(entity.ipniVerifiedCidsCount).toBe(6); + expect(entity.ipniUnverifiedCidsCount).toBe(0); + const [, row] = insertSpy.mock.calls[0] as [string, Record]; - expect(row.piece_fetch_status).toBe(RetrievalStatus.SUCCESS); - expect(row.commp_valid).toBe(true); + expect(row.piece_fetch_status).toBe(PieceFetchStatus.SUCCESS); expect(row.car_parseable).toBe(true); - expect(row.car_block_count).toBe(42); - expect(row.block_fetch_endpoint).toBe("https://sp.test/ipfs/"); - expect(row.block_fetch_valid).toBe(true); - expect(row.block_fetch_sampled_count).toBe(5); - expect(row.block_fetch_failed_count).toBe(0); - expect(row.ipni_status).toBe("valid"); - expect(row.ipni_verify_ms).toBe(137); - expect(row.ipni_verified_cids_count).toBe(6); - expect(row.ipni_unverified_cids_count).toBe(0); + expect(row.ipni_status).toBe(IpniCheckStatus.VALID); }); it("distinguishes IPNI invalid from block-fetch failures with explicit counts", async () => { @@ -316,7 +359,7 @@ describe("AnonRetrievalService", () => { blockFetchEndpoint: "https://sp.test/ipfs/", }; - const { service, insertSpy } = makeService({ + const { service, saveSpy } = makeService({ pieceResult: okPiece(Buffer.from("car-bytes")), piece: INDEXED_PIECE, carResult, @@ -324,24 +367,24 @@ describe("AnonRetrievalService", () => { await service.performForProvider(SP_ADDRESS); - const [, row] = insertSpy.mock.calls[0] as [string, Record]; + const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; // The piece-fetch path still succeeded — failures are surfaced as // independent dimensions, not folded into piece_fetch_status. - expect(row.piece_fetch_status).toBe(RetrievalStatus.SUCCESS); - expect(row.car_parseable).toBe(true); - expect(row.ipni_status).toBe("invalid"); - expect(row.ipni_verified_cids_count).toBe(0); - expect(row.ipni_unverified_cids_count).toBe(6); - expect(row.block_fetch_valid).toBe(false); - expect(row.block_fetch_sampled_count).toBe(5); - expect(row.block_fetch_failed_count).toBe(2); + expect(entity.pieceFetchStatus).toBe(PieceFetchStatus.SUCCESS); + expect(entity.carParseable).toBe(true); + expect(entity.ipniStatus).toBe(IpniCheckStatus.INVALID); + expect(entity.ipniVerifiedCidsCount).toBe(0); + expect(entity.ipniUnverifiedCidsCount).toBe(6); + expect(entity.blockFetchValid).toBe(false); + expect(entity.blockFetchSampledCount).toBe(5); + expect(entity.blockFetchFailedCount).toBe(2); }); it("emits ipni_status='error' (not 'skipped') when CAR validation throws on a successful piece", async () => { // Distinguishes a real infra outage (e.g. IpniVerificationService down) // from a piece that legitimately had no IPFS indexing. Without the // distinction, an outage looks like normal non-IPFS volume in dashboards. - const { service, insertSpy, metricsRecordIpniSpy, metricsRecordBlockFetchSpy } = makeService({ + const { service, saveSpy, metricsRecordIpniSpy, metricsRecordBlockFetchSpy } = makeService({ pieceResult: okPiece(Buffer.from("car-bytes")), piece: INDEXED_PIECE, validateCarImpl: async () => { @@ -354,11 +397,11 @@ describe("AnonRetrievalService", () => { expect(metricsRecordIpniSpy).toHaveBeenCalledWith(expect.anything(), "error"); expect(metricsRecordBlockFetchSpy).toHaveBeenCalledWith(expect.anything(), "error"); - const [, row] = insertSpy.mock.calls[0] as [string, Record]; - expect(row.ipni_status).toBe("error"); + const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; + expect(entity.ipniStatus).toBe(IpniCheckStatus.ERROR); // Piece-fetch path itself succeeded — only the validation pipeline failed. - expect(row.commp_valid).toBe(true); - expect(row.car_parseable).toBeNull(); + expect(entity.commpValid).toBe(true); + expect(entity.carParseable).toBeNull(); }); it("emits car_parseable=false with skipped IPNI/block-fetch when bytes don't parse as CAR", async () => { @@ -375,7 +418,7 @@ describe("AnonRetrievalService", () => { blockFetchEndpoint: null, }; - const { service, insertSpy } = makeService({ + const { service, saveSpy } = makeService({ pieceResult: okPiece(Buffer.from("not-a-car")), piece: INDEXED_PIECE, carResult, @@ -383,19 +426,19 @@ describe("AnonRetrievalService", () => { await service.performForProvider(SP_ADDRESS); - const [, row] = insertSpy.mock.calls[0] as [string, Record]; - expect(row.car_parseable).toBe(false); + const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; + expect(entity.carParseable).toBe(false); // car_block_count and block_fetch_sampled_count are gated on carParseable // so an unparseable CAR doesn't emit a misleading 0. - expect(row.car_block_count).toBeNull(); - expect(row.block_fetch_sampled_count).toBeNull(); - expect(row.block_fetch_endpoint).toBeNull(); - expect(row.block_fetch_valid).toBeNull(); - expect(row.block_fetch_failed_count).toBeNull(); - expect(row.ipni_status).toBe("skipped"); - expect(row.ipni_verify_ms).toBeNull(); - expect(row.ipni_verified_cids_count).toBeNull(); - expect(row.ipni_unverified_cids_count).toBeNull(); + expect(entity.carBlockCount).toBeNull(); + expect(entity.blockFetchSampledCount).toBeNull(); + expect(entity.blockFetchEndpoint).toBeNull(); + expect(entity.blockFetchValid).toBeNull(); + expect(entity.blockFetchFailedCount).toBeNull(); + expect(entity.ipniStatus).toBe(IpniCheckStatus.SKIPPED); + expect(entity.ipniVerifyMs).toBeNull(); + expect(entity.ipniVerifiedCidsCount).toBeNull(); + expect(entity.ipniUnverifiedCidsCount).toBeNull(); }); }); }); diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index 5343d59a..d8298776 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -4,8 +4,9 @@ import { InjectRepository } from "@nestjs/typeorm"; import type { Repository } from "typeorm"; import { ClickhouseService } from "../clickhouse/clickhouse.service.js"; import { type ProviderJobContext, toStructuredError } from "../common/logging.js"; +import { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; -import { RetrievalStatus, ServiceType } from "../database/types.js"; +import { IpniCheckStatus, PieceFetchStatus, ServiceType } from "../database/types.js"; import { buildCheckMetricLabels } from "../metrics-prometheus/check-metric-labels.js"; import { AnonRetrievalCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; @@ -29,6 +30,8 @@ export class AnonRetrievalService { private readonly clickhouseService: ClickhouseService, @InjectRepository(StorageProvider) private readonly spRepository: Repository, + @InjectRepository(AnonRetrieval) + private readonly anonRetrievalRepository: Repository, ) {} async performForProvider(spAddress: string, signal?: AbortSignal, logContext?: ProviderJobContext): Promise { @@ -137,80 +140,75 @@ export class AnonRetrievalService { pieceResult.success ? "success" : pieceResult.aborted ? "failure.aborted" : "failure.http", ); } finally { - // Always emit a ClickHouse row — even on abort or unexpected error — so - // we never lose the evidence (ttfb, bytes, response code) we already - // collected. + // Always persist a row — even on abort or unexpected error — so we never + // lose the evidence (ttfb, bytes, response code) we already collected. const finalPieceResult = pieceResult ?? buildAbortedPlaceholder(piece.pieceCid, signal?.reason); - const retrievalId = randomUUID(); + const providerInfo = this.walletSdkService.getProviderInfo(spAddress); + const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; + const retrievalEndpoint = `${spBaseUrl}/piece/${piece.pieceCid}`; + const pieceFetchStatus = finalPieceResult.success ? PieceFetchStatus.SUCCESS : PieceFetchStatus.FAILED; + const ipniStatus: IpniCheckStatus = !validatedCarPiece + ? IpniCheckStatus.SKIPPED + : carResult + ? ipniStatusFromResult(carResult) + : IpniCheckStatus.ERROR; - if (this.clickhouseService.enabled) { - const providerInfo = this.walletSdkService.getProviderInfo(spAddress); - const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; - const pieceFetchStatus = finalPieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED; - const ipniStatus = !validatedCarPiece ? "skipped" : carResult ? ipniStatusFromResult(carResult) : "error"; + const entity: AnonRetrieval = { + id: randomUUID(), + createdAt: startedAt, + startedAt, + probeLocation: this.clickhouseService.probeLocation, + spAddress, + spId: provider?.providerId ?? null, + spName: provider?.name ?? null, + pieceCid: piece.pieceCid, + dataSetId: BigInt(piece.dataSetId), + pieceId: BigInt(piece.pieceId), + rawSize: BigInt(piece.rawSize), + withIpfsIndexing: piece.withIPFSIndexing, + ipfsRootCid: piece.ipfsRootCid, + serviceType: ServiceType.DIRECT_SP, + retrievalEndpoint, + pieceFetchStatus, + httpResponseCode: finalPieceResult.statusCode > 0 ? finalPieceResult.statusCode : null, + firstByteMs: finalPieceResult.ttfbMs > 0 ? finalPieceResult.ttfbMs : null, + lastByteMs: finalPieceResult.latencyMs > 0 ? finalPieceResult.latencyMs : null, + bytesRetrieved: finalPieceResult.bytesReceived > 0 ? BigInt(finalPieceResult.bytesReceived) : null, + throughputBps: finalPieceResult.throughputBps > 0 ? BigInt(Math.round(finalPieceResult.throughputBps)) : null, + commpValid: finalPieceResult.success ? finalPieceResult.commPValid : null, + carParseable: carResult ? carResult.carParseable : null, + carBlockCount: carResult?.carParseable ? carResult.blockCount : null, + blockFetchEndpoint: carResult?.blockFetchEndpoint ?? null, + blockFetchValid: carResult ? carResult.blockFetchValid : null, + blockFetchSampledCount: carResult?.carParseable ? carResult.sampledCidCount : null, + blockFetchFailedCount: carResult?.blockFetchFailedCount ?? null, + ipniStatus, + ipniVerifyMs: carResult?.ipniVerifyMs ?? null, + ipniVerifiedCidsCount: carResult?.ipniVerifiedCidsCount ?? null, + ipniUnverifiedCidsCount: carResult?.ipniUnverifiedCidsCount ?? null, + errorMessage: finalPieceResult.errorMessage ?? null, + }; - try { - this.clickhouseService.insert(ANON_RETRIEVAL_CHECKS_TABLE, { - timestamp: startedAt.getTime(), - probe_location: this.clickhouseService.probeLocation, - sp_address: spAddress, - sp_id: provider?.providerId != null ? Number(provider.providerId) : null, - sp_name: provider?.name ?? null, - retrieval_id: retrievalId, - piece_cid: piece.pieceCid, - data_set_id: piece.dataSetId, - piece_id: piece.pieceId, - raw_size: piece.rawSize, - with_ipfs_indexing: piece.withIPFSIndexing, - ipfs_root_cid: piece.ipfsRootCid, - service_type: ServiceType.DIRECT_SP, - retrieval_endpoint: `${spBaseUrl}/piece/${piece.pieceCid}`, - piece_fetch_status: pieceFetchStatus, - http_response_code: finalPieceResult.statusCode > 0 ? finalPieceResult.statusCode : null, - first_byte_ms: finalPieceResult.ttfbMs > 0 ? finalPieceResult.ttfbMs : null, - last_byte_ms: finalPieceResult.latencyMs > 0 ? finalPieceResult.latencyMs : null, - bytes_retrieved: finalPieceResult.bytesReceived > 0 ? finalPieceResult.bytesReceived : null, - throughput_bps: finalPieceResult.throughputBps > 0 ? Math.round(finalPieceResult.throughputBps) : null, - commp_valid: finalPieceResult.success ? finalPieceResult.commPValid : null, - car_parseable: carResult ? carResult.carParseable : null, - car_block_count: carResult?.carParseable ? carResult?.blockCount : null, - block_fetch_endpoint: carResult?.blockFetchEndpoint ?? null, - block_fetch_valid: carResult ? carResult.blockFetchValid : null, - block_fetch_sampled_count: carResult?.carParseable ? carResult?.sampledCidCount : null, - block_fetch_failed_count: carResult?.blockFetchFailedCount ?? null, - ipni_status: ipniStatus, - ipni_verify_ms: carResult?.ipniVerifyMs ?? null, - ipni_verified_cids_count: carResult?.ipniVerifiedCidsCount ?? null, - ipni_unverified_cids_count: carResult?.ipniUnverifiedCidsCount ?? null, - error_message: finalPieceResult.errorMessage ?? null, - }); - } catch (error) { - // ClickhouseService.insert is buffered/non-throwing in normal operation, but - // guard against unexpected runtime errors so we don't break the probe cycle. - this.logger.warn({ - ...logContext, - event: "anon_retrieval_clickhouse_insert_failed", - message: "Failed to enqueue anonymous retrieval row to ClickHouse", - pieceCid: piece.pieceCid, - spAddress, - error: toStructuredError(error), - }); - } - } else { - this.logger.debug({ + try { + await this.anonRetrievalRepository.save(entity); + } catch (error) { + this.logger.warn({ ...logContext, - event: "anon_retrieval_clickhouse_disabled", - message: "ClickHouse disabled — anon retrieval row not emitted", + event: "anon_retrieval_save_failed", + message: "Failed to persist anonymous retrieval row to Postgres", pieceCid: piece.pieceCid, spAddress, + error: toStructuredError(error), }); } + this.clickhouseService.insert(ANON_RETRIEVAL_CHECKS_TABLE, toClickhouseRow(entity)); + this.logger.log({ ...logContext, event: "anon_retrieval_completed", message: "Anonymous retrieval test completed", - retrievalId, + retrievalId: entity.id, pieceCid: piece.pieceCid, spAddress, success: finalPieceResult.success, @@ -226,9 +224,53 @@ export class AnonRetrievalService { } } -function ipniStatusFromResult(result: CarValidationResult): "valid" | "invalid" | "skipped" { - if (result.ipniValid === null) return "skipped"; - return result.ipniValid ? "valid" : "invalid"; +function ipniStatusFromResult(result: CarValidationResult): IpniCheckStatus { + switch (result.ipniValid) { + case null: + return IpniCheckStatus.SKIPPED; + case true: + return IpniCheckStatus.VALID; + case false: + return IpniCheckStatus.INVALID; + default: + throw new Error(`Unexpected IPNI validation result: ${result.ipniValid}`); + } +} + +/** + * Project an AnonRetrieval entity to the chartable subset stored in ClickHouse. + * High-cardinality identifiers (piece_cid, data_set_id, piece_id, ipfs_root_cid), + * URLs (retrieval_endpoint, block_fetch_endpoint), and free-text columns + * (error_message) are intentionally dropped — they live only in Postgres. + */ +function toClickhouseRow(entity: AnonRetrieval): Record { + return { + timestamp: entity.startedAt.getTime(), + probe_location: entity.probeLocation, + sp_address: entity.spAddress, + sp_id: entity.spId != null ? Number(entity.spId) : null, + sp_name: entity.spName, + retrieval_id: entity.id, + raw_size: Number(entity.rawSize), + with_ipfs_indexing: entity.withIpfsIndexing, + service_type: entity.serviceType, + piece_fetch_status: entity.pieceFetchStatus, + http_response_code: entity.httpResponseCode, + first_byte_ms: entity.firstByteMs, + last_byte_ms: entity.lastByteMs, + bytes_retrieved: entity.bytesRetrieved != null ? Number(entity.bytesRetrieved) : null, + throughput_bps: entity.throughputBps != null ? Number(entity.throughputBps) : null, + commp_valid: entity.commpValid, + car_parseable: entity.carParseable, + car_block_count: entity.carBlockCount, + block_fetch_valid: entity.blockFetchValid, + block_fetch_sampled_count: entity.blockFetchSampledCount, + block_fetch_failed_count: entity.blockFetchFailedCount, + ipni_status: entity.ipniStatus, + ipni_verify_ms: entity.ipniVerifyMs, + ipni_verified_cids_count: entity.ipniVerifiedCidsCount, + ipni_unverified_cids_count: entity.ipniUnverifiedCidsCount, + }; } function buildAbortedPlaceholder(pieceCid: string, reason: unknown): PieceRetrievalResult { diff --git a/apps/backend/src/retrieval-anon/retrieval-anon.module.ts b/apps/backend/src/retrieval-anon/retrieval-anon.module.ts index c05dcb5f..4e9e38df 100644 --- a/apps/backend/src/retrieval-anon/retrieval-anon.module.ts +++ b/apps/backend/src/retrieval-anon/retrieval-anon.module.ts @@ -1,6 +1,7 @@ import { Module } from "@nestjs/common"; import { ConfigModule } from "@nestjs/config"; import { TypeOrmModule } from "@nestjs/typeorm"; +import { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; import { HttpClientModule } from "../http-client/http-client.module.js"; import { IpniModule } from "../ipni/ipni.module.js"; @@ -14,7 +15,7 @@ import { PieceRetrievalService } from "./piece-retrieval.service.js"; @Module({ imports: [ ConfigModule, - TypeOrmModule.forFeature([StorageProvider]), + TypeOrmModule.forFeature([AnonRetrieval, StorageProvider]), SubgraphModule, WalletSdkModule, HttpClientModule, From 92c40a85fb4798aa74ad03d8490ea4f1e0e62899 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Mon, 4 May 2026 08:12:26 +0200 Subject: [PATCH 11/28] Revert "refactor: store anon retrieval data primarily in postgres" This reverts commit 6824f752b106f8bbd8e443aa2f74f680a8afe4c1. --- .../src/clickhouse/clickhouse.schema.ts | 14 +- apps/backend/src/database/database.module.ts | 9 +- .../entities/anon-retrieval.entity.ts | 120 ----------- .../1776300000000-CreateAnonRetrievals.ts | 72 ------- apps/backend/src/database/types.ts | 12 -- .../anon-retrieval.service.spec.ts | 203 +++++++----------- .../retrieval-anon/anon-retrieval.service.ts | 172 ++++++--------- .../retrieval-anon/retrieval-anon.module.ts | 3 +- 8 files changed, 161 insertions(+), 444 deletions(-) delete mode 100644 apps/backend/src/database/entities/anon-retrieval.entity.ts delete mode 100644 apps/backend/src/database/migrations/1776300000000-CreateAnonRetrievals.ts diff --git a/apps/backend/src/clickhouse/clickhouse.schema.ts b/apps/backend/src/clickhouse/clickhouse.schema.ts index 5a9a805e..e30f6151 100644 --- a/apps/backend/src/clickhouse/clickhouse.schema.ts +++ b/apps/backend/src/clickhouse/clickhouse.schema.ts @@ -70,12 +70,17 @@ export function buildMigrations(database: string): string[] { sp_id Nullable(UInt64), -- storage provider numeric id sp_name Nullable(String), -- storage provider name - retrieval_id UUID, -- per-event correlation id (matches anon_retrievals.id in Postgres) + retrieval_id UUID, -- per-event correlation id (log/Prometheus join) + piece_cid String, -- piece CID (v2/CommP) sampled from the subgraph + data_set_id UInt64, -- on-chain data set id + piece_id UInt64, -- on-chain piece id within the data set raw_size UInt64, -- raw (unpadded) piece size, bytes with_ipfs_indexing Bool, -- whether the piece advertises IPNI metadata + ipfs_root_cid Nullable(String), -- root CID of the contained DAG; null when not IPFS-indexed service_type LowCardinality(String), -- 'direct_sp' (only mode for anon retrievals today) + retrieval_endpoint String, -- URL probed (e.g. {spBaseUrl}/piece/{pieceCid}) piece_fetch_status LowCardinality(String), -- 'success' | 'failed' — outcome of GET /piece/ (HTTP 2xx AND CommP match). CAR/IPNI/block-fetch outcomes live in their own columns. http_response_code Nullable(UInt16), -- raw HTTP status; null on transport failure @@ -87,14 +92,17 @@ export function buildMigrations(database: string): string[] { commp_valid Nullable(Bool), -- null when retrieval failed before CommP could be hashed car_parseable Nullable(Bool), -- null when CAR validation was skipped (no IPFS indexing or piece fetch failed); true if bytes parsed as a CAR car_block_count Nullable(UInt32), -- total number of blocks observed inside the CAR; null when skipped or unparseable + block_fetch_endpoint Nullable(String), -- gateway base URL probed for block fetch (e.g. {spBaseUrl}/ipfs/); null when skipped block_fetch_valid Nullable(Bool), -- null when skipped; true if all sampled blocks fetched + hash-verified block_fetch_sampled_count Nullable(UInt32), -- number of blocks sampled and probed via /ipfs/?format=raw block_fetch_failed_count Nullable(UInt32), -- number of sampled blocks that failed (non-2xx, hash mismatch, unsupported codec, or transport error) - ipni_status LowCardinality(String), -- 'valid' | 'invalid' | 'skipped' | 'error' + ipni_status LowCardinality(String), -- 'valid' | 'invalid' | 'skipped' (mirrors data_storage_checks naming) ipni_verify_ms Nullable(Float64), -- IPNI verification duration; null when skipped ipni_verified_cids_count Nullable(UInt32), -- CIDs confirmed findable via IPNI - ipni_unverified_cids_count Nullable(UInt32) -- CIDs checked but not findable + ipni_unverified_cids_count Nullable(UInt32), -- CIDs checked but not findable + + error_message Nullable(String) -- failure reason; null on success ) ENGINE MergeTree() PRIMARY KEY (probe_location, sp_address, timestamp) PARTITION BY toStartOfMonth(timestamp) diff --git a/apps/backend/src/database/database.module.ts b/apps/backend/src/database/database.module.ts index f3f9ed09..9249c3a9 100644 --- a/apps/backend/src/database/database.module.ts +++ b/apps/backend/src/database/database.module.ts @@ -7,7 +7,6 @@ import { fileURLToPath } from "url"; import { toStructuredError } from "../common/logging.js"; import { createPinoExitLogger } from "../common/pino.config.js"; import type { IAppConfig, IConfig, IDatabaseConfig } from "../config/app.config.js"; -import { AnonRetrieval } from "./entities/anon-retrieval.entity.js"; import { DataRetentionBaseline } from "./entities/data-retention-baseline.entity.js"; import { Deal } from "./entities/deal.entity.js"; import { JobScheduleState } from "./entities/job-schedule-state.entity.js"; @@ -50,7 +49,7 @@ function toSafeDataSourceContext(options: DataSourceOptions): Record { - await queryRunner.query(` - CREATE TYPE anon_retrievals_piece_fetch_status_enum AS ENUM ('success', 'failed') - `); - await queryRunner.query(` - CREATE TYPE anon_retrievals_ipni_status_enum AS ENUM ('valid', 'invalid', 'skipped', 'error') - `); - await queryRunner.query(` - CREATE TYPE anon_retrievals_service_type_enum AS ENUM ('direct_sp', 'ipfs_pin') - `); - - await queryRunner.query(` - CREATE TABLE IF NOT EXISTS anon_retrievals ( - id UUID NOT NULL PRIMARY KEY DEFAULT gen_random_uuid(), - started_at TIMESTAMPTZ NOT NULL, - probe_location VARCHAR NOT NULL, - sp_address VARCHAR NOT NULL, - sp_id BIGINT, - sp_name VARCHAR, - piece_cid VARCHAR NOT NULL, - data_set_id BIGINT NOT NULL, - piece_id BIGINT NOT NULL, - raw_size BIGINT NOT NULL, - with_ipfs_indexing BOOLEAN NOT NULL, - ipfs_root_cid VARCHAR, - service_type anon_retrievals_service_type_enum NOT NULL DEFAULT 'direct_sp', - retrieval_endpoint VARCHAR NOT NULL, - piece_fetch_status anon_retrievals_piece_fetch_status_enum NOT NULL, - http_response_code INTEGER, - first_byte_ms DOUBLE PRECISION, - last_byte_ms DOUBLE PRECISION, - bytes_retrieved BIGINT, - throughput_bps BIGINT, - commp_valid BOOLEAN, - car_parseable BOOLEAN, - car_block_count INTEGER, - block_fetch_endpoint VARCHAR, - block_fetch_valid BOOLEAN, - block_fetch_sampled_count INTEGER, - block_fetch_failed_count INTEGER, - ipni_status anon_retrievals_ipni_status_enum NOT NULL, - ipni_verify_ms DOUBLE PRECISION, - ipni_verified_cids_count INTEGER, - ipni_unverified_cids_count INTEGER, - error_message VARCHAR, - created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() - ) - `); - - await queryRunner.query(` - CREATE INDEX IF NOT EXISTS "IDX_anon_retrievals_sp_address_started_at" - ON anon_retrievals (sp_address, started_at) - `); - - await queryRunner.query(` - CREATE INDEX IF NOT EXISTS "IDX_anon_retrievals_started_at" - ON anon_retrievals (started_at) - `); - } - - public async down(queryRunner: QueryRunner): Promise { - await queryRunner.query(`DROP TABLE IF EXISTS anon_retrievals CASCADE`); - await queryRunner.query(`DROP TYPE IF EXISTS anon_retrievals_service_type_enum`); - await queryRunner.query(`DROP TYPE IF EXISTS anon_retrievals_ipni_status_enum`); - await queryRunner.query(`DROP TYPE IF EXISTS anon_retrievals_piece_fetch_status_enum`); - } -} diff --git a/apps/backend/src/database/types.ts b/apps/backend/src/database/types.ts index e09d1dd3..46fd5d28 100644 --- a/apps/backend/src/database/types.ts +++ b/apps/backend/src/database/types.ts @@ -28,18 +28,6 @@ export enum IpniStatus { FAILED = "failed", } -export enum PieceFetchStatus { - SUCCESS = "success", - FAILED = "failed", -} - -export enum IpniCheckStatus { - VALID = "valid", - INVALID = "invalid", - SKIPPED = "skipped", - ERROR = "error", -} - /** * Metadata schema for deal storage and retrieval */ diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts index 4f775150..b5f17c57 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts @@ -1,9 +1,8 @@ import type { Repository } from "typeorm"; import { beforeEach, describe, expect, it, vi } from "vitest"; import type { ClickhouseService } from "../clickhouse/clickhouse.service.js"; -import type { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; import type { StorageProvider } from "../database/entities/storage-provider.entity.js"; -import { IpniCheckStatus, PieceFetchStatus } from "../database/types.js"; +import { RetrievalStatus } from "../database/types.js"; import type { AnonRetrievalCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; import type { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; import type { AnonPieceSelectorService } from "./anon-piece-selector.service.js"; @@ -27,7 +26,7 @@ const PIECE = { function makeProvider(): StorageProvider { return { address: SP_ADDRESS, - providerId: 7n, + providerId: 7, name: "sp-test", isApproved: true, } as unknown as StorageProvider; @@ -40,7 +39,6 @@ function makeService(opts: { piece?: AnonPiece | null; carResult?: CarValidationResult; validateCarImpl?: () => Promise; - saveImpl?: (entity: AnonRetrieval) => Promise; }): { service: AnonRetrievalService; insertSpy: ReturnType; @@ -49,7 +47,6 @@ function makeService(opts: { metricsRecordStatusSpy: ReturnType; metricsRecordIpniSpy: ReturnType; metricsRecordBlockFetchSpy: ReturnType; - saveSpy: ReturnType; } { const insertSpy = vi.fn(); const clickhouseService = { @@ -62,11 +59,6 @@ function makeService(opts: { findOne: vi.fn(async () => makeProvider()), } as unknown as Repository; - const saveSpy = vi.fn(opts.saveImpl ?? (async (entity: AnonRetrieval) => entity)); - const anonRetrievalRepository = { - save: saveSpy, - } as unknown as Repository; - const anonPieceSelector = { selectPieceForProvider: vi.fn(async () => (opts.piece === null ? null : (opts.piece ?? PIECE))), } as unknown as AnonPieceSelectorService; @@ -108,7 +100,6 @@ function makeService(opts: { metrics, clickhouseService, spRepository, - anonRetrievalRepository, ); return { @@ -119,7 +110,6 @@ function makeService(opts: { metricsRecordStatusSpy, metricsRecordIpniSpy, metricsRecordBlockFetchSpy, - saveSpy, }; } @@ -128,7 +118,7 @@ describe("AnonRetrievalService", () => { vi.clearAllMocks(); }); - it("persists a Postgres row with partial metrics when fetchPiece returns aborted=true", async () => { + it("emits a ClickHouse row with partial metrics when fetchPiece returns aborted=true", async () => { const partial: PieceRetrievalResult = { success: false, pieceCid: PIECE.pieceCid, @@ -143,59 +133,41 @@ describe("AnonRetrievalService", () => { aborted: true, }; - const { service, saveSpy, insertSpy } = makeService({ pieceResult: partial }); + const { service, insertSpy } = makeService({ pieceResult: partial }); await service.performForProvider(SP_ADDRESS); - expect(saveSpy).toHaveBeenCalledTimes(1); - const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; - expect(entity.pieceFetchStatus).toBe(PieceFetchStatus.FAILED); - expect(entity.bytesRetrieved).toBe(524288n); - expect(entity.firstByteMs).toBe(150); - expect(entity.lastByteMs).toBe(42000); - expect(entity.throughputBps).toBe(12500n); - expect(entity.httpResponseCode).toBe(200); - expect(entity.errorMessage).toContain("Anon retrieval job timeout"); - expect(entity.pieceCid).toBe(PIECE.pieceCid); - expect(entity.spAddress).toBe(SP_ADDRESS); - expect(entity.spId).toBe(7n); - expect(entity.probeLocation).toBe("test-location"); - expect(entity.retrievalEndpoint).toBe(`https://sp.test/piece/${PIECE.pieceCid}`); - expect(typeof entity.id).toBe("string"); - - // CAR/IPNI/block-fetch were never run on a non-IPFS-indexed piece. - expect(entity.carParseable).toBeNull(); - expect(entity.carBlockCount).toBeNull(); - expect(entity.blockFetchEndpoint).toBeNull(); - expect(entity.blockFetchValid).toBeNull(); - expect(entity.blockFetchSampledCount).toBeNull(); - expect(entity.blockFetchFailedCount).toBeNull(); - expect(entity.ipniStatus).toBe(IpniCheckStatus.SKIPPED); - - // ClickHouse mirror is also written. expect(insertSpy).toHaveBeenCalledTimes(1); const [table, row] = insertSpy.mock.calls[0] as [string, Record]; expect(table).toBe("anon_retrieval_checks"); - expect(row.retrieval_id).toBe(entity.id); - expect(row.piece_fetch_status).toBe(PieceFetchStatus.FAILED); + expect(row.piece_fetch_status).toBe(RetrievalStatus.FAILED); expect(row.bytes_retrieved).toBe(524288); expect(row.first_byte_ms).toBe(150); expect(row.last_byte_ms).toBe(42000); expect(row.throughput_bps).toBe(12500); expect(row.http_response_code).toBe(200); - expect(row.ipni_status).toBe(IpniCheckStatus.SKIPPED); - - // Trimmed CH columns must NOT appear (they live only in Postgres). - expect(row).not.toHaveProperty("piece_cid"); - expect(row).not.toHaveProperty("data_set_id"); - expect(row).not.toHaveProperty("piece_id"); - expect(row).not.toHaveProperty("ipfs_root_cid"); - expect(row).not.toHaveProperty("retrieval_endpoint"); - expect(row).not.toHaveProperty("block_fetch_endpoint"); - expect(row).not.toHaveProperty("error_message"); + expect(row.error_message).toContain("Anon retrieval job timeout"); + expect(row.piece_cid).toBe(PIECE.pieceCid); + expect(row.sp_address).toBe(SP_ADDRESS); + expect(row.sp_id).toBe(7); + expect(row.probe_location).toBe("test-location"); + expect(typeof row.retrieval_id).toBe("string"); + + // CAR/IPNI/block-fetch were never run on a non-IPFS-indexed piece — every + // dimension column should explicitly say "skipped" (ipni_status) or null. + expect(row.car_parseable).toBeNull(); + expect(row.car_block_count).toBeNull(); + expect(row.block_fetch_endpoint).toBeNull(); + expect(row.block_fetch_valid).toBeNull(); + expect(row.block_fetch_sampled_count).toBeNull(); + expect(row.block_fetch_failed_count).toBeNull(); + expect(row.ipni_status).toBe("skipped"); + expect(row.ipni_verify_ms).toBeNull(); + expect(row.ipni_verified_cids_count).toBeNull(); + expect(row.ipni_unverified_cids_count).toBeNull(); }); - it("still persists when the signal aborts before fetchPiece runs", async () => { + it("still emits a row when the signal aborts before fetchPiece runs", async () => { const ac = new AbortController(); ac.abort(new Error("Anon retrieval job timeout (60s) for sp1")); @@ -211,21 +183,20 @@ describe("AnonRetrievalService", () => { commPValid: false, }; - const { service, saveSpy, insertSpy, fetchSpy } = makeService({ pieceResult: never }); + const { service, insertSpy, fetchSpy } = makeService({ pieceResult: never }); await service.performForProvider(SP_ADDRESS, ac.signal); expect(fetchSpy).not.toHaveBeenCalled(); - expect(saveSpy).toHaveBeenCalledTimes(1); - const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; - expect(entity.pieceFetchStatus).toBe(PieceFetchStatus.FAILED); - expect(entity.errorMessage).toContain("Anon retrieval job timeout"); - expect(entity.bytesRetrieved).toBeNull(); - expect(entity.firstByteMs).toBeNull(); expect(insertSpy).toHaveBeenCalledTimes(1); + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.piece_fetch_status).toBe(RetrievalStatus.FAILED); + expect(row.error_message).toContain("Anon retrieval job timeout"); + expect(row.bytes_retrieved).toBeNull(); + expect(row.first_byte_ms).toBeNull(); }); - it("still persists when fetchPiece throws unexpectedly", async () => { + it("still emits a row when fetchPiece throws unexpectedly", async () => { const never: PieceRetrievalResult = { success: false, pieceCid: PIECE.pieceCid, @@ -238,7 +209,7 @@ describe("AnonRetrievalService", () => { commPValid: false, }; - const { service, saveSpy } = makeService({ + const { service, insertSpy } = makeService({ pieceResult: never, fetchPieceImpl: async () => { throw new Error("network down"); @@ -247,12 +218,12 @@ describe("AnonRetrievalService", () => { await expect(service.performForProvider(SP_ADDRESS)).rejects.toThrow("network down"); - expect(saveSpy).toHaveBeenCalledTimes(1); - const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; - expect(entity.pieceFetchStatus).toBe(PieceFetchStatus.FAILED); + expect(insertSpy).toHaveBeenCalledTimes(1); + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.piece_fetch_status).toBe(RetrievalStatus.FAILED); }); - it("does not throw when Postgres save fails and still attempts the CH insert", async () => { + it("skips ClickHouse insert when ClickHouse is disabled", async () => { const ok: PieceRetrievalResult = { success: true, pieceCid: PIECE.pieceCid, @@ -265,20 +236,11 @@ describe("AnonRetrievalService", () => { commPValid: true, }; - const { service, saveSpy, insertSpy } = makeService({ - pieceResult: ok, - saveImpl: async () => { - throw new Error("connection refused"); - }, - }); + const { service, insertSpy } = makeService({ pieceResult: ok, clickhouseEnabled: false }); - await expect(service.performForProvider(SP_ADDRESS)).resolves.toBeUndefined(); + await service.performForProvider(SP_ADDRESS); - expect(saveSpy).toHaveBeenCalledTimes(1); - // CH still gets the row keyed by the client-side uuid. - expect(insertSpy).toHaveBeenCalledTimes(1); - const [, row] = insertSpy.mock.calls[0] as [string, Record]; - expect(typeof row.retrieval_id).toBe("string"); + expect(insertSpy).not.toHaveBeenCalled(); }); describe("with IPFS indexing", () => { @@ -302,7 +264,7 @@ describe("AnonRetrievalService", () => { }; } - it("populates CAR/IPNI/block-fetch columns when validation fully succeeds", async () => { + it("emits populated CAR/IPNI/block-fetch columns when validation fully succeeds", async () => { const carResult: CarValidationResult = { carParseable: true, blockCount: 42, @@ -316,7 +278,7 @@ describe("AnonRetrievalService", () => { blockFetchEndpoint: "https://sp.test/ipfs/", }; - const { service, saveSpy, insertSpy, validateCarSpy } = makeService({ + const { service, insertSpy, validateCarSpy } = makeService({ pieceResult: okPiece(Buffer.from("car-bytes")), piece: INDEXED_PIECE, carResult, @@ -325,24 +287,19 @@ describe("AnonRetrievalService", () => { await service.performForProvider(SP_ADDRESS); expect(validateCarSpy).toHaveBeenCalledTimes(1); - const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; - expect(entity.pieceFetchStatus).toBe(PieceFetchStatus.SUCCESS); - expect(entity.commpValid).toBe(true); - expect(entity.carParseable).toBe(true); - expect(entity.carBlockCount).toBe(42); - expect(entity.blockFetchEndpoint).toBe("https://sp.test/ipfs/"); - expect(entity.blockFetchValid).toBe(true); - expect(entity.blockFetchSampledCount).toBe(5); - expect(entity.blockFetchFailedCount).toBe(0); - expect(entity.ipniStatus).toBe(IpniCheckStatus.VALID); - expect(entity.ipniVerifyMs).toBe(137); - expect(entity.ipniVerifiedCidsCount).toBe(6); - expect(entity.ipniUnverifiedCidsCount).toBe(0); - const [, row] = insertSpy.mock.calls[0] as [string, Record]; - expect(row.piece_fetch_status).toBe(PieceFetchStatus.SUCCESS); + expect(row.piece_fetch_status).toBe(RetrievalStatus.SUCCESS); + expect(row.commp_valid).toBe(true); expect(row.car_parseable).toBe(true); - expect(row.ipni_status).toBe(IpniCheckStatus.VALID); + expect(row.car_block_count).toBe(42); + expect(row.block_fetch_endpoint).toBe("https://sp.test/ipfs/"); + expect(row.block_fetch_valid).toBe(true); + expect(row.block_fetch_sampled_count).toBe(5); + expect(row.block_fetch_failed_count).toBe(0); + expect(row.ipni_status).toBe("valid"); + expect(row.ipni_verify_ms).toBe(137); + expect(row.ipni_verified_cids_count).toBe(6); + expect(row.ipni_unverified_cids_count).toBe(0); }); it("distinguishes IPNI invalid from block-fetch failures with explicit counts", async () => { @@ -359,7 +316,7 @@ describe("AnonRetrievalService", () => { blockFetchEndpoint: "https://sp.test/ipfs/", }; - const { service, saveSpy } = makeService({ + const { service, insertSpy } = makeService({ pieceResult: okPiece(Buffer.from("car-bytes")), piece: INDEXED_PIECE, carResult, @@ -367,24 +324,24 @@ describe("AnonRetrievalService", () => { await service.performForProvider(SP_ADDRESS); - const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; + const [, row] = insertSpy.mock.calls[0] as [string, Record]; // The piece-fetch path still succeeded — failures are surfaced as // independent dimensions, not folded into piece_fetch_status. - expect(entity.pieceFetchStatus).toBe(PieceFetchStatus.SUCCESS); - expect(entity.carParseable).toBe(true); - expect(entity.ipniStatus).toBe(IpniCheckStatus.INVALID); - expect(entity.ipniVerifiedCidsCount).toBe(0); - expect(entity.ipniUnverifiedCidsCount).toBe(6); - expect(entity.blockFetchValid).toBe(false); - expect(entity.blockFetchSampledCount).toBe(5); - expect(entity.blockFetchFailedCount).toBe(2); + expect(row.piece_fetch_status).toBe(RetrievalStatus.SUCCESS); + expect(row.car_parseable).toBe(true); + expect(row.ipni_status).toBe("invalid"); + expect(row.ipni_verified_cids_count).toBe(0); + expect(row.ipni_unverified_cids_count).toBe(6); + expect(row.block_fetch_valid).toBe(false); + expect(row.block_fetch_sampled_count).toBe(5); + expect(row.block_fetch_failed_count).toBe(2); }); it("emits ipni_status='error' (not 'skipped') when CAR validation throws on a successful piece", async () => { // Distinguishes a real infra outage (e.g. IpniVerificationService down) // from a piece that legitimately had no IPFS indexing. Without the // distinction, an outage looks like normal non-IPFS volume in dashboards. - const { service, saveSpy, metricsRecordIpniSpy, metricsRecordBlockFetchSpy } = makeService({ + const { service, insertSpy, metricsRecordIpniSpy, metricsRecordBlockFetchSpy } = makeService({ pieceResult: okPiece(Buffer.from("car-bytes")), piece: INDEXED_PIECE, validateCarImpl: async () => { @@ -397,11 +354,11 @@ describe("AnonRetrievalService", () => { expect(metricsRecordIpniSpy).toHaveBeenCalledWith(expect.anything(), "error"); expect(metricsRecordBlockFetchSpy).toHaveBeenCalledWith(expect.anything(), "error"); - const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; - expect(entity.ipniStatus).toBe(IpniCheckStatus.ERROR); + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.ipni_status).toBe("error"); // Piece-fetch path itself succeeded — only the validation pipeline failed. - expect(entity.commpValid).toBe(true); - expect(entity.carParseable).toBeNull(); + expect(row.commp_valid).toBe(true); + expect(row.car_parseable).toBeNull(); }); it("emits car_parseable=false with skipped IPNI/block-fetch when bytes don't parse as CAR", async () => { @@ -418,7 +375,7 @@ describe("AnonRetrievalService", () => { blockFetchEndpoint: null, }; - const { service, saveSpy } = makeService({ + const { service, insertSpy } = makeService({ pieceResult: okPiece(Buffer.from("not-a-car")), piece: INDEXED_PIECE, carResult, @@ -426,19 +383,19 @@ describe("AnonRetrievalService", () => { await service.performForProvider(SP_ADDRESS); - const entity = saveSpy.mock.calls[0]?.[0] as AnonRetrieval; - expect(entity.carParseable).toBe(false); + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.car_parseable).toBe(false); // car_block_count and block_fetch_sampled_count are gated on carParseable // so an unparseable CAR doesn't emit a misleading 0. - expect(entity.carBlockCount).toBeNull(); - expect(entity.blockFetchSampledCount).toBeNull(); - expect(entity.blockFetchEndpoint).toBeNull(); - expect(entity.blockFetchValid).toBeNull(); - expect(entity.blockFetchFailedCount).toBeNull(); - expect(entity.ipniStatus).toBe(IpniCheckStatus.SKIPPED); - expect(entity.ipniVerifyMs).toBeNull(); - expect(entity.ipniVerifiedCidsCount).toBeNull(); - expect(entity.ipniUnverifiedCidsCount).toBeNull(); + expect(row.car_block_count).toBeNull(); + expect(row.block_fetch_sampled_count).toBeNull(); + expect(row.block_fetch_endpoint).toBeNull(); + expect(row.block_fetch_valid).toBeNull(); + expect(row.block_fetch_failed_count).toBeNull(); + expect(row.ipni_status).toBe("skipped"); + expect(row.ipni_verify_ms).toBeNull(); + expect(row.ipni_verified_cids_count).toBeNull(); + expect(row.ipni_unverified_cids_count).toBeNull(); }); }); }); diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index d8298776..5343d59a 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -4,9 +4,8 @@ import { InjectRepository } from "@nestjs/typeorm"; import type { Repository } from "typeorm"; import { ClickhouseService } from "../clickhouse/clickhouse.service.js"; import { type ProviderJobContext, toStructuredError } from "../common/logging.js"; -import { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; -import { IpniCheckStatus, PieceFetchStatus, ServiceType } from "../database/types.js"; +import { RetrievalStatus, ServiceType } from "../database/types.js"; import { buildCheckMetricLabels } from "../metrics-prometheus/check-metric-labels.js"; import { AnonRetrievalCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; @@ -30,8 +29,6 @@ export class AnonRetrievalService { private readonly clickhouseService: ClickhouseService, @InjectRepository(StorageProvider) private readonly spRepository: Repository, - @InjectRepository(AnonRetrieval) - private readonly anonRetrievalRepository: Repository, ) {} async performForProvider(spAddress: string, signal?: AbortSignal, logContext?: ProviderJobContext): Promise { @@ -140,75 +137,80 @@ export class AnonRetrievalService { pieceResult.success ? "success" : pieceResult.aborted ? "failure.aborted" : "failure.http", ); } finally { - // Always persist a row — even on abort or unexpected error — so we never - // lose the evidence (ttfb, bytes, response code) we already collected. + // Always emit a ClickHouse row — even on abort or unexpected error — so + // we never lose the evidence (ttfb, bytes, response code) we already + // collected. const finalPieceResult = pieceResult ?? buildAbortedPlaceholder(piece.pieceCid, signal?.reason); - const providerInfo = this.walletSdkService.getProviderInfo(spAddress); - const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; - const retrievalEndpoint = `${spBaseUrl}/piece/${piece.pieceCid}`; - const pieceFetchStatus = finalPieceResult.success ? PieceFetchStatus.SUCCESS : PieceFetchStatus.FAILED; - const ipniStatus: IpniCheckStatus = !validatedCarPiece - ? IpniCheckStatus.SKIPPED - : carResult - ? ipniStatusFromResult(carResult) - : IpniCheckStatus.ERROR; + const retrievalId = randomUUID(); - const entity: AnonRetrieval = { - id: randomUUID(), - createdAt: startedAt, - startedAt, - probeLocation: this.clickhouseService.probeLocation, - spAddress, - spId: provider?.providerId ?? null, - spName: provider?.name ?? null, - pieceCid: piece.pieceCid, - dataSetId: BigInt(piece.dataSetId), - pieceId: BigInt(piece.pieceId), - rawSize: BigInt(piece.rawSize), - withIpfsIndexing: piece.withIPFSIndexing, - ipfsRootCid: piece.ipfsRootCid, - serviceType: ServiceType.DIRECT_SP, - retrievalEndpoint, - pieceFetchStatus, - httpResponseCode: finalPieceResult.statusCode > 0 ? finalPieceResult.statusCode : null, - firstByteMs: finalPieceResult.ttfbMs > 0 ? finalPieceResult.ttfbMs : null, - lastByteMs: finalPieceResult.latencyMs > 0 ? finalPieceResult.latencyMs : null, - bytesRetrieved: finalPieceResult.bytesReceived > 0 ? BigInt(finalPieceResult.bytesReceived) : null, - throughputBps: finalPieceResult.throughputBps > 0 ? BigInt(Math.round(finalPieceResult.throughputBps)) : null, - commpValid: finalPieceResult.success ? finalPieceResult.commPValid : null, - carParseable: carResult ? carResult.carParseable : null, - carBlockCount: carResult?.carParseable ? carResult.blockCount : null, - blockFetchEndpoint: carResult?.blockFetchEndpoint ?? null, - blockFetchValid: carResult ? carResult.blockFetchValid : null, - blockFetchSampledCount: carResult?.carParseable ? carResult.sampledCidCount : null, - blockFetchFailedCount: carResult?.blockFetchFailedCount ?? null, - ipniStatus, - ipniVerifyMs: carResult?.ipniVerifyMs ?? null, - ipniVerifiedCidsCount: carResult?.ipniVerifiedCidsCount ?? null, - ipniUnverifiedCidsCount: carResult?.ipniUnverifiedCidsCount ?? null, - errorMessage: finalPieceResult.errorMessage ?? null, - }; + if (this.clickhouseService.enabled) { + const providerInfo = this.walletSdkService.getProviderInfo(spAddress); + const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; + const pieceFetchStatus = finalPieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED; + const ipniStatus = !validatedCarPiece ? "skipped" : carResult ? ipniStatusFromResult(carResult) : "error"; - try { - await this.anonRetrievalRepository.save(entity); - } catch (error) { - this.logger.warn({ + try { + this.clickhouseService.insert(ANON_RETRIEVAL_CHECKS_TABLE, { + timestamp: startedAt.getTime(), + probe_location: this.clickhouseService.probeLocation, + sp_address: spAddress, + sp_id: provider?.providerId != null ? Number(provider.providerId) : null, + sp_name: provider?.name ?? null, + retrieval_id: retrievalId, + piece_cid: piece.pieceCid, + data_set_id: piece.dataSetId, + piece_id: piece.pieceId, + raw_size: piece.rawSize, + with_ipfs_indexing: piece.withIPFSIndexing, + ipfs_root_cid: piece.ipfsRootCid, + service_type: ServiceType.DIRECT_SP, + retrieval_endpoint: `${spBaseUrl}/piece/${piece.pieceCid}`, + piece_fetch_status: pieceFetchStatus, + http_response_code: finalPieceResult.statusCode > 0 ? finalPieceResult.statusCode : null, + first_byte_ms: finalPieceResult.ttfbMs > 0 ? finalPieceResult.ttfbMs : null, + last_byte_ms: finalPieceResult.latencyMs > 0 ? finalPieceResult.latencyMs : null, + bytes_retrieved: finalPieceResult.bytesReceived > 0 ? finalPieceResult.bytesReceived : null, + throughput_bps: finalPieceResult.throughputBps > 0 ? Math.round(finalPieceResult.throughputBps) : null, + commp_valid: finalPieceResult.success ? finalPieceResult.commPValid : null, + car_parseable: carResult ? carResult.carParseable : null, + car_block_count: carResult?.carParseable ? carResult?.blockCount : null, + block_fetch_endpoint: carResult?.blockFetchEndpoint ?? null, + block_fetch_valid: carResult ? carResult.blockFetchValid : null, + block_fetch_sampled_count: carResult?.carParseable ? carResult?.sampledCidCount : null, + block_fetch_failed_count: carResult?.blockFetchFailedCount ?? null, + ipni_status: ipniStatus, + ipni_verify_ms: carResult?.ipniVerifyMs ?? null, + ipni_verified_cids_count: carResult?.ipniVerifiedCidsCount ?? null, + ipni_unverified_cids_count: carResult?.ipniUnverifiedCidsCount ?? null, + error_message: finalPieceResult.errorMessage ?? null, + }); + } catch (error) { + // ClickhouseService.insert is buffered/non-throwing in normal operation, but + // guard against unexpected runtime errors so we don't break the probe cycle. + this.logger.warn({ + ...logContext, + event: "anon_retrieval_clickhouse_insert_failed", + message: "Failed to enqueue anonymous retrieval row to ClickHouse", + pieceCid: piece.pieceCid, + spAddress, + error: toStructuredError(error), + }); + } + } else { + this.logger.debug({ ...logContext, - event: "anon_retrieval_save_failed", - message: "Failed to persist anonymous retrieval row to Postgres", + event: "anon_retrieval_clickhouse_disabled", + message: "ClickHouse disabled — anon retrieval row not emitted", pieceCid: piece.pieceCid, spAddress, - error: toStructuredError(error), }); } - this.clickhouseService.insert(ANON_RETRIEVAL_CHECKS_TABLE, toClickhouseRow(entity)); - this.logger.log({ ...logContext, event: "anon_retrieval_completed", message: "Anonymous retrieval test completed", - retrievalId: entity.id, + retrievalId, pieceCid: piece.pieceCid, spAddress, success: finalPieceResult.success, @@ -224,53 +226,9 @@ export class AnonRetrievalService { } } -function ipniStatusFromResult(result: CarValidationResult): IpniCheckStatus { - switch (result.ipniValid) { - case null: - return IpniCheckStatus.SKIPPED; - case true: - return IpniCheckStatus.VALID; - case false: - return IpniCheckStatus.INVALID; - default: - throw new Error(`Unexpected IPNI validation result: ${result.ipniValid}`); - } -} - -/** - * Project an AnonRetrieval entity to the chartable subset stored in ClickHouse. - * High-cardinality identifiers (piece_cid, data_set_id, piece_id, ipfs_root_cid), - * URLs (retrieval_endpoint, block_fetch_endpoint), and free-text columns - * (error_message) are intentionally dropped — they live only in Postgres. - */ -function toClickhouseRow(entity: AnonRetrieval): Record { - return { - timestamp: entity.startedAt.getTime(), - probe_location: entity.probeLocation, - sp_address: entity.spAddress, - sp_id: entity.spId != null ? Number(entity.spId) : null, - sp_name: entity.spName, - retrieval_id: entity.id, - raw_size: Number(entity.rawSize), - with_ipfs_indexing: entity.withIpfsIndexing, - service_type: entity.serviceType, - piece_fetch_status: entity.pieceFetchStatus, - http_response_code: entity.httpResponseCode, - first_byte_ms: entity.firstByteMs, - last_byte_ms: entity.lastByteMs, - bytes_retrieved: entity.bytesRetrieved != null ? Number(entity.bytesRetrieved) : null, - throughput_bps: entity.throughputBps != null ? Number(entity.throughputBps) : null, - commp_valid: entity.commpValid, - car_parseable: entity.carParseable, - car_block_count: entity.carBlockCount, - block_fetch_valid: entity.blockFetchValid, - block_fetch_sampled_count: entity.blockFetchSampledCount, - block_fetch_failed_count: entity.blockFetchFailedCount, - ipni_status: entity.ipniStatus, - ipni_verify_ms: entity.ipniVerifyMs, - ipni_verified_cids_count: entity.ipniVerifiedCidsCount, - ipni_unverified_cids_count: entity.ipniUnverifiedCidsCount, - }; +function ipniStatusFromResult(result: CarValidationResult): "valid" | "invalid" | "skipped" { + if (result.ipniValid === null) return "skipped"; + return result.ipniValid ? "valid" : "invalid"; } function buildAbortedPlaceholder(pieceCid: string, reason: unknown): PieceRetrievalResult { diff --git a/apps/backend/src/retrieval-anon/retrieval-anon.module.ts b/apps/backend/src/retrieval-anon/retrieval-anon.module.ts index 4e9e38df..c05dcb5f 100644 --- a/apps/backend/src/retrieval-anon/retrieval-anon.module.ts +++ b/apps/backend/src/retrieval-anon/retrieval-anon.module.ts @@ -1,7 +1,6 @@ import { Module } from "@nestjs/common"; import { ConfigModule } from "@nestjs/config"; import { TypeOrmModule } from "@nestjs/typeorm"; -import { AnonRetrieval } from "../database/entities/anon-retrieval.entity.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; import { HttpClientModule } from "../http-client/http-client.module.js"; import { IpniModule } from "../ipni/ipni.module.js"; @@ -15,7 +14,7 @@ import { PieceRetrievalService } from "./piece-retrieval.service.js"; @Module({ imports: [ ConfigModule, - TypeOrmModule.forFeature([AnonRetrieval, StorageProvider]), + TypeOrmModule.forFeature([StorageProvider]), SubgraphModule, WalletSdkModule, HttpClientModule, From d4f7d802f93a3c48cec49bc9f145bc28c2815ea3 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Mon, 4 May 2026 08:29:28 +0200 Subject: [PATCH 12/28] refactor(retrieval-anon): introduce IpniCheckStatus enum and drop redundant clickhouse-enabled gate - Replace string literals ("valid"|"invalid"|"skipped"|"error") with IpniCheckStatus enum in anon-retrieval.service.ts - Drop the `if (clickhouseService.enabled)` wrapper around the insert call; ClickhouseService.insert is already a no-op when disabled, matching the pattern used by other retrieval flows - Fix outdated ipni_status schema comment to include the 'error' value --- .../src/clickhouse/clickhouse.schema.ts | 2 +- apps/backend/src/database/types.ts | 7 + .../anon-retrieval.service.spec.ts | 23 +-- .../retrieval-anon/anon-retrieval.service.ts | 133 +++++++++--------- 4 files changed, 74 insertions(+), 91 deletions(-) diff --git a/apps/backend/src/clickhouse/clickhouse.schema.ts b/apps/backend/src/clickhouse/clickhouse.schema.ts index e30f6151..05684154 100644 --- a/apps/backend/src/clickhouse/clickhouse.schema.ts +++ b/apps/backend/src/clickhouse/clickhouse.schema.ts @@ -97,7 +97,7 @@ export function buildMigrations(database: string): string[] { block_fetch_sampled_count Nullable(UInt32), -- number of blocks sampled and probed via /ipfs/?format=raw block_fetch_failed_count Nullable(UInt32), -- number of sampled blocks that failed (non-2xx, hash mismatch, unsupported codec, or transport error) - ipni_status LowCardinality(String), -- 'valid' | 'invalid' | 'skipped' (mirrors data_storage_checks naming) + ipni_status LowCardinality(String), -- 'valid' | 'invalid' | 'skipped' | 'error' ipni_verify_ms Nullable(Float64), -- IPNI verification duration; null when skipped ipni_verified_cids_count Nullable(UInt32), -- CIDs confirmed findable via IPNI ipni_unverified_cids_count Nullable(UInt32), -- CIDs checked but not findable diff --git a/apps/backend/src/database/types.ts b/apps/backend/src/database/types.ts index 46fd5d28..c56b355a 100644 --- a/apps/backend/src/database/types.ts +++ b/apps/backend/src/database/types.ts @@ -28,6 +28,13 @@ export enum IpniStatus { FAILED = "failed", } +export enum IpniCheckStatus { + VALID = "valid", + INVALID = "invalid", + SKIPPED = "skipped", + ERROR = "error", +} + /** * Metadata schema for deal storage and retrieval */ diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts index b5f17c57..c82eed76 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts @@ -35,7 +35,6 @@ function makeProvider(): StorageProvider { function makeService(opts: { pieceResult: PieceRetrievalResult; fetchPieceImpl?: (signal?: AbortSignal) => Promise; - clickhouseEnabled?: boolean; piece?: AnonPiece | null; carResult?: CarValidationResult; validateCarImpl?: () => Promise; @@ -51,7 +50,7 @@ function makeService(opts: { const insertSpy = vi.fn(); const clickhouseService = { insert: insertSpy, - enabled: opts.clickhouseEnabled ?? true, + enabled: true, probeLocation: "test-location", } as unknown as ClickhouseService; @@ -223,26 +222,6 @@ describe("AnonRetrievalService", () => { expect(row.piece_fetch_status).toBe(RetrievalStatus.FAILED); }); - it("skips ClickHouse insert when ClickHouse is disabled", async () => { - const ok: PieceRetrievalResult = { - success: true, - pieceCid: PIECE.pieceCid, - bytesReceived: 1024, - pieceBytes: null, - latencyMs: 100, - ttfbMs: 10, - throughputBps: 10240, - statusCode: 200, - commPValid: true, - }; - - const { service, insertSpy } = makeService({ pieceResult: ok, clickhouseEnabled: false }); - - await service.performForProvider(SP_ADDRESS); - - expect(insertSpy).not.toHaveBeenCalled(); - }); - describe("with IPFS indexing", () => { const INDEXED_PIECE: AnonPiece = { ...PIECE, diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index 5343d59a..c1d08c0e 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -5,7 +5,7 @@ import type { Repository } from "typeorm"; import { ClickhouseService } from "../clickhouse/clickhouse.service.js"; import { type ProviderJobContext, toStructuredError } from "../common/logging.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; -import { RetrievalStatus, ServiceType } from "../database/types.js"; +import { IpniCheckStatus, RetrievalStatus, ServiceType } from "../database/types.js"; import { buildCheckMetricLabels } from "../metrics-prometheus/check-metric-labels.js"; import { AnonRetrievalCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; @@ -108,13 +108,17 @@ export class AnonRetrievalService { this.metrics.recordIpniStatus(labels, ipniStatusFromResult(carResult)); this.metrics.recordBlockFetchStatus( labels, - carResult.blockFetchValid === null ? "skipped" : carResult.blockFetchValid ? "valid" : "invalid", + carResult.blockFetchValid === null + ? IpniCheckStatus.SKIPPED + : carResult.blockFetchValid + ? IpniCheckStatus.VALID + : IpniCheckStatus.INVALID, ); } catch (error) { // Validation was attempted on a successful piece retrieval but threw. this.metrics.recordCarParseStatus(labels, false); - this.metrics.recordIpniStatus(labels, "error"); - this.metrics.recordBlockFetchStatus(labels, "error"); + this.metrics.recordIpniStatus(labels, IpniCheckStatus.ERROR); + this.metrics.recordBlockFetchStatus(labels, IpniCheckStatus.ERROR); this.logger.warn({ ...logContext, event: "anon_retrieval_car_validation_failed", @@ -126,8 +130,8 @@ export class AnonRetrievalService { } } else if (!pieceResult.success) { // Piece retrieval failed — IPNI and block fetch were skipped - this.metrics.recordIpniStatus(labels, "skipped"); - this.metrics.recordBlockFetchStatus(labels, "skipped"); + this.metrics.recordIpniStatus(labels, IpniCheckStatus.SKIPPED); + this.metrics.recordBlockFetchStatus(labels, IpniCheckStatus.SKIPPED); } // Overall check duration and status @@ -139,70 +143,63 @@ export class AnonRetrievalService { } finally { // Always emit a ClickHouse row — even on abort or unexpected error — so // we never lose the evidence (ttfb, bytes, response code) we already - // collected. + // collected. ClickhouseService.insert is a no-op when disabled. const finalPieceResult = pieceResult ?? buildAbortedPlaceholder(piece.pieceCid, signal?.reason); const retrievalId = randomUUID(); - - if (this.clickhouseService.enabled) { - const providerInfo = this.walletSdkService.getProviderInfo(spAddress); - const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; - const pieceFetchStatus = finalPieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED; - const ipniStatus = !validatedCarPiece ? "skipped" : carResult ? ipniStatusFromResult(carResult) : "error"; - - try { - this.clickhouseService.insert(ANON_RETRIEVAL_CHECKS_TABLE, { - timestamp: startedAt.getTime(), - probe_location: this.clickhouseService.probeLocation, - sp_address: spAddress, - sp_id: provider?.providerId != null ? Number(provider.providerId) : null, - sp_name: provider?.name ?? null, - retrieval_id: retrievalId, - piece_cid: piece.pieceCid, - data_set_id: piece.dataSetId, - piece_id: piece.pieceId, - raw_size: piece.rawSize, - with_ipfs_indexing: piece.withIPFSIndexing, - ipfs_root_cid: piece.ipfsRootCid, - service_type: ServiceType.DIRECT_SP, - retrieval_endpoint: `${spBaseUrl}/piece/${piece.pieceCid}`, - piece_fetch_status: pieceFetchStatus, - http_response_code: finalPieceResult.statusCode > 0 ? finalPieceResult.statusCode : null, - first_byte_ms: finalPieceResult.ttfbMs > 0 ? finalPieceResult.ttfbMs : null, - last_byte_ms: finalPieceResult.latencyMs > 0 ? finalPieceResult.latencyMs : null, - bytes_retrieved: finalPieceResult.bytesReceived > 0 ? finalPieceResult.bytesReceived : null, - throughput_bps: finalPieceResult.throughputBps > 0 ? Math.round(finalPieceResult.throughputBps) : null, - commp_valid: finalPieceResult.success ? finalPieceResult.commPValid : null, - car_parseable: carResult ? carResult.carParseable : null, - car_block_count: carResult?.carParseable ? carResult?.blockCount : null, - block_fetch_endpoint: carResult?.blockFetchEndpoint ?? null, - block_fetch_valid: carResult ? carResult.blockFetchValid : null, - block_fetch_sampled_count: carResult?.carParseable ? carResult?.sampledCidCount : null, - block_fetch_failed_count: carResult?.blockFetchFailedCount ?? null, - ipni_status: ipniStatus, - ipni_verify_ms: carResult?.ipniVerifyMs ?? null, - ipni_verified_cids_count: carResult?.ipniVerifiedCidsCount ?? null, - ipni_unverified_cids_count: carResult?.ipniUnverifiedCidsCount ?? null, - error_message: finalPieceResult.errorMessage ?? null, - }); - } catch (error) { - // ClickhouseService.insert is buffered/non-throwing in normal operation, but - // guard against unexpected runtime errors so we don't break the probe cycle. - this.logger.warn({ - ...logContext, - event: "anon_retrieval_clickhouse_insert_failed", - message: "Failed to enqueue anonymous retrieval row to ClickHouse", - pieceCid: piece.pieceCid, - spAddress, - error: toStructuredError(error), - }); - } - } else { - this.logger.debug({ + const providerInfo = this.walletSdkService.getProviderInfo(spAddress); + const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; + const pieceFetchStatus = finalPieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED; + const ipniStatus: IpniCheckStatus = !validatedCarPiece + ? IpniCheckStatus.SKIPPED + : carResult + ? ipniStatusFromResult(carResult) + : IpniCheckStatus.ERROR; + + try { + this.clickhouseService.insert(ANON_RETRIEVAL_CHECKS_TABLE, { + timestamp: startedAt.getTime(), + probe_location: this.clickhouseService.probeLocation, + sp_address: spAddress, + sp_id: provider?.providerId != null ? Number(provider.providerId) : null, + sp_name: provider?.name ?? null, + retrieval_id: retrievalId, + piece_cid: piece.pieceCid, + data_set_id: piece.dataSetId, + piece_id: piece.pieceId, + raw_size: piece.rawSize, + with_ipfs_indexing: piece.withIPFSIndexing, + ipfs_root_cid: piece.ipfsRootCid, + service_type: ServiceType.DIRECT_SP, + retrieval_endpoint: `${spBaseUrl}/piece/${piece.pieceCid}`, + piece_fetch_status: pieceFetchStatus, + http_response_code: finalPieceResult.statusCode > 0 ? finalPieceResult.statusCode : null, + first_byte_ms: finalPieceResult.ttfbMs > 0 ? finalPieceResult.ttfbMs : null, + last_byte_ms: finalPieceResult.latencyMs > 0 ? finalPieceResult.latencyMs : null, + bytes_retrieved: finalPieceResult.bytesReceived > 0 ? finalPieceResult.bytesReceived : null, + throughput_bps: finalPieceResult.throughputBps > 0 ? Math.round(finalPieceResult.throughputBps) : null, + commp_valid: finalPieceResult.success ? finalPieceResult.commPValid : null, + car_parseable: carResult ? carResult.carParseable : null, + car_block_count: carResult?.carParseable ? carResult?.blockCount : null, + block_fetch_endpoint: carResult?.blockFetchEndpoint ?? null, + block_fetch_valid: carResult ? carResult.blockFetchValid : null, + block_fetch_sampled_count: carResult?.carParseable ? carResult?.sampledCidCount : null, + block_fetch_failed_count: carResult?.blockFetchFailedCount ?? null, + ipni_status: ipniStatus, + ipni_verify_ms: carResult?.ipniVerifyMs ?? null, + ipni_verified_cids_count: carResult?.ipniVerifiedCidsCount ?? null, + ipni_unverified_cids_count: carResult?.ipniUnverifiedCidsCount ?? null, + error_message: finalPieceResult.errorMessage ?? null, + }); + } catch (error) { + // ClickhouseService.insert is buffered/non-throwing in normal operation, but + // guard against unexpected runtime errors so we don't break the probe cycle. + this.logger.warn({ ...logContext, - event: "anon_retrieval_clickhouse_disabled", - message: "ClickHouse disabled — anon retrieval row not emitted", + event: "anon_retrieval_clickhouse_insert_failed", + message: "Failed to enqueue anonymous retrieval row to ClickHouse", pieceCid: piece.pieceCid, spAddress, + error: toStructuredError(error), }); } @@ -226,9 +223,9 @@ export class AnonRetrievalService { } } -function ipniStatusFromResult(result: CarValidationResult): "valid" | "invalid" | "skipped" { - if (result.ipniValid === null) return "skipped"; - return result.ipniValid ? "valid" : "invalid"; +function ipniStatusFromResult(result: CarValidationResult): IpniCheckStatus { + if (result.ipniValid === null) return IpniCheckStatus.SKIPPED; + return result.ipniValid ? IpniCheckStatus.VALID : IpniCheckStatus.INVALID; } function buildAbortedPlaceholder(pieceCid: string, reason: unknown): PieceRetrievalResult { From ab3748a047415581dbc1aa2ed09651ff4f11d80e Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Mon, 4 May 2026 09:00:59 +0200 Subject: [PATCH 13/28] remove(retrieval-anon): dedup window logic --- .../anon-piece-selector.service.spec.ts | 20 ------------ .../anon-piece-selector.service.ts | 31 ------------------- 2 files changed, 51 deletions(-) diff --git a/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts b/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts index 32d13719..30a04486 100644 --- a/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts @@ -100,26 +100,6 @@ describe("AnonPieceSelectorService", () => { expect(result?.pieceCid).toBe(liveCid); }); - it("redraws when the first sampled piece was recently selected by this process", async () => { - const staleCid = "baga-stale"; - const freshCid = "baga-fresh"; - - const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); - - // Prime the in-memory ring buffer by first selecting `staleCid`. - sampleAnonPiece.mockResolvedValueOnce(makePiece({ pieceCid: staleCid })); - const first = await service.selectPieceForProvider(SP_ADDRESS); - expect(first?.pieceCid).toBe(staleCid); - - // Now the second selection should skip `staleCid` and use `freshCid`. - sampleAnonPiece - .mockResolvedValueOnce(makePiece({ pieceCid: staleCid })) - .mockResolvedValueOnce(makePiece({ pieceCid: freshCid })); - const second = await service.selectPieceForProvider(SP_ADDRESS); - - expect(second?.pieceCid).toBe(freshCid); - }); - it("falls back to the opposite pool when the preferred one is empty", async () => { // First pool call returns nothing twice (both attempts), second pool succeeds. const fresh = makePiece({ pieceCid: "baga-other-pool" }); diff --git a/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts index 342a4780..0ee51fc7 100644 --- a/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts +++ b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts @@ -7,11 +7,6 @@ import { SubgraphService } from "../subgraph/subgraph.service.js"; import type { AnonCandidatePiece } from "../subgraph/types.js"; import type { AnonPiece } from "./types.js"; -/** - * Number of most-recently-tested piece CIDs to exclude from re-selection. - */ -const RECENT_DEDUP_WINDOW = 500; - /** * Piece size buckets, in raw (unpadded) bytes. Weighted sampling across * these buckets keeps tests meaningful for bandwidth measurement without @@ -47,10 +42,6 @@ const IPFS_INDEXED_SAMPLE_RATE = 0.8; export class AnonPieceSelectorService { private readonly logger = new Logger(AnonPieceSelectorService.name); - /** Bounded FIFO of recently-selected piece CIDs. Process-local; lost on restart. */ - private readonly recentlyTested = new Set(); - private readonly recentlyTestedQueue: string[] = []; - constructor( private readonly subgraphService: SubgraphService, private readonly configService: ConfigService, @@ -91,7 +82,6 @@ export class AnonPieceSelectorService { }); if (piece) { - this.rememberRecent(piece.pieceCid); this.logger.log({ event: "anon_piece_selected", message: "Selected anonymous piece for retrieval test", @@ -158,10 +148,6 @@ export class AnonPieceSelectorService { continue; } - if (this.recentlyTested.has(piece.pieceCid)) { - continue; - } - return piece; } @@ -179,23 +165,6 @@ export class AnonPieceSelectorService { } return "medium"; } - - /** Push a CID into the bounded FIFO; evict the oldest when at capacity. */ - private rememberRecent(pieceCid: string): void { - if (this.recentlyTested.has(pieceCid)) { - return; - } - - this.recentlyTested.add(pieceCid); - this.recentlyTestedQueue.push(pieceCid); - - while (this.recentlyTestedQueue.length > RECENT_DEDUP_WINDOW) { - const evicted = this.recentlyTestedQueue.shift(); - if (evicted !== undefined) { - this.recentlyTested.delete(evicted); - } - } - } } /** Uniform-random 32-byte sort key as `0x`-prefixed hex. */ From beffac7be083ae84e56bfba5818a48257d4b4922 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Mon, 4 May 2026 09:09:36 +0200 Subject: [PATCH 14/28] revert(ipni): sequential block CID verification Context: https://github.com/filecoin-project/filecoin-pin/issues/417 --- .../src/clickhouse/clickhouse.schema.ts | 4 +- .../src/ipni/ipni-verification.service.ts | 122 +++++++----------- .../anon-retrieval.service.spec.ts | 16 +-- .../retrieval-anon/anon-retrieval.service.ts | 2 - .../retrieval-anon/car-validation.service.ts | 12 +- apps/backend/src/retrieval-anon/types.ts | 2 - 6 files changed, 53 insertions(+), 105 deletions(-) diff --git a/apps/backend/src/clickhouse/clickhouse.schema.ts b/apps/backend/src/clickhouse/clickhouse.schema.ts index 05684154..e8612056 100644 --- a/apps/backend/src/clickhouse/clickhouse.schema.ts +++ b/apps/backend/src/clickhouse/clickhouse.schema.ts @@ -97,10 +97,8 @@ export function buildMigrations(database: string): string[] { block_fetch_sampled_count Nullable(UInt32), -- number of blocks sampled and probed via /ipfs/?format=raw block_fetch_failed_count Nullable(UInt32), -- number of sampled blocks that failed (non-2xx, hash mismatch, unsupported codec, or transport error) - ipni_status LowCardinality(String), -- 'valid' | 'invalid' | 'skipped' | 'error' + ipni_status LowCardinality(String), -- 'valid' | 'invalid' | 'skipped' | 'error' — all-or-nothing across the root CID and the sampled child CIDs (filecoin-pin verifies them as a single batch) ipni_verify_ms Nullable(Float64), -- IPNI verification duration; null when skipped - ipni_verified_cids_count Nullable(UInt32), -- CIDs confirmed findable via IPNI - ipni_unverified_cids_count Nullable(UInt32), -- CIDs checked but not findable error_message Nullable(String) -- failure reason; null on success ) ENGINE MergeTree() diff --git a/apps/backend/src/ipni/ipni-verification.service.ts b/apps/backend/src/ipni/ipni-verification.service.ts index 51fcc8e0..3d7d52f9 100644 --- a/apps/backend/src/ipni/ipni-verification.service.ts +++ b/apps/backend/src/ipni/ipni-verification.service.ts @@ -3,7 +3,7 @@ import { PDPProvider } from "filecoin-pin"; import { waitForIpniProviderResults } from "filecoin-pin/core/utils"; import { CID } from "multiformats/cid"; import type { StorageProvider } from "../database/entities/storage-provider.entity.js"; -import type { FailedCID, IPNIVerificationResult } from "../deal-addons/strategies/ipni.types.js"; +import type { IPNIVerificationResult } from "../deal-addons/strategies/ipni.types.js"; export type IpniVerificationInput = { rootCid: CID; @@ -44,6 +44,7 @@ export class IpniVerificationService { const expectedProviders = [this.buildExpectedProviderInfo(storageProvider as StorageProviderWithUrl)]; const timeoutSignal = AbortSignal.timeout(timeoutMs); const verificationSignal = signal ? AbortSignal.any([signal, timeoutSignal]) : timeoutSignal; + let failureReason = "IPNI did not return expected provider results via filecoin-pin"; this.logger.log({ event: "ipni_verification_started", @@ -60,69 +61,56 @@ export class IpniVerificationService { }); const ipniVerificationStartTime = Date.now(); - const cidsToValidate: { cid: CID; isRoot: boolean }[] = [ - { cid: rootCid, isRoot: true }, - ...blockCids.map((cid) => ({ cid, isRoot: false })), - ]; - let verified = 0; - const failedCIDs: FailedCID[] = []; - let rootCIDVerified = false; - - // waitForIpniProviderResults is all-or-nothing per call (throws on first failure), - // so we invoke it once per CID to get accurate per-CID verified/unverified counts. - // The shared verificationSignal bounds total wall-clock time across all CIDs. - for (const { cid, isRoot } of cidsToValidate) { + const ipniValidated = await waitForIpniProviderResults(rootCid, { + childBlocks: blockCids, + maxAttempts, + delayMs, + expectedProviders, + signal: verificationSignal, + }).catch((error) => { if (signal?.aborted) { signal.throwIfAborted(); } - if (verificationSignal.aborted) { - failedCIDs.push({ cid: cid.toString(), reason: `IPNI verification timed out after ${timeoutMs}ms` }); - continue; - } - - try { - await waitForIpniProviderResults(cid, { - maxAttempts, - delayMs, - expectedProviders, - signal: verificationSignal, - }); - verified += 1; - if (isRoot) rootCIDVerified = true; - } catch (error) { - if (signal?.aborted) { - signal.throwIfAborted(); - } - - const reason = verificationSignal.aborted - ? `IPNI verification timed out after ${timeoutMs}ms` - : error instanceof Error - ? error.message - : String(error); - - failedCIDs.push({ cid: cid.toString(), reason }); - - this.logger.warn({ - event: "ipni_cid_verification_failed", - message: "IPNI verification failed for CID", - cid: cid.toString(), - isRoot, + failureReason = `IPNI verification timed out after ${timeoutMs}ms`; + this.logger.error({ + event: "ipni_verification_timed_out", + message: failureReason, + rootCID: rootCid.toString(), providerAddress: storageProvider.address, providerId: storageProvider.providerId, providerName: storageProvider.name, serviceUrl: storageProvider.serviceUrl, - failureReason: reason, + blockCIDCount: blockCids.length, + timeoutMs, + pollIntervalMs: delayMs, + maxAttempts, }); + return false; } - } + const errorMessage = error instanceof Error ? error.message : String(error); + failureReason = errorMessage; + this.logger.error({ + event: "ipni_verification_failed", + message: "IPNI verification failed", + rootCID: rootCid.toString(), + providerAddress: storageProvider.address, + providerId: storageProvider.providerId, + providerName: storageProvider.name, + serviceUrl: storageProvider.serviceUrl, + blockCIDCount: blockCids.length, + timeoutMs, + pollIntervalMs: delayMs, + maxAttempts, + failureReason, + }); + return false; + }); const ipniVerificationDurationMs = Date.now() - ipniVerificationStartTime; - const total = cidsToValidate.length; - const unverified = total - verified; - if (verified === total) { + if (ipniValidated) { this.logger.log({ event: "ipni_verification_succeeded", message: "IPNI verification succeeded", @@ -133,32 +121,22 @@ export class IpniVerificationService { verifyDurationMs: ipniVerificationDurationMs, blockCIDCount: blockCids.length, }); - } else { - this.logger.error({ - event: verificationSignal.aborted ? "ipni_verification_timed_out" : "ipni_verification_failed", - message: "IPNI verification did not fully succeed", - rootCID: rootCid.toString(), - providerAddress: storageProvider.address, - providerId: storageProvider.providerId, - providerName: storageProvider.name, - serviceUrl: storageProvider.serviceUrl, - blockCIDCount: blockCids.length, - timeoutMs, - pollIntervalMs: delayMs, - maxAttempts, - verified, - unverified, - total, - }); } return { - verified: verified, - unverified: unverified, - total: total, - rootCIDVerified: rootCIDVerified, + verified: ipniValidated ? 1 : 0, + unverified: ipniValidated ? 0 : 1, + total: 1, + rootCIDVerified: ipniValidated, durationMs: ipniVerificationDurationMs, - failedCIDs: failedCIDs, + failedCIDs: ipniValidated + ? [] + : [ + { + cid: rootCid.toString(), + reason: failureReason, + }, + ], verifiedAt: new Date().toISOString(), }; } diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts index c82eed76..adc75920 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts @@ -162,8 +162,6 @@ describe("AnonRetrievalService", () => { expect(row.block_fetch_failed_count).toBeNull(); expect(row.ipni_status).toBe("skipped"); expect(row.ipni_verify_ms).toBeNull(); - expect(row.ipni_verified_cids_count).toBeNull(); - expect(row.ipni_unverified_cids_count).toBeNull(); }); it("still emits a row when the signal aborts before fetchPiece runs", async () => { @@ -250,8 +248,6 @@ describe("AnonRetrievalService", () => { sampledCidCount: 5, ipniValid: true, ipniVerifyMs: 137, - ipniVerifiedCidsCount: 6, - ipniUnverifiedCidsCount: 0, blockFetchValid: true, blockFetchFailedCount: 0, blockFetchEndpoint: "https://sp.test/ipfs/", @@ -277,19 +273,15 @@ describe("AnonRetrievalService", () => { expect(row.block_fetch_failed_count).toBe(0); expect(row.ipni_status).toBe("valid"); expect(row.ipni_verify_ms).toBe(137); - expect(row.ipni_verified_cids_count).toBe(6); - expect(row.ipni_unverified_cids_count).toBe(0); }); - it("distinguishes IPNI invalid from block-fetch failures with explicit counts", async () => { + it("distinguishes IPNI invalid from block-fetch failures", async () => { const carResult: CarValidationResult = { carParseable: true, blockCount: 100, sampledCidCount: 5, ipniValid: false, ipniVerifyMs: 250, - ipniVerifiedCidsCount: 0, - ipniUnverifiedCidsCount: 6, blockFetchValid: false, blockFetchFailedCount: 2, blockFetchEndpoint: "https://sp.test/ipfs/", @@ -309,8 +301,6 @@ describe("AnonRetrievalService", () => { expect(row.piece_fetch_status).toBe(RetrievalStatus.SUCCESS); expect(row.car_parseable).toBe(true); expect(row.ipni_status).toBe("invalid"); - expect(row.ipni_verified_cids_count).toBe(0); - expect(row.ipni_unverified_cids_count).toBe(6); expect(row.block_fetch_valid).toBe(false); expect(row.block_fetch_sampled_count).toBe(5); expect(row.block_fetch_failed_count).toBe(2); @@ -347,8 +337,6 @@ describe("AnonRetrievalService", () => { sampledCidCount: 0, ipniValid: null, ipniVerifyMs: null, - ipniVerifiedCidsCount: null, - ipniUnverifiedCidsCount: null, blockFetchValid: null, blockFetchFailedCount: null, blockFetchEndpoint: null, @@ -373,8 +361,6 @@ describe("AnonRetrievalService", () => { expect(row.block_fetch_failed_count).toBeNull(); expect(row.ipni_status).toBe("skipped"); expect(row.ipni_verify_ms).toBeNull(); - expect(row.ipni_verified_cids_count).toBeNull(); - expect(row.ipni_unverified_cids_count).toBeNull(); }); }); }); diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index c1d08c0e..25b34e82 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -186,8 +186,6 @@ export class AnonRetrievalService { block_fetch_failed_count: carResult?.blockFetchFailedCount ?? null, ipni_status: ipniStatus, ipni_verify_ms: carResult?.ipniVerifyMs ?? null, - ipni_verified_cids_count: carResult?.ipniVerifiedCidsCount ?? null, - ipni_unverified_cids_count: carResult?.ipniUnverifiedCidsCount ?? null, error_message: finalPieceResult.errorMessage ?? null, }); } catch (error) { diff --git a/apps/backend/src/retrieval-anon/car-validation.service.ts b/apps/backend/src/retrieval-anon/car-validation.service.ts index 27ec2744..c3a6c717 100644 --- a/apps/backend/src/retrieval-anon/car-validation.service.ts +++ b/apps/backend/src/retrieval-anon/car-validation.service.ts @@ -63,8 +63,6 @@ export class CarValidationService { sampledCidCount: 0, ipniValid: null, ipniVerifyMs: null, - ipniVerifiedCidsCount: null, - ipniUnverifiedCidsCount: null, blockFetchValid: null, blockFetchFailedCount: null, blockFetchEndpoint: null, @@ -77,8 +75,6 @@ export class CarValidationService { sampledCidCount: 0, ipniValid: null, ipniVerifyMs: null, - ipniVerifiedCidsCount: null, - ipniUnverifiedCidsCount: null, blockFetchValid: null, blockFetchFailedCount: null, blockFetchEndpoint: null, @@ -99,8 +95,6 @@ export class CarValidationService { sampledCidCount: sampledBlocks.length, ipniValid: ipni.valid, ipniVerifyMs: ipni.durationMs, - ipniVerifiedCidsCount: ipni.verifiedCount, - ipniUnverifiedCidsCount: ipni.unverifiedCount, blockFetchValid: blockFetchResult.valid, blockFetchFailedCount: blockFetchResult.failedCount, blockFetchEndpoint: blockFetchResult.endpoint, @@ -129,8 +123,6 @@ export class CarValidationService { ): Promise<{ valid: boolean; durationMs: number | null; - verifiedCount: number | null; - unverifiedCount: number | null; }> { const timeouts = this.configService.get("timeouts", { infer: true }); let rootCid: CID; @@ -144,7 +136,7 @@ export class CarValidationService { providerAddress: provider.address, error: toStructuredError(error), }); - return { valid: false, durationMs: null, verifiedCount: null, unverifiedCount: null }; + return { valid: false, durationMs: null }; } const result = await this.ipniVerificationService.verify({ @@ -159,8 +151,6 @@ export class CarValidationService { return { valid: result.rootCIDVerified, durationMs: result.durationMs, - verifiedCount: result.verified, - unverifiedCount: result.unverified, }; } diff --git a/apps/backend/src/retrieval-anon/types.ts b/apps/backend/src/retrieval-anon/types.ts index 3ba2b9f9..9013a5ea 100644 --- a/apps/backend/src/retrieval-anon/types.ts +++ b/apps/backend/src/retrieval-anon/types.ts @@ -31,8 +31,6 @@ export type CarValidationResult = { sampledCidCount: number; ipniValid: boolean | null; ipniVerifyMs: number | null; - ipniVerifiedCidsCount: number | null; - ipniUnverifiedCidsCount: number | null; blockFetchValid: boolean | null; blockFetchFailedCount: number | null; blockFetchEndpoint: string | null; From f26744b8dfc661e779fae511cdc56a9985942e2c Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Mon, 4 May 2026 11:18:50 +0200 Subject: [PATCH 15/28] docs(retrieval-anon): flow description and metrics definitions --- .../anon-piece-selector.service.ts | 6 +- docs/checks/README.md | 3 +- docs/checks/anon-retrievals.md | 145 ++++++++++++++++++ docs/checks/events-and-metrics.md | 20 +++ 4 files changed, 172 insertions(+), 2 deletions(-) create mode 100644 docs/checks/anon-retrievals.md diff --git a/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts index 0ee51fc7..d354a222 100644 --- a/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts +++ b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts @@ -18,13 +18,15 @@ type SizeRange = { min: bigint; max: bigint }; const MIB = 1024n * 1024n; // All downloads are buffered in-memory, so we need to keep piece sizes reasonable +// When changing these values, also update ./docs/checks/anon-retrievals.md#piece-selection const SIZE_BUCKETS: Record = { small: { min: 1n * MIB, max: 20n * MIB - 1n }, medium: { min: 20n * MIB, max: 100n * MIB - 1n }, large: { min: 100n * MIB, max: 500n * MIB - 1n }, }; -/** Weights for choosing a bucket per selection. Must sum to 1. */ +// Weights for choosing a bucket per selection. Must sum to 1. +// When changing these values, also update ./docs/checks/anon-retrievals.md#piece-selection const BUCKET_WEIGHTS: Record = { small: 0.2, medium: 0.5, @@ -35,6 +37,8 @@ const BUCKET_WEIGHTS: Record = { * Probability the primary draw targets the withIPFSIndexing pool. * The rest of the time we sample across all FWSS pieces, so SPs can't * optimise only their CAR corpus. + * + * When changing this value, also update ./docs/checks/anon-retrievals.md#piece-selection */ const IPFS_INDEXED_SAMPLE_RATE = 0.8; diff --git a/docs/checks/README.md b/docs/checks/README.md index 74b1a872..136349ee 100644 --- a/docs/checks/README.md +++ b/docs/checks/README.md @@ -4,6 +4,7 @@ The files are: - [production-configuration-and-approval-methodology.md](./production-configuration-and-approval-methodology.md): Defines the production configuration and approval methodology. - [data-storage.md](./data-storage.md): Defines the "data storage check" and how it is calculated. - [retrievals.md](./retrievals.md): Defines the "retrieval check" and how it is calculated. +- [anon-retrievals.md](./anon-retrievals.md): Defines the "anonymous retrieval check" (sampled public pieces, not dealbot-uploaded) and how it is calculated. - [data-retention.md](./data-retention.md): Defines the "data retention check" and how it is calculated. - [events-and-metrics.md](./events-and-metrics.md): Defines the events and metrics that are used to assess SP performance. @@ -14,7 +15,7 @@ DealBot creates synthetic traffic for SPs in the onchain SP registry and monitor ## Terminology ### Check -A "check" refers to a task type that dealbot performs on a SP. We currently have [Data Storage](./data-storage.md) and [Retrieval](./retrievals.md) checks. +A "check" refers to a task type that dealbot performs on an SP. We currently have [Data Storage](./data-storage.md), [Retrieval](./retrievals.md), [Anonymous Retrieval](./anon-retrievals.md), and [Data Retention](./data-retention.md) checks. ### Deal This is synonym for "Data Storage Check". This is covered in the [data-storage.md](./data-storage.md). diff --git a/docs/checks/anon-retrievals.md b/docs/checks/anon-retrievals.md new file mode 100644 index 00000000..0a303462 --- /dev/null +++ b/docs/checks/anon-retrievals.md @@ -0,0 +1,145 @@ +# Anonymous Retrieval Check + +This document is the **source of truth** for how dealbot's Anonymous Retrieval check works. + +Source code links throughout this document point to the current implementation. + +For event and metric definitions to be used by the dashboard, see [Dealbot Events & Metrics](./events-and-metrics.md). + +## Overview + +The Anonymous Retrieval check (sometimes referred to internally as [retrieval++](https://github.com/FilOzone/dealbot/pull/427)) tests publicly discoverable pieces on a storage provider (pieces that were *not* uploaded by dealbot). The intent is to measure SP retrievability against real-world tenant data, not just dealbot's own corpus. + +This is distinct from the [Retrieval check](./retrievals.md), which exercises pieces dealbot itself uploaded as part of a [Data Storage check](./data-storage.md). The Anonymous Retrieval check answers a different question: does the SP serve arbitrary pieces from its broader public corpus, with the same correctness and performance properties as dealbot's controlled pieces? + +### Definition of Successful Retrieval + +A successful anonymous retrieval requires: + +1. **Piece fetch** — `GET {spBaseUrl}/piece/{pieceCid}` returns HTTP 2xx and the response bytes hash to the declared CommP (piece CID). + +If the piece advertises IPFS indexing (`withIPFSIndexing = true` and a non-null `ipfsRootCid`), three additional dimensions are validated *independently*. Importantly, they do not gate the overall `piece_fetch_status`, and each is recorded as its own outcome column / metric: + +2. **CAR parseable:** the fetched bytes parse as a CAR file. +3. **IPNI:** the SP is advertised as a provider for the root CID and a sample of child CIDs via filecoinpin.contact. +4. **Block fetch:** a sample of CIDs from the parsed CAR is re-fetched via `{spBaseUrl}/ipfs/{cid}?format=raw` and each response is hash-verified against its declared CID. + +A piece without IPFS indexing is exercised only at step (1). + +Operational timeouts exist to prevent jobs from running indefinitely. If the job exceeds `ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS`, it is aborted; a row is still emitted so that partial metrics (TTFB, bytes, response code) are not lost. + +## Piece Selection + +Unlike the [Retrieval check](./retrievals.md#piece-selection), dealbot does not retrieve from its own deals. Pieces are sampled from the on-chain subgraph of all FWSS-served pieces for the SP under test. + +Selection strategy (per scheduled job, per SP): + +1. **Pick a size bucket** by weighted random: + - `small` (1–20 MiB) — 20% + - `medium` (20–100 MiB) — 50% + - `large` (100–500 MiB) — 30% +2. **Pick a pool**: + - `indexed` (IPFS-indexed pieces) — 80% + - `any` (all FWSS pieces) — 20% +3. **Generate a uniform-random `sampleKey`** and query the subgraph for the smallest `Root.sampleKey ≥ $sampleKey` matching the SP, payer, size range, and pool filters. +4. **Drop the candidate** if `pdpPaymentEndEpoch` has passed. +5. **Fall back** through: (same bucket, opposite pool) → (any bucket, indexed) → (any bucket, any). + +The 80/20 split for `indexed` vs `any` exists so that SPs cannot optimize only their CAR corpus and still appear healthy on this check. + +> [!NOTE] +> The bucket sizes were chosen such that the whole file will still fit into memory. In the future we may implement a streaming verification and parsing. + +Source: [`anon-piece-selector.service.ts`](../../apps/backend/src/retrieval-anon/anon-piece-selector.service.ts) + +## What Happens Each Cycle + +```mermaid +flowchart TD + Select["Sample anonymous piece for SP from subgraph"] --> Fetch["GET /piece/{pieceCid}"] + Fetch --> CommP["Hash bytes → verify CommP"] + CommP --> HasIpfs{"piece.withIPFSIndexing
and ipfsRootCid?"} + HasIpfs -- "no" --> Record["Persist row + metrics"] + HasIpfs -- "yes" --> ParseCar["Parse bytes as CAR"] + ParseCar --> SampleBlocks["Pick N random CIDs
(ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT)"] + SampleBlocks --> Ipni["IPNI: verify SP advertises root + sampled CIDs"] + SampleBlocks --> BlockFetch["GET /ipfs/{cid}?format=raw for each sampled CID"] + BlockFetch --> HashCheck["Hash-verify each response against its CID"] + Ipni --> Record + HashCheck --> Record +``` + +### Piece Fetch + +- **URL:** `{spBaseUrl}/piece/{pieceCid}` (HTTP/2) +- **Buffered in memory** — piece sizes are capped at 500 MiB by selection. +- **Validates CommP** — the CommP of the response bytes must match `pieceCid`. + +Source: [`piece-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/piece-retrieval.service.ts) + +### CAR Validation (only when piece advertises IPFS indexing) + +When the selected piece has `withIPFSIndexing = true` and a non-null `ipfsRootCid`, the fetched bytes are parsed as a CAR and a random sample of `ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT` CIDs is exercised: + +- **IPNI check:** `IpniVerificationService.verify(rootCid, sampledCids, sp)` polls filecoinpin.contact until each CID resolves to the SP under test, the timeout fires, or `IPNI_VERIFICATION_TIMEOUT_MS` is reached. +- **Block fetch check:** for each sampled CID, fetch `{spBaseUrl}/ipfs/{cid}?format=raw` and hash-verify the response against the CID. Non-2xx, hash mismatch, unsupported codec, or transport errors all count as a single failed block. + +Source: [`car-validation.service.ts`](../../apps/backend/src/retrieval-anon/car-validation.service.ts) + +## What Gets Asserted + +| # | Assertion | How It's Checked | Retries | Relevant Metric | Implemented? | +|---|-----------|------------------|:---:|------------------|:---:| +| 1 | SP serves the piece | `GET /piece/{pieceCid}` returns HTTP 2xx | 0 | [`anonPieceRetrievalLastByteMs`](./events-and-metrics.md#anonPieceRetrievalLastByteMs) | Yes | +| 2 | Bytes match the declared CommP | Hash of response bytes equals `pieceCid` | 0 | [`anonRetrievalStatus`](./events-and-metrics.md#anonRetrievalStatus) | Yes | +| 3 | Bytes parse as a CAR (IPFS-indexed pieces only) | `@ipld/car` parses the response | 0 | [`anonCarParseStatus`](./events-and-metrics.md#anonCarParseStatus) | Yes | +| 4 | SP is advertised on IPNI for root + sampled CIDs | filecoinpin.contact returns provider records | polling until timeout | [`anonIpniStatus`](./events-and-metrics.md#anonIpniStatus) | Yes | +| 5 | Sampled blocks fetch + hash-verify | `/ipfs/{cid}?format=raw` for each sample | 0 | [`anonBlockFetchStatus`](./events-and-metrics.md#anonBlockFetchStatus) | Yes | + +## Result Recording + +Each anonymous retrieval attempt writes one row to the `anon_retrieval_checks` ClickHouse table. The row is emitted **even on abort or unexpected error** so that the partial evidence (TTFB, bytes, response code) is preserved. + +The DDL and column-level comments in [`clickhouse.schema.ts`](../../apps/backend/src/clickhouse/clickhouse.schema.ts) are authoritative. The summary below is for orientation. + +| Column | Meaning | +|--------|---------| +| `timestamp` | When the check started (ms UTC) | +| `probe_location` | Dealbot probe location (`DEALBOT_PROBE_LOCATION`) | +| `sp_address`, `sp_id`, `sp_name` | SP identity | +| `retrieval_id` | Per-event UUID; correlates row to logs and Prometheus | +| `piece_cid`, `data_set_id`, `piece_id`, `raw_size` | Sampled piece identity | +| `with_ipfs_indexing`, `ipfs_root_cid` | Whether the piece advertises IPNI metadata | +| `service_type` | Always `direct_sp` today | +| `retrieval_endpoint` | URL probed for piece fetch | +| `piece_fetch_status` | `success` or `failed` — outcome of `/piece/{cid}` (HTTP 2xx **and** CommP match). CAR/IPNI/block-fetch outcomes live in their own columns and do **not** flip this status. | +| `http_response_code` | Raw HTTP status; null on transport failure | +| `first_byte_ms`, `last_byte_ms`, `bytes_retrieved`, `throughput_bps` | Piece-fetch performance | +| `commp_valid` | Null when retrieval failed before CommP could be hashed | +| `car_parseable`, `car_block_count` | Null when CAR validation was skipped (no IPFS indexing or piece fetch failed) | +| `block_fetch_endpoint`, `block_fetch_valid`, `block_fetch_sampled_count`, `block_fetch_failed_count` | Block-fetch outcomes; null when skipped | +| `ipni_status` | `valid` \| `invalid` \| `skipped` \| `error` | +| `ipni_verify_ms`, `ipni_verified_cids_count`, `ipni_unverified_cids_count` | IPNI check details | +| `error_message` | Failure reason; null on success | + +Source: [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) + +## Metrics Recorded + +Anonymous-retrieval Prometheus metric definitions live in [Dealbot Events & Metrics](./events-and-metrics.md). All anon-retrieval metrics carry `checkType=anon_retrieval`. + +## Configuration + +Key environment variables that control anonymous retrieval testing: + +| Variable | Description | +|----------|-------------| +| `RETRIEVALS_ANON_PER_SP_PER_HOUR` | Anonymous retrieval rate per SP. Falls back to `RETRIEVALS_PER_SP_PER_HOUR` when unset. | +| `ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS` | Max end-to-end anon retrieval job runtime before forced abort (default 360s). | +| `ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT` | Number of CIDs sampled from the parsed CAR for IPNI + block-fetch verification (default 5, max 50). | +| `IPNI_VERIFICATION_TIMEOUT_MS` | Max time to wait for IPNI provider verification (shared with the Retrieval check). | +| `IPNI_VERIFICATION_POLLING_MS` | Poll interval between IPNI verification attempts (shared). | +| `CONNECT_TIMEOUT_MS` | Connection/header timeout for HTTP requests. | +| `HTTP2_REQUEST_TIMEOUT_MS` | Total timeout for HTTP/2 retrieval requests. | + +See also: [`docs/environment-variables.md`](../environment-variables.md) for the full configuration reference. diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index 6c461f7f..f5d89b23 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -4,6 +4,16 @@ This document is the intended **source of truth** for the events emitted by deal > **Note on "events":** the entries in the [Event List](#event-list) are named **timing markers** used to define metric Timer Starts/Ends — they are not all emitted as discrete Prometheus events or log lines. Each marker is anchored in code (as a timestamp variable, log line, or status transition) and used to compute the metrics in the [Metrics](#metrics) section. +## Anonymous Retrieval Event Model + +The [Anonymous Retrieval check](./anon-retrievals.md) is a single-shot flow per piece: select → fetch piece → (optional) parse CAR + IPNI + block fetch → write one ClickHouse row. + +It is not modeled as a sequence of named lifecycle events. Instead it emits: + +- **Outcome metrics** when each step completes — see the [time](#time-related-metrics) and [status](#status-count-related-metrics) metric tables for `anonPieceRetrievalFirstByteMs`, `anonRetrievalCheckMs`, `anonRetrievalStatus`, `anonCarParseStatus`, `anonIpniStatus`, `anonBlockFetchStatus`, and friends. +- **One row per attempt** in the `anon_retrieval_checks` [ClickHouse table](#clickhouse-tables), emitted even on abort or unexpected error. +- **Structured log lines** (`anon_retrieval_started`, `anon_retrieval_completed`, `anon_retrieval_no_piece`, `anon_retrieval_car_validation_failed`, `anon_retrieval_clickhouse_insert_failed`) carrying a `retrievalId` so each row can be joined back to log evidence. + ## Data Storage Event Model Below are the sequence of events for a [Data Storage check](./data-storage.md). The Data Storage flow is used because it encapsulates a [Retrieval check](./retrievals.md) as well. @@ -87,6 +97,10 @@ sequenceDiagram | `dataStorageCheckMs` | Data Storage | [`uploadToSpStart`](#uploadToSpStart) | [`ipfsRetrievalIntegrityChecked`](#ipfsRetrievalIntegrityChecked) | Duration of a Data Storage check | | | `retrievalCheckMs` | Retrieval | Retrieval check start | [`ipfsRetrievalIntegrityChecked`](#ipfsRetrievalIntegrityChecked) | Duration of a Retrieval check | | | `dataSetCreationMs` | Data-Set Creation | Data-set creation uploadToSpStart | Data-set creation pieceConfirmed | Duration of one data-set creation with confirmed piece (all using `createDataSetWithPiece`) | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | +| `anonPieceRetrievalFirstByteMs` | Anonymous Retrieval | Piece fetch start | First byte received from `/piece/{pieceCid}` | Time to first byte for anonymous piece retrievals | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonPieceRetrievalLastByteMs` | Anonymous Retrieval | Piece fetch start | Last byte received from `/piece/{pieceCid}` | Total time to retrieve an anonymous piece | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonPieceRetrievalThroughputBps` | Anonymous Retrieval | n/a | n/a | `(bytesRetrieved / anonPieceRetrievalLastByteMs) * 1000` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonRetrievalCheckMs` | Anonymous Retrieval | Anon retrieval check start | After CAR/IPNI/block-fetch validation completes (or on abort) | End-to-end anonymous retrieval check duration | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | ### Status Count Related Metrics @@ -106,6 +120,11 @@ sequenceDiagram | `dataSetCreationStatus` | Data-Set Creation | Not tied to an [event above](#event-list) but rather to data-set creation start (`pending`) and completion (`success`/`failure.*`) | `pending`, `success`, `failure.timedout`, `failure.other` | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | | `dataSetChallengeStatus` | Data Retention | Emitted on each [Data Retention Check](./data-retention.md) poll when a provider's confirmed proving-period totals advance (strictly positive deltas). Unit: **challenges** (period delta × `CHALLENGES_PER_PROVING_PERIOD = 5`). | `success` (challenges in successfully-proven periods), `failure` (challenges in faulted periods) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | | `pdp_provider_estimated_overdue_periods` | Data Retention | Emitted on every [Data Retention Check](./data-retention.md) poll for every successfully processed provider. | Gauge value in proving periods (non-negative integer) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | +| `anonRetrievalStatus` | Anonymous Retrieval | After piece fetch completes (or on abort) | `success`, `failure.http`, `failure.aborted`, `failure.no_piece` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonPieceHttpResponseCode` | Anonymous Retrieval | After piece fetch completes | `200`, `500`, `2xxSuccess`, `4xxClientError`, `5xxServerError`, `otherHttpStatusCodes`, `failure` (same classifier as [`ipfsRetrievalHttpResponseCode`](#ipfsRetrievalHttpResponseCode)) | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonCarParseStatus` | Anonymous Retrieval | After CAR validation runs (skipped when piece fetch failed or piece is not IPFS-indexed) | `parseable`, `not_parseable` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonIpniStatus` | Anonymous Retrieval | After CAR validation runs, **or** when piece fetch failed (records `skipped`) | `valid`, `invalid`, `skipped`, `error` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonBlockFetchStatus` | Anonymous Retrieval | After block-fetch sampling runs, **or** when piece fetch failed (records `skipped`) | `valid`, `invalid`, `skipped`, `error` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | ## ClickHouse Tables @@ -115,6 +134,7 @@ When `CLICKHOUSE_URL` is configured, dealbot writes one row per check result to - **`data_storage_checks`** — one row written each time a deal is saved (on every status transition). Populated by [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts). - **`retrieval_checks`** — one row per retrieval attempt. Populated by [`retrieval.service.ts`](../../apps/backend/src/retrieval/retrieval.service.ts). +- **`anon_retrieval_checks`** — one row per [Anonymous Retrieval check](./anon-retrievals.md) attempt; emitted even on abort or unexpected error. Populated by [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts). See [Anonymous Retrieval § Result Recording](./anon-retrievals.md#result-recording) for column-level meanings. - **`data_retention_challenges`** — one row per provider per poll cycle. Populated by [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts). All tables share the primary key `(probe_location, sp_address, timestamp)`: From 5cee3ee85975342302fe8b1e418e8758c723aaf1 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Mon, 4 May 2026 12:05:14 +0200 Subject: [PATCH 16/28] docs: add missing anonymous retrieval env vars --- docs/environment-variables.md | 40 ++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/docs/environment-variables.md b/docs/environment-variables.md index 2f25943c..e2b23735 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -11,7 +11,7 @@ This document provides a comprehensive guide to all environment variables used b | [Blockchain](#blockchain-configuration) | `NETWORK`, `RPC_URL`, `WALLET_ADDRESS`, `WALLET_PRIVATE_KEY`, `SESSION_KEY_PRIVATE_KEY`, `CHECK_DATASET_CREATION_FEES`, `USE_ONLY_APPROVED_PROVIDERS`, `SUBGRAPH_ENDPOINT` | | [Dataset Versioning](#dataset-versioning) | `DEALBOT_DATASET_VERSION` | | [Scheduling](#scheduling-configuration) | `PROVIDERS_REFRESH_INTERVAL_SECONDS`, `DATA_RETENTION_POLL_INTERVAL_SECONDS`, `DEALBOT_MAINTENANCE_WINDOWS_UTC`, `DEALBOT_MAINTENANCE_WINDOW_MINUTES` | -| [Jobs (pg-boss)](#jobs-pg-boss) | `DEALBOT_PGBOSS_SCHEDULER_ENABLED`, `DEALBOT_PGBOSS_POOL_MAX`, `DEALS_PER_SP_PER_HOUR`, `DATASET_CREATIONS_PER_SP_PER_HOUR`, `RETRIEVALS_PER_SP_PER_HOUR`, `JOB_SCHEDULER_POLL_SECONDS`, `JOB_WORKER_POLL_SECONDS`, `PG_BOSS_LOCAL_CONCURRENCY`, `JOB_CATCHUP_MAX_ENQUEUE`, `JOB_SCHEDULE_PHASE_SECONDS`, `JOB_ENQUEUE_JITTER_SECONDS`, `DEAL_JOB_TIMEOUT_SECONDS`, `RETRIEVAL_JOB_TIMEOUT_SECONDS`, `ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS`, `IPFS_BLOCK_FETCH_CONCURRENCY` | +| [Jobs (pg-boss)](#jobs-pg-boss) | `DEALBOT_PGBOSS_SCHEDULER_ENABLED`, `DEALBOT_PGBOSS_POOL_MAX`, `DEALS_PER_SP_PER_HOUR`, `DATASET_CREATIONS_PER_SP_PER_HOUR`, `RETRIEVALS_PER_SP_PER_HOUR`, `RETRIEVALS_ANON_PER_SP_PER_HOUR`, `JOB_SCHEDULER_POLL_SECONDS`, `JOB_WORKER_POLL_SECONDS`, `PG_BOSS_LOCAL_CONCURRENCY`, `JOB_CATCHUP_MAX_ENQUEUE`, `JOB_SCHEDULE_PHASE_SECONDS`, `JOB_ENQUEUE_JITTER_SECONDS`, `DEAL_JOB_TIMEOUT_SECONDS`, `RETRIEVAL_JOB_TIMEOUT_SECONDS`, `ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS`, `ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT`, `IPFS_BLOCK_FETCH_CONCURRENCY` | | [Dataset](#dataset-configuration) | `DEALBOT_LOCAL_DATASETS_PATH`, `RANDOM_PIECE_SIZES` | | [ClickHouse](#clickhouse-configuration) | `CLICKHOUSE_URL`, `CLICKHOUSE_BATCH_SIZE`, `CLICKHOUSE_FLUSH_INTERVAL_MS`, `DEALBOT_PROBE_LOCATION` | | [Timeouts](#timeout-configuration) | `CONNECT_TIMEOUT_MS`, `HTTP_REQUEST_TIMEOUT_MS`, `HTTP2_REQUEST_TIMEOUT_MS`, `IPNI_VERIFICATION_TIMEOUT_MS`, `IPNI_VERIFICATION_POLLING_MS` | @@ -622,6 +622,19 @@ rate-based (per hour) and persisted in Postgres so restarts do not reset timing. --- +### `RETRIEVALS_ANON_PER_SP_PER_HOUR` + +- **Type**: `number` +- **Required**: No +- **Default**: Falls back to `RETRIEVALS_PER_SP_PER_HOUR`, which itself defaults to `2` +- **Limits**: `0.001` – `20` + +**Role**: Target [anonymous retrieval](./checks/anon-retrievals.md) check rate per storage provider. Anonymous retrievals fetch arbitrary FWSS pieces sampled from the on-chain subgraph (not pieces dealbot uploaded), so this rate controls coverage of the SP's broader public corpus independently of the dealbot-owned [retrieval check](./checks/retrievals.md) rate. + +**Notes**: Fractional values are supported. For example, `0.5` means one anon retrieval every 2 hours per storage provider. + +--- + ### `DATASET_CREATIONS_PER_SP_PER_HOUR` - **Type**: `number` @@ -806,6 +819,31 @@ Use this to stagger multiple dealbot deployments that are not sharing a database **Note**: This is independent of HTTP-level timeouts (`CONNECT_TIMEOUT_MS`, `HTTP2_REQUEST_TIMEOUT_MS`). The job timeout covers the end-to-end execution of an Anon Retrieval Check (piece selection, download, CommP validation, CAR/IPNI validation). +--- + +### `ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT` + +- **Type**: `number` (integer) +- **Required**: No +- **Default**: `5` +- **Minimum**: `1` +- **Maximum**: `50` +- **Enforced**: Yes (config validation) + +**Role**: Number of CIDs randomly sampled from the parsed CAR for IPNI verification and block-fetch validation during an [anonymous retrieval check](./checks/anon-retrievals.md). Only applies to pieces with IPFS indexing enabled — pieces without an `ipfsRootCid` skip CAR validation entirely. + +For each sampled CID, dealbot: + +1. Confirms via filecoinpin.contact that the SP is advertised as a provider for the CID. +2. Re-fetches the block via `{spBaseUrl}/ipfs/{cid}?format=raw` and hash-verifies the response. + +**When to update**: + +- Increase for stronger statistical confidence that the SP serves the entire DAG correctly (more IPNI queries + per-block fetches per check) +- Decrease to reduce per-check load on the SP and on filecoinpin.contact + +**Note**: A higher sample count multiplies both IPNI traffic and block-fetch traffic per check. The IPNI step is all-or-nothing across the root CID and the sampled child CIDs — see [Anonymous Retrieval § CAR Validation](./checks/anon-retrievals.md#car-validation-only-when-piece-advertises-ipfs-indexing). + --- ### `IPFS_BLOCK_FETCH_CONCURRENCY` From 95a2dff643b032ea02878251a0f9986a9a12f825 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Mon, 4 May 2026 12:07:11 +0200 Subject: [PATCH 17/28] docs: fix obsolete reference to the pdp-explorer-owned subgraph --- .../production-configuration-and-approval-methodology.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/checks/production-configuration-and-approval-methodology.md b/docs/checks/production-configuration-and-approval-methodology.md index 3d956aa4..6b2859aa 100644 --- a/docs/checks/production-configuration-and-approval-methodology.md +++ b/docs/checks/production-configuration-and-approval-methodology.md @@ -40,8 +40,8 @@ Relevant parameters include: | Parameter | Value | Notes | |-----------|-------|-------| -| [`SUBGRAPH_ENDPOINT`](../environment-variables.md#subgraph_endpoint) | TODO: fill this in | Uses the subgraph from [pdp-explorer](https://github.com/FilOzone/pdp-explorer). | -| [`MIN_NUM_DATASETS_FOR_CHECKS`](../environment-variables.md#dataset-configuration) | 15 | Ensure there are enough datasets with pieces being added so that statistical significance for [Data Retention Fault Rate](#data-retention-fault-rate) can be achieved quicker. Note that on mainnet each dataset incurs 5 challenges[^1] per daily proof[^2]. With this many datasets, an SP can be approved for data retention after a faultless ~7 days even if the SP doesn't have other datasets. | +| [`SUBGRAPH_ENDPOINT`](../environment-variables.md#subgraph_endpoint) | - | Points at a Goldsky deployment of the dealbot-owned subgraph in [`apps/subgraph/`](../../apps/subgraph/) (package `@dealbot/subgraph`). | +| [`MIN_NUM_DATASETS_FOR_CHECKS`](../environment-variables.md#dataset-configuration) | 15 | Ensure there are enough datasets with pieces being added so that statistical significance for [Data Retention Fault Rate](#data-retention-fault-rate) can be achieved quicker. Note that on mainnet each dataset incurs 5 challenges[^1] per daily proof[^2]. With this many datasets, an SP can be approved for data retention after a faultless ~7 days even if the SP doesn't have other datasets. | See [How are data retention statistics/thresholds calculated?](#how-are-data-retention-statisticsthresholds-calculated) for more details. From cff31713aa28d97ce4ba41135c6e73d95ca2a17f Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Mon, 4 May 2026 12:30:56 +0200 Subject: [PATCH 18/28] improve: clarity around piece fetch status and commp validation --- apps/backend/src/clickhouse/clickhouse.schema.ts | 2 +- .../backend/src/retrieval-anon/anon-retrieval.service.ts | 9 ++++++++- docs/checks/events-and-metrics.md | 6 +++--- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/apps/backend/src/clickhouse/clickhouse.schema.ts b/apps/backend/src/clickhouse/clickhouse.schema.ts index e8612056..b27ba0e2 100644 --- a/apps/backend/src/clickhouse/clickhouse.schema.ts +++ b/apps/backend/src/clickhouse/clickhouse.schema.ts @@ -82,7 +82,7 @@ export function buildMigrations(database: string): string[] { service_type LowCardinality(String), -- 'direct_sp' (only mode for anon retrievals today) retrieval_endpoint String, -- URL probed (e.g. {spBaseUrl}/piece/{pieceCid}) - piece_fetch_status LowCardinality(String), -- 'success' | 'failed' — outcome of GET /piece/ (HTTP 2xx AND CommP match). CAR/IPNI/block-fetch outcomes live in their own columns. + piece_fetch_status LowCardinality(String), -- 'success' | 'failed' — HTTP transport outcome of GET /piece/ (HTTP 2xx). CommP validity, CAR/IPNI/block-fetch outcomes live in their own columns. http_response_code Nullable(UInt16), -- raw HTTP status; null on transport failure first_byte_ms Nullable(Float64), -- time to first response byte last_byte_ms Nullable(Float64), -- time to last response byte diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index 25b34e82..eddc88f0 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -136,9 +136,16 @@ export class AnonRetrievalService { // Overall check duration and status this.metrics.observeCheckDuration(labels, Date.now() - checkStart); + const pieceServedCorrectly = pieceResult.success && pieceResult.commPValid; this.metrics.recordStatus( labels, - pieceResult.success ? "success" : pieceResult.aborted ? "failure.aborted" : "failure.http", + pieceServedCorrectly + ? "success" + : pieceResult.aborted + ? "failure.aborted" + : pieceResult.success + ? "failure.commp" + : "failure.http", ); } finally { // Always emit a ClickHouse row — even on abort or unexpected error — so diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index f5d89b23..fba8b003 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -118,9 +118,9 @@ sequenceDiagram | `ipfsRetrievalHttpResponseCode` | Data Storage, Retrieval | [`ipfsRetrievalLastByteReceived`](#ipfsRetrievalLastByteReceived) | `200`, `500`, `2xxSuccess`, `4xxClientError`, `5xxServerError`, `otherHttpStatusCodes`, `failure` | [`retrieval.service.ts`](../../apps/backend/src/retrieval/retrieval.service.ts) | | `retrievalStatus` | Data Storage, Retrieval | [`ipfsRetrievalIntegrityChecked`](#ipfsRetrievalIntegrityChecked) | `success`, `failure.timedout`, `failure.other` from [Data Storage Sub-status meanings](./data-storage.md#sub-status-meanings). | | | `dataSetCreationStatus` | Data-Set Creation | Not tied to an [event above](#event-list) but rather to data-set creation start (`pending`) and completion (`success`/`failure.*`) | `pending`, `success`, `failure.timedout`, `failure.other` | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | -| `dataSetChallengeStatus` | Data Retention | Emitted on each [Data Retention Check](./data-retention.md) poll when a provider's confirmed proving-period totals advance (strictly positive deltas). Unit: **challenges** (period delta × `CHALLENGES_PER_PROVING_PERIOD = 5`). | `success` (challenges in successfully-proven periods), `failure` (challenges in faulted periods) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | -| `pdp_provider_estimated_overdue_periods` | Data Retention | Emitted on every [Data Retention Check](./data-retention.md) poll for every successfully processed provider. | Gauge value in proving periods (non-negative integer) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | -| `anonRetrievalStatus` | Anonymous Retrieval | After piece fetch completes (or on abort) | `success`, `failure.http`, `failure.aborted`, `failure.no_piece` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `dataSetChallengeStatus` | Data Retention | Not tied to an [event above](#event-list) but rather to the periodic chain-checking done in the [Data Retention Check](./data-retention.md) | `success`, `failure` | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | +| `pdp_provider_overdue_periods` | Data Retention | Emitted on every poll | Gauge value (estimated overdue periods) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | +| `anonRetrievalStatus` | Anonymous Retrieval | After piece fetch completes (or on abort) | `success` (HTTP 2xx **and** CommP matches), `failure.http`, `failure.commp` (HTTP 2xx but bytes hashed to a different CID), `failure.aborted`, `failure.no_piece`. | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonPieceHttpResponseCode` | Anonymous Retrieval | After piece fetch completes | `200`, `500`, `2xxSuccess`, `4xxClientError`, `5xxServerError`, `otherHttpStatusCodes`, `failure` (same classifier as [`ipfsRetrievalHttpResponseCode`](#ipfsRetrievalHttpResponseCode)) | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonCarParseStatus` | Anonymous Retrieval | After CAR validation runs (skipped when piece fetch failed or piece is not IPFS-indexed) | `parseable`, `not_parseable` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonIpniStatus` | Anonymous Retrieval | After CAR validation runs, **or** when piece fetch failed (records `skipped`) | `valid`, `invalid`, `skipped`, `error` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | From 3c2a69899944ca5d4aa8acfe6d8a95e26e2c454e Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Tue, 5 May 2026 09:04:48 +0200 Subject: [PATCH 19/28] refactor: let two subgraph endpoints coexist --- apps/backend/.env.example | 5 +- apps/backend/README.md | 3 +- apps/backend/src/config/app.config.ts | 13 +- .../data-retention/data-retention.module.ts | 4 +- .../data-retention.service.spec.ts | 184 ++--- .../data-retention/data-retention.service.ts | 16 +- .../src/pdp-subgraph/pdp-subgraph.module.ts | 8 + .../pdp-subgraph/pdp-subgraph.service.spec.ts | 694 ++++++++++++++++++ .../src/pdp-subgraph/pdp-subgraph.service.ts | 306 ++++++++ apps/backend/src/pdp-subgraph/queries.ts | 24 + apps/backend/src/pdp-subgraph/types.spec.ts | 245 +++++++ apps/backend/src/pdp-subgraph/types.ts | 151 ++++ apps/backend/src/subgraph/subgraph.service.ts | 15 + .../src/wallet-sdk/wallet-sdk.service.spec.ts | 2 +- docs/checks/data-retention.md | 10 +- ...-configuration-and-approval-methodology.md | 2 +- docs/environment-variables.md | 26 +- .../local/backend-configmap-local.yaml | 1 + 18 files changed, 1595 insertions(+), 114 deletions(-) create mode 100644 apps/backend/src/pdp-subgraph/pdp-subgraph.module.ts create mode 100644 apps/backend/src/pdp-subgraph/pdp-subgraph.service.spec.ts create mode 100644 apps/backend/src/pdp-subgraph/pdp-subgraph.service.ts create mode 100644 apps/backend/src/pdp-subgraph/queries.ts create mode 100644 apps/backend/src/pdp-subgraph/types.spec.ts create mode 100644 apps/backend/src/pdp-subgraph/types.ts diff --git a/apps/backend/.env.example b/apps/backend/.env.example index 26469c52..30556e7a 100644 --- a/apps/backend/.env.example +++ b/apps/backend/.env.example @@ -23,7 +23,10 @@ WALLET_ADDRESS=0x0000000000000000000000000000000000000000 WALLET_PRIVATE_KEY=your_private_key_here CHECK_DATASET_CREATION_FEES=true USE_ONLY_APPROVED_PROVIDERS=true -# Point at the dealbot-owned subgraph on Goldsky (see apps/subgraph/README.md). +# Upstream pdp-explorer subgraph — drives the data-retention / overdue-periods path. +PDP_SUBGRAPH_ENDPOINT=https://api.thegraph.com/subgraphs/filecoin/pdp +# Dealbot-owned subgraph on Goldsky (see apps/subgraph/README.md) — drives only +# the new anonymous-retrieval candidate-piece query for now. SUBGRAPH_ENDPOINT=https://api.goldsky.com/api/public//subgraphs/dealbot-subgraph//gn # Minimum number of datasets per SP (default: 1). When > 1, a separate data_set_creation job provisions extra datasets. diff --git a/apps/backend/README.md b/apps/backend/README.md index 4805080f..e4dafd6e 100644 --- a/apps/backend/README.md +++ b/apps/backend/README.md @@ -104,7 +104,8 @@ All configuration is done via environment variables in `.env`. | `CHECK_DATASET_CREATION_FEES` | Check fees before dataset creation | `true` | | `ENABLE_IPNI_TESTING` | IPNI testing mode (`disabled`/`random`/`always`) | `always` | | `USE_ONLY_APPROVED_PROVIDERS` | Only use approved storage providers | `true` | -| `SUBGRAPH_ENDPOINT` | Subgraph GraphQL endpoint for PDP proof-set/data-retention and anon-retrieval queries | `https://api.goldsky.com/api/public//subgraphs/dealbot-subgraph//gn` | +| `PDP_SUBGRAPH_ENDPOINT` | PDP subgraph API endpoint for PDP proof-set/data-retention | `https://api.thegraph.com/subgraphs/filecoin/pdp` | +| `SUBGRAPH_ENDPOINT` | Subgraph GraphQL endpoint for anon-retrieval queries | `https://api.goldsky.com/api/public//subgraphs/dealbot-subgraph//gn` | ### Scheduling Configuration (pg-boss) diff --git a/apps/backend/src/config/app.config.ts b/apps/backend/src/config/app.config.ts index 4e49e4d8..7906be8c 100644 --- a/apps/backend/src/config/app.config.ts +++ b/apps/backend/src/config/app.config.ts @@ -56,6 +56,15 @@ export const configValidationSchema = Joi.object({ USE_ONLY_APPROVED_PROVIDERS: Joi.boolean().default(true), DEALBOT_DATASET_VERSION: Joi.string().optional(), MIN_NUM_DATASETS_FOR_CHECKS: Joi.number().integer().min(1).default(1), + // Two subgraph endpoints coexist intentionally to limit blast radius while we + // migrate off the upstream pdp-explorer subgraph: + // - PDP_SUBGRAPH_ENDPOINT drives the established overdue-periods / data + // retention path against the existing pdp-explorer subgraph. + // - SUBGRAPH_ENDPOINT drives only the new anonymous-retrieval candidate + // piece query against the dealbot-owned subgraph. + // Once the dealbot-owned subgraph has soaked in production we can drop + // PDP_SUBGRAPH_ENDPOINT and route everything through SUBGRAPH_ENDPOINT. + PDP_SUBGRAPH_ENDPOINT: Joi.string().uri().optional().allow(""), SUBGRAPH_ENDPOINT: Joi.string().uri().optional().allow(""), // Scheduling @@ -177,7 +186,8 @@ export interface IBlockchainConfig { useOnlyApprovedProviders: boolean; dealbotDataSetVersion?: string; minNumDataSetsForChecks: number; - subgraphEndpoint?: string; + pdpSubgraphEndpoint?: string; + subgraphEndpoint?: string; // Endpoint of the dealbot-owned subgraph. Eventually replaces `pdpSubgraphEndpoint` } export interface ISchedulingConfig { @@ -437,6 +447,7 @@ export function loadConfig(): IConfig { useOnlyApprovedProviders: process.env.USE_ONLY_APPROVED_PROVIDERS !== "false", dealbotDataSetVersion: process.env.DEALBOT_DATASET_VERSION, minNumDataSetsForChecks: Number.parseInt(process.env.MIN_NUM_DATASETS_FOR_CHECKS || "1", 10), + pdpSubgraphEndpoint: process.env.PDP_SUBGRAPH_ENDPOINT || "", subgraphEndpoint: process.env.SUBGRAPH_ENDPOINT || "", }, scheduling: { diff --git a/apps/backend/src/data-retention/data-retention.module.ts b/apps/backend/src/data-retention/data-retention.module.ts index f0aec1ec..f459570a 100644 --- a/apps/backend/src/data-retention/data-retention.module.ts +++ b/apps/backend/src/data-retention/data-retention.module.ts @@ -2,12 +2,12 @@ import { Module } from "@nestjs/common"; import { TypeOrmModule } from "@nestjs/typeorm"; import { DataRetentionBaseline } from "../database/entities/data-retention-baseline.entity.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; -import { SubgraphModule } from "../subgraph/subgraph.module.js"; +import { PdpSubgraphModule } from "../pdp-subgraph/pdp-subgraph.module.js"; import { WalletSdkModule } from "../wallet-sdk/wallet-sdk.module.js"; import { DataRetentionService } from "./data-retention.service.js"; @Module({ - imports: [WalletSdkModule, SubgraphModule, TypeOrmModule.forFeature([DataRetentionBaseline, StorageProvider])], + imports: [WalletSdkModule, PdpSubgraphModule, TypeOrmModule.forFeature([DataRetentionBaseline, StorageProvider])], providers: [DataRetentionService], exports: [DataRetentionService], }) diff --git a/apps/backend/src/data-retention/data-retention.service.spec.ts b/apps/backend/src/data-retention/data-retention.service.spec.ts index d2d539cf..3fde29e8 100644 --- a/apps/backend/src/data-retention/data-retention.service.spec.ts +++ b/apps/backend/src/data-retention/data-retention.service.spec.ts @@ -7,8 +7,8 @@ import type { IConfig } from "../config/app.config.js"; import type { DataRetentionBaseline } from "../database/entities/data-retention-baseline.entity.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; import { buildCheckMetricLabels } from "../metrics-prometheus/check-metric-labels.js"; -import type { SubgraphService } from "../subgraph/subgraph.service.js"; -import type { ProviderDataSetResponse } from "../subgraph/types.js"; +import type { PDPSubgraphService } from "../pdp-subgraph/pdp-subgraph.service.js"; +import type { ProviderDataSetResponse } from "../pdp-subgraph/types.js"; import type { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; import { DataRetentionService } from "./data-retention.service.js"; @@ -42,7 +42,7 @@ describe("DataRetentionService", () => { let walletSdkServiceMock: { getTestingProviders: ReturnType; }; - let subgraphServiceMock: { + let pdpSubgraphServiceMock: { fetchSubgraphMeta: ReturnType; fetchProvidersWithDatasets: ReturnType; }; @@ -69,7 +69,7 @@ describe("DataRetentionService", () => { configServiceMock = { get: vi.fn((key: keyof IConfig) => { if (key === "blockchain") { - return { subgraphEndpoint: "https://example.com/subgraph" }; + return { pdpSubgraphEndpoint: "https://example.com/subgraph" }; } if (key === "spBlocklists") { return { ids: new Set(), addresses: new Set() }; @@ -95,7 +95,7 @@ describe("DataRetentionService", () => { ]), }; - subgraphServiceMock = { + pdpSubgraphServiceMock = { fetchSubgraphMeta: vi.fn().mockResolvedValue({ _meta: { block: { @@ -146,7 +146,7 @@ describe("DataRetentionService", () => { service = new DataRetentionService( configServiceMock, walletSdkServiceMock as unknown as WalletSdkService, - subgraphServiceMock as unknown as SubgraphService, + pdpSubgraphServiceMock as unknown as PDPSubgraphService, mockBaselineRepository as unknown as Repository, mockSPRepository as unknown as Repository, counterMock as unknown as Counter, @@ -155,15 +155,15 @@ describe("DataRetentionService", () => { ); }); - it("returns early when subgraphEndpoint is empty", async () => { + it("returns early when pdpSubgraphEndpoint is empty", async () => { (configServiceMock.get as ReturnType).mockReturnValue({ - subgraphEndpoint: "", + pdpSubgraphEndpoint: "", }); await service.pollDataRetention(); - expect(subgraphServiceMock.fetchSubgraphMeta).not.toHaveBeenCalled(); - expect(subgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); + expect(pdpSubgraphServiceMock.fetchSubgraphMeta).not.toHaveBeenCalled(); + expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); }); it("returns early when no testing providers configured", async () => { @@ -171,31 +171,31 @@ describe("DataRetentionService", () => { await service.pollDataRetention(); - expect(subgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); + expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); }); it("returns early when all providers are blocked for data-retention", async () => { (configServiceMock.get as ReturnType).mockImplementation((key: string) => { - if (key === "blockchain") return { subgraphEndpoint: "https://example.com/subgraph" }; + if (key === "blockchain") return { pdpSubgraphEndpoint: "https://example.com/subgraph" }; if (key === "spBlocklists") return { ids: new Set(), addresses: new Set([PROVIDER_A, PROVIDER_B]) }; }); await service.pollDataRetention(); - expect(subgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); + expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); }); it("excludes blocked providers from data-retention polling while retaining unblocked ones", async () => { (configServiceMock.get as ReturnType).mockImplementation((key: string) => { - if (key === "blockchain") return { subgraphEndpoint: "https://example.com/subgraph" }; + if (key === "blockchain") return { pdpSubgraphEndpoint: "https://example.com/subgraph" }; if (key === "spBlocklists") return { ids: new Set(), addresses: new Set([PROVIDER_A]) }; }); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); const allAddressesPolled: string[] = ( - subgraphServiceMock.fetchProvidersWithDatasets.mock.calls as [{ addresses: string[] }][] + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mock.calls as [{ addresses: string[] }][] ).flatMap(([{ addresses }]) => addresses); expect(allAddressesPolled).toContain(PROVIDER_B.toLowerCase()); expect(allAddressesPolled).not.toContain(PROVIDER_A.toLowerCase()); @@ -206,16 +206,16 @@ describe("DataRetentionService", () => { await service.pollDataRetention(); - expect(subgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); + expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).not.toHaveBeenCalled(); }); it("sets baseline on first poll without emitting counters (fresh deploy / new provider)", async () => { - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); - expect(subgraphServiceMock.fetchSubgraphMeta).toHaveBeenCalled(); - expect(subgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenCalledWith({ + expect(pdpSubgraphServiceMock.fetchSubgraphMeta).toHaveBeenCalled(); + expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenCalledWith({ blockNumber: 1200, addresses: [PROVIDER_A, PROVIDER_B], }); @@ -239,20 +239,20 @@ describe("DataRetentionService", () => { it("computes deltas correctly on consecutive polls", async () => { // First poll: blockNumber=1200 - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); const firstCallCount = counterMock.labels.mock.calls.length; // Second poll: blockNumber=1300, provider totals changed - subgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ + pdpSubgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1300, }, }, }); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 12n, totalProvingPeriods: 105n, @@ -266,7 +266,7 @@ describe("DataRetentionService", () => { }); it("does not increment counters when deltas are zero", async () => { - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([makeProvider()]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([makeProvider()]); // First poll await service.pollDataRetention(); @@ -288,7 +288,7 @@ describe("DataRetentionService", () => { const providerA = makeProvider({ address: PROVIDER_A, totalFaultedPeriods: 5n }); const providerB = makeProvider({ address: PROVIDER_B, totalFaultedPeriods: 20n }); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([providerA, providerB]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([providerA, providerB]); await service.pollDataRetention(); @@ -310,7 +310,7 @@ describe("DataRetentionService", () => { ]); const provider = makeProvider(); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([provider]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([provider]); await service.pollDataRetention(); @@ -333,7 +333,7 @@ describe("DataRetentionService", () => { }); it("handles empty providers array without errors", async () => { - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([]); await service.pollDataRetention(); @@ -347,7 +347,7 @@ describe("DataRetentionService", () => { ]); const provider = makeProvider(); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([provider]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([provider]); await service.pollDataRetention(); @@ -370,7 +370,7 @@ describe("DataRetentionService", () => { }); it("catches and logs errors without rethrowing", async () => { - subgraphServiceMock.fetchProvidersWithDatasets.mockRejectedValueOnce(new Error("subgraph down")); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockRejectedValueOnce(new Error("subgraph down")); // Should not throw await expect(service.pollDataRetention()).resolves.toBeUndefined(); @@ -378,14 +378,14 @@ describe("DataRetentionService", () => { it("resets baseline on negative deltas without incrementing counters", async () => { // First poll: high values - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 100n, totalProvingPeriods: 200n }), ]); await service.pollDataRetention(); counterMock.labels.mockClear(); // Second poll: lower values (e.g., chain reorg or subgraph correction) - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 50n, totalProvingPeriods: 100n }), ]); await service.pollDataRetention(); @@ -394,7 +394,7 @@ describe("DataRetentionService", () => { expect(counterMock.labels).not.toHaveBeenCalled(); // Third poll: values increase from new baseline - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 52n, totalProvingPeriods: 105n }), ]); await service.pollDataRetention(); @@ -412,7 +412,7 @@ describe("DataRetentionService", () => { { providerAddress: PROVIDER_A, faultedPeriods: "0", successPeriods: "0", lastBlockNumber: "1000" }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: largeValue, totalProvingPeriods: largeValue * 2n }), ]); @@ -436,7 +436,7 @@ describe("DataRetentionService", () => { { providerAddress: PROVIDER_A, faultedPeriods: "0", successPeriods: "0", lastBlockNumber: "1000" }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: maxSafeInt, totalProvingPeriods: maxSafeInt * 2n }), ]); @@ -456,7 +456,7 @@ describe("DataRetentionService", () => { totalFaultedPeriods: 5n, totalProvingPeriods: 50n, }); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([provider]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([provider]); await service.pollDataRetention(); @@ -475,18 +475,18 @@ describe("DataRetentionService", () => { })); walletSdkServiceMock.getTestingProviders.mockReturnValueOnce(manyProviders); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([]); await service.pollDataRetention(); // Should be called twice: once for first 50, once for remaining 25 - expect(subgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenCalledTimes(2); - expect(subgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenNthCalledWith(1, { + expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenCalledTimes(2); + expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenNthCalledWith(1, { addresses: expect.arrayContaining([expect.any(String)]), blockNumber: 1200, }); - expect(subgraphServiceMock.fetchProvidersWithDatasets.mock.calls[0][0].addresses).toHaveLength(50); - expect(subgraphServiceMock.fetchProvidersWithDatasets.mock.calls[1][0].addresses).toHaveLength(25); + expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets.mock.calls[0][0].addresses).toHaveLength(50); + expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets.mock.calls[1][0].addresses).toHaveLength(25); }); it("continues processing next batch if one batch fails", async () => { @@ -499,20 +499,20 @@ describe("DataRetentionService", () => { walletSdkServiceMock.getTestingProviders.mockReturnValueOnce(manyProviders); // First batch fails, second succeeds - subgraphServiceMock.fetchProvidersWithDatasets + pdpSubgraphServiceMock.fetchProvidersWithDatasets .mockRejectedValueOnce(new Error("Subgraph timeout")) .mockResolvedValueOnce([]); await service.pollDataRetention(); // Both batches should be attempted - expect(subgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenCalledTimes(2); + expect(pdpSubgraphServiceMock.fetchProvidersWithDatasets).toHaveBeenCalledTimes(2); }); it("logs error and skips counter update when provider not found in cache but returned from subgraph", async () => { // Provider C not in cache const PROVIDER_C = "0x1234567890123456789012345678901234567890"; - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_C })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_C })]); await service.pollDataRetention(); @@ -523,7 +523,7 @@ describe("DataRetentionService", () => { describe("cleanupStaleProviders", () => { it("does not cleanup when no stale providers exist", async () => { // First poll establishes baseline for both providers - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ address: PROVIDER_A }), makeProvider({ address: PROVIDER_B }), ]); @@ -536,7 +536,7 @@ describe("DataRetentionService", () => { it("successfully cleans up stale provider with valid database entry", async () => { // First poll: establish baseline for PROVIDER_A - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: PROVIDER_A removed from active list, only PROVIDER_B active @@ -558,7 +558,7 @@ describe("DataRetentionService", () => { }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -589,7 +589,7 @@ describe("DataRetentionService", () => { it("skips cleanup entirely when database fetch fails", async () => { // First poll: establish baseline - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: provider removed, but DB fails @@ -604,7 +604,7 @@ describe("DataRetentionService", () => { mockSPRepository.find.mockRejectedValueOnce(new Error("Database connection failed")); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -624,7 +624,7 @@ describe("DataRetentionService", () => { }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ address: PROVIDER_A, totalFaultedPeriods: 12n, totalProvingPeriods: 105n }), ]); @@ -637,7 +637,7 @@ describe("DataRetentionService", () => { it("retains baseline when provider not found in database", async () => { // First poll: establish baseline - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: provider removed from active list @@ -653,7 +653,7 @@ describe("DataRetentionService", () => { // Database returns empty array (provider not found) mockSPRepository.find.mockResolvedValueOnce([]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -670,7 +670,7 @@ describe("DataRetentionService", () => { }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ address: PROVIDER_A, totalFaultedPeriods: 12n, totalProvingPeriods: 105n }), ]); @@ -683,7 +683,7 @@ describe("DataRetentionService", () => { it("retains baseline when provider has null providerId", async () => { // First poll: establish baseline - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: provider removed @@ -706,7 +706,7 @@ describe("DataRetentionService", () => { }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -716,7 +716,7 @@ describe("DataRetentionService", () => { it("retains baseline when counter removal throws error", async () => { // First poll: establish baseline - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: provider removed @@ -743,7 +743,7 @@ describe("DataRetentionService", () => { throw new Error("Counter removal failed"); }); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -760,7 +760,7 @@ describe("DataRetentionService", () => { }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ address: PROVIDER_A, totalFaultedPeriods: 12n, totalProvingPeriods: 110n }), ]); @@ -781,7 +781,7 @@ describe("DataRetentionService", () => { { id: 3, serviceProvider: PROVIDER_C, name: "Provider C", isApproved: true }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ address: PROVIDER_A }), makeProvider({ address: PROVIDER_B }), makeProvider({ address: PROVIDER_C }), @@ -799,7 +799,7 @@ describe("DataRetentionService", () => { { address: PROVIDER_C, name: "Provider C", providerId: 3, isApproved: true }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); @@ -815,7 +815,7 @@ describe("DataRetentionService", () => { it("skips cleanup when processing errors occurred", async () => { // First poll: establish baseline - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: provider removed, but processing has errors @@ -824,7 +824,7 @@ describe("DataRetentionService", () => { ]); // Simulate processing error - subgraphServiceMock.fetchProvidersWithDatasets.mockRejectedValueOnce(new Error("Processing failed")); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockRejectedValueOnce(new Error("Processing failed")); await service.pollDataRetention(); @@ -841,7 +841,7 @@ describe("DataRetentionService", () => { { id: 1, serviceProvider: PROVIDER_MIXED_CASE, name: "Provider A", isApproved: true }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ address: PROVIDER_MIXED_CASE.toLowerCase() as `0x${string}` }), ]); @@ -861,7 +861,7 @@ describe("DataRetentionService", () => { }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -885,7 +885,7 @@ describe("DataRetentionService", () => { // Subgraph returns same values: totalFaultedPeriods=10, totalProvingPeriods=100 // confirmedTotalSuccess = 100 - 10 = 90 // With DB baseline: faultedDelta = 10 - 10 = 0, successDelta = 90 - 90 = 0 - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); @@ -907,7 +907,7 @@ describe("DataRetentionService", () => { // Subgraph returns: totalFaultedPeriods=10, totalProvingPeriods=100 // confirmedTotalSuccess = 100 - 10 = 90 // faultedDelta = 10 - 8 = 2, successDelta = 90 - 85 = 5 - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); @@ -921,8 +921,8 @@ describe("DataRetentionService", () => { expect(incCalls).toEqual(expect.arrayContaining([[10], [25]])); }); - it("reloads baselines from DB on every poll", async () => { - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([makeProvider()]); + it("only loads baselines from DB once across multiple polls", async () => { + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([makeProvider()]); await service.pollDataRetention(); await service.pollDataRetention(); @@ -932,13 +932,13 @@ describe("DataRetentionService", () => { }); it("does not double-count when poll ownership alternates across worker pods", async () => { - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); const secondPod = new DataRetentionService( configServiceMock, walletSdkServiceMock as unknown as WalletSdkService, - subgraphServiceMock as unknown as SubgraphService, + pdpSubgraphServiceMock as unknown as PDPSubgraphService, mockBaselineRepository as unknown as Repository, mockSPRepository as unknown as Repository, counterMock as unknown as Counter, @@ -946,8 +946,8 @@ describe("DataRetentionService", () => { { insert: vi.fn(), probeLocation: "test" } as unknown as ClickhouseService, ); - subgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1300 } } }); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1300 } } }); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 11n, totalProvingPeriods: 102n }), ]); await secondPod.pollDataRetention(); @@ -955,8 +955,8 @@ describe("DataRetentionService", () => { counterMock.labels.mockClear(); counterMock.inc.mockClear(); - subgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1400 } } }); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1400 } } }); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 12n, totalProvingPeriods: 104n }), ]); await service.pollDataRetention(); @@ -972,8 +972,8 @@ describe("DataRetentionService", () => { ]; mockBaselineRepository.upsert.mockRejectedValueOnce(new Error("DB write failed")); - subgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1300 } } }); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1300 } } }); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 12n, totalProvingPeriods: 105n }), ]); @@ -981,8 +981,8 @@ describe("DataRetentionService", () => { expect(counterMock.labels).not.toHaveBeenCalled(); - subgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1400 } } }); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1400 } } }); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 12n, totalProvingPeriods: 105n }), ]); @@ -1003,12 +1003,12 @@ describe("DataRetentionService", () => { }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([makeProvider()]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([makeProvider()]); // First poll: DB load fails, poll bails out to avoid emitting bloated values await service.pollDataRetention(); expect(mockBaselineRepository.find).toHaveBeenCalledTimes(1); - expect(subgraphServiceMock.fetchSubgraphMeta).not.toHaveBeenCalled(); + expect(pdpSubgraphServiceMock.fetchSubgraphMeta).not.toHaveBeenCalled(); expect(counterMock.labels).not.toHaveBeenCalled(); // Second poll: DB load succeeds, baselines restored, normal delta computation @@ -1021,16 +1021,16 @@ describe("DataRetentionService", () => { it("emits real deltas on second poll after fresh deploy baseline-only first poll", async () => { // First poll: fresh deploy, no baselines in DB // Baseline set to: faultedPeriods=10, successPeriods=90 - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); counterMock.labels.mockClear(); counterMock.inc.mockClear(); // Second poll: values have increased - subgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ + pdpSubgraphServiceMock.fetchSubgraphMeta.mockResolvedValueOnce({ _meta: { block: { number: 1300 } }, }); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 12n, totalProvingPeriods: 105n }), ]); @@ -1044,7 +1044,7 @@ describe("DataRetentionService", () => { it("deletes baseline from DB when stale provider is cleaned up", async () => { // First poll: establish baseline for PROVIDER_A - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: PROVIDER_A removed from active list @@ -1056,7 +1056,7 @@ describe("DataRetentionService", () => { { address: PROVIDER_A, name: "Provider A", providerId: 1, isApproved: true }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); @@ -1069,7 +1069,7 @@ describe("DataRetentionService", () => { it("emits overdue gauge on first poll (baseline-only)", async () => { // Provider is overdue: currentBlock=1200, // estimatedOverduePeriods = (1200 - 901) / 100 = 2.99 -> 2 - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); @@ -1086,7 +1086,7 @@ describe("DataRetentionService", () => { it("emits overdue gauge = 0 when provider is not overdue", async () => { // nextDeadline=2000 > currentBlock=1200 - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ proofSets: [] })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ proofSets: [] })]); await service.pollDataRetention(); @@ -1095,7 +1095,7 @@ describe("DataRetentionService", () => { it("emits overdue gauge even on negative delta (baseline reset)", async () => { // First poll: high values - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 100n, totalProvingPeriods: 200n }), ]); await service.pollDataRetention(); @@ -1103,7 +1103,7 @@ describe("DataRetentionService", () => { gaugeMock.set.mockClear(); // Second poll: lower values (negative delta) but still overdue - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 50n, totalProvingPeriods: 100n }), ]); await service.pollDataRetention(); @@ -1115,7 +1115,7 @@ describe("DataRetentionService", () => { it("naturally resets gauge to 0 when subgraph catches up", async () => { // First poll: provider is overdue (currentBlock=1200, nextDeadline=1000) - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider()]); await service.pollDataRetention(); expect(gaugeMock.set).toHaveBeenCalledWith(2); @@ -1124,7 +1124,7 @@ describe("DataRetentionService", () => { gaugeMock.set.mockClear(); // Second poll: subgraph caught up, nextDeadline advanced past currentBlock - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([ makeProvider({ totalFaultedPeriods: 12n, totalProvingPeriods: 102n, @@ -1140,7 +1140,7 @@ describe("DataRetentionService", () => { it("removes overdue gauge when stale provider is cleaned up", async () => { // First poll: establish baseline for PROVIDER_A - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_A })]); await service.pollDataRetention(); // Second poll: PROVIDER_A removed from active list @@ -1152,7 +1152,7 @@ describe("DataRetentionService", () => { { address: PROVIDER_A, name: "Provider A", providerId: 1, isApproved: true }, ]); - subgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); + pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValueOnce([makeProvider({ address: PROVIDER_B })]); await service.pollDataRetention(); diff --git a/apps/backend/src/data-retention/data-retention.service.ts b/apps/backend/src/data-retention/data-retention.service.ts index 1422bbfd..c6ece7b5 100644 --- a/apps/backend/src/data-retention/data-retention.service.ts +++ b/apps/backend/src/data-retention/data-retention.service.ts @@ -11,8 +11,8 @@ import { IConfig } from "../config/app.config.js"; import { DataRetentionBaseline } from "../database/entities/data-retention-baseline.entity.js"; import { StorageProvider } from "../database/entities/storage-provider.entity.js"; import { buildCheckMetricLabels, CheckMetricLabels } from "../metrics-prometheus/check-metric-labels.js"; -import { SubgraphService } from "../subgraph/subgraph.service.js"; -import { type ProviderDataSetResponse } from "../subgraph/types.js"; +import { PDPSubgraphService } from "../pdp-subgraph/pdp-subgraph.service.js"; +import { type ProviderDataSetResponse } from "../pdp-subgraph/types.js"; import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; import { type PDPProviderEx } from "../wallet-sdk/wallet-sdk.types.js"; @@ -41,7 +41,7 @@ export class DataRetentionService { constructor( private readonly configService: ConfigService, private readonly walletSdkService: WalletSdkService, - private readonly subgraphService: SubgraphService, + private readonly pdpSubgraphService: PDPSubgraphService, @InjectRepository(DataRetentionBaseline) private readonly baselineRepository: Repository, @InjectRepository(StorageProvider) @@ -59,10 +59,10 @@ export class DataRetentionService { * challenge delta since the last poll. */ async pollDataRetention(): Promise { - const subgraphEndpoint = this.configService.get("blockchain").subgraphEndpoint; - if (!subgraphEndpoint) { + const pdpSubgraphEndpoint = this.configService.get("blockchain").pdpSubgraphEndpoint; + if (!pdpSubgraphEndpoint) { this.logger.warn({ - event: "subgraph_endpoint_not_configured", + event: "pdp_subgraph_endpoint_not_configured", message: "No PDP subgraph endpoint configured", }); return; @@ -75,7 +75,7 @@ export class DataRetentionService { } try { - const subgraphMeta = await this.subgraphService.fetchSubgraphMeta(); + const subgraphMeta = await this.pdpSubgraphService.fetchSubgraphMeta(); const allProviderInfos = this.walletSdkService.getTestingProviders(); const spBlocklists = this.configService.get("spBlocklists"); const providerInfos = allProviderInfos?.filter((p) => !isSpBlocked(spBlocklists, p.serviceProvider, p.id)); @@ -104,7 +104,7 @@ export class DataRetentionService { ); try { - const providersFromSubgraph = await this.subgraphService.fetchProvidersWithDatasets({ + const providersFromSubgraph = await this.pdpSubgraphService.fetchProvidersWithDatasets({ blockNumber, addresses: batchAddresses, }); diff --git a/apps/backend/src/pdp-subgraph/pdp-subgraph.module.ts b/apps/backend/src/pdp-subgraph/pdp-subgraph.module.ts new file mode 100644 index 00000000..6e084fc1 --- /dev/null +++ b/apps/backend/src/pdp-subgraph/pdp-subgraph.module.ts @@ -0,0 +1,8 @@ +import { Module } from "@nestjs/common"; +import { PDPSubgraphService } from "./pdp-subgraph.service.js"; + +@Module({ + providers: [PDPSubgraphService], + exports: [PDPSubgraphService], +}) +export class PdpSubgraphModule {} diff --git a/apps/backend/src/pdp-subgraph/pdp-subgraph.service.spec.ts b/apps/backend/src/pdp-subgraph/pdp-subgraph.service.spec.ts new file mode 100644 index 00000000..cd3a1ea8 --- /dev/null +++ b/apps/backend/src/pdp-subgraph/pdp-subgraph.service.spec.ts @@ -0,0 +1,694 @@ +import type { ConfigService } from "@nestjs/config"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import type { IConfig } from "../config/app.config.js"; +import { PDPSubgraphService } from "./pdp-subgraph.service.js"; + +const VALID_ADDRESS = "0xd8da6bf26964af9d7eed9e03e53415d37aa96045" as const; +const SUBGRAPH_ENDPOINT = "https://api.thegraph.com/subgraphs/filecoin/pdp" as const; + +const makeSubgraphResponse = (providers: Record[] = []) => ({ + data: { providers }, +}); + +const makeValidProvider = (overrides: Record = {}) => ({ + address: VALID_ADDRESS, + totalFaultedPeriods: "10", + totalProvingPeriods: "100", + proofSets: [ + { + totalFaultedPeriods: "2", + currentDeadlineCount: "5", + nextDeadline: "1000", + maxProvingPeriod: "100", + }, + ], + ...overrides, +}); + +const makeSubgraphMetaResponse = (blockNumber = 12345) => ({ + data: { + _meta: { + block: { + number: blockNumber, + }, + }, + }, +}); + +describe("PDPSubgraphService", () => { + let service: PDPSubgraphService; + let fetchMock: ReturnType; + + beforeEach(() => { + const configService = { + get: vi.fn((key: keyof IConfig) => { + if (key === "blockchain") { + return { pdpSubgraphEndpoint: SUBGRAPH_ENDPOINT }; + } + return undefined; + }), + } as unknown as ConfigService; + + service = new PDPSubgraphService(configService); + + fetchMock = vi.fn(); + vi.stubGlobal("fetch", fetchMock); + + vi.useFakeTimers(); + }); + + afterEach(() => { + vi.restoreAllMocks(); + vi.useRealTimers(); + }); + + describe("fetchProvidersWithDatasets", () => { + it("fetches and returns validated providers with bigint fields", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => makeSubgraphResponse([makeValidProvider()]), + }); + + const providers = await service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }); + + expect(fetchMock).toHaveBeenCalledWith(SUBGRAPH_ENDPOINT, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: expect.stringContaining('"addresses"'), + }); + + expect(providers).toHaveLength(1); + expect(providers[0].address).toBe(VALID_ADDRESS); + expect(providers[0].totalFaultedPeriods).toBe(10n); + expect(providers[0].totalProvingPeriods).toBe(100n); + expect(providers[0].proofSets[0].maxProvingPeriod).toBe(100n); + }); + + it("returns empty array when no providers exist", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => makeSubgraphResponse([]), + }); + + const providers = await service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }); + expect(providers).toEqual([]); + }); + + it("returns empty array when addresses array is empty", async () => { + const providers = await service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [], + }); + + expect(providers).toEqual([]); + expect(fetchMock).not.toHaveBeenCalled(); + }); + + it("throws on HTTP error response", async () => { + fetchMock.mockResolvedValue({ + ok: false, + status: 500, + }); + + const promise = service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }); + + // This stops Node.js from throwing an Unhandled Rejection during fast-forward. + promise.catch(() => {}); + + await vi.runAllTimersAsync(); + + await expect(promise).rejects.toThrow("Failed to fetch provider data after 3 attempts"); + expect(fetchMock).toHaveBeenCalledTimes(3); + }); + + it("throws on GraphQL errors in response", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => ({ + data: null, + errors: [{ message: "Query failed" }], + }), + }); + + const promise = service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }); + promise.catch(() => {}); + + await vi.runAllTimersAsync(); + + // Now await the final promise to catch the expected error + await expect(promise).rejects.toThrow("Failed to fetch provider data after 3 attempts"); + expect(fetchMock).toHaveBeenCalledTimes(3); + }); + + it("throws on network failure", async () => { + fetchMock.mockRejectedValueOnce(new Error("Network error")); + + const promise = service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }); + promise.catch(() => {}); + + await vi.runAllTimersAsync(); + + // Now await the final promise to catch the expected error + await expect(promise).rejects.toThrow("Failed to fetch provider data after 3 attempts"); + expect(fetchMock).toHaveBeenCalledTimes(3); // Initial + 2 retries = 3 total + }); + + it("throws immediately on validation error without retrying", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => ({ + data: { providers: [{ address: "invalid" }] }, + }), + }); + + await expect( + service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }), + ).rejects.toThrow("Data validation failed"); + + // Should only be called once - no retries for validation errors + expect(fetchMock).toHaveBeenCalledTimes(1); + }); + + it("throws immediately when response data is missing required fields", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => ({ + data: { providers: [{ address: VALID_ADDRESS }] }, // Missing required fields + }), + }); + + await expect( + service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }), + ).rejects.toThrow("Data validation failed"); + + // Should only be called once - no retries for validation errors + expect(fetchMock).toHaveBeenCalledTimes(1); + }); + + it("sends blockNumber as string in the GraphQL variables", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => makeSubgraphResponse([makeValidProvider()]), + }); + + await service.fetchProvidersWithDatasets({ + blockNumber: 12345, + addresses: [VALID_ADDRESS], + }); + + const body = JSON.parse(fetchMock.mock.calls[0][1].body); + expect(body.variables.blockNumber).toBe("12345"); + }); + + it("retries network errors but not validation errors", async () => { + // First attempt: network error (should retry) + fetchMock.mockRejectedValueOnce(new Error("Network timeout")); + + // Second attempt: succeeds but validation fails (should not retry) + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => ({ + data: { providers: [{ address: "invalid" }] }, + }), + }); + + const promise = service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }); + promise.catch(() => {}); + + await vi.runAllTimersAsync(); + + // Now await the final promise to catch the expected error + await expect(promise).rejects.toThrow("Data validation failed"); + + // Should be called twice: initial network error + 1 retry that fails validation + expect(fetchMock).toHaveBeenCalledTimes(2); + }); + + it("sends addresses array in the GraphQL variables", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => makeSubgraphResponse([makeValidProvider()]), + }); + + const addresses = [VALID_ADDRESS, "0xAb5801a7D398351b8bE11C439e05C5B3259aeC9B"]; + await service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses, + }); + + const body = JSON.parse(fetchMock.mock.calls[0][1].body); + expect(body.variables.addresses).toEqual(addresses); + }); + + it("batches large address lists into chunks of MAX_PROVIDERS_PER_QUERY", async () => { + // Create 150 addresses (should be split into 2 batches: 100 + 50) + const addresses = Array.from({ length: 150 }, (_, i) => `0x${i.toString().padStart(40, "0")}`); + + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphResponse([]), + }); + + await service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses, + }); + + // Should make 2 requests + expect(fetchMock).toHaveBeenCalledTimes(2); + }); + + it("retries failed requests with exponential backoff", async () => { + // Fail on first attempt, succeed on second attempt (1 retry) + fetchMock.mockRejectedValueOnce(new Error("Network timeout")).mockResolvedValueOnce({ + ok: true, + json: async () => makeSubgraphResponse([makeValidProvider()]), + }); + + const promise = service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }); + + await vi.runAllTimersAsync(); + + // Now await the final promise to resolve + const providers = await promise; + + expect(fetchMock).toHaveBeenCalledTimes(2); // Initial attempt + 1 retry + expect(providers).toHaveLength(1); + }); + + it("processes batches with concurrency control", async () => { + // Create 120 addresses (should be 2 batches of 100 each, but processed with concurrency limit) + const addresses = Array.from({ length: 120 }, (_, i) => `0x${i.toString().padStart(40, "0")}`); + + let concurrentCalls = 0; + let maxConcurrentCalls = 0; + + fetchMock.mockImplementation(async () => { + concurrentCalls++; + maxConcurrentCalls = Math.max(maxConcurrentCalls, concurrentCalls); + await new Promise((resolve) => setTimeout(resolve, 10)); + concurrentCalls--; + return { + ok: true, + json: async () => makeSubgraphResponse([]), + }; + }); + + const fetchPromise = service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses, + }); + + await vi.runAllTimersAsync(); + + await fetchPromise; + + // Should respect MAX_CONCURRENT_REQUESTS (50) + expect(maxConcurrentCalls).toBeLessThanOrEqual(50); + expect(fetchMock).toHaveBeenCalledTimes(2); + }); + }); + + describe("fetchSubgraphMeta", () => { + it("fetches and returns subgraph metadata with block number", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + const meta = await service.fetchSubgraphMeta(); + + expect(fetchMock).toHaveBeenCalledWith(SUBGRAPH_ENDPOINT, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: expect.stringContaining("GetSubgraphMeta"), + }); + + expect(meta).toEqual({ + _meta: { + block: { + number: 12345, + }, + }, + }); + }); + + it("throws when PDP subgraph endpoint is not configured", async () => { + const configService = { + get: vi.fn(() => ({ pdpSubgraphEndpoint: "" })), + } as unknown as ConfigService; + + const serviceWithoutEndpoint = new PDPSubgraphService(configService); + + await expect(serviceWithoutEndpoint.fetchSubgraphMeta()).rejects.toThrow("No PDP subgraph endpoint configured"); + }); + + it("throws on HTTP error response", async () => { + fetchMock.mockResolvedValueOnce({ + ok: false, + status: 500, + statusText: "Internal Server Error", + }); + + const promise = service.fetchSubgraphMeta(); + promise.catch(() => {}); + + await vi.runAllTimersAsync(); + + // Now await the final promise to catch the expected error + await expect(promise).rejects.toThrow("Failed to fetch subgraph metadata after 3 attempts"); + }); + + it("throws on GraphQL errors in response", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => ({ + errors: [{ message: "Query timeout" }], + }), + }); + + const promise = service.fetchSubgraphMeta(); + promise.catch(() => {}); + + await vi.runAllTimersAsync(); + + // Now await the final promise to catch the expected error + await expect(promise).rejects.toThrow("Failed to fetch subgraph metadata after 3 attempts"); + }); + + it("throws on validation failure without retry", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => ({ + data: { + _meta: { + block: { + number: "not-a-number", // Invalid - should be number + }, + }, + }, + }), + }); + + await expect(service.fetchSubgraphMeta()).rejects.toThrow("Data validation failed"); + expect(fetchMock).toHaveBeenCalledTimes(1); // Should not retry validation errors + }); + + it("throws on missing required fields", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => ({ + data: { + _meta: { + block: { + number: undefined, // missing required field + }, + }, + }, + }), + }); + + await expect(service.fetchSubgraphMeta()).rejects.toThrow("Data validation failed"); + expect(fetchMock).toHaveBeenCalledTimes(1); + }); + + it("retries on network failures with exponential backoff", async () => { + fetchMock.mockRejectedValueOnce(new Error("Network timeout")).mockResolvedValueOnce({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + const promise = service.fetchSubgraphMeta(); + + await vi.runAllTimersAsync(); + + // Now await the second promise to resolve + const meta = await promise; + + expect(fetchMock).toHaveBeenCalledTimes(2); // Initial + 1 retry + expect(meta._meta.block.number).toBe(12345); + }); + + it("throws after MAX_RETRIES attempts on persistent network errors", async () => { + fetchMock.mockRejectedValue(new Error("Network timeout")); + + const promise = service.fetchSubgraphMeta(); + promise.catch(() => {}); + + await vi.runAllTimersAsync(); + + // Now await the final promise to catch the expected error + await expect(promise).rejects.toThrow("Failed to fetch subgraph metadata after 3 attempts"); + expect(fetchMock).toHaveBeenCalledTimes(3); + }); + }); + + describe("enforceRateLimit (sliding window)", () => { + it("allows requests when under the rate limit", async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + const startTime = Date.now(); + + // Make 5 requests - should all go through immediately + const promises = Array.from({ length: 5 }, () => service.fetchSubgraphMeta()); + + await Promise.all(promises); + + const endTime = Date.now(); + const elapsed = endTime - startTime; + + // Should complete quickly (no waiting) + expect(elapsed).toBeLessThan(100); + expect(fetchMock).toHaveBeenCalledTimes(5); + }); + + it("enforces rate limit when exceeding MAX_CONCURRENT_REQUESTS", async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + // Fill up the rate limit window with 50 requests + const initialPromises = Array.from({ length: 50 }, () => service.fetchSubgraphMeta()); + await Promise.all(initialPromises); + + fetchMock.mockClear(); + + // Try to make one more request - should wait for oldest to expire + const promise = service.fetchSubgraphMeta(); + + // Advance past the 10 second window + buffer + await vi.advanceTimersByTimeAsync(10010); + await promise; + + expect(fetchMock).toHaveBeenCalledTimes(1); + }); + + it("throws error when requestCount exceeds MAX_CONCURRENT_REQUESTS", async () => { + // Access private method via type assertion for testing + const enforceRateLimit = (service as any).enforceRateLimit.bind(service); + + await expect(enforceRateLimit(51)).rejects.toThrow("Cannot request 51 items; exceeds rate limit window of 50"); + }); + + it("correctly calculates wait time for multiple required slots", async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + // Fill 48 slots + const initialPromises = Array.from({ length: 48 }, () => service.fetchSubgraphMeta()); + await vi.runAllTimersAsync(); + await Promise.all(initialPromises); + + fetchMock.mockClear(); + + // Request 5 more slots (need 3 to free up: 5 - 2 available = 3) + // Should wait for the 3rd oldest timestamp to expire + const enforceRateLimit = (service as any).enforceRateLimit.bind(service); + const promise = enforceRateLimit(5); + + // The 3rd request should expire at ~10 seconds + await vi.advanceTimersByTimeAsync(10010); + await promise; + + // Verify slots were reserved + // After 10s, the first 48 expired, so we should only have the 5 new ones + const timestamps = (service as any).requestTimestamps; + expect(timestamps.length).toBe(5); // Only the 5 new slots remain + }); + + it("handles sliding window correctly as old requests expire", async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + // Make 30 requests at t=0 + const batch1 = Array.from({ length: 30 }, () => service.fetchSubgraphMeta()); + await vi.runAllTimersAsync(); + await Promise.all(batch1); + + // Advance 5 seconds + await vi.advanceTimersByTimeAsync(5000); + + // Make 20 more requests at t=5000 + const batch2 = Array.from({ length: 20 }, () => service.fetchSubgraphMeta()); + await vi.runAllTimersAsync(); + await Promise.all(batch2); + + // Now at t=5000, we have 50 requests in the window + // Advance to t=10100 - first 30 should expire + await vi.advanceTimersByTimeAsync(5100); + + fetchMock.mockClear(); + + // Should be able to make 30 more requests immediately + const batch3 = Array.from({ length: 30 }, () => service.fetchSubgraphMeta()); + await vi.runAllTimersAsync(); + await Promise.all(batch3); + + expect(fetchMock).toHaveBeenCalledTimes(30); + }); + + it("adds 10ms buffer to prevent timing edge cases", async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + // Fill the window + const initialPromises = Array.from({ length: 50 }, () => service.fetchSubgraphMeta()); + await vi.runAllTimersAsync(); + await Promise.all(initialPromises); + + fetchMock.mockClear(); + + const promise = service.fetchSubgraphMeta(); + + // Advance past the window + buffer + await vi.advanceTimersByTimeAsync(10010); + await promise; + + expect(fetchMock).toHaveBeenCalledTimes(1); + }); + + it("recursively waits when multiple batches need to expire", async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + // Fill window with 50 requests + const batch1 = Array.from({ length: 50 }, () => service.fetchSubgraphMeta()); + await vi.runAllTimersAsync(); + await Promise.all(batch1); + + // Advance 5 seconds + await vi.advanceTimersByTimeAsync(5000); + + fetchMock.mockClear(); + + // Try to request 30 slots (need to wait for 30 to expire) + const enforceRateLimit = (service as any).enforceRateLimit.bind(service); + const promise = enforceRateLimit(30); + + // First recursion: wait for 30th oldest to expire (~10s from start) + await vi.advanceTimersByTimeAsync(5010); + + // Should recursively check and complete + await promise; + + const timestamps = (service as any).requestTimestamps; + // After 10s from start, all 50 initial requests expired, only 30 new ones remain + expect(timestamps.length).toBe(30); // Only the 30 new slots + }); + + it("reserves slots immediately to prevent race conditions", async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + // Fill 47 slots + const initial = Array.from({ length: 47 }, () => service.fetchSubgraphMeta()); + await vi.runAllTimersAsync(); + await Promise.all(initial); + + // Now we have 3 available slots + const enforceRateLimit = (service as any).enforceRateLimit.bind(service); + + // Request 3 slots - should succeed immediately + await enforceRateLimit(3); + + const timestamps = (service as any).requestTimestamps; + expect(timestamps.length).toBe(50); // 47 + 3 = 50 (full) + + // Try to request 1 more - should need to wait + const promise = enforceRateLimit(1); + + // Advance time to free up a slot + await vi.advanceTimersByTimeAsync(10010); + await promise; + + // After waiting, the old slots expired and new one was added + const finalTimestamps = (service as any).requestTimestamps; + expect(finalTimestamps.length).toBe(1); // Only the new request remains + }); + + it("filters out expired timestamps from the sliding window", async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + // Make 20 requests + const batch1 = Array.from({ length: 20 }, () => service.fetchSubgraphMeta()); + await vi.runAllTimersAsync(); + await Promise.all(batch1); + + // Advance past the window + await vi.advanceTimersByTimeAsync(11000); + + fetchMock.mockClear(); + + // Make another request - should have full window available + await service.fetchSubgraphMeta(); + + const timestamps = (service as any).requestTimestamps; + // Should only have 1 timestamp (the new one), old ones filtered out + expect(timestamps.length).toBe(1); + }); + }); +}); diff --git a/apps/backend/src/pdp-subgraph/pdp-subgraph.service.ts b/apps/backend/src/pdp-subgraph/pdp-subgraph.service.ts new file mode 100644 index 00000000..aedd8bce --- /dev/null +++ b/apps/backend/src/pdp-subgraph/pdp-subgraph.service.ts @@ -0,0 +1,306 @@ +import { Injectable, Logger } from "@nestjs/common"; +import { ConfigService } from "@nestjs/config"; +import { toStructuredError } from "../common/logging.js"; +import type { IBlockchainConfig, IConfig } from "../config/app.config.js"; +import { Queries } from "./queries.js"; +import type { GraphQLResponse, ProviderDataSetResponse, ProvidersWithDataSetsOptions, SubgraphMeta } from "./types.js"; +import { validateProviderDataSetResponse, validateSubgraphMetaResponse } from "./types.js"; + +/** + * Error thrown when data validation fails. + * These errors should not be retried as they indicate schema/data issues. + */ +class ValidationError extends Error { + constructor(message: string) { + super(message); + this.name = "ValidationError"; + if (Error.captureStackTrace) { + Error.captureStackTrace(this, ValidationError); + } + } +} + +@Injectable() +export class PDPSubgraphService { + private readonly logger: Logger = new Logger(PDPSubgraphService.name); + private readonly blockchainConfig: IBlockchainConfig; + + private static readonly MAX_PROVIDERS_PER_QUERY = 100; + private static readonly MAX_CONCURRENT_REQUESTS = 50; + private static readonly RATE_LIMIT_WINDOW_MS = 10000; + private static readonly MAX_RETRIES = 3; + private static readonly INITIAL_RETRY_DELAY_MS = 1000; + + private requestTimestamps: number[] = []; + + constructor(private readonly configService: ConfigService) { + this.blockchainConfig = this.configService.get("blockchain"); + } + + /** + * Fetch subgraph metadata including the latest indexed block number + * + * @param attempt - Current retry attempt number (default: 1) + * @returns Subgraph metadata with block number + * @throws Error if endpoint is not configured or after MAX_RETRIES attempts + */ + async fetchSubgraphMeta(attempt: number = 1): Promise { + if (!this.blockchainConfig.pdpSubgraphEndpoint) { + throw new Error("No PDP subgraph endpoint configured"); + } + + try { + await this.enforceRateLimit(); + + const response = await fetch(this.blockchainConfig.pdpSubgraphEndpoint, { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ + query: Queries.GET_SUBGRAPH_META, + }), + }); + + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } + + const result = (await response.json()) as GraphQLResponse; + + if (result.errors) { + const errorMessage = result.errors?.[0]?.message || "Unknown GraphQL error"; + throw new Error(`GraphQL error: ${errorMessage}`); + } + let validated: SubgraphMeta; + try { + validated = validateSubgraphMetaResponse(result.data); + } catch (validationError) { + const errorMessage = validationError instanceof Error ? validationError.message : "Unknown validation error"; + throw new ValidationError(`Data validation failed: ${errorMessage}`); + } + + return validated; + } catch (error) { + const errorMessage = error instanceof Error ? error.message : "Unknown error"; + + // No need to retry on validation errors - they indicate schema/data issues, not transient failures + if (error instanceof ValidationError) { + this.logger.error({ + event: "subgraph_meta_validation_failed", + message: "Subgraph data validation failed", + error: toStructuredError(error), + }); + throw error; + } + + // Retry on network/HTTP errors + if (attempt < PDPSubgraphService.MAX_RETRIES) { + const delay = PDPSubgraphService.INITIAL_RETRY_DELAY_MS * (1 << (attempt - 1)); + this.logger.warn({ + event: "subgraph_meta_request_retry", + message: "Subgraph meta request failed. Retrying...", + attempt, + maxRetries: PDPSubgraphService.MAX_RETRIES, + retryDelayMs: delay, + error: toStructuredError(error), + }); + await new Promise((resolve) => setTimeout(resolve, delay)); + return this.fetchSubgraphMeta(attempt + 1); + } + + this.logger.error({ + event: "subgraph_meta_request_failed", + message: "Subgraph meta request failed after maximum retries", + maxRetries: PDPSubgraphService.MAX_RETRIES, + error: toStructuredError(error), + }); + throw new Error( + `Failed to fetch subgraph metadata after ${PDPSubgraphService.MAX_RETRIES} attempts: ${errorMessage}`, + ); + } + } + + /** + * Fetch provider-level totals from subgraph with batching, pagination, and rate limiting + * + * @param options - Options containing block number and provider addresses + * @returns Array of providers with their data sets currently proving + */ + async fetchProvidersWithDatasets( + options: ProvidersWithDataSetsOptions, + ): Promise { + const { blockNumber, addresses } = options; + + if (addresses.length === 0) { + return []; + } + + if (addresses.length <= PDPSubgraphService.MAX_PROVIDERS_PER_QUERY) { + return this.fetchWithRetry(blockNumber, addresses); + } + + return this.fetchMultipleBatchesWithRateLimit(blockNumber, addresses); + } + + /** + * Fetch multiple batches with rate limiting and concurrency control + */ + private async fetchMultipleBatchesWithRateLimit( + blockNumber: number, + addresses: string[], + ): Promise { + const batches: string[][] = []; + for (let i = 0; i < addresses.length; i += PDPSubgraphService.MAX_PROVIDERS_PER_QUERY) { + const addressesLimit = Math.min(addresses.length, i + PDPSubgraphService.MAX_PROVIDERS_PER_QUERY); + batches.push(addresses.slice(i, addressesLimit)); + } + + const allProviders: ProviderDataSetResponse["providers"] = []; + + for (let i = 0; i < batches.length; i += PDPSubgraphService.MAX_CONCURRENT_REQUESTS) { + const batchGroup = batches.slice(i, i + PDPSubgraphService.MAX_CONCURRENT_REQUESTS); + + const results = await Promise.all(batchGroup.map((batch) => this.fetchWithRetry(blockNumber, batch))); + + allProviders.push(...results.flat()); + } + + return allProviders; + } + + /** + * Fetch with exponential backoff retry mechanism + * Assuming initial request to be first attempt + */ + private async fetchWithRetry( + blockNumber: number, + addresses: string[], + attempt: number = 1, + ): Promise { + if (!this.blockchainConfig.pdpSubgraphEndpoint) { + throw new Error("No PDP subgraph endpoint configured"); + } + + const variables = { + blockNumber: blockNumber.toString(), + addresses, + }; + + try { + await this.enforceRateLimit(); + + const response = await fetch(this.blockchainConfig.pdpSubgraphEndpoint, { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ + query: Queries.GET_PROVIDERS_WITH_DATASETS, + variables, + }), + }); + + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } + + const result = (await response.json()) as GraphQLResponse; + + if (result.errors) { + const errorMessage = result.errors?.[0]?.message || "Unknown GraphQL error"; + throw new Error(`GraphQL error: ${errorMessage}`); + } + + let validated: ProviderDataSetResponse; + try { + validated = validateProviderDataSetResponse(result.data); + } catch (validationError) { + const errorMessage = validationError instanceof Error ? validationError.message : "Unknown validation error"; + throw new ValidationError(`Data validation failed: ${errorMessage}`); + } + + return validated.providers; + } catch (error) { + const errorMessage = error instanceof Error ? error.message : "Unknown error"; + + // No need to retry on validation errors - they indicate schema/data issues, not transient failures + if (error instanceof ValidationError) { + this.logger.error({ + event: "subgraph_provider_data_validation_failed", + message: "Subgraph data validation failed", + error: toStructuredError(error), + }); + throw error; + } + + // Retry on network/HTTP errors + if (attempt < PDPSubgraphService.MAX_RETRIES) { + const delay = PDPSubgraphService.INITIAL_RETRY_DELAY_MS * (1 << (attempt - 1)); + this.logger.warn({ + event: "subgraph_provider_request_retry", + message: "Subgraph provider request failed. Retrying...", + attempt, + maxRetries: PDPSubgraphService.MAX_RETRIES, + retryDelayMs: delay, + addressCount: addresses.length, + error: toStructuredError(error), + }); + await new Promise((resolve) => setTimeout(resolve, delay)); + return this.fetchWithRetry(blockNumber, addresses, attempt + 1); + } + + this.logger.error({ + event: "subgraph_provider_request_failed", + message: "Subgraph provider request failed after maximum retries", + maxRetries: PDPSubgraphService.MAX_RETRIES, + blockNumber, + addressCount: addresses.length, + error: toStructuredError(error), + }); + throw new Error( + `Failed to fetch provider data after ${PDPSubgraphService.MAX_RETRIES} attempts: ${errorMessage}`, + ); + } + } + + /** + * Enforce rate limiting: max 50 requests per 10 seconds + * This rate limit is applied by Goldsky on their public endpoints + * Read more here: https://docs.goldsky.com/subgraphs/graphql-endpoints#public-endpoints + */ + private async enforceRateLimit(requestCount: number = 1): Promise { + if (requestCount > PDPSubgraphService.MAX_CONCURRENT_REQUESTS) { + throw new Error( + `Cannot request ${requestCount} items; exceeds rate limit window of ${PDPSubgraphService.MAX_CONCURRENT_REQUESTS}`, + ); + } + + const now = Date.now(); + const windowStart = now - PDPSubgraphService.RATE_LIMIT_WINDOW_MS; + + this.requestTimestamps = this.requestTimestamps.filter((timestamp) => timestamp > windowStart); + + const availableSlots = PDPSubgraphService.MAX_CONCURRENT_REQUESTS - this.requestTimestamps.length; + + if (requestCount > availableSlots) { + const requiredSlots = requestCount - availableSlots; + + const index = Math.min(this.requestTimestamps.length, requiredSlots) - 1; + const oldestTimestamp = this.requestTimestamps[index] || now; + + // wait time with 10ms buffer + const waitTime = oldestTimestamp + PDPSubgraphService.RATE_LIMIT_WINDOW_MS - now + 10; + + if (waitTime > 0) { + await new Promise((resolve) => setTimeout(resolve, waitTime)); + return this.enforceRateLimit(requestCount); + } + } + + // Reserve the slots NOW + for (let i = 0; i < requestCount; i++) { + this.requestTimestamps.push(Date.now()); + } + } +} diff --git a/apps/backend/src/pdp-subgraph/queries.ts b/apps/backend/src/pdp-subgraph/queries.ts new file mode 100644 index 00000000..a21a3991 --- /dev/null +++ b/apps/backend/src/pdp-subgraph/queries.ts @@ -0,0 +1,24 @@ +export const Queries = { + GET_PROVIDERS_WITH_DATASETS: ` + query GetProvidersWithDataSet($addresses: [Bytes!], $blockNumber: BigInt!) { + providers(where: {address_in: $addresses}) { + address + totalFaultedPeriods + totalProvingPeriods + proofSets (where: {nextDeadline_lt: $blockNumber, status: PROVING}) { + nextDeadline + maxProvingPeriod + } + } + } + `, + GET_SUBGRAPH_META: ` + query GetSubgraphMeta { + _meta { + block { + number + } + } + } + `, +} as const; diff --git a/apps/backend/src/pdp-subgraph/types.spec.ts b/apps/backend/src/pdp-subgraph/types.spec.ts new file mode 100644 index 00000000..02e6eee0 --- /dev/null +++ b/apps/backend/src/pdp-subgraph/types.spec.ts @@ -0,0 +1,245 @@ +import { describe, expect, it } from "vitest"; +import { validateProviderDataSetResponse, validateSubgraphMetaResponse } from "./types.js"; + +// Subgraph stores addresses in lowercase +const VALID_ADDRESS = "0xd8da6bf26964af9d7eed9e03e53415d37aa96045" as const; + +const makeValidProvider = (overrides: Record = {}) => ({ + address: VALID_ADDRESS, + totalFaultedPeriods: "10", + totalProvingPeriods: "100", + proofSets: [ + { + nextDeadline: "1000", + maxProvingPeriod: "100", + }, + ], + ...overrides, +}); + +const makeValidResponse = (providers = [makeValidProvider()]) => ({ + providers, +}); + +describe("validateProviderDataSetResponse", () => { + it("validates and transforms a well-formed response", () => { + const result = validateProviderDataSetResponse(makeValidResponse()); + + expect(result.providers).toHaveLength(1); + const provider = result.providers[0]; + expect(provider.address).toBe(VALID_ADDRESS); + expect(provider.totalFaultedPeriods).toBe(10n); + expect(provider.totalProvingPeriods).toBe(100n); + + const proofSet = provider.proofSets[0]; + expect(proofSet.nextDeadline).toBe(1000n); + expect(proofSet.maxProvingPeriod).toBe(100n); + }); + + it("converts string numbers to bigint", () => { + const result = validateProviderDataSetResponse( + makeValidResponse([ + makeValidProvider({ + totalFaultedPeriods: "999999999999999999", + totalProvingPeriods: "1000000000000000000", + }), + ]), + ); + + expect(typeof result.providers[0].totalFaultedPeriods).toBe("bigint"); + expect(result.providers[0].totalFaultedPeriods).toBe(999999999999999999n); + expect(result.providers[0].totalProvingPeriods).toBe(1000000000000000000n); + }); + + it("accepts an empty providers array", () => { + const result = validateProviderDataSetResponse({ providers: [] }); + expect(result.providers).toEqual([]); + }); + + it("accepts a provider with empty proofSets", () => { + const result = validateProviderDataSetResponse(makeValidResponse([makeValidProvider({ proofSets: [] })])); + expect(result.providers[0].proofSets).toEqual([]); + }); + + it("preserves unknown fields (schema uses .unknown(true))", () => { + const result = validateProviderDataSetResponse(makeValidResponse([makeValidProvider({ extraField: "hello" })])); + expect((result.providers[0] as Record).extraField).toBe("hello"); + }); + + it("throws on missing providers field", () => { + expect(() => validateProviderDataSetResponse({})).toThrow("Invalid provider dataset response format"); + }); + + it("throws on null input", () => { + expect(() => validateProviderDataSetResponse(null)).toThrow("Invalid provider dataset response format"); + }); + + it("throws on missing required provider fields", () => { + expect(() => + validateProviderDataSetResponse({ + providers: [{ address: VALID_ADDRESS }], + }), + ).toThrow("Invalid provider dataset response format"); + }); + + it("throws on invalid Ethereum address", () => { + expect(() => + validateProviderDataSetResponse(makeValidResponse([makeValidProvider({ address: "not-an-address" })])), + ).toThrow("Invalid provider dataset response format"); + }); + + it("throws on non-numeric string for bigint fields", () => { + expect(() => + validateProviderDataSetResponse(makeValidResponse([makeValidProvider({ totalFaultedPeriods: "abc" })])), + ).toThrow("Invalid provider dataset response format"); + }); + + it("throws on negative number string for bigint fields", () => { + expect(() => + validateProviderDataSetResponse(makeValidResponse([makeValidProvider({ totalFaultedPeriods: "-1" })])), + ).toThrow("Invalid provider dataset response format"); + }); + + it("throws on missing proofSet fields", () => { + expect(() => + validateProviderDataSetResponse( + makeValidResponse([ + makeValidProvider({ + proofSets: [{ totalFaultedPeriods: "1" }], + }), + ]), + ), + ).toThrow("Invalid provider dataset response format"); + }); + + it("validates multiple providers in a single response", () => { + const provider1 = makeValidProvider({ address: VALID_ADDRESS, totalFaultedPeriods: "5" }); + const provider2 = makeValidProvider({ + address: "0xAb5801a7D398351b8bE11C439e05C5B3259aeC9B", + totalFaultedPeriods: "15", + }); + + const result = validateProviderDataSetResponse(makeValidResponse([provider1, provider2])); + + expect(result.providers).toHaveLength(2); + expect(result.providers[0].totalFaultedPeriods).toBe(5n); + expect(result.providers[1].totalFaultedPeriods).toBe(15n); + }); + + it("handles zero values correctly", () => { + const result = validateProviderDataSetResponse( + makeValidResponse([ + makeValidProvider({ + totalFaultedPeriods: "0", + totalProvingPeriods: "0", + proofSets: [ + { + nextDeadline: "0", + maxProvingPeriod: "0", + }, + ], + }), + ]), + ); + + expect(result.providers[0].totalFaultedPeriods).toBe(0n); + expect(result.providers[0].totalProvingPeriods).toBe(0n); + expect(result.providers[0].proofSets[0].maxProvingPeriod).toBe(0n); + }); +}); + +describe("validateSubgraphMetaResponse", () => { + it("validates a well-formed subgraph meta response", () => { + const input = { + _meta: { + block: { + number: 12345, + }, + }, + }; + + const result = validateSubgraphMetaResponse(input); + + expect(result._meta.block.number).toBe(12345); + }); + + it("accepts large block numbers", () => { + const input = { + _meta: { + block: { + number: 999999999, + }, + }, + }; + + const result = validateSubgraphMetaResponse(input); + + expect(result._meta.block.number).toBe(999999999); + }); + + it("accepts numeric strings block number", () => { + const result = validateSubgraphMetaResponse({ + _meta: { + block: { + number: "12345", + }, + }, + }); + + expect(result._meta.block.number).toBe(12345); + }); + + it("throws on missing _meta field", () => { + expect(() => validateSubgraphMetaResponse({})).toThrow("Invalid subgraph meta response format"); + }); + + it("throws on missing block field", () => { + expect(() => + validateSubgraphMetaResponse({ + _meta: {}, + }), + ).toThrow("Invalid subgraph meta response format"); + }); + + it("throws on missing number field", () => { + expect(() => + validateSubgraphMetaResponse({ + _meta: { + block: {}, + }, + }), + ).toThrow("Invalid subgraph meta response format"); + }); + + it("throws on null input", () => { + expect(() => validateSubgraphMetaResponse(null)).toThrow("Invalid subgraph meta response format"); + }); + + it("throws on undefined input", () => { + expect(() => validateSubgraphMetaResponse(undefined)).toThrow("Invalid subgraph meta response format"); + }); + + it("throws on negative block number", () => { + expect(() => + validateSubgraphMetaResponse({ + _meta: { + block: { + number: -1, + }, + }, + }), + ).toThrow("Invalid subgraph meta response format"); + }); + + it("throws on floating point block number", () => { + expect(() => + validateSubgraphMetaResponse({ + _meta: { + block: { + number: 123.45, + }, + }, + }), + ).toThrow("Invalid subgraph meta response format"); + }); +}); diff --git a/apps/backend/src/pdp-subgraph/types.ts b/apps/backend/src/pdp-subgraph/types.ts new file mode 100644 index 00000000..ad8dcdc4 --- /dev/null +++ b/apps/backend/src/pdp-subgraph/types.ts @@ -0,0 +1,151 @@ +import Joi from "joi"; +import { Hex, isAddress } from "viem"; + +// ----------------------------------------- +// Types +// ----------------------------------------- + +/** The response from the subgraph GraphQL query */ +export type GraphQLResponse = { + /** The data from the query */ + data?: unknown; + /** The errors from the query */ + errors?: { message: string }[]; +}; + +/** + * Options for fetching providers with data sets + */ +export type ProvidersWithDataSetsOptions = { + addresses: string[]; + blockNumber: number; +}; + +/** + * Validated response from the PDP subgraph meta query. + */ +export type SubgraphMeta = { + _meta: { + block: { + number: number; + }; + }; +}; + +/** + * A single proof set within a provider, representing deadline-related proving data. + * All numeric fields are bigints converted from the subgraph string representation. + */ +export type DataSet = { + nextDeadline: bigint; + maxProvingPeriod: bigint; +}; + +/** + * Validated and transformed response from the PDP subgraph providers query. + * Numeric fields are converted from subgraph string representation to bigint. + */ +export type ProviderDataSetResponse = { + providers: { + address: Hex; + totalFaultedPeriods: bigint; + totalProvingPeriods: bigint; + proofSets: DataSet[]; + }[]; +}; + +// ----------------------------------------- +// Joi Custom Schema Converters +// ----------------------------------------- + +/** Joi custom validator that converts a numeric string to bigint. */ +const toBigInt = (value: unknown, helpers: Joi.CustomHelpers) => { + try { + return BigInt(value as string); + } catch { + return helpers.error("any.invalid", { + message: "Invalid bigint value", + }); + } +}; + +/** Joi custom validator to validate an Ethereum address and normalize to lowercase. */ +const toEthereumAddress = (value: unknown, helpers: Joi.CustomHelpers) => { + if (!isAddress(value as string)) { + return helpers.error("any.invalid", { message: "Invalid Ethereum address" }); + } + + // Normalize to lowercase for consistent key lookups + return (value as string).toLowerCase() as Hex; +}; + +// ----------------------------------------- +// Joi Schemas +// ----------------------------------------- + +const metaSchema = Joi.object({ + _meta: Joi.object({ + block: Joi.object({ + number: Joi.number().integer().positive().required(), + }) + .unknown(true) + .required(), + }) + .unknown(true) + .required(), +}) + .unknown(true) + .required(); + +const dataSetSchema = Joi.object({ + nextDeadline: Joi.string().pattern(/^\d+$/).required().custom(toBigInt), + maxProvingPeriod: Joi.string().pattern(/^\d+$/).required().custom(toBigInt), +}).unknown(true); + +const providerDataSetResponseSchema = Joi.object({ + providers: Joi.array() + .items( + Joi.object({ + address: Joi.string().required().custom(toEthereumAddress), + totalFaultedPeriods: Joi.string().pattern(/^\d+$/).required().custom(toBigInt), + totalProvingPeriods: Joi.string().pattern(/^\d+$/).required().custom(toBigInt), + proofSets: Joi.array().items(dataSetSchema).required(), + }).unknown(true), + ) + .required(), +}) + .unknown(true) + .required(); + +// ----------------------------------------- +// Validator Functions +// ----------------------------------------- + +/** + * Validates a raw subgraph meta response into SubgraphMeta. + * + * @param value - The raw parsed JSON from the subgraph + * @throws Error if validation fails + */ +export function validateSubgraphMetaResponse(value: unknown): SubgraphMeta { + const { error, value: validated } = metaSchema.validate(value, { abortEarly: false }); + if (error) { + throw new Error(`Invalid subgraph meta response format: ${error.message}`); + } + return validated as SubgraphMeta; +} + +/** + * Validates and transforms a raw subgraph response into ProviderDataSetResponse. + * Converts string fields to bigint. + * + * @param value - The raw parsed JSON from the subgraph + * @throws Error if validation fails + */ +export function validateProviderDataSetResponse(value: unknown): ProviderDataSetResponse { + const { error, value: validated } = providerDataSetResponseSchema.validate(value, { abortEarly: false }); + if (error) { + throw new Error(`Invalid provider dataset response format: ${error.message}`); + } + return validated as ProviderDataSetResponse; +} diff --git a/apps/backend/src/subgraph/subgraph.service.ts b/apps/backend/src/subgraph/subgraph.service.ts index 3067532c..97472c3c 100644 --- a/apps/backend/src/subgraph/subgraph.service.ts +++ b/apps/backend/src/subgraph/subgraph.service.ts @@ -51,6 +51,21 @@ class ValidationError extends Error { } } +/** + * Client for the dealbot-owned subgraph (driven by `SUBGRAPH_ENDPOINT`). + * + * Functionally a superset of `PDPSubgraphService`: it exposes the same + * `fetchSubgraphMeta` / `fetchProvidersWithDatasets` surface plus the new + * `sampleAnonPiece` query used by anonymous retrievals. + * + * The two services intentionally coexist while we migrate off the upstream + * pdp-explorer subgraph: `PDPSubgraphService` continues to drive the + * established data-retention path against `PDP_SUBGRAPH_ENDPOINT`, and + * `SubgraphService` is scoped to the new anonymous-retrieval flow only. + * Once the dealbot-owned subgraph has soaked in production, this service + * should become the single drop-in replacement for `PDPSubgraphService` + * and `PDP_SUBGRAPH_ENDPOINT` can be retired. + */ @Injectable() export class SubgraphService { private readonly logger: Logger = new Logger(SubgraphService.name); diff --git a/apps/backend/src/wallet-sdk/wallet-sdk.service.spec.ts b/apps/backend/src/wallet-sdk/wallet-sdk.service.spec.ts index 195db19f..d6613a31 100644 --- a/apps/backend/src/wallet-sdk/wallet-sdk.service.spec.ts +++ b/apps/backend/src/wallet-sdk/wallet-sdk.service.spec.ts @@ -18,7 +18,7 @@ const baseConfig: IBlockchainConfig = { checkDatasetCreationFees: false, useOnlyApprovedProviders: false, minNumDataSetsForChecks: 1, - subgraphEndpoint: "https://api.thegraph.com/subgraphs/filecoin/pdp", + pdpSubgraphEndpoint: "https://api.thegraph.com/subgraphs/filecoin/pdp", }; const makeProvider = (overrides: Partial): PDPProviderEx => diff --git a/docs/checks/data-retention.md b/docs/checks/data-retention.md index 4eb7a912..605753e7 100644 --- a/docs/checks/data-retention.md +++ b/docs/checks/data-retention.md @@ -27,7 +27,7 @@ Dealbot polls The Graph API endpoint for PDP (Proof of Data Possession) data at **Subgraph repository**: [FilOzone/pdp-explorer](https://github.com/FilOzone/pdp-explorer/blob/main/subgraph/src/pdp-verifier.ts) -**Subgraph endpoint**: Configured via `SUBGRAPH_ENDPOINT` environment variable (see [environment-variables.md](../environment-variables.md#subgraph_endpoint)) +**Subgraph endpoint**: Configured via `PDP_SUBGRAPH_ENDPOINT` environment variable (see [environment-variables.md](../environment-variables.md#pdp_subgraph_endpoint)) > **Note**: The production subgraph URL is currently being finalized [here](https://github.com/FilOzone/pdp-explorer/pull/86). @@ -48,7 +48,7 @@ From `GET_PROVIDERS_WITH_DATASETS` query for each provider: > **Note**: The subgraph query uses the field name `proofSets`, but this refers to "dataSets" in the current codebase. The terminology was updated from "proof set" to "data set" but the subgraph schema retains the old naming. -Source: [`subgraph.service.ts` (`fetchSubgraphMeta`, `fetchProvidersWithDatasets`)](../../apps/backend/src/subgraph/subgraph.service.ts) +Source: [`pdp-subgraph.service.ts` (`fetchSubgraphMeta`, `fetchProvidersWithDatasets`)](../../apps/backend/src/pdp-subgraph/pdp-subgraph.service.ts) ### 2. Compute Challenge Totals and Overdue Estimates @@ -170,7 +170,7 @@ The PDP subgraph service enforces Goldsky's public endpoint rate limits: Rate limiting is enforced client-side to prevent 429 errors. -Source: [`subgraph.service.ts` (`enforceRateLimit`)](../../apps/backend/src/subgraph/subgraph.service.ts) +Source: [`pdp-subgraph.service.ts` (`enforceRateLimit`)](../../apps/backend/src/pdp-subgraph/pdp-subgraph.service.ts) ## Metrics Recorded @@ -210,11 +210,11 @@ Key environment variables that control data retention check behavior: | Variable | Required | Default | Description | | ----------------------- | -------- | ------------ | ------------------------------------------------------------------------------------------------ | -| `SUBGRAPH_ENDPOINT` | No | Empty string | The Graph API endpoint for PDP subgraph queries. When empty, data retention checks are disabled. | +| `PDP_SUBGRAPH_ENDPOINT` | No | Empty string | The Graph API endpoint for PDP subgraph queries. When empty, data retention checks are disabled. | Source: [`app.config.ts`](../../apps/backend/src/config/app.config.ts) -See also: [`environment-variables.md`](../environment-variables.md#subgraph_endpoint) for the full configuration reference. +See also: [`environment-variables.md`](../environment-variables.md#pdp_subgraph_endpoint) for the full configuration reference. ## Error Handling diff --git a/docs/checks/production-configuration-and-approval-methodology.md b/docs/checks/production-configuration-and-approval-methodology.md index 6b2859aa..2e89a45d 100644 --- a/docs/checks/production-configuration-and-approval-methodology.md +++ b/docs/checks/production-configuration-and-approval-methodology.md @@ -40,7 +40,7 @@ Relevant parameters include: | Parameter | Value | Notes | |-----------|-------|-------| -| [`SUBGRAPH_ENDPOINT`](../environment-variables.md#subgraph_endpoint) | - | Points at a Goldsky deployment of the dealbot-owned subgraph in [`apps/subgraph/`](../../apps/subgraph/) (package `@dealbot/subgraph`). | +| [`PDP_SUBGRAPH_ENDPOINT`](../environment-variables.md#pdp_subgraph_endpoint) | TODO: fill this in | Uses the subgraph from [pdp-explorer](https://github.com/FilOzone/pdp-explorer). | | [`MIN_NUM_DATASETS_FOR_CHECKS`](../environment-variables.md#dataset-configuration) | 15 | Ensure there are enough datasets with pieces being added so that statistical significance for [Data Retention Fault Rate](#data-retention-fault-rate) can be achieved quicker. Note that on mainnet each dataset incurs 5 challenges[^1] per daily proof[^2]. With this many datasets, an SP can be approved for data retention after a faultless ~7 days even if the SP doesn't have other datasets. | See [How are data retention statistics/thresholds calculated?](#how-are-data-retention-statisticsthresholds-calculated) for more details. diff --git a/docs/environment-variables.md b/docs/environment-variables.md index e2b23735..91e28abc 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -8,7 +8,7 @@ This document provides a comprehensive guide to all environment variables used b | ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | | [Application](#application-configuration) | `NODE_ENV`, `DEALBOT_PORT`, `DEALBOT_HOST`, `DEALBOT_RUN_MODE`, `DEALBOT_METRICS_PORT`, `DEALBOT_METRICS_HOST`, `DEALBOT_ALLOWED_ORIGINS`, `ENABLE_DEV_MODE` | | [Database](#database-configuration) | `DATABASE_HOST`, `DATABASE_PORT`, `DATABASE_POOL_MAX`, `DATABASE_USER`, `DATABASE_PASSWORD`, `DATABASE_NAME` | -| [Blockchain](#blockchain-configuration) | `NETWORK`, `RPC_URL`, `WALLET_ADDRESS`, `WALLET_PRIVATE_KEY`, `SESSION_KEY_PRIVATE_KEY`, `CHECK_DATASET_CREATION_FEES`, `USE_ONLY_APPROVED_PROVIDERS`, `SUBGRAPH_ENDPOINT` | +| [Blockchain](#blockchain-configuration) | `NETWORK`, `RPC_URL`, `WALLET_ADDRESS`, `WALLET_PRIVATE_KEY`, `SESSION_KEY_PRIVATE_KEY`, `CHECK_DATASET_CREATION_FEES`, `USE_ONLY_APPROVED_PROVIDERS`, `PDP_SUBGRAPH_ENDPOINT`, `SUBGRAPH_ENDPOINT` | | [Dataset Versioning](#dataset-versioning) | `DEALBOT_DATASET_VERSION` | | [Scheduling](#scheduling-configuration) | `PROVIDERS_REFRESH_INTERVAL_SECONDS`, `DATA_RETENTION_POLL_INTERVAL_SECONDS`, `DEALBOT_MAINTENANCE_WINDOWS_UTC`, `DEALBOT_MAINTENANCE_WINDOW_MINUTES` | | [Jobs (pg-boss)](#jobs-pg-boss) | `DEALBOT_PGBOSS_SCHEDULER_ENABLED`, `DEALBOT_PGBOSS_POOL_MAX`, `DEALS_PER_SP_PER_HOUR`, `DATASET_CREATIONS_PER_SP_PER_HOUR`, `RETRIEVALS_PER_SP_PER_HOUR`, `RETRIEVALS_ANON_PER_SP_PER_HOUR`, `JOB_SCHEDULER_POLL_SECONDS`, `JOB_WORKER_POLL_SECONDS`, `PG_BOSS_LOCAL_CONCURRENCY`, `JOB_CATCHUP_MAX_ENQUEUE`, `JOB_SCHEDULE_PHASE_SECONDS`, `JOB_ENQUEUE_JITTER_SECONDS`, `DEAL_JOB_TIMEOUT_SECONDS`, `RETRIEVAL_JOB_TIMEOUT_SECONDS`, `ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS`, `ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT`, `IPFS_BLOCK_FETCH_CONCURRENCY` | @@ -425,13 +425,35 @@ Session keys are scoped (only storage operations, not deposits or withdrawals) a --- +### `PDP_SUBGRAPH_ENDPOINT` + +- **Type**: `string` (URL) +- **Required**: No +- **Default**: Empty string (feature disabled) + +**Role**: The Graph API endpoint for the upstream pdp-explorer subgraph. Drives the data-retention overdue-periods metric. + +This variable is kept distinct from [`SUBGRAPH_ENDPOINT`](#subgraph_endpoint) so the dealbot-owned subgraph can be rolled out incrementally — only the new anonymous-retrieval flow points at the new endpoint while the established data-retention path stays on the upstream subgraph. + +**When to update**: + +- When switching between different Graph API endpoints for the pdp-explorer subgraph. + +**Example**: + +```bash +PDP_SUBGRAPH_ENDPOINT=https://api.thegraph.com/subgraphs/filecoin/pdp +``` + +--- + ### `SUBGRAPH_ENDPOINT` - **Type**: `string` (URL) - **Required**: No - **Default**: Empty string (feature disabled) -**Role**: The Graph API endpoint for querying PDP (Proof of Data Possession) subgraph data. Drives the overdue-periods metric and the anonymous-retrieval candidate-piece query. +**Role**: The Graph API endpoint for the dealbot-owned subgraph. Currently drives only the [anonymous-retrieval](./checks/anon-retrievals.md) candidate-piece query. Once the dealbot-owned subgraph has soaked in production it is intended to replace [`PDP_SUBGRAPH_ENDPOINT`](#pdp_subgraph_endpoint). The dealbot-owned subgraph lives at `apps/subgraph/` (package `@dealbot/subgraph`) and is deployed to Goldsky. Point this variable at one of those slots; the exact slugs are documented in `apps/subgraph/README.md`. diff --git a/kustomize/overlays/local/backend-configmap-local.yaml b/kustomize/overlays/local/backend-configmap-local.yaml index b4febf61..52918aa2 100644 --- a/kustomize/overlays/local/backend-configmap-local.yaml +++ b/kustomize/overlays/local/backend-configmap-local.yaml @@ -26,6 +26,7 @@ data: PG_BOSS_LOCAL_CONCURRENCY: "3" JOB_WORKER_POLL_SECONDS: "60" RANDOM_PIECE_SIZES: "10485760" + PDP_SUBGRAPH_ENDPOINT: "https://api.goldsky.com/api/public/project_cmdfaaxeuz6us01u359yjdctw/subgraphs/pdp-explorer/calibration311a/gn" SUBGRAPH_ENDPOINT: "https://api.goldsky.com/api/public/project_cmdfaaxeuz6us01u359yjdctw/subgraphs/pdp-explorer/calibration311a/gn" JOB_SCHEDULER_POLL_SECONDS: "60" CLICKHOUSE_URL: "http://default:@dealbot-clickhouse:8123/dealbot" From d82222f530489c9a054e7166a2a61fcc86bbec5c Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Tue, 5 May 2026 09:17:25 +0200 Subject: [PATCH 20/28] refactor: reduce pr diff noise --- apps/backend/src/jobs/jobs.module.ts | 2 +- apps/backend/src/jobs/jobs.service.spec.ts | 52 +++++++++++----------- apps/backend/src/jobs/jobs.service.ts | 3 +- 3 files changed, 29 insertions(+), 28 deletions(-) diff --git a/apps/backend/src/jobs/jobs.module.ts b/apps/backend/src/jobs/jobs.module.ts index 69f1edb1..fb708e09 100644 --- a/apps/backend/src/jobs/jobs.module.ts +++ b/apps/backend/src/jobs/jobs.module.ts @@ -18,10 +18,10 @@ import { JobScheduleRepository } from "./repositories/job-schedule.repository.js TypeOrmModule.forFeature([StorageProvider, JobScheduleState]), DealModule, RetrievalModule, - RetrievalAnonModule, WalletSdkModule, DataRetentionModule, PieceCleanupModule, + RetrievalAnonModule, ], providers: [JobsService, JobScheduleRepository], }) diff --git a/apps/backend/src/jobs/jobs.service.spec.ts b/apps/backend/src/jobs/jobs.service.spec.ts index c20d0890..8983c723 100644 --- a/apps/backend/src/jobs/jobs.service.spec.ts +++ b/apps/backend/src/jobs/jobs.service.spec.ts @@ -52,10 +52,10 @@ describe("JobsService schedule rows", () => { jobScheduleRepository: JobsServiceDeps[2]; dealService: JobsServiceDeps[3]; retrievalService: JobsServiceDeps[4]; - anonRetrievalService: JobsServiceDeps[5]; - walletSdkService: JobsServiceDeps[6]; - dataRetentionService: JobsServiceDeps[7]; - pieceCleanupService: JobsServiceDeps[8]; + walletSdkService: JobsServiceDeps[5]; + dataRetentionService: JobsServiceDeps[6]; + pieceCleanupService: JobsServiceDeps[7]; + anonRetrievalService: JobsServiceDeps[8]; jobsQueuedGauge: JobsServiceDeps[9]; jobsRetryScheduledGauge: JobsServiceDeps[10]; oldestQueuedAgeGauge: JobsServiceDeps[11]; @@ -160,10 +160,10 @@ describe("JobsService schedule rows", () => { overrides.jobScheduleRepository ?? (jobScheduleRepositoryMock as unknown as JobsServiceDeps[2]), overrides.dealService ?? ({} as JobsServiceDeps[3]), overrides.retrievalService ?? ({} as JobsServiceDeps[4]), - overrides.anonRetrievalService ?? ({} as JobsServiceDeps[5]), - overrides.walletSdkService ?? ({} as JobsServiceDeps[6]), - overrides.dataRetentionService ?? (dataRetentionServiceMock as unknown as JobsServiceDeps[7]), - overrides.pieceCleanupService ?? ({} as JobsServiceDeps[8]), + overrides.walletSdkService ?? ({} as JobsServiceDeps[5]), + overrides.dataRetentionService ?? (dataRetentionServiceMock as unknown as JobsServiceDeps[6]), + overrides.pieceCleanupService ?? ({} as JobsServiceDeps[7]), + overrides.anonRetrievalService ?? ({} as JobsServiceDeps[8]), overrides.jobsQueuedGauge ?? metricsMocks.jobsQueuedGauge, overrides.jobsRetryScheduledGauge ?? metricsMocks.jobsRetryScheduledGauge, overrides.oldestQueuedAgeGauge ?? metricsMocks.oldestQueuedAgeGauge, @@ -287,7 +287,7 @@ describe("JobsService schedule rows", () => { service = buildService({ configService, dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[6], + walletSdkService: walletSdkService as unknown as ConstructorParameters[5], }); // Trigger the timeout immediately by using fake timers @@ -346,7 +346,7 @@ describe("JobsService schedule rows", () => { service = buildService({ configService, retrievalService: retrievalService as unknown as ConstructorParameters[4], - walletSdkService: walletSdkService as unknown as ConstructorParameters[6], + walletSdkService: walletSdkService as unknown as ConstructorParameters[5], }); vi.useFakeTimers(); @@ -385,7 +385,7 @@ describe("JobsService schedule rows", () => { service = buildService({ retrievalService: retrievalService as unknown as ConstructorParameters[4], - walletSdkService: walletSdkService as unknown as ConstructorParameters[6], + walletSdkService: walletSdkService as unknown as ConstructorParameters[5], }); await callPrivate(service, "handleRetrievalJob", { @@ -425,7 +425,7 @@ describe("JobsService schedule rows", () => { service = buildService({ retrievalService: retrievalService as unknown as ConstructorParameters[4], - walletSdkService: walletSdkService as unknown as ConstructorParameters[6], + walletSdkService: walletSdkService as unknown as ConstructorParameters[5], }); await expect( @@ -928,7 +928,7 @@ describe("JobsService schedule rows", () => { service = buildService({ dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[6], + walletSdkService: walletSdkService as unknown as ConstructorParameters[5], }); await callPrivate(service, "handleDealJob", { @@ -967,8 +967,8 @@ describe("JobsService schedule rows", () => { service = buildService({ dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[6], - pieceCleanupService: pieceCleanupService as unknown as JobsServiceDeps[8], + walletSdkService: walletSdkService as unknown as ConstructorParameters[5], + pieceCleanupService: pieceCleanupService as unknown as JobsServiceDeps[7], }); await callPrivate(service, "handleDealJob", { @@ -1000,7 +1000,7 @@ describe("JobsService schedule rows", () => { service = buildService({ dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[6], + walletSdkService: walletSdkService as unknown as ConstructorParameters[5], }); await callPrivate(service, "handleDealJob", { @@ -1029,7 +1029,7 @@ describe("JobsService schedule rows", () => { service = buildService({ dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[6], + walletSdkService: walletSdkService as unknown as ConstructorParameters[5], }); await callPrivate(service, "handleDataSetCreationJob", { @@ -1071,7 +1071,7 @@ describe("JobsService schedule rows", () => { service = buildService({ configService, dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[6], + walletSdkService: walletSdkService as unknown as ConstructorParameters[5], }); await callPrivate(service, "handleDataSetCreationJob", { @@ -1112,7 +1112,7 @@ describe("JobsService schedule rows", () => { service = buildService({ configService, dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[6], + walletSdkService: walletSdkService as unknown as ConstructorParameters[5], }); await callPrivate(service, "handleDataSetCreationJob", { @@ -1157,7 +1157,7 @@ describe("JobsService schedule rows", () => { service = buildService({ configService, dealService: dealService as unknown as ConstructorParameters[3], - walletSdkService: walletSdkService as unknown as ConstructorParameters[6], + walletSdkService: walletSdkService as unknown as ConstructorParameters[5], }); await callPrivate(service, "handleDataSetCreationJob", { @@ -1330,7 +1330,7 @@ describe("JobsService schedule rows", () => { service = buildService({ dealService: dealService as unknown as JobsServiceDeps[3], - walletSdkService: walletSdkService as unknown as JobsServiceDeps[6], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], }); await callPrivate(service, "handleDealJob", { @@ -1354,7 +1354,7 @@ describe("JobsService schedule rows", () => { service = buildService({ retrievalService: retrievalService as unknown as JobsServiceDeps[4], - walletSdkService: walletSdkService as unknown as JobsServiceDeps[6], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], }); await callPrivate(service, "handleRetrievalJob", { @@ -1383,7 +1383,7 @@ describe("JobsService schedule rows", () => { service = buildService({ dealService: dealService as unknown as JobsServiceDeps[3], - walletSdkService: walletSdkService as unknown as JobsServiceDeps[6], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], }); await callPrivate(service, "handleDataSetCreationJob", { @@ -1425,7 +1425,7 @@ describe("JobsService schedule rows", () => { intervalSeconds: 60, service: buildService({ dealService: dealService as unknown as JobsServiceDeps[3], - walletSdkService: walletSdkService as unknown as JobsServiceDeps[6], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], }), expectCheckNotRun: () => expect(dealService.createDealForProvider).not.toHaveBeenCalled(), }, @@ -1435,7 +1435,7 @@ describe("JobsService schedule rows", () => { intervalSeconds: 60, service: buildService({ retrievalService: retrievalService as unknown as JobsServiceDeps[4], - walletSdkService: walletSdkService as unknown as JobsServiceDeps[6], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], }), expectCheckNotRun: () => expect(retrievalService.performRandomRetrievalForProvider).not.toHaveBeenCalled(), }, @@ -1445,7 +1445,7 @@ describe("JobsService schedule rows", () => { intervalSeconds: 3600, service: buildService({ dealService: dataSetDealService as unknown as JobsServiceDeps[3], - walletSdkService: walletSdkService as unknown as JobsServiceDeps[6], + walletSdkService: walletSdkService as unknown as JobsServiceDeps[5], }), expectCheckNotRun: () => expect(dataSetDealService.createDataSetWithPiece).not.toHaveBeenCalled(), }, diff --git a/apps/backend/src/jobs/jobs.service.ts b/apps/backend/src/jobs/jobs.service.ts index b070de5a..e09cf42c 100644 --- a/apps/backend/src/jobs/jobs.service.ts +++ b/apps/backend/src/jobs/jobs.service.ts @@ -72,10 +72,11 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { private readonly jobScheduleRepository: JobScheduleRepository, private readonly dealService: DealService, private readonly retrievalService: RetrievalService, - private readonly anonRetrievalService: AnonRetrievalService, private readonly walletSdkService: WalletSdkService, private readonly dataRetentionService: DataRetentionService, private readonly pieceCleanupService: PieceCleanupService, + private readonly anonRetrievalService: AnonRetrievalService, + @InjectMetric("jobs_queued") private readonly jobsQueuedGauge: Gauge, @InjectMetric("jobs_retry_scheduled") From 527283fcc8f082f8af2920fa8367f84e4f87cdb8 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Tue, 5 May 2026 09:33:25 +0200 Subject: [PATCH 21/28] remove: residual references to a pdp subgraph in the subgraph module --- apps/backend/src/subgraph/subgraph.service.spec.ts | 6 +++--- apps/backend/src/subgraph/subgraph.service.ts | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/apps/backend/src/subgraph/subgraph.service.spec.ts b/apps/backend/src/subgraph/subgraph.service.spec.ts index 8703b2c5..64f28435 100644 --- a/apps/backend/src/subgraph/subgraph.service.spec.ts +++ b/apps/backend/src/subgraph/subgraph.service.spec.ts @@ -397,14 +397,14 @@ describe("SubgraphService", () => { }); }); - it("throws when PDP subgraph endpoint is not configured", async () => { + it("throws when subgraph endpoint is not configured", async () => { const configService = { get: vi.fn(() => ({ subgraphEndpoint: "" })), } as unknown as ConfigService; const serviceWithoutEndpoint = new SubgraphService(configService); - await expect(serviceWithoutEndpoint.fetchSubgraphMeta()).rejects.toThrow("No PDP subgraph endpoint configured"); + await expect(serviceWithoutEndpoint.fetchSubgraphMeta()).rejects.toThrow("No subgraph endpoint configured"); }); it("throws on HTTP error response", async () => { @@ -740,7 +740,7 @@ describe("SubgraphService", () => { const noEndpointService = new SubgraphService(noEndpointConfig); await expect(noEndpointService.sampleAnonPiece(defaultSampleParams)).rejects.toThrow( - "No PDP subgraph endpoint configured", + "No subgraph endpoint configured", ); expect(fetchMock).not.toHaveBeenCalled(); }); diff --git a/apps/backend/src/subgraph/subgraph.service.ts b/apps/backend/src/subgraph/subgraph.service.ts index 97472c3c..3d4e8370 100644 --- a/apps/backend/src/subgraph/subgraph.service.ts +++ b/apps/backend/src/subgraph/subgraph.service.ts @@ -133,9 +133,9 @@ export class SubgraphService { // candidate pool (which silently no-ops every anon retrieval job). this.logger.error({ event: "subgraph_endpoint_not_configured", - message: "Cannot sample anonymous piece — no PDP subgraph endpoint configured", + message: "Cannot sample anonymous piece — no subgraph endpoint configured", }); - throw new Error("No PDP subgraph endpoint configured"); + throw new Error("No subgraph endpoint configured"); } const query = buildSampleAnonPieceQuery(params.pool); @@ -194,7 +194,7 @@ export class SubgraphService { attempt: number = 1, ): Promise { if (!this.blockchainConfig.subgraphEndpoint) { - throw new Error("No PDP subgraph endpoint configured"); + throw new Error("No subgraph endpoint configured"); } try { @@ -297,7 +297,7 @@ export class SubgraphService { attempt: number = 1, ): Promise { if (!this.blockchainConfig.subgraphEndpoint) { - throw new Error("No PDP subgraph endpoint configured"); + throw new Error("No subgraph endpoint configured"); } const variables = { From 8dfb3ca9f2508cb24dea95f22d5380d65643d51c Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Fri, 15 May 2026 21:10:42 +0200 Subject: [PATCH 22/28] Apply suggestion from @BigLep Co-authored-by: Steve Loeppky --- docs/checks/anon-retrievals.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/checks/anon-retrievals.md b/docs/checks/anon-retrievals.md index 0a303462..2c15f3ed 100644 --- a/docs/checks/anon-retrievals.md +++ b/docs/checks/anon-retrievals.md @@ -30,7 +30,7 @@ Operational timeouts exist to prevent jobs from running indefinitely. If the job ## Piece Selection -Unlike the [Retrieval check](./retrievals.md#piece-selection), dealbot does not retrieve from its own deals. Pieces are sampled from the on-chain subgraph of all FWSS-served pieces for the SP under test. +Unlike the [Retrieval check](./retrievals.md#piece-selection), dealbot does not retrieve from its own deals. Pieces are sampled from the [on-chain subgraph](../../src/subgraph) of all FWSS-served pieces for the SP under test. Selection strategy (per scheduled job, per SP): @@ -59,7 +59,7 @@ flowchart TD Select["Sample anonymous piece for SP from subgraph"] --> Fetch["GET /piece/{pieceCid}"] Fetch --> CommP["Hash bytes → verify CommP"] CommP --> HasIpfs{"piece.withIPFSIndexing
and ipfsRootCid?"} - HasIpfs -- "no" --> Record["Persist row + metrics"] + HasIpfs -- "no" --> Record["Persist Clickhosue row + emit Prometheus metrics"] HasIpfs -- "yes" --> ParseCar["Parse bytes as CAR"] ParseCar --> SampleBlocks["Pick N random CIDs
(ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT)"] SampleBlocks --> Ipni["IPNI: verify SP advertises root + sampled CIDs"] From b8a2621ce5747d2d351066147d4d3487dbd56169 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Fri, 15 May 2026 22:01:13 +0200 Subject: [PATCH 23/28] chore: align pnpm-lock.yaml with main --- pnpm-lock.yaml | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 8089b756..0495aa11 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1513,24 +1513,24 @@ packages: engines: {node: ^14.18.0 || >=16.10.0, npm: '>=5.10.0'} hasBin: true - '@oclif/core@4.10.6': - resolution: {integrity: sha512-ySCOYnPKZE3KACT1V9It99hWG9b8E5MpagbRdWxPNRO3beMqmbr4SLUQoFtZ9XRtW++kks1ZVwZOdpnR8rpb9A==} + '@oclif/core@4.10.5': + resolution: {integrity: sha512-qcdCF7NrdWPfme6Kr34wwljRCXbCVpL1WVxiNy0Ep6vbWKjxAjFQwuhqkoyL0yjI+KdwtLcOCGn5z2yzdijc8w==} engines: {node: '>=18.0.0'} '@oclif/core@4.5.5': resolution: {integrity: sha512-iQzlaJQgPeUXrtrX71OzDwxPikQ7c2FhNd8U8rBB7BCtj2XYfmzBT/Hmbc+g9OKDIG/JkbJT0fXaWMMBrhi+1A==} engines: {node: '>=18.0.0'} - '@oclif/plugin-autocomplete@3.2.46': - resolution: {integrity: sha512-TFvuD6JlmqEVsEvMqunyj3cyCz/l2Q4MqCjp/XtlSLS9x3xTlam7PGlqWi4WAhxl/K8CtpYqVlMYFEnlLTHspw==} + '@oclif/plugin-autocomplete@3.2.45': + resolution: {integrity: sha512-ENrUg8rbVCjh40uvi3MC9kGbiUoEf11nyqE59RBzegeeLpRXNo/Zp27L9j1tUmPEqGgfS2/wvHPihNzkpK1FDw==} engines: {node: '>=18.0.0'} - '@oclif/plugin-not-found@3.2.81': - resolution: {integrity: sha512-M88tLONBH36hLAbkFbmCo1hoZPSdU5l8Px1xEIlIgSmGMam+CoAzx4kGqpLbokgfpaHeP8/Jx3QJ18u9ef/2Qw==} + '@oclif/plugin-not-found@3.2.80': + resolution: {integrity: sha512-yTLjWvR1r/Rd/cO2LxHdMCDoL5sQhBYRUcOMCmxZtWVWhx4rAZ8KVUPDVsb+SvjJDV5ADTDBgt1H52fFx7YWqg==} engines: {node: '>=18.0.0'} - '@oclif/plugin-warn-if-update-available@3.1.61': - resolution: {integrity: sha512-4XcrTxcCs+brR/eZ0BPeuiREiH3USlJiaHbUqPhnIBuyxhhUSYVd8ZO6s5MQN7AXJq4SMQ+B5zLaHq+ep/afIw==} + '@oclif/plugin-warn-if-update-available@3.1.60': + resolution: {integrity: sha512-cRKBZm14IuA6G8W84dfd3iXj3BTAoxQ5o3pUE8DKEQ4n/tVha20t5nkVeD+ISC68e0Fuw5koTMvRwXb1lJSnzg==} engines: {node: '>=18.0.0'} '@open-draft/deferred-promise@2.2.0': @@ -7599,9 +7599,9 @@ snapshots: dependencies: '@float-capital/float-subgraph-uncrashable': 0.0.0-internal-testing.5 '@oclif/core': 4.5.5 - '@oclif/plugin-autocomplete': 3.2.46 - '@oclif/plugin-not-found': 3.2.81(@types/node@25.2.3) - '@oclif/plugin-warn-if-update-available': 3.1.61 + '@oclif/plugin-autocomplete': 3.2.45 + '@oclif/plugin-not-found': 3.2.80(@types/node@25.6.2) + '@oclif/plugin-warn-if-update-available': 3.1.60 '@pinax/graph-networks-registry': 0.7.1 '@whatwg-node/fetch': 0.10.13 assemblyscript: 0.19.23 @@ -8937,7 +8937,7 @@ snapshots: dependencies: consola: 3.4.2 - '@oclif/core@4.10.6': + '@oclif/core@4.10.5': dependencies: ansi-escapes: 4.3.2 ansis: 3.17.0 @@ -8979,7 +8979,7 @@ snapshots: wordwrap: 1.0.0 wrap-ansi: 7.0.0 - '@oclif/plugin-autocomplete@3.2.46': + '@oclif/plugin-autocomplete@3.2.45': dependencies: '@oclif/core': 4.5.5 ansis: 3.17.0 @@ -8988,16 +8988,16 @@ snapshots: transitivePeerDependencies: - supports-color - '@oclif/plugin-not-found@3.2.81(@types/node@25.2.3)': + '@oclif/plugin-not-found@3.2.80(@types/node@25.6.2)': dependencies: - '@inquirer/prompts': 7.10.1(@types/node@25.2.3) - '@oclif/core': 4.10.6 + '@inquirer/prompts': 7.10.1(@types/node@25.6.2) + '@oclif/core': 4.10.5 ansis: 3.17.0 fast-levenshtein: 3.0.0 transitivePeerDependencies: - '@types/node' - '@oclif/plugin-warn-if-update-available@3.1.61': + '@oclif/plugin-warn-if-update-available@3.1.60': dependencies: '@oclif/core': 4.5.5 ansis: 3.17.0 @@ -11779,7 +11779,7 @@ snapshots: dependencies: foreground-child: 3.3.1 jackspeak: 4.2.3 - minimatch: 10.2.5 + minimatch: 10.2.4 minipass: 7.1.2 package-json-from-dist: 1.0.1 path-scurry: 2.0.1 From 70af7c07ad194a71b8c82f05e391e386daba1827 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Fri, 15 May 2026 22:16:25 +0200 Subject: [PATCH 24/28] fix: wrong reference to an old maximum anon retrieval piece size --- apps/backend/.env.example | 2 +- apps/backend/src/config/app.config.ts | 4 ++-- docs/environment-variables.md | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/backend/.env.example b/apps/backend/.env.example index 30556e7a..807de908 100644 --- a/apps/backend/.env.example +++ b/apps/backend/.env.example @@ -67,7 +67,7 @@ JOB_SCHEDULE_PHASE_SECONDS=0 JOB_ENQUEUE_JITTER_SECONDS=0 DEAL_JOB_TIMEOUT_SECONDS=360 # 6m: Max runtime for deal jobs (TODO: reduce default to 3m) RETRIEVAL_JOB_TIMEOUT_SECONDS=60 # 1m: Max runtime for retrieval jobs (TODO: reduce default to 30s) -ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS=360 # 6m: Max runtime for anon retrieval jobs (pieces up to ~70 MiB) +ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS=360 # 6m: Max runtime for anon retrieval jobs (pieces up to ~500 MiB) IPFS_BLOCK_FETCH_CONCURRENCY=6 # Parallel block fetches when validating IPFS DAGs DEALBOT_PGBOSS_POOL_MAX=1 DEALBOT_PGBOSS_SCHEDULER_ENABLED=true diff --git a/apps/backend/src/config/app.config.ts b/apps/backend/src/config/app.config.ts index 7906be8c..49b55606 100644 --- a/apps/backend/src/config/app.config.ts +++ b/apps/backend/src/config/app.config.ts @@ -101,7 +101,7 @@ export const configValidationSchema = Joi.object({ JOB_ENQUEUE_JITTER_SECONDS: Joi.number().min(0).default(0), DEAL_JOB_TIMEOUT_SECONDS: Joi.number().min(120).default(360), // 6 minutes max runtime for data storage jobs (TODO: reduce default to 3 minutes) RETRIEVAL_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(60), // 1 minute max runtime for retrieval jobs (TODO: reduce default to 30 seconds) - ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(360), // 6 minutes max runtime for anon retrieval jobs (pieces can be up to ~70 MiB) + ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(360), // 6 minutes max runtime for anon retrieval jobs (pieces can be up to 500 MiB) DATA_SET_CREATION_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(300), // 5 minutes max runtime for dataset creation jobs IPFS_BLOCK_FETCH_CONCURRENCY: Joi.number().integer().min(1).max(32).default(6), ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT: Joi.number().integer().min(1).max(50).default(5), @@ -281,7 +281,7 @@ export interface IJobsConfig { /** * Maximum runtime (seconds) for anonymous retrieval jobs before forced abort. * - * Anonymous retrievals fetch arbitrary pieces (up to ~70 MiB), so this is + * Anonymous retrievals fetch arbitrary pieces (up to ~500 MiB), so this is * typically larger than `retrievalJobTimeoutSeconds`. Uses AbortController * to actively cancel job execution while still persisting partial metrics. */ diff --git a/docs/environment-variables.md b/docs/environment-variables.md index 91e28abc..547170ac 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -832,7 +832,7 @@ Use this to stagger multiple dealbot deployments that are not sharing a database - **Minimum**: `60` - **Enforced**: Yes (config validation) -**Role**: Maximum runtime for anonymous retrieval jobs before forced abort. Anonymous retrievals fetch arbitrary pieces (up to ~70 MiB) that were not produced by the dealbot, so this is typically larger than `RETRIEVAL_JOB_TIMEOUT_SECONDS`. When the timeout trips, partial metrics (`ttfb_ms`, `bytes_retrieved`, `response_code`) are still persisted so the abort is not silently lost. +**Role**: Maximum runtime for anonymous retrieval jobs before forced abort. Anonymous retrievals fetch arbitrary pieces (up to ~500 MiB) that were not produced by the dealbot, so this is typically larger than `RETRIEVAL_JOB_TIMEOUT_SECONDS`. When the timeout trips, partial metrics (`ttfb_ms`, `bytes_retrieved`, `response_code`) are still persisted so the abort is not silently lost. **When to update**: From b003d78250412cecf36a86fa5f0f78f60876cc47 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Fri, 15 May 2026 22:17:16 +0200 Subject: [PATCH 25/28] docs: improve anon retrieval documentation --- docs/checks/events-and-metrics.md | 2 +- docs/environment-variables.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index fba8b003..9c8a5ae0 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -4,7 +4,7 @@ This document is the intended **source of truth** for the events emitted by deal > **Note on "events":** the entries in the [Event List](#event-list) are named **timing markers** used to define metric Timer Starts/Ends — they are not all emitted as discrete Prometheus events or log lines. Each marker is anchored in code (as a timestamp variable, log line, or status transition) and used to compute the metrics in the [Metrics](#metrics) section. -## Anonymous Retrieval Event Model +## Data Storage Event Model The [Anonymous Retrieval check](./anon-retrievals.md) is a single-shot flow per piece: select → fetch piece → (optional) parse CAR + IPNI + block fetch → write one ClickHouse row. diff --git a/docs/environment-variables.md b/docs/environment-variables.md index 547170ac..72fadca0 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -431,9 +431,9 @@ Session keys are scoped (only storage operations, not deposits or withdrawals) a - **Required**: No - **Default**: Empty string (feature disabled) -**Role**: The Graph API endpoint for the upstream pdp-explorer subgraph. Drives the data-retention overdue-periods metric. +**Role**: The Graph API endpoint for querying PDP (Proof of Data Possession) subgraph data. This endpoint is used to retrieve data retention info for provider data. -This variable is kept distinct from [`SUBGRAPH_ENDPOINT`](#subgraph_endpoint) so the dealbot-owned subgraph can be rolled out incrementally — only the new anonymous-retrieval flow points at the new endpoint while the established data-retention path stays on the upstream subgraph. +This variable is kept distinct from [`SUBGRAPH_ENDPOINT`](#subgraph_endpoint) so the [dealbot-owned subgraph](../../src/subgraph) can be rolled out incrementally. Only the newer [anonymous-retrieval check](./checks/anon-retrievals.md) points at the new endpoint while the established [data-retention check](./checks/data-retention.md) stays on the upstream subgraph. **When to update**: @@ -455,7 +455,7 @@ PDP_SUBGRAPH_ENDPOINT=https://api.thegraph.com/subgraphs/filecoin/pdp **Role**: The Graph API endpoint for the dealbot-owned subgraph. Currently drives only the [anonymous-retrieval](./checks/anon-retrievals.md) candidate-piece query. Once the dealbot-owned subgraph has soaked in production it is intended to replace [`PDP_SUBGRAPH_ENDPOINT`](#pdp_subgraph_endpoint). -The dealbot-owned subgraph lives at `apps/subgraph/` (package `@dealbot/subgraph`) and is deployed to Goldsky. Point this variable at one of those slots; the exact slugs are documented in `apps/subgraph/README.md`. +The dealbot-owned subgraph lives at [`apps/subgraph/`](../apps/subgraph) (package `@dealbot/subgraph`) and is deployed to [Goldsky](https://goldsky.com). **When to update**: From 21b4f2d5045dc6261b915c7e14c75521ddb83d89 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Fri, 15 May 2026 22:54:51 +0200 Subject: [PATCH 26/28] docs: fix accidental changes to untouched event descriptions --- docs/checks/events-and-metrics.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index 9c8a5ae0..1e9d8583 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -118,8 +118,8 @@ sequenceDiagram | `ipfsRetrievalHttpResponseCode` | Data Storage, Retrieval | [`ipfsRetrievalLastByteReceived`](#ipfsRetrievalLastByteReceived) | `200`, `500`, `2xxSuccess`, `4xxClientError`, `5xxServerError`, `otherHttpStatusCodes`, `failure` | [`retrieval.service.ts`](../../apps/backend/src/retrieval/retrieval.service.ts) | | `retrievalStatus` | Data Storage, Retrieval | [`ipfsRetrievalIntegrityChecked`](#ipfsRetrievalIntegrityChecked) | `success`, `failure.timedout`, `failure.other` from [Data Storage Sub-status meanings](./data-storage.md#sub-status-meanings). | | | `dataSetCreationStatus` | Data-Set Creation | Not tied to an [event above](#event-list) but rather to data-set creation start (`pending`) and completion (`success`/`failure.*`) | `pending`, `success`, `failure.timedout`, `failure.other` | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | -| `dataSetChallengeStatus` | Data Retention | Not tied to an [event above](#event-list) but rather to the periodic chain-checking done in the [Data Retention Check](./data-retention.md) | `success`, `failure` | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | -| `pdp_provider_overdue_periods` | Data Retention | Emitted on every poll | Gauge value (estimated overdue periods) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | +| `dataSetChallengeStatus` | Data Retention | Emitted on each [Data Retention Check](./data-retention.md) poll when a provider's confirmed proving-period totals advance (strictly positive deltas). Unit: **challenges** (period delta × `CHALLENGES_PER_PROVING_PERIOD = 5`). | `success` (challenges in successfully-proven periods), `failure` (challenges in faulted periods) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | +| `pdp_provider_estimated_overdue_periods` | Data Retention | Emitted on every [Data Retention Check](./data-retention.md) poll for every successfully processed provider. | Gauge value in proving periods (non-negative integer) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | | `anonRetrievalStatus` | Anonymous Retrieval | After piece fetch completes (or on abort) | `success` (HTTP 2xx **and** CommP matches), `failure.http`, `failure.commp` (HTTP 2xx but bytes hashed to a different CID), `failure.aborted`, `failure.no_piece`. | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonPieceHttpResponseCode` | Anonymous Retrieval | After piece fetch completes | `200`, `500`, `2xxSuccess`, `4xxClientError`, `5xxServerError`, `otherHttpStatusCodes`, `failure` (same classifier as [`ipfsRetrievalHttpResponseCode`](#ipfsRetrievalHttpResponseCode)) | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonCarParseStatus` | Anonymous Retrieval | After CAR validation runs (skipped when piece fetch failed or piece is not IPFS-indexed) | `parseable`, `not_parseable` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | From a4f0b38fdb789de01d1dbff7e8977434320c0008 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Fri, 15 May 2026 22:56:32 +0200 Subject: [PATCH 27/28] rename: metric anonRetrievalStatus to anonPieceRetrievalStatus https://github.com/FilOzone/dealbot/pull/487/changes#r3245245410 --- apps/backend/src/metrics-prometheus/check-metrics.service.ts | 2 +- .../src/metrics-prometheus/metrics-prometheus.module.ts | 4 ++-- docs/checks/anon-retrievals.md | 2 +- docs/checks/events-and-metrics.md | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/apps/backend/src/metrics-prometheus/check-metrics.service.ts b/apps/backend/src/metrics-prometheus/check-metrics.service.ts index 8d4be313..76a8ee31 100644 --- a/apps/backend/src/metrics-prometheus/check-metrics.service.ts +++ b/apps/backend/src/metrics-prometheus/check-metrics.service.ts @@ -260,7 +260,7 @@ export class AnonRetrievalCheckMetrics { private readonly throughputBps: Histogram, @InjectMetric("anonRetrievalCheckMs") private readonly checkMs: Histogram, - @InjectMetric("anonRetrievalStatus") + @InjectMetric("anonPieceRetrievalStatus") private readonly statusCounter: Counter, @InjectMetric("anonPieceHttpResponseCode") private readonly httpResponseCounter: Counter, diff --git a/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts b/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts index 45f728b6..4ebeb01a 100644 --- a/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts +++ b/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts @@ -234,8 +234,8 @@ const metricProviders = [ buckets: [100, 500, 1000, 2000, 5000, 10000, 30000, 60000, 120000, 300000, 600000], }), makeCounterProvider({ - name: "anonRetrievalStatus", - help: "Anonymous retrieval overall outcome", + name: "anonPieceRetrievalStatus", + help: "Anonymous piece retrieval overall outcome", labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, }), makeCounterProvider({ diff --git a/docs/checks/anon-retrievals.md b/docs/checks/anon-retrievals.md index 2c15f3ed..c3b69610 100644 --- a/docs/checks/anon-retrievals.md +++ b/docs/checks/anon-retrievals.md @@ -91,7 +91,7 @@ Source: [`car-validation.service.ts`](../../apps/backend/src/retrieval-anon/car- | # | Assertion | How It's Checked | Retries | Relevant Metric | Implemented? | |---|-----------|------------------|:---:|------------------|:---:| | 1 | SP serves the piece | `GET /piece/{pieceCid}` returns HTTP 2xx | 0 | [`anonPieceRetrievalLastByteMs`](./events-and-metrics.md#anonPieceRetrievalLastByteMs) | Yes | -| 2 | Bytes match the declared CommP | Hash of response bytes equals `pieceCid` | 0 | [`anonRetrievalStatus`](./events-and-metrics.md#anonRetrievalStatus) | Yes | +| 2 | Bytes match the declared CommP | Hash of response bytes equals `pieceCid` | 0 | [`anonPieceRetrievalStatus`](./events-and-metrics.md#anonPieceRetrievalStatus) | Yes | | 3 | Bytes parse as a CAR (IPFS-indexed pieces only) | `@ipld/car` parses the response | 0 | [`anonCarParseStatus`](./events-and-metrics.md#anonCarParseStatus) | Yes | | 4 | SP is advertised on IPNI for root + sampled CIDs | filecoinpin.contact returns provider records | polling until timeout | [`anonIpniStatus`](./events-and-metrics.md#anonIpniStatus) | Yes | | 5 | Sampled blocks fetch + hash-verify | `/ipfs/{cid}?format=raw` for each sample | 0 | [`anonBlockFetchStatus`](./events-and-metrics.md#anonBlockFetchStatus) | Yes | diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index 1e9d8583..2421242c 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -10,7 +10,7 @@ The [Anonymous Retrieval check](./anon-retrievals.md) is a single-shot flow per It is not modeled as a sequence of named lifecycle events. Instead it emits: -- **Outcome metrics** when each step completes — see the [time](#time-related-metrics) and [status](#status-count-related-metrics) metric tables for `anonPieceRetrievalFirstByteMs`, `anonRetrievalCheckMs`, `anonRetrievalStatus`, `anonCarParseStatus`, `anonIpniStatus`, `anonBlockFetchStatus`, and friends. +- **Outcome metrics** when each step completes — see the [time](#time-related-metrics) and [status](#status-count-related-metrics) metric tables for `anonPieceRetrievalFirstByteMs`, `anonRetrievalCheckMs`, `anonPieceRetrievalStatus`, `anonCarParseStatus`, `anonIpniStatus`, `anonBlockFetchStatus`, and friends. - **One row per attempt** in the `anon_retrieval_checks` [ClickHouse table](#clickhouse-tables), emitted even on abort or unexpected error. - **Structured log lines** (`anon_retrieval_started`, `anon_retrieval_completed`, `anon_retrieval_no_piece`, `anon_retrieval_car_validation_failed`, `anon_retrieval_clickhouse_insert_failed`) carrying a `retrievalId` so each row can be joined back to log evidence. @@ -120,7 +120,7 @@ sequenceDiagram | `dataSetCreationStatus` | Data-Set Creation | Not tied to an [event above](#event-list) but rather to data-set creation start (`pending`) and completion (`success`/`failure.*`) | `pending`, `success`, `failure.timedout`, `failure.other` | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | | `dataSetChallengeStatus` | Data Retention | Emitted on each [Data Retention Check](./data-retention.md) poll when a provider's confirmed proving-period totals advance (strictly positive deltas). Unit: **challenges** (period delta × `CHALLENGES_PER_PROVING_PERIOD = 5`). | `success` (challenges in successfully-proven periods), `failure` (challenges in faulted periods) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | | `pdp_provider_estimated_overdue_periods` | Data Retention | Emitted on every [Data Retention Check](./data-retention.md) poll for every successfully processed provider. | Gauge value in proving periods (non-negative integer) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | -| `anonRetrievalStatus` | Anonymous Retrieval | After piece fetch completes (or on abort) | `success` (HTTP 2xx **and** CommP matches), `failure.http`, `failure.commp` (HTTP 2xx but bytes hashed to a different CID), `failure.aborted`, `failure.no_piece`. | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonPieceRetrievalStatus` | Anonymous Retrieval | After piece fetch completes (or on abort) | `success` (HTTP 2xx **and** CommP matches), `failure.http`, `failure.commp` (HTTP 2xx but bytes hashed to a different CID), `failure.aborted`, `failure.no_piece`. | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonPieceHttpResponseCode` | Anonymous Retrieval | After piece fetch completes | `200`, `500`, `2xxSuccess`, `4xxClientError`, `5xxServerError`, `otherHttpStatusCodes`, `failure` (same classifier as [`ipfsRetrievalHttpResponseCode`](#ipfsRetrievalHttpResponseCode)) | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonCarParseStatus` | Anonymous Retrieval | After CAR validation runs (skipped when piece fetch failed or piece is not IPFS-indexed) | `parseable`, `not_parseable` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonIpniStatus` | Anonymous Retrieval | After CAR validation runs, **or** when piece fetch failed (records `skipped`) | `valid`, `invalid`, `skipped`, `error` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | From 1a32373e29e7868c39f19d68ded25e3d238b9858 Mon Sep 17 00:00:00 2001 From: Dennis Trautwein Date: Fri, 15 May 2026 22:58:07 +0200 Subject: [PATCH 28/28] fix: interpret abort signal as timed out for metric --- apps/backend/src/retrieval-anon/anon-retrieval.service.ts | 2 +- docs/checks/events-and-metrics.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts index eddc88f0..a74c2bf0 100644 --- a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -142,7 +142,7 @@ export class AnonRetrievalService { pieceServedCorrectly ? "success" : pieceResult.aborted - ? "failure.aborted" + ? "failure.timedout" : pieceResult.success ? "failure.commp" : "failure.http", diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index 2421242c..37761e89 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -120,7 +120,7 @@ sequenceDiagram | `dataSetCreationStatus` | Data-Set Creation | Not tied to an [event above](#event-list) but rather to data-set creation start (`pending`) and completion (`success`/`failure.*`) | `pending`, `success`, `failure.timedout`, `failure.other` | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | | `dataSetChallengeStatus` | Data Retention | Emitted on each [Data Retention Check](./data-retention.md) poll when a provider's confirmed proving-period totals advance (strictly positive deltas). Unit: **challenges** (period delta × `CHALLENGES_PER_PROVING_PERIOD = 5`). | `success` (challenges in successfully-proven periods), `failure` (challenges in faulted periods) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | | `pdp_provider_estimated_overdue_periods` | Data Retention | Emitted on every [Data Retention Check](./data-retention.md) poll for every successfully processed provider. | Gauge value in proving periods (non-negative integer) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | -| `anonPieceRetrievalStatus` | Anonymous Retrieval | After piece fetch completes (or on abort) | `success` (HTTP 2xx **and** CommP matches), `failure.http`, `failure.commp` (HTTP 2xx but bytes hashed to a different CID), `failure.aborted`, `failure.no_piece`. | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonPieceRetrievalStatus` | Anonymous Retrieval | After piece fetch completes (or on abort) | `success` (HTTP 2xx **and** CommP matches), `failure.http`, `failure.commp` (HTTP 2xx but bytes hashed to a different CID), `failure.timedout`, `failure.no_piece`. | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonPieceHttpResponseCode` | Anonymous Retrieval | After piece fetch completes | `200`, `500`, `2xxSuccess`, `4xxClientError`, `5xxServerError`, `otherHttpStatusCodes`, `failure` (same classifier as [`ipfsRetrievalHttpResponseCode`](#ipfsRetrievalHttpResponseCode)) | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonCarParseStatus` | Anonymous Retrieval | After CAR validation runs (skipped when piece fetch failed or piece is not IPFS-indexed) | `parseable`, `not_parseable` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | | `anonIpniStatus` | Anonymous Retrieval | After CAR validation runs, **or** when piece fetch failed (records `skipped`) | `valid`, `invalid`, `skipped`, `error` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) |