diff --git a/.gitignore b/.gitignore index fc72832b..cbf7f9d7 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,5 @@ coverage/ # per-package lockfiles are stray apps/*/pnpm-lock.yaml !pnpm-lock.yaml + +.tool-versions diff --git a/apps/backend/.env.example b/apps/backend/.env.example index 6815a66f..807de908 100644 --- a/apps/backend/.env.example +++ b/apps/backend/.env.example @@ -23,7 +23,11 @@ WALLET_ADDRESS=0x0000000000000000000000000000000000000000 WALLET_PRIVATE_KEY=your_private_key_here CHECK_DATASET_CREATION_FEES=true USE_ONLY_APPROVED_PROVIDERS=true +# Upstream pdp-explorer subgraph — drives the data-retention / overdue-periods path. PDP_SUBGRAPH_ENDPOINT=https://api.thegraph.com/subgraphs/filecoin/pdp +# Dealbot-owned subgraph on Goldsky (see apps/subgraph/README.md) — drives only +# the new anonymous-retrieval candidate-piece query for now. +SUBGRAPH_ENDPOINT=https://api.goldsky.com/api/public//subgraphs/dealbot-subgraph//gn # Minimum number of datasets per SP (default: 1). When > 1, a separate data_set_creation job provisions extra datasets. MIN_NUM_DATASETS_FOR_CHECKS=1 @@ -52,6 +56,9 @@ DEALBOT_MAINTENANCE_WINDOW_MINUTES=20 DEALS_PER_SP_PER_HOUR=2 DATASET_CREATIONS_PER_SP_PER_HOUR=1 RETRIEVALS_PER_SP_PER_HOUR=1 +RETRIEVALS_ANON_PER_SP_PER_HOUR= +ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT=5 +METRICS_PER_HOUR=2 PG_BOSS_LOCAL_CONCURRENCY=20 JOB_SCHEDULER_POLL_SECONDS=300 JOB_WORKER_POLL_SECONDS=60 @@ -60,6 +67,7 @@ JOB_SCHEDULE_PHASE_SECONDS=0 JOB_ENQUEUE_JITTER_SECONDS=0 DEAL_JOB_TIMEOUT_SECONDS=360 # 6m: Max runtime for deal jobs (TODO: reduce default to 3m) RETRIEVAL_JOB_TIMEOUT_SECONDS=60 # 1m: Max runtime for retrieval jobs (TODO: reduce default to 30s) +ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS=360 # 6m: Max runtime for anon retrieval jobs (pieces up to ~500 MiB) IPFS_BLOCK_FETCH_CONCURRENCY=6 # Parallel block fetches when validating IPFS DAGs DEALBOT_PGBOSS_POOL_MAX=1 DEALBOT_PGBOSS_SCHEDULER_ENABLED=true @@ -73,9 +81,13 @@ PROXY_LIST=http://username:password@host:port,http://username:password@host:port PROXY_LOCATIONS=l1,l2 # Timeout Configuration (in milliseconds) -CONNECT_TIMEOUT_MS=10000 # 10s: Initial connection timeout -HTTP_REQUEST_TIMEOUT_MS=240000 # 4m: Total transfer timeout for HTTP/1.1 (10MiB @ 170KB/s + overhead) -HTTP2_REQUEST_TIMEOUT_MS=240000 # 4m: Total transfer timeout for HTTP/2 (10MiB @ 170KB/s + overhead) +CONNECT_TIMEOUT_MS=10000 # 10s: Connection + response-headers timeout (scoped to the header phase only) +# HTTP_REQUEST_TIMEOUT_MS and HTTP2_REQUEST_TIMEOUT_MS default to the longest job timeout above +# (max of DEAL_/RETRIEVAL_/ANON_RETRIEVAL_/DATA_SET_CREATION_/MAX_PIECE_CLEANUP_ * 1000 ms) so the +# HTTP-level ceiling never pre-empts a job-scoped AbortSignal. Only override when you have a non-job +# caller of HttpClientService that needs a specific deadline. +# HTTP_REQUEST_TIMEOUT_MS=360000 +# HTTP2_REQUEST_TIMEOUT_MS=360000 # SP Blocklists configuration # BLOCKED_SP_IDS=1234,5678 diff --git a/apps/backend/README.md b/apps/backend/README.md index 19ee970a..e4dafd6e 100644 --- a/apps/backend/README.md +++ b/apps/backend/README.md @@ -105,6 +105,7 @@ All configuration is done via environment variables in `.env`. | `ENABLE_IPNI_TESTING` | IPNI testing mode (`disabled`/`random`/`always`) | `always` | | `USE_ONLY_APPROVED_PROVIDERS` | Only use approved storage providers | `true` | | `PDP_SUBGRAPH_ENDPOINT` | PDP subgraph API endpoint for PDP proof-set/data-retention | `https://api.thegraph.com/subgraphs/filecoin/pdp` | +| `SUBGRAPH_ENDPOINT` | Subgraph GraphQL endpoint for anon-retrieval queries | `https://api.goldsky.com/api/public//subgraphs/dealbot-subgraph//gn` | ### Scheduling Configuration (pg-boss) diff --git a/apps/backend/src/app.module.ts b/apps/backend/src/app.module.ts index 569ec5e4..0580f339 100644 --- a/apps/backend/src/app.module.ts +++ b/apps/backend/src/app.module.ts @@ -13,6 +13,7 @@ import { JobsModule } from "./jobs/jobs.module.js"; import { MetricsPrometheusModule } from "./metrics-prometheus/metrics-prometheus.module.js"; import { ProvidersModule } from "./providers/providers.module.js"; import { RetrievalModule } from "./retrieval/retrieval.module.js"; +import { RetrievalAnonModule } from "./retrieval-anon/retrieval-anon.module.js"; @Module({ imports: [ @@ -28,6 +29,7 @@ import { RetrievalModule } from "./retrieval/retrieval.module.js"; JobsModule, DealModule, RetrievalModule, + RetrievalAnonModule, DataSourceModule, ProvidersModule, ...(process.env.ENABLE_DEV_MODE === "true" ? [DevToolsModule] : []), diff --git a/apps/backend/src/clickhouse/clickhouse.schema.ts b/apps/backend/src/clickhouse/clickhouse.schema.ts index 85d91052..b27ba0e2 100644 --- a/apps/backend/src/clickhouse/clickhouse.schema.ts +++ b/apps/backend/src/clickhouse/clickhouse.schema.ts @@ -62,6 +62,50 @@ export function buildMigrations(database: string): string[] { PARTITION BY toStartOfMonth(timestamp) TTL toDateTime(timestamp) + INTERVAL 1 YEAR`, + `CREATE TABLE IF NOT EXISTS ${database}.anon_retrieval_checks +( + timestamp DateTime64(3, 'UTC'), -- when the check completed + probe_location LowCardinality(String), -- dealbot location + sp_address String, -- storage provider address (lowercased) + sp_id Nullable(UInt64), -- storage provider numeric id + sp_name Nullable(String), -- storage provider name + + retrieval_id UUID, -- per-event correlation id (log/Prometheus join) + + piece_cid String, -- piece CID (v2/CommP) sampled from the subgraph + data_set_id UInt64, -- on-chain data set id + piece_id UInt64, -- on-chain piece id within the data set + raw_size UInt64, -- raw (unpadded) piece size, bytes + with_ipfs_indexing Bool, -- whether the piece advertises IPNI metadata + ipfs_root_cid Nullable(String), -- root CID of the contained DAG; null when not IPFS-indexed + + service_type LowCardinality(String), -- 'direct_sp' (only mode for anon retrievals today) + retrieval_endpoint String, -- URL probed (e.g. {spBaseUrl}/piece/{pieceCid}) + + piece_fetch_status LowCardinality(String), -- 'success' | 'failed' — HTTP transport outcome of GET /piece/ (HTTP 2xx). CommP validity, CAR/IPNI/block-fetch outcomes live in their own columns. + http_response_code Nullable(UInt16), -- raw HTTP status; null on transport failure + first_byte_ms Nullable(Float64), -- time to first response byte + last_byte_ms Nullable(Float64), -- time to last response byte + bytes_retrieved Nullable(UInt64), -- bytes received from /piece/{cid} + throughput_bps Nullable(UInt64), -- effective throughput, bytes per second + + commp_valid Nullable(Bool), -- null when retrieval failed before CommP could be hashed + car_parseable Nullable(Bool), -- null when CAR validation was skipped (no IPFS indexing or piece fetch failed); true if bytes parsed as a CAR + car_block_count Nullable(UInt32), -- total number of blocks observed inside the CAR; null when skipped or unparseable + block_fetch_endpoint Nullable(String), -- gateway base URL probed for block fetch (e.g. {spBaseUrl}/ipfs/); null when skipped + block_fetch_valid Nullable(Bool), -- null when skipped; true if all sampled blocks fetched + hash-verified + block_fetch_sampled_count Nullable(UInt32), -- number of blocks sampled and probed via /ipfs/?format=raw + block_fetch_failed_count Nullable(UInt32), -- number of sampled blocks that failed (non-2xx, hash mismatch, unsupported codec, or transport error) + + ipni_status LowCardinality(String), -- 'valid' | 'invalid' | 'skipped' | 'error' — all-or-nothing across the root CID and the sampled child CIDs (filecoin-pin verifies them as a single batch) + ipni_verify_ms Nullable(Float64), -- IPNI verification duration; null when skipped + + error_message Nullable(String) -- failure reason; null on success +) ENGINE MergeTree() + PRIMARY KEY (probe_location, sp_address, timestamp) + PARTITION BY toStartOfMonth(timestamp) + TTL toDateTime(timestamp) + INTERVAL 1 YEAR`, + `CREATE TABLE IF NOT EXISTS ${database}.data_retention_challenges ( timestamp DateTime64(3, 'UTC'), -- when the poll ran and detected these periods diff --git a/apps/backend/src/config/app.config.ts b/apps/backend/src/config/app.config.ts index b3b32a37..49b55606 100644 --- a/apps/backend/src/config/app.config.ts +++ b/apps/backend/src/config/app.config.ts @@ -56,7 +56,16 @@ export const configValidationSchema = Joi.object({ USE_ONLY_APPROVED_PROVIDERS: Joi.boolean().default(true), DEALBOT_DATASET_VERSION: Joi.string().optional(), MIN_NUM_DATASETS_FOR_CHECKS: Joi.number().integer().min(1).default(1), + // Two subgraph endpoints coexist intentionally to limit blast radius while we + // migrate off the upstream pdp-explorer subgraph: + // - PDP_SUBGRAPH_ENDPOINT drives the established overdue-periods / data + // retention path against the existing pdp-explorer subgraph. + // - SUBGRAPH_ENDPOINT drives only the new anonymous-retrieval candidate + // piece query against the dealbot-owned subgraph. + // Once the dealbot-owned subgraph has soaked in production we can drop + // PDP_SUBGRAPH_ENDPOINT and route everything through SUBGRAPH_ENDPOINT. PDP_SUBGRAPH_ENDPOINT: Joi.string().uri().optional().allow(""), + SUBGRAPH_ENDPOINT: Joi.string().uri().optional().allow(""), // Scheduling PROVIDERS_REFRESH_INTERVAL_SECONDS: Joi.number().default(4 * 3600), @@ -80,6 +89,7 @@ export const configValidationSchema = Joi.object({ DEALS_PER_SP_PER_HOUR: Joi.number().min(0.001).max(20).default(4), DATASET_CREATIONS_PER_SP_PER_HOUR: Joi.number().min(0.001).max(20).default(1), RETRIEVALS_PER_SP_PER_HOUR: Joi.number().min(0.001).max(20).default(2), + RETRIEVALS_ANON_PER_SP_PER_HOUR: Joi.number().min(0.001).max(20).optional(), // Polling interval for pg-boss scheduler (lower = more responsive, higher = less DB chatter). JOB_SCHEDULER_POLL_SECONDS: Joi.number().min(60).default(300), JOB_WORKER_POLL_SECONDS: Joi.number().min(5).default(60), @@ -91,8 +101,10 @@ export const configValidationSchema = Joi.object({ JOB_ENQUEUE_JITTER_SECONDS: Joi.number().min(0).default(0), DEAL_JOB_TIMEOUT_SECONDS: Joi.number().min(120).default(360), // 6 minutes max runtime for data storage jobs (TODO: reduce default to 3 minutes) RETRIEVAL_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(60), // 1 minute max runtime for retrieval jobs (TODO: reduce default to 30 seconds) + ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(360), // 6 minutes max runtime for anon retrieval jobs (pieces can be up to 500 MiB) DATA_SET_CREATION_JOB_TIMEOUT_SECONDS: Joi.number().min(60).default(300), // 5 minutes max runtime for dataset creation jobs IPFS_BLOCK_FETCH_CONCURRENCY: Joi.number().integer().min(1).max(32).default(6), + ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT: Joi.number().integer().min(1).max(50).default(5), // Piece Cleanup MAX_DATASET_STORAGE_SIZE_BYTES: Joi.number() @@ -131,8 +143,9 @@ export const configValidationSchema = Joi.object({ // Timeouts (in milliseconds) CONNECT_TIMEOUT_MS: Joi.number().min(1000).default(10000), // 10 seconds to establish connection/receive headers - HTTP_REQUEST_TIMEOUT_MS: Joi.number().min(1000).default(240000), // 4 minutes total for HTTP requests (10MiB @ 170KB/s + overhead) - HTTP2_REQUEST_TIMEOUT_MS: Joi.number().min(1000).default(240000), // 4 minutes total for HTTP/2 requests (10MiB @ 170KB/s + overhead) + // Defaults intentionally omitted so loadConfig can derive them from the longest job timeout. + HTTP_REQUEST_TIMEOUT_MS: Joi.number().min(1000).optional(), + HTTP2_REQUEST_TIMEOUT_MS: Joi.number().min(1000).optional(), IPNI_VERIFICATION_TIMEOUT_MS: Joi.number().min(1000).default(60000), // 60 seconds max time to wait for IPNI verification IPNI_VERIFICATION_POLLING_MS: Joi.number().min(250).default(2000), // 2 seconds between IPNI verification polls @@ -174,6 +187,7 @@ export interface IBlockchainConfig { dealbotDataSetVersion?: string; minNumDataSetsForChecks: number; pdpSubgraphEndpoint?: string; + subgraphEndpoint?: string; // Endpoint of the dealbot-owned subgraph. Eventually replaces `pdpSubgraphEndpoint` } export interface ISchedulingConfig { @@ -264,6 +278,14 @@ export interface IJobsConfig { * Uses AbortController to actively cancel job execution. */ retrievalJobTimeoutSeconds: number; + /** + * Maximum runtime (seconds) for anonymous retrieval jobs before forced abort. + * + * Anonymous retrievals fetch arbitrary pieces (up to ~500 MiB), so this is + * typically larger than `retrievalJobTimeoutSeconds`. Uses AbortController + * to actively cancel job execution while still persisting partial metrics. + */ + anonRetrievalJobTimeoutSeconds: number; /** * Target number of piece cleanup runs per storage provider per hour. * @@ -278,6 +300,12 @@ export interface IJobsConfig { * Only used when `DEALBOT_JOBS_MODE=pgboss`. */ maxPieceCleanupRuntimeSeconds: number; + + /** + * Target number of anonymous retrieval tests per storage provider per hour. + * Defaults to retrievalsPerSpPerHour when not set. + */ + retrievalsAnonPerSpPerHour: number; } export interface IDatasetConfig { @@ -295,6 +323,10 @@ export interface ITimeoutConfig { export interface IRetrievalConfig { ipfsBlockFetchConcurrency: number; + /** + * Number of CAR blocks to sample for IPNI + block-fetch validation. + */ + anonBlockSampleCount: number; } export interface IPieceCleanupConfig { @@ -336,6 +368,43 @@ export interface IConfig { } export function loadConfig(): IConfig { + const jobTimeoutSeconds = { + deal: Number.parseInt(process.env.DEAL_JOB_TIMEOUT_SECONDS || "360", 10), + retrieval: Number.parseInt(process.env.RETRIEVAL_JOB_TIMEOUT_SECONDS || "60", 10), + anonRetrieval: Number.parseInt(process.env.ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS || "360", 10), + dataSetCreation: Number.parseInt(process.env.DATA_SET_CREATION_JOB_TIMEOUT_SECONDS || "300", 10), + pieceCleanup: Number.parseInt(process.env.MAX_PIECE_CLEANUP_RUNTIME_SECONDS || "300", 10), + }; + + // HTTP-level request timeouts default to the longest job timeout so the + // per-request ceiling never caps below the per-job budget. Any job-scoped + // AbortSignal fires first and is authoritative; the HTTP timer only kicks + // in for callers that do not pass a parent signal. + const longestJobTimeoutMs = Math.max(...Object.values(jobTimeoutSeconds)) * 1000; + + const httpRequestTimeoutMs = Number.parseInt(process.env.HTTP_REQUEST_TIMEOUT_MS || String(longestJobTimeoutMs), 10); + const http2RequestTimeoutMs = Number.parseInt( + process.env.HTTP2_REQUEST_TIMEOUT_MS || String(longestJobTimeoutMs), + 10, + ); + + // Misconfiguration guard: if someone explicitly sets an HTTP timeout below + // the longest job timeout, the HTTP-level timer will abort in-flight work + // before the job signal has a chance to report it. Warn loudly so this is + // caught at boot rather than inferred from short-timeout incidents later. + for (const [name, value] of [ + ["HTTP_REQUEST_TIMEOUT_MS", httpRequestTimeoutMs], + ["HTTP2_REQUEST_TIMEOUT_MS", http2RequestTimeoutMs], + ] as const) { + if (value < longestJobTimeoutMs) { + // eslint-disable-next-line no-console + console.warn( + `[config] ${name}=${value}ms is lower than the longest job timeout (${longestJobTimeoutMs}ms). ` + + `HTTP requests may abort before the job signal fires, producing short, unexplained timeouts.`, + ); + } + } + return { app: { env: process.env.NODE_ENV || "development", @@ -379,6 +448,7 @@ export function loadConfig(): IConfig { dealbotDataSetVersion: process.env.DEALBOT_DATASET_VERSION, minNumDataSetsForChecks: Number.parseInt(process.env.MIN_NUM_DATASETS_FOR_CHECKS || "1", 10), pdpSubgraphEndpoint: process.env.PDP_SUBGRAPH_ENDPOINT || "", + subgraphEndpoint: process.env.SUBGRAPH_ENDPOINT || "", }, scheduling: { providersRefreshIntervalSeconds: Number.parseInt(process.env.PROVIDERS_REFRESH_INTERVAL_SECONDS || "14400", 10), @@ -401,11 +471,15 @@ export function loadConfig(): IConfig { catchupMaxEnqueue: Number.parseInt(process.env.JOB_CATCHUP_MAX_ENQUEUE || "10", 10), schedulePhaseSeconds: Number.parseInt(process.env.JOB_SCHEDULE_PHASE_SECONDS || "0", 10), enqueueJitterSeconds: Number.parseInt(process.env.JOB_ENQUEUE_JITTER_SECONDS || "0", 10), - dealJobTimeoutSeconds: Number.parseInt(process.env.DEAL_JOB_TIMEOUT_SECONDS || "360", 10), - retrievalJobTimeoutSeconds: Number.parseInt(process.env.RETRIEVAL_JOB_TIMEOUT_SECONDS || "60", 10), - dataSetCreationJobTimeoutSeconds: Number.parseInt(process.env.DATA_SET_CREATION_JOB_TIMEOUT_SECONDS || "300", 10), + dealJobTimeoutSeconds: jobTimeoutSeconds.deal, + retrievalJobTimeoutSeconds: jobTimeoutSeconds.retrieval, + anonRetrievalJobTimeoutSeconds: jobTimeoutSeconds.anonRetrieval, + retrievalsAnonPerSpPerHour: Number.parseFloat( + process.env.RETRIEVALS_ANON_PER_SP_PER_HOUR || process.env.RETRIEVALS_PER_SP_PER_HOUR || "2", + ), + dataSetCreationJobTimeoutSeconds: jobTimeoutSeconds.dataSetCreation, pieceCleanupPerSpPerHour: Number.parseFloat(process.env.JOB_PIECE_CLEANUP_PER_SP_PER_HOUR || String(1 / 24)), - maxPieceCleanupRuntimeSeconds: Number.parseInt(process.env.MAX_PIECE_CLEANUP_RUNTIME_SECONDS || "300", 10), + maxPieceCleanupRuntimeSeconds: jobTimeoutSeconds.pieceCleanup, }, dataset: { localDatasetsPath: process.env.DEALBOT_LOCAL_DATASETS_PATH || DEFAULT_LOCAL_DATASETS_PATH, @@ -427,13 +501,14 @@ export function loadConfig(): IConfig { }, timeouts: { connectTimeoutMs: Number.parseInt(process.env.CONNECT_TIMEOUT_MS || "10000", 10), - httpRequestTimeoutMs: Number.parseInt(process.env.HTTP_REQUEST_TIMEOUT_MS || "240000", 10), - http2RequestTimeoutMs: Number.parseInt(process.env.HTTP2_REQUEST_TIMEOUT_MS || "240000", 10), + httpRequestTimeoutMs, + http2RequestTimeoutMs, ipniVerificationTimeoutMs: Number.parseInt(process.env.IPNI_VERIFICATION_TIMEOUT_MS || "60000", 10), ipniVerificationPollingMs: Number.parseInt(process.env.IPNI_VERIFICATION_POLLING_MS || "2000", 10), }, retrieval: { ipfsBlockFetchConcurrency: Number.parseInt(process.env.IPFS_BLOCK_FETCH_CONCURRENCY || "6", 10), + anonBlockSampleCount: Number.parseInt(process.env.ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT || "5", 10), }, clickhouse: { url: process.env.CLICKHOUSE_URL || undefined, diff --git a/apps/backend/src/data-retention/data-retention.service.spec.ts b/apps/backend/src/data-retention/data-retention.service.spec.ts index 87ced66a..3fde29e8 100644 --- a/apps/backend/src/data-retention/data-retention.service.spec.ts +++ b/apps/backend/src/data-retention/data-retention.service.spec.ts @@ -921,7 +921,7 @@ describe("DataRetentionService", () => { expect(incCalls).toEqual(expect.arrayContaining([[10], [25]])); }); - it("reloads baselines from DB on every poll", async () => { + it("only loads baselines from DB once across multiple polls", async () => { pdpSubgraphServiceMock.fetchProvidersWithDatasets.mockResolvedValue([makeProvider()]); await service.pollDataRetention(); diff --git a/apps/backend/src/database/entities/job-schedule-state.entity.ts b/apps/backend/src/database/entities/job-schedule-state.entity.ts index d1758ae9..ebd5254d 100644 --- a/apps/backend/src/database/entities/job-schedule-state.entity.ts +++ b/apps/backend/src/database/entities/job-schedule-state.entity.ts @@ -6,6 +6,7 @@ import { Column, CreateDateColumn, Entity, Index, PrimaryGeneratedColumn, Update export type JobType = | "deal" | "retrieval" + | "retrieval_anon" | "data_set_creation" | "metrics" // legacy: no longer scheduled; see RemoveMetricsJobScheduleRows migration. TODO(#457): remove. | "metrics_cleanup" // legacy: no longer scheduled; see RemoveMetricsJobScheduleRows migration. TODO(#457): remove. diff --git a/apps/backend/src/database/types.ts b/apps/backend/src/database/types.ts index 46fd5d28..c56b355a 100644 --- a/apps/backend/src/database/types.ts +++ b/apps/backend/src/database/types.ts @@ -28,6 +28,13 @@ export enum IpniStatus { FAILED = "failed", } +export enum IpniCheckStatus { + VALID = "valid", + INVALID = "invalid", + SKIPPED = "skipped", + ERROR = "error", +} + /** * Metadata schema for deal storage and retrieval */ diff --git a/apps/backend/src/http-client/http-client.service.spec.ts b/apps/backend/src/http-client/http-client.service.spec.ts index 96604139..511910ba 100644 --- a/apps/backend/src/http-client/http-client.service.spec.ts +++ b/apps/backend/src/http-client/http-client.service.spec.ts @@ -64,25 +64,94 @@ describe("HttpClientService", () => { expect(config.timeout).toBe(120000); }); - it("times out HTTP/2 requests using the connection timeout", async () => { + it("passes the configured headersTimeout to undici and translates its error", async () => { const service = await createService(); - if (typeof AbortSignal.timeout !== "function") { - (AbortSignal as any).timeout = () => new AbortController().signal; + let receivedHeadersTimeout: number | undefined; + undiciRequestMock.mockImplementationOnce((_url: string, options: { headersTimeout?: number }) => { + receivedHeadersTimeout = options.headersTimeout; + const err = new Error("Headers Timeout Error") as Error & { code?: string }; + err.name = "HeadersTimeoutError"; + err.code = "UND_ERR_HEADERS_TIMEOUT"; + return Promise.reject(err); + }); + + await expect(service.requestWithMetrics("http://example.com", { httpVersion: "2" })).rejects.toThrow( + "HTTP/2 connection/headers timed out after 25ms", + ); + + expect(receivedHeadersTimeout).toBe(25); + }); + + it("keeps the request signal alive after the connect timeout window elapses", async () => { + const service = await createService(); + + // Previously, connectTimeoutMs (25ms) was folded into the request signal, + // so any download lasting longer than 25ms was aborted mid-stream. The + // signal must now stay live until the transfer timeout or parent signal + // fires. + let sawAbortBeforeResolve = false; + undiciRequestMock.mockImplementationOnce(async (_url: string, options: { signal?: AbortSignal }) => { + await new Promise((r) => setTimeout(r, 75)); + sawAbortBeforeResolve = options.signal?.aborted === true; + async function* body() { + yield Buffer.from("ok"); + } + return { statusCode: 200, body: body() }; + }); + + const result = await service.requestWithMetrics("http://example.com", { httpVersion: "2" }); + + expect(sawAbortBeforeResolve).toBe(false); + expect(result.aborted).toBeUndefined(); + expect(result.metrics.statusCode).toBe(200); + }); + + it("returns partial bytes and metrics when HTTP/2 download is aborted after headers", async () => { + const service = await createService(); + + const parentAbort = new AbortController(); + + async function* abortingBody() { + yield Buffer.from("hello"); + yield Buffer.from(" world"); + // Simulate an abort mid-stream after two chunks. + parentAbort.abort(new Error("Anon retrieval job timeout (60s) for sp1")); + throw new Error("aborted"); } - undiciRequestMock.mockImplementationOnce((_url: string, options: { signal?: AbortSignal }) => { - return new Promise((_resolve, reject) => { - options.signal?.addEventListener("abort", () => reject(new Error("aborted")), { once: true }); - }); + undiciRequestMock.mockImplementationOnce(async () => ({ + statusCode: 200, + body: abortingBody(), + })); + + const result = await service.requestWithMetrics("http://example.com/piece", { + httpVersion: "2", + signal: parentAbort.signal, }); - vi.useFakeTimers(); + expect(result.aborted).toBe(true); + expect(result.abortReason).toContain("timeout"); + expect(result.metrics.statusCode).toBe(200); + expect(result.metrics.responseSize).toBe(11); + expect(Buffer.isBuffer(result.data) ? result.data.toString() : "").toBe("hello world"); + }); + + it("rethrows non-abort download errors on HTTP/2", async () => { + const service = await createService(); - const promise = service.requestWithMetrics("http://example.com", { httpVersion: "2" }); - const assertion = expect(promise).rejects.toThrow("HTTP/2 connection/headers timed out after 25ms"); - await vi.advanceTimersByTimeAsync(25); + async function* brokenBody() { + yield Buffer.from("partial"); + throw new Error("network reset"); + } + + undiciRequestMock.mockImplementationOnce(async () => ({ + statusCode: 200, + body: brokenBody(), + })); - await assertion; + await expect(service.requestWithMetrics("http://example.com/piece", { httpVersion: "2" })).rejects.toThrow( + "network reset", + ); }); }); diff --git a/apps/backend/src/http-client/http-client.service.ts b/apps/backend/src/http-client/http-client.service.ts index 48e10e5c..81140162 100644 --- a/apps/backend/src/http-client/http-client.service.ts +++ b/apps/backend/src/http-client/http-client.service.ts @@ -81,12 +81,11 @@ export class HttpClientService { let ttfbTime = 0; let statusCode = 0; - /** - * Dual-timeout strategy for HTTP/2 requests: - * 1. AbortSignal.timeout() - Undici's native timeout (10 min default) - * 2. AbortSignal.timeout() for connection/headers (10 sec default) - */ - const { signal, connectTimeoutSignal } = this.buildHttp2Signals(options.signal); + // Dual-timeout strategy for HTTP/2 requests: + // - `headersTimeout` (undici): scopes the connect + response-headers phase. + // - Combined AbortSignal: transfer-timeout ceiling + parent (job) signal. + const transferTimeoutSignal = AbortSignal.timeout(this.http2TimeoutMs); + const signal = options.signal ? anySignal([transferTimeoutSignal, options.signal]) : transferTimeoutSignal; const requestOptions: any = { method, headers: { @@ -94,6 +93,7 @@ export class HttpClientService { ...headers, }, signal, + headersTimeout: this.connectTimeoutMs, }; if (data) { @@ -105,7 +105,8 @@ export class HttpClientService { try { response = await undiciRequest(url, requestOptions); } catch (error) { - if (connectTimeoutSignal.aborted) { + // discern connection error from transfer error + if (isHeadersTimeoutError(error)) { throw new Error(`HTTP/2 connection/headers timed out after ${this.connectTimeoutMs}ms`); } throw error; @@ -115,8 +116,15 @@ export class HttpClientService { statusCode = response.statusCode; const chunks: Buffer[] = []; - for await (const chunk of response.body) { - chunks.push(Buffer.from(chunk)); + let downloadError: unknown; + try { + for await (const chunk of response.body) { + chunks.push(Buffer.from(chunk)); + } + } catch (error) { + // Download-phase failures (e.g. abort signal) fall through so we can + // return the partial buffer + metrics collected so far. + downloadError = error; } const dataBuffer = Buffer.concat(chunks); @@ -133,6 +141,29 @@ export class HttpClientService { httpVersion: "2", }; + if (downloadError !== undefined) { + const aborted = options.signal?.aborted === true || isAbortLikeError(downloadError); + if (!aborted) { + throw downloadError; + } + const abortReason = describeAbortReason(options.signal, downloadError); + this.logger.warn({ + event: "http2_download_aborted", + message: "HTTP/2 download aborted after headers; returning partial data", + url, + bytesReceived: dataBuffer.length, + totalTime: metrics.totalTime, + ttfb: metrics.ttfb, + abortReason, + }); + return { + data: dataBuffer as T, + metrics, + aborted: true, + abortReason, + }; + } + return { data: dataBuffer as T, metrics, @@ -255,24 +286,28 @@ export class HttpClientService { // Fallback for objects/arrays return Buffer.from(JSON.stringify(data)); } +} - private buildHttp2Signals(parentSignal?: AbortSignal): { - signal: AbortSignal; - connectTimeoutSignal: AbortSignal; - } { - const transferTimeoutSignal = AbortSignal.timeout(this.http2TimeoutMs); - const connectTimeoutSignal = AbortSignal.timeout(this.connectTimeoutMs); +function isAbortLikeError(error: unknown): boolean { + if (error instanceof Error) { + return error.name === "AbortError" || error.name === "TimeoutError" || /abort/i.test(error.message); + } + return false; +} - if (parentSignal) { - return { - signal: anySignal([transferTimeoutSignal, connectTimeoutSignal, parentSignal]), - connectTimeoutSignal, - }; - } +/** + * Determines if a given error represents a "Headers Timeout" error. + */ +function isHeadersTimeoutError(error: unknown): boolean { + if (!(error instanceof Error)) return false; + const code = (error as Error & { code?: string }).code; + return error.name === "HeadersTimeoutError" || code === "UND_ERR_HEADERS_TIMEOUT"; +} - return { - signal: anySignal([transferTimeoutSignal, connectTimeoutSignal]), - connectTimeoutSignal, - }; - } +function describeAbortReason(signal: AbortSignal | undefined, fallback: unknown): string { + const reason = signal?.reason; + if (reason instanceof Error && reason.message) return reason.message; + if (typeof reason === "string" && reason.length > 0) return reason; + if (fallback instanceof Error && fallback.message) return fallback.message; + return "aborted"; } diff --git a/apps/backend/src/http-client/types.ts b/apps/backend/src/http-client/types.ts index 7e48ce7d..26892ee6 100644 --- a/apps/backend/src/http-client/types.ts +++ b/apps/backend/src/http-client/types.ts @@ -13,4 +13,6 @@ export interface RequestMetrics { export interface RequestWithMetrics { data: T; metrics: RequestMetrics; + aborted?: boolean; // Set when the request was aborted mid-download after response headers arrived. + abortReason?: string; // Error message when `aborted` is true; human-readable summary of the abort reason. } diff --git a/apps/backend/src/jobs/job-queues.ts b/apps/backend/src/jobs/job-queues.ts index 9488ce7b..db475d49 100644 --- a/apps/backend/src/jobs/job-queues.ts +++ b/apps/backend/src/jobs/job-queues.ts @@ -7,3 +7,4 @@ export const LEGACY_DEAL_QUEUE = "deal.run"; export const LEGACY_RETRIEVAL_QUEUE = "retrieval.run"; export const DATA_RETENTION_POLL_QUEUE = "data.retention.poll"; export const PROVIDERS_REFRESH_QUEUE = "providers.refresh"; +export const RETRIEVAL_ANON_QUEUE = "retrieval.anon.run"; diff --git a/apps/backend/src/jobs/jobs.module.ts b/apps/backend/src/jobs/jobs.module.ts index 15ad4d64..fb708e09 100644 --- a/apps/backend/src/jobs/jobs.module.ts +++ b/apps/backend/src/jobs/jobs.module.ts @@ -7,6 +7,7 @@ import { StorageProvider } from "../database/entities/storage-provider.entity.js import { DealModule } from "../deal/deal.module.js"; import { PieceCleanupModule } from "../piece-cleanup/piece-cleanup.module.js"; import { RetrievalModule } from "../retrieval/retrieval.module.js"; +import { RetrievalAnonModule } from "../retrieval-anon/retrieval-anon.module.js"; import { WalletSdkModule } from "../wallet-sdk/wallet-sdk.module.js"; import { JobsService } from "./jobs.service.js"; import { JobScheduleRepository } from "./repositories/job-schedule.repository.js"; @@ -20,6 +21,7 @@ import { JobScheduleRepository } from "./repositories/job-schedule.repository.js WalletSdkModule, DataRetentionModule, PieceCleanupModule, + RetrievalAnonModule, ], providers: [JobsService, JobScheduleRepository], }) diff --git a/apps/backend/src/jobs/jobs.service.spec.ts b/apps/backend/src/jobs/jobs.service.spec.ts index d556f3d6..8983c723 100644 --- a/apps/backend/src/jobs/jobs.service.spec.ts +++ b/apps/backend/src/jobs/jobs.service.spec.ts @@ -30,18 +30,18 @@ describe("JobsService schedule rows", () => { }; let dataRetentionServiceMock: { pollDataRetention: ReturnType }; let metricsMocks: { - jobsQueuedGauge: JobsServiceDeps[8]; - jobsRetryScheduledGauge: JobsServiceDeps[9]; - oldestQueuedAgeGauge: JobsServiceDeps[10]; - oldestInFlightAgeGauge: JobsServiceDeps[11]; - jobsInFlightGauge: JobsServiceDeps[12]; - jobsEnqueueAttemptsCounter: JobsServiceDeps[13]; - jobsStartedCounter: JobsServiceDeps[14]; - jobsCompletedCounter: JobsServiceDeps[15]; - jobsPausedGauge: JobsServiceDeps[16]; - jobDuration: JobsServiceDeps[17]; - storageProvidersActive: JobsServiceDeps[18]; - storageProvidersTested: JobsServiceDeps[19]; + jobsQueuedGauge: JobsServiceDeps[9]; + jobsRetryScheduledGauge: JobsServiceDeps[10]; + oldestQueuedAgeGauge: JobsServiceDeps[11]; + oldestInFlightAgeGauge: JobsServiceDeps[12]; + jobsInFlightGauge: JobsServiceDeps[13]; + jobsEnqueueAttemptsCounter: JobsServiceDeps[14]; + jobsStartedCounter: JobsServiceDeps[15]; + jobsCompletedCounter: JobsServiceDeps[16]; + jobsPausedGauge: JobsServiceDeps[17]; + jobDuration: JobsServiceDeps[18]; + storageProvidersActive: JobsServiceDeps[19]; + storageProvidersTested: JobsServiceDeps[20]; }; let baseConfigValues: Partial; let configService: JobsServiceDeps[0]; @@ -55,18 +55,19 @@ describe("JobsService schedule rows", () => { walletSdkService: JobsServiceDeps[5]; dataRetentionService: JobsServiceDeps[6]; pieceCleanupService: JobsServiceDeps[7]; - jobsQueuedGauge: JobsServiceDeps[8]; - jobsRetryScheduledGauge: JobsServiceDeps[9]; - oldestQueuedAgeGauge: JobsServiceDeps[10]; - oldestInFlightAgeGauge: JobsServiceDeps[11]; - jobsInFlightGauge: JobsServiceDeps[12]; - jobsEnqueueAttemptsCounter: JobsServiceDeps[13]; - jobsStartedCounter: JobsServiceDeps[14]; - jobsCompletedCounter: JobsServiceDeps[15]; - jobsPausedGauge: JobsServiceDeps[16]; - jobDuration: JobsServiceDeps[17]; - storageProvidersActive: JobsServiceDeps[18]; - storageProvidersTested: JobsServiceDeps[19]; + anonRetrievalService: JobsServiceDeps[8]; + jobsQueuedGauge: JobsServiceDeps[9]; + jobsRetryScheduledGauge: JobsServiceDeps[10]; + oldestQueuedAgeGauge: JobsServiceDeps[11]; + oldestInFlightAgeGauge: JobsServiceDeps[12]; + jobsInFlightGauge: JobsServiceDeps[13]; + jobsEnqueueAttemptsCounter: JobsServiceDeps[14]; + jobsStartedCounter: JobsServiceDeps[15]; + jobsCompletedCounter: JobsServiceDeps[16]; + jobsPausedGauge: JobsServiceDeps[17]; + jobDuration: JobsServiceDeps[18]; + storageProvidersActive: JobsServiceDeps[19]; + storageProvidersTested: JobsServiceDeps[20]; }>, ) => JobsService; @@ -96,18 +97,18 @@ describe("JobsService schedule rows", () => { }; metricsMocks = { - jobsQueuedGauge: { set: vi.fn() } as unknown as JobsServiceDeps[8], - jobsRetryScheduledGauge: { set: vi.fn() } as unknown as JobsServiceDeps[9], - oldestQueuedAgeGauge: { set: vi.fn() } as unknown as JobsServiceDeps[10], - oldestInFlightAgeGauge: { set: vi.fn() } as unknown as JobsServiceDeps[11], - jobsInFlightGauge: { set: vi.fn() } as unknown as JobsServiceDeps[12], - jobsEnqueueAttemptsCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[13], - jobsStartedCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[14], - jobsCompletedCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[15], - jobsPausedGauge: { set: vi.fn() } as unknown as JobsServiceDeps[16], - jobDuration: { observe: vi.fn() } as unknown as JobsServiceDeps[17], - storageProvidersActive: { set: vi.fn() } as unknown as JobsServiceDeps[18], - storageProvidersTested: { set: vi.fn() } as unknown as JobsServiceDeps[19], + jobsQueuedGauge: { set: vi.fn() } as unknown as JobsServiceDeps[9], + jobsRetryScheduledGauge: { set: vi.fn() } as unknown as JobsServiceDeps[10], + oldestQueuedAgeGauge: { set: vi.fn() } as unknown as JobsServiceDeps[11], + oldestInFlightAgeGauge: { set: vi.fn() } as unknown as JobsServiceDeps[12], + jobsInFlightGauge: { set: vi.fn() } as unknown as JobsServiceDeps[13], + jobsEnqueueAttemptsCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[14], + jobsStartedCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[15], + jobsCompletedCounter: { inc: vi.fn() } as unknown as JobsServiceDeps[16], + jobsPausedGauge: { set: vi.fn() } as unknown as JobsServiceDeps[17], + jobDuration: { observe: vi.fn() } as unknown as JobsServiceDeps[18], + storageProvidersActive: { set: vi.fn() } as unknown as JobsServiceDeps[19], + storageProvidersTested: { set: vi.fn() } as unknown as JobsServiceDeps[20], }; const emptySpBlocklists: ISpBlocklistConfig = { @@ -133,6 +134,7 @@ describe("JobsService schedule rows", () => { dataSetCreationJobTimeoutSeconds: 300, pieceCleanupPerSpPerHour: 1, maxPieceCleanupRuntimeSeconds: 300, + retrievalsAnonPerSpPerHour: 2, } as IConfig["jobs"], database: { host: "localhost", @@ -161,6 +163,7 @@ describe("JobsService schedule rows", () => { overrides.walletSdkService ?? ({} as JobsServiceDeps[5]), overrides.dataRetentionService ?? (dataRetentionServiceMock as unknown as JobsServiceDeps[6]), overrides.pieceCleanupService ?? ({} as JobsServiceDeps[7]), + overrides.anonRetrievalService ?? ({} as JobsServiceDeps[8]), overrides.jobsQueuedGauge ?? metricsMocks.jobsQueuedGauge, overrides.jobsRetryScheduledGauge ?? metricsMocks.jobsRetryScheduledGauge, overrides.oldestQueuedAgeGauge ?? metricsMocks.oldestQueuedAgeGauge, @@ -615,12 +618,13 @@ describe("JobsService schedule rows", () => { // Check upserts for providerB const upsertCalls = jobScheduleRepositoryMock.upsertSchedule.mock.calls; const upsertsForB = upsertCalls.filter((call) => call[1] === providerB.address); - expect(upsertsForB).toHaveLength(4); + expect(upsertsForB).toHaveLength(5); expect(upsertsForB.map((call) => call[0]).sort()).toEqual([ "data_set_creation", "deal", "piece_cleanup", "retrieval", + "retrieval_anon", ]); }); @@ -976,7 +980,7 @@ describe("JobsService schedule rows", () => { expect(dealService.createDealForProvider).toHaveBeenCalledTimes(1); }); - it("deal job maps DealJobTerminatedDataSetError to handler_result=error", async () => { + it("data storage job does not run data-storage check when data-set selection aborts", async () => { const completedCounter = metricsMocks.jobsCompletedCounter as unknown as { inc: ReturnType }; vi.useFakeTimers(); vi.setSystemTime(new Date("2024-01-01T12:00:00Z")); diff --git a/apps/backend/src/jobs/jobs.service.ts b/apps/backend/src/jobs/jobs.service.ts index f8fe1d80..e09cf42c 100644 --- a/apps/backend/src/jobs/jobs.service.ts +++ b/apps/backend/src/jobs/jobs.service.ts @@ -16,18 +16,32 @@ import { StorageProvider } from "../database/entities/storage-provider.entity.js import { DealService } from "../deal/deal.service.js"; import { PieceCleanupService } from "../piece-cleanup/piece-cleanup.service.js"; import { RetrievalService } from "../retrieval/retrieval.service.js"; +import { AnonRetrievalService } from "../retrieval-anon/anon-retrieval.service.js"; import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; import { provisionNextMissingDataSet } from "./data-set-creation.handler.js"; -import { DATA_RETENTION_POLL_QUEUE, PROVIDERS_REFRESH_QUEUE, SP_WORK_QUEUE } from "./job-queues.js"; +import { + DATA_RETENTION_POLL_QUEUE, + PROVIDERS_REFRESH_QUEUE, + RETRIEVAL_ANON_QUEUE, + SP_WORK_QUEUE, +} from "./job-queues.js"; import { JobScheduleRepository } from "./repositories/job-schedule.repository.js"; -type SpJobType = "deal" | "retrieval" | "data_set_creation" | "piece_cleanup"; -const SP_JOB_TYPES: ReadonlySet = new Set(["deal", "retrieval", "data_set_creation", "piece_cleanup"]); +type SpJobType = "deal" | "retrieval" | "data_set_creation" | "retrieval_anon" | "piece_cleanup"; +const SP_JOB_TYPES: ReadonlySet = new Set([ + "deal", + "retrieval", + "retrieval_anon", + "data_set_creation", + "piece_cleanup", +]); + function isSpJobType(jobType: string): jobType is SpJobType { return SP_JOB_TYPES.has(jobType); } type SpJobData = { jobType: SpJobType; spAddress: string; intervalSeconds: number }; +type AnonRetrievalJobData = { spAddress: string; intervalSeconds: number }; type ProvidersRefreshJobData = { intervalSeconds: number }; type SpJob = Job; type DataRetentionJobData = { intervalSeconds: number }; @@ -61,6 +75,8 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { private readonly walletSdkService: WalletSdkService, private readonly dataRetentionService: DataRetentionService, private readonly pieceCleanupService: PieceCleanupService, + private readonly anonRetrievalService: AnonRetrievalService, + @InjectMetric("jobs_queued") private readonly jobsQueuedGauge: Gauge, @InjectMetric("jobs_retry_scheduled") @@ -258,6 +274,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { await boss.createQueue(SP_WORK_QUEUE, { policy: "singleton" }); await boss.createQueue(PROVIDERS_REFRESH_QUEUE); await boss.createQueue(DATA_RETENTION_POLL_QUEUE); + await boss.createQueue(RETRIEVAL_ANON_QUEUE); } private registerWorkers(): void { @@ -335,6 +352,23 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { error: toStructuredError(error), }), ); + void this.boss + .work( + RETRIEVAL_ANON_QUEUE, + { batchSize: 1, localConcurrency: spConcurrency, pollingIntervalSeconds: workerPollSeconds }, + async ([job]) => { + if (!job) return; + await this.handleAnonRetrievalJob(job); + }, + ) + .catch((error) => + this.logger.error({ + event: "worker_register_failed", + message: "Failed to register worker", + queue: RETRIEVAL_ANON_QUEUE, + error: toStructuredError(error), + }), + ); } private getMaintenanceWindowStatus(now: Date = new Date()) { @@ -587,6 +621,51 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { }); } + private async handleAnonRetrievalJob(job: Job): Promise { + const data = job.data; + const spAddress = data.spAddress; + + // Create AbortController for job timeout enforcement + const abortController = new AbortController(); + const timeoutSeconds = this.configService.get("jobs").anonRetrievalJobTimeoutSeconds; + const timeoutMs = Math.max(60000, timeoutSeconds * 1000); + const effectiveTimeoutSeconds = Math.round(timeoutMs / 1000); + const abortReason = new Error(`Anon retrieval job timeout (${effectiveTimeoutSeconds}s) for ${spAddress}`); + const timeoutId = setTimeout(() => { + abortController.abort(abortReason); + }, timeoutMs); + + await this.recordJobExecution("retrieval_anon", async () => { + const logContext = await this.resolveProviderJobContext(spAddress, job.id); + try { + await this.anonRetrievalService.performForProvider(spAddress, abortController.signal, logContext); + return "success"; + } catch (error) { + if (abortController.signal.aborted) { + const reason = abortController.signal.reason; + const reasonMessage = reason instanceof Error ? reason.message : String(reason ?? ""); + this.logger.error({ + ...logContext, + event: "anon_retrieval_job_aborted", + message: reasonMessage || "Anon retrieval job aborted after timeout", + timeoutSeconds: effectiveTimeoutSeconds, + error: toStructuredError(reason ?? error), + }); + return "aborted"; + } + this.logger.error({ + ...logContext, + event: "anon_retrieval_job_failed", + message: "Anon retrieval job failed", + error: toStructuredError(error), + }); + throw error; + } finally { + clearTimeout(timeoutId); + } + }); + } + private async handleDataRetentionJob(data: DataRetentionJobData): Promise { void data; await this.recordJobExecution("data_retention_poll", async () => { @@ -865,6 +944,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { private getIntervalSecondsForRates(): { dealIntervalSeconds: number; retrievalIntervalSeconds: number; + retrievalAnonIntervalSeconds: number; dataSetCreationIntervalSeconds: number; dataRetentionPollIntervalSeconds: number; providersRefreshIntervalSeconds: number; @@ -885,9 +965,13 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { const dataRetentionPollIntervalSeconds = scheduling.dataRetentionPollIntervalSeconds; const providersRefreshIntervalSeconds = scheduling.providersRefreshIntervalSeconds; + const retrievalsAnonPerHour = jobsConfig.retrievalsAnonPerSpPerHour; + const retrievalAnonIntervalSeconds = Math.max(1, Math.round(3600 / retrievalsAnonPerHour)); + return { dealIntervalSeconds, retrievalIntervalSeconds, + retrievalAnonIntervalSeconds, dataSetCreationIntervalSeconds, dataRetentionPollIntervalSeconds, providersRefreshIntervalSeconds, @@ -907,6 +991,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { const { dealIntervalSeconds, retrievalIntervalSeconds, + retrievalAnonIntervalSeconds, dataSetCreationIntervalSeconds, dataRetentionPollIntervalSeconds, providersRefreshIntervalSeconds, @@ -924,6 +1009,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { const phaseMs = this.schedulePhaseSeconds() * 1000; const dealStartAt = new Date(now.getTime() + phaseMs); const retrievalStartAt = new Date(now.getTime() + phaseMs); + const retrievalAnonStartAt = new Date(now.getTime() + phaseMs); const dataSetCreationStartAt = new Date(now.getTime() + phaseMs); const dataRetentionPollStartAt = new Date(now.getTime() + phaseMs); const providersRefreshStartAt = new Date(now.getTime() + phaseMs); @@ -947,6 +1033,12 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { for (const address of unblockedAddresses) { await this.jobScheduleRepository.upsertSchedule("deal", address, dealIntervalSeconds, dealStartAt); await this.jobScheduleRepository.upsertSchedule("retrieval", address, retrievalIntervalSeconds, retrievalStartAt); + await this.jobScheduleRepository.upsertSchedule( + "retrieval_anon", + address, + retrievalAnonIntervalSeconds, + retrievalAnonStartAt, + ); if (minDataSets >= 1) { await this.jobScheduleRepository.upsertSchedule( "data_set_creation", @@ -1104,6 +1196,8 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { return SP_WORK_QUEUE; case "piece_cleanup": return SP_WORK_QUEUE; + case "retrieval_anon": + return RETRIEVAL_ANON_QUEUE; case "data_retention_poll": return DATA_RETENTION_POLL_QUEUE; case "providers_refresh": @@ -1123,6 +1217,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { if ( row.job_type === "deal" || row.job_type === "retrieval" || + row.job_type === "retrieval_anon" || row.job_type === "data_set_creation" || row.job_type === "piece_cleanup" ) { @@ -1195,6 +1290,7 @@ export class JobsService implements OnModuleInit, OnApplicationShutdown { const jobTypes: JobType[] = [ "deal", "retrieval", + "retrieval_anon", "data_set_creation", "piece_cleanup", "data_retention_poll", diff --git a/apps/backend/src/metrics-prometheus/check-metric-labels.ts b/apps/backend/src/metrics-prometheus/check-metric-labels.ts index d8447160..9d776586 100644 --- a/apps/backend/src/metrics-prometheus/check-metric-labels.ts +++ b/apps/backend/src/metrics-prometheus/check-metric-labels.ts @@ -1,4 +1,4 @@ -export type CheckType = "dataStorage" | "retrieval" | "dataRetention" | "dataSetCreation"; +export type CheckType = "dataStorage" | "retrieval" | "anon_retrieval" | "dataRetention" | "dataSetCreation"; export type ProviderStatus = "approved" | "unapproved"; export type CheckMetricLabels = { diff --git a/apps/backend/src/metrics-prometheus/check-metrics.service.ts b/apps/backend/src/metrics-prometheus/check-metrics.service.ts index 55975cad..76a8ee31 100644 --- a/apps/backend/src/metrics-prometheus/check-metrics.service.ts +++ b/apps/backend/src/metrics-prometheus/check-metrics.service.ts @@ -248,3 +248,66 @@ export class DataSetCreationCheckMetrics { this.dataSetCreationStatusCounter.inc({ ...labels, value }); } } + +@Injectable() +export class AnonRetrievalCheckMetrics { + constructor( + @InjectMetric("anonPieceRetrievalFirstByteMs") + private readonly firstByteMs: Histogram, + @InjectMetric("anonPieceRetrievalLastByteMs") + private readonly lastByteMs: Histogram, + @InjectMetric("anonPieceRetrievalThroughputBps") + private readonly throughputBps: Histogram, + @InjectMetric("anonRetrievalCheckMs") + private readonly checkMs: Histogram, + @InjectMetric("anonPieceRetrievalStatus") + private readonly statusCounter: Counter, + @InjectMetric("anonPieceHttpResponseCode") + private readonly httpResponseCounter: Counter, + @InjectMetric("anonCarParseStatus") + private readonly carParseCounter: Counter, + @InjectMetric("anonIpniStatus") + private readonly ipniCounter: Counter, + @InjectMetric("anonBlockFetchStatus") + private readonly blockFetchCounter: Counter, + ) {} + + observeFirstByteMs(labels: CheckMetricLabels, value: number | null | undefined): void { + observePositive(this.firstByteMs, labels, value); + } + + observeLastByteMs(labels: CheckMetricLabels, value: number | null | undefined): void { + observePositive(this.lastByteMs, labels, value); + } + + observeThroughput(labels: CheckMetricLabels, value: number | null | undefined): void { + observePositive(this.throughputBps, labels, value); + } + + observeCheckDuration(labels: CheckMetricLabels, value: number | null | undefined): void { + observePositive(this.checkMs, labels, value); + } + + recordStatus(labels: CheckMetricLabels, value: string): void { + this.statusCounter.inc({ ...labels, value }); + } + + recordHttpResponseCode(labels: CheckMetricLabels, statusCode: number): void { + this.httpResponseCounter.inc({ + ...labels, + value: classifyHttpResponseCode(statusCode), + }); + } + + recordCarParseStatus(labels: CheckMetricLabels, parseable: boolean): void { + this.carParseCounter.inc({ ...labels, value: parseable ? "parseable" : "not_parseable" }); + } + + recordIpniStatus(labels: CheckMetricLabels, value: "valid" | "invalid" | "skipped" | "error"): void { + this.ipniCounter.inc({ ...labels, value }); + } + + recordBlockFetchStatus(labels: CheckMetricLabels, value: "valid" | "invalid" | "skipped" | "error"): void { + this.blockFetchCounter.inc({ ...labels, value }); + } +} diff --git a/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts b/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts index 18bda30d..4ebeb01a 100644 --- a/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts +++ b/apps/backend/src/metrics-prometheus/metrics-prometheus.module.ts @@ -8,6 +8,7 @@ import { } from "@willsoto/nestjs-prometheus"; import { WalletSdkModule } from "../wallet-sdk/wallet-sdk.module.js"; import { + AnonRetrievalCheckMetrics, DataSetCreationCheckMetrics, DataStorageCheckMetrics, DiscoverabilityCheckMetrics, @@ -207,6 +208,56 @@ const metricProviders = [ help: "Estimated number of unrecorded overdue proving periods per provider. Resets to 0 when the subgraph catches up.", labelNames: ["checkType", "providerId", "providerName", "providerStatus"] as const, }), + // Anonymous Retrieval Metrics + makeHistogramProvider({ + name: "anonPieceRetrievalFirstByteMs", + help: "Time to first byte for anonymous piece retrievals via /piece/{cid} (ms)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus"] as const, + buckets: [1, 5, 10, 50, 100, 250, 500, 1000, 2000, 5000, 10000, 30000], + }), + makeHistogramProvider({ + name: "anonPieceRetrievalLastByteMs", + help: "Total time to retrieve an anonymous piece via /piece/{cid} (ms)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus"] as const, + buckets: [1, 5, 10, 50, 100, 250, 500, 1000, 2000, 5000, 10000, 30000, 60000, 120000, 300000], + }), + makeHistogramProvider({ + name: "anonPieceRetrievalThroughputBps", + help: "Throughput for anonymous piece retrievals (bytes/s)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus"] as const, + buckets: throughputBuckets, + }), + makeHistogramProvider({ + name: "anonRetrievalCheckMs", + help: "End-to-end anonymous retrieval check duration (ms)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus"] as const, + buckets: [100, 500, 1000, 2000, 5000, 10000, 30000, 60000, 120000, 300000, 600000], + }), + makeCounterProvider({ + name: "anonPieceRetrievalStatus", + help: "Anonymous piece retrieval overall outcome", + labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, + }), + makeCounterProvider({ + name: "anonPieceHttpResponseCode", + help: "HTTP response codes for anonymous piece retrieval requests", + labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, + }), + makeCounterProvider({ + name: "anonCarParseStatus", + help: "Anonymous retrieval CAR parse outcomes (parseable / not_parseable)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, + }), + makeCounterProvider({ + name: "anonIpniStatus", + help: "Anonymous retrieval IPNI check outcomes (valid / invalid / skipped)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, + }), + makeCounterProvider({ + name: "anonBlockFetchStatus", + help: "Anonymous retrieval block fetch validation outcomes (valid / invalid / skipped)", + labelNames: ["checkType", "providerId", "providerName", "providerStatus", "value"] as const, + }), // Storage provider metrics: absolute counts, independent of query filters. makeGaugeProvider({ name: "storage_providers_active", @@ -333,6 +384,7 @@ const metricProviders = [ RetrievalCheckMetrics, DiscoverabilityCheckMetrics, DataSetCreationCheckMetrics, + AnonRetrievalCheckMetrics, WalletBalanceCollector, // HTTP metrics interceptor { @@ -347,6 +399,7 @@ const metricProviders = [ RetrievalCheckMetrics, DiscoverabilityCheckMetrics, DataSetCreationCheckMetrics, + AnonRetrievalCheckMetrics, WalletBalanceCollector, ], }) diff --git a/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts b/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts new file mode 100644 index 00000000..30a04486 --- /dev/null +++ b/apps/backend/src/retrieval-anon/anon-piece-selector.service.spec.ts @@ -0,0 +1,153 @@ +import type { ConfigService } from "@nestjs/config"; +import { beforeEach, describe, expect, it, vi } from "vitest"; +import type { IConfig } from "../config/app.config.js"; +import type { SampleAnonPieceParams, SubgraphService } from "../subgraph/subgraph.service.js"; +import type { AnonCandidatePiece } from "../subgraph/types.js"; +import { AnonPieceSelectorService } from "./anon-piece-selector.service.js"; + +const SP_ADDRESS = "0xAaAaAAaAaaaAaAAAAaaaaAAaaAaaaAAaaaaa1111"; +const DEALBOT_PAYER = "0xBbBBBbBBbbbBbBBBBBbbbbbBBbbBbbbBBbbbb2222"; + +const makePiece = (overrides: Partial = {}): AnonCandidatePiece => ({ + pieceCid: `baga6ea4seaqpiece${Math.random().toString(36).slice(2, 10)}`, + pieceId: "1", + dataSetId: "42", + rawSize: "1048576", + withIPFSIndexing: true, + ipfsRootCid: "bafyroot", + indexedAtBlock: 12345, + pdpPaymentEndEpoch: null, + ...overrides, +}); + +const makeConfigService = (): ConfigService => + ({ + get: vi.fn((key: string) => { + if (key === "blockchain") { + return { walletAddress: DEALBOT_PAYER }; + } + return undefined; + }), + }) as unknown as ConfigService; + +describe("AnonPieceSelectorService", () => { + let subgraphService: SubgraphService; + let sampleAnonPiece: ReturnType; + + beforeEach(() => { + sampleAnonPiece = vi.fn(); + subgraphService = { sampleAnonPiece } as unknown as SubgraphService; + }); + + it("returns null when every fallback attempt yields no piece", async () => { + sampleAnonPiece.mockResolvedValue(null); + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); + + const result = await service.selectPieceForProvider(SP_ADDRESS); + + expect(result).toBeNull(); + expect(sampleAnonPiece).toHaveBeenCalled(); + }); + + it("returns the sampled piece with SP address lowercased", async () => { + sampleAnonPiece.mockResolvedValueOnce(makePiece({ pieceCid: "baga-the-one" })); + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); + + const result = await service.selectPieceForProvider(SP_ADDRESS); + + expect(result).not.toBeNull(); + expect(result?.pieceCid).toBe("baga-the-one"); + expect(result?.serviceProvider).toBe(SP_ADDRESS.toLowerCase()); + }); + + it("passes the dealbot payer address to sampleAnonPiece for exclusion", async () => { + sampleAnonPiece.mockResolvedValueOnce(makePiece()); + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); + + await service.selectPieceForProvider(SP_ADDRESS); + + const call = sampleAnonPiece.mock.calls[0][0] as SampleAnonPieceParams; + expect(call.payer).toBe(DEALBOT_PAYER); + expect(call.serviceProvider).toBe(SP_ADDRESS); + }); + + it("redraws when the first sampled piece's payment has already terminated", async () => { + const staleCid = "baga-terminated"; + const freshCid = "baga-live"; + sampleAnonPiece + .mockResolvedValueOnce(makePiece({ pieceCid: staleCid, pdpPaymentEndEpoch: 100n, indexedAtBlock: 200 })) + .mockResolvedValueOnce(makePiece({ pieceCid: freshCid, pdpPaymentEndEpoch: null })); + + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); + const result = await service.selectPieceForProvider(SP_ADDRESS); + + expect(result?.pieceCid).toBe(freshCid); + }); + + it("treats payment-end exactly equal to current epoch as terminated (boundary)", async () => { + // pdpPaymentEndEpoch === indexedAtBlock should be rejected (<=, not <). + // This guards against an off-by-one regression where pieces in the final + // payment epoch silently slip through. + const boundaryCid = "baga-boundary"; + const liveCid = "baga-still-live"; + sampleAnonPiece + .mockResolvedValueOnce(makePiece({ pieceCid: boundaryCid, pdpPaymentEndEpoch: 200n, indexedAtBlock: 200 })) + .mockResolvedValueOnce(makePiece({ pieceCid: liveCid, pdpPaymentEndEpoch: 201n, indexedAtBlock: 200 })); + + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); + const result = await service.selectPieceForProvider(SP_ADDRESS); + + expect(result?.pieceCid).toBe(liveCid); + }); + + it("falls back to the opposite pool when the preferred one is empty", async () => { + // First pool call returns nothing twice (both attempts), second pool succeeds. + const fresh = makePiece({ pieceCid: "baga-other-pool" }); + sampleAnonPiece.mockResolvedValueOnce(null).mockResolvedValueOnce(null).mockResolvedValueOnce(fresh); + + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); + const result = await service.selectPieceForProvider(SP_ADDRESS); + + expect(result?.pieceCid).toBe("baga-other-pool"); + + // The second (fallback) call should target the opposite pool. + const firstCall = sampleAnonPiece.mock.calls[0][0] as SampleAnonPieceParams; + const fallbackCall = sampleAnonPiece.mock.calls[2][0] as SampleAnonPieceParams; + expect(fallbackCall.pool).not.toBe(firstCall.pool); + }); + + it("widens size bucket to 'any' after both pools fail in the primary bucket", async () => { + // 4 empty attempts across (bucket × both pools × 2 draws each) then + // succeed on the first `any` bucket call. + sampleAnonPiece + .mockResolvedValueOnce(null) + .mockResolvedValueOnce(null) + .mockResolvedValueOnce(null) + .mockResolvedValueOnce(null) + .mockResolvedValueOnce(makePiece({ pieceCid: "baga-any-bucket" })); + + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); + const result = await service.selectPieceForProvider(SP_ADDRESS); + + expect(result?.pieceCid).toBe("baga-any-bucket"); + + // The 5th call (index 4) should be the widened-bucket attempt; its size + // range covers at least the 32 GiB ceiling of the "large" bucket. + const widened = sampleAnonPiece.mock.calls[4][0] as SampleAnonPieceParams; + expect(BigInt(widened.maxSize)).toBeGreaterThanOrEqual(32n * 1024n * 1024n * 1024n); + expect(widened.minSize).toBe("0"); + }); + + it("draws a fresh sampleKey for each subgraph call", async () => { + sampleAnonPiece.mockResolvedValueOnce(null).mockResolvedValueOnce(makePiece()); + + const service = new AnonPieceSelectorService(subgraphService, makeConfigService()); + await service.selectPieceForProvider(SP_ADDRESS); + + const call1 = sampleAnonPiece.mock.calls[0][0] as SampleAnonPieceParams; + const call2 = sampleAnonPiece.mock.calls[1][0] as SampleAnonPieceParams; + expect(call1.sampleKey).toMatch(/^0x[0-9a-f]{64}$/); + expect(call2.sampleKey).toMatch(/^0x[0-9a-f]{64}$/); + expect(call1.sampleKey).not.toBe(call2.sampleKey); + }); +}); diff --git a/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts new file mode 100644 index 00000000..d354a222 --- /dev/null +++ b/apps/backend/src/retrieval-anon/anon-piece-selector.service.ts @@ -0,0 +1,182 @@ +import { randomBytes } from "node:crypto"; +import { Injectable, Logger } from "@nestjs/common"; +import { ConfigService } from "@nestjs/config"; +import type { IConfig } from "../config/app.config.js"; +import type { AnonPiecePool, SampleAnonPieceParams } from "../subgraph/subgraph.service.js"; +import { SubgraphService } from "../subgraph/subgraph.service.js"; +import type { AnonCandidatePiece } from "../subgraph/types.js"; +import type { AnonPiece } from "./types.js"; + +/** + * Piece size buckets, in raw (unpadded) bytes. Weighted sampling across + * these buckets keeps tests meaningful for bandwidth measurement without + * locking out SPs whose corpus skews small or large. + */ +type SizeBucket = "small" | "medium" | "large"; +type SizeRange = { min: bigint; max: bigint }; + +const MIB = 1024n * 1024n; + +// All downloads are buffered in-memory, so we need to keep piece sizes reasonable +// When changing these values, also update ./docs/checks/anon-retrievals.md#piece-selection +const SIZE_BUCKETS: Record = { + small: { min: 1n * MIB, max: 20n * MIB - 1n }, + medium: { min: 20n * MIB, max: 100n * MIB - 1n }, + large: { min: 100n * MIB, max: 500n * MIB - 1n }, +}; + +// Weights for choosing a bucket per selection. Must sum to 1. +// When changing these values, also update ./docs/checks/anon-retrievals.md#piece-selection +const BUCKET_WEIGHTS: Record = { + small: 0.2, + medium: 0.5, + large: 0.3, +}; + +/** + * Probability the primary draw targets the withIPFSIndexing pool. + * The rest of the time we sample across all FWSS pieces, so SPs can't + * optimise only their CAR corpus. + * + * When changing this value, also update ./docs/checks/anon-retrievals.md#piece-selection + */ +const IPFS_INDEXED_SAMPLE_RATE = 0.8; + +@Injectable() +export class AnonPieceSelectorService { + private readonly logger = new Logger(AnonPieceSelectorService.name); + + constructor( + private readonly subgraphService: SubgraphService, + private readonly configService: ConfigService, + ) {} + + /** + * Select an anonymous piece to test against the given SP. + * + * Strategy: + * 1. Pick a size bucket by weighted random. + * 2. Pick a pool (`indexed` 80% / `any` 20%). + * 3. Generate a uniform-random sampleKey and query the subgraph for the + * smallest `Root.sampleKey ≥ $sampleKey` matching the filters. + * 4. Drop the pick if `pdpPaymentEndEpoch` has passed or it was tested + * recently; redraw once. + * 5. If still empty, fall back through: (same bucket, opposite pool) → + * (any bucket, indexed) → (any bucket, any). + */ + async selectPieceForProvider(spAddress: string): Promise { + const dealbotPayer = this.configService.get("blockchain", { infer: true }).walletAddress; + + const bucket = this.pickBucket(); + const pool: AnonPiecePool = Math.random() < IPFS_INDEXED_SAMPLE_RATE ? "indexed" : "any"; + + const attempts: Array<{ bucket: SizeBucket | "any"; pool: AnonPiecePool }> = [ + { bucket: bucket, pool: pool }, + { bucket: bucket, pool: pool === "indexed" ? "any" : "indexed" }, + { bucket: "any", pool: "indexed" }, + { bucket: "any", pool: "any" }, + ]; + + for (const attempt of attempts) { + const piece = await this.drawPiece({ + spAddress, + dealbotPayer, + bucket: attempt.bucket, + pool: attempt.pool, + }); + + if (piece) { + this.logger.log({ + event: "anon_piece_selected", + message: "Selected anonymous piece for retrieval test", + spAddress, + pieceCid: piece.pieceCid, + dataSetId: piece.dataSetId, + withIPFSIndexing: piece.withIPFSIndexing, + bucket: attempt.bucket, + pool: attempt.pool, + }); + + return { + pieceCid: piece.pieceCid, + dataSetId: piece.dataSetId, + pieceId: piece.pieceId, + serviceProvider: spAddress.toLowerCase(), + withIPFSIndexing: piece.withIPFSIndexing, + ipfsRootCid: piece.ipfsRootCid, + rawSize: piece.rawSize, + }; + } + } + + this.logger.warn({ + event: "anon_no_candidates", + message: "No anonymous piece found after all fallbacks", + spAddress, + }); + + return null; + } + + /** + * Try to draw a piece for one (bucket, pool) combination. Up to two draws + * with fresh sampleKeys, each filtered by dedup + epoch-termination. + */ + private async drawPiece(args: { + spAddress: string; + dealbotPayer: string; + bucket: SizeBucket | "any"; + pool: AnonPiecePool; + }): Promise { + const range = args.bucket === "any" ? fullRange() : SIZE_BUCKETS[args.bucket]; + + for (let attempt = 0; attempt < 2; attempt++) { + const params: SampleAnonPieceParams = { + serviceProvider: args.spAddress, + payer: args.dealbotPayer, + sampleKey: randomSampleKey(), + minSize: range.min.toString(), + maxSize: range.max.toString(), + pool: args.pool, + }; + + const piece = await this.subgraphService.sampleAnonPiece(params); + if (!piece) { + continue; + } + + // On Filecoin FEVM the EVM block number IS the chain epoch (one block per + // epoch), so the subgraph's indexedAtBlock is a safe proxy for "now" when + // checking if PDP payment for this piece has already terminated. + if (piece.pdpPaymentEndEpoch != null && piece.pdpPaymentEndEpoch <= BigInt(piece.indexedAtBlock)) { + continue; + } + + return piece; + } + + return null; + } + + private pickBucket(): SizeBucket { + const r = Math.random(); + let acc = 0; + for (const [name, weight] of Object.entries(BUCKET_WEIGHTS) as Array<[SizeBucket, number]>) { + acc += weight; + if (r < acc) { + return name; + } + } + return "medium"; + } +} + +/** Uniform-random 32-byte sort key as `0x`-prefixed hex. */ +function randomSampleKey(): string { + return `0x${randomBytes(32).toString("hex")}`; +} + +/** The full size range (used when bucket fallback is "any"). */ +function fullRange(): SizeRange { + return { min: 0n, max: (1n << 63n) - 1n }; +} diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts new file mode 100644 index 00000000..adc75920 --- /dev/null +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.spec.ts @@ -0,0 +1,366 @@ +import type { Repository } from "typeorm"; +import { beforeEach, describe, expect, it, vi } from "vitest"; +import type { ClickhouseService } from "../clickhouse/clickhouse.service.js"; +import type { StorageProvider } from "../database/entities/storage-provider.entity.js"; +import { RetrievalStatus } from "../database/types.js"; +import type { AnonRetrievalCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; +import type { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; +import type { AnonPieceSelectorService } from "./anon-piece-selector.service.js"; +import { AnonRetrievalService } from "./anon-retrieval.service.js"; +import type { CarValidationService } from "./car-validation.service.js"; +import type { PieceRetrievalService } from "./piece-retrieval.service.js"; +import type { AnonPiece, CarValidationResult, PieceRetrievalResult } from "./types.js"; + +const SP_ADDRESS = "0xaaaa0000000000000000000000000000000000aa"; + +const PIECE = { + pieceCid: "baga6ea4seaqpiece", + pieceId: "1", + dataSetId: "42", + rawSize: "1048576", + withIPFSIndexing: false, + ipfsRootCid: null, + serviceProvider: SP_ADDRESS, +}; + +function makeProvider(): StorageProvider { + return { + address: SP_ADDRESS, + providerId: 7, + name: "sp-test", + isApproved: true, + } as unknown as StorageProvider; +} + +function makeService(opts: { + pieceResult: PieceRetrievalResult; + fetchPieceImpl?: (signal?: AbortSignal) => Promise; + piece?: AnonPiece | null; + carResult?: CarValidationResult; + validateCarImpl?: () => Promise; +}): { + service: AnonRetrievalService; + insertSpy: ReturnType; + fetchSpy: ReturnType; + validateCarSpy: ReturnType; + metricsRecordStatusSpy: ReturnType; + metricsRecordIpniSpy: ReturnType; + metricsRecordBlockFetchSpy: ReturnType; +} { + const insertSpy = vi.fn(); + const clickhouseService = { + insert: insertSpy, + enabled: true, + probeLocation: "test-location", + } as unknown as ClickhouseService; + + const spRepository = { + findOne: vi.fn(async () => makeProvider()), + } as unknown as Repository; + + const anonPieceSelector = { + selectPieceForProvider: vi.fn(async () => (opts.piece === null ? null : (opts.piece ?? PIECE))), + } as unknown as AnonPieceSelectorService; + + const fetchSpy = vi.fn(opts.fetchPieceImpl ?? (async () => opts.pieceResult)); + const pieceRetrievalService = { + fetchPiece: fetchSpy, + } as unknown as PieceRetrievalService; + + const validateCarSpy = vi.fn(opts.validateCarImpl ?? (async () => opts.carResult)); + const carValidationService = { + validateCarPiece: validateCarSpy, + } as unknown as CarValidationService; + + const walletSdkService = { + getProviderInfo: vi.fn(() => ({ pdp: { serviceURL: "https://sp.test/" } })), + } as unknown as WalletSdkService; + + const metricsRecordStatusSpy = vi.fn(); + const metricsRecordIpniSpy = vi.fn(); + const metricsRecordBlockFetchSpy = vi.fn(); + const metrics = { + observeFirstByteMs: vi.fn(), + observeLastByteMs: vi.fn(), + observeThroughput: vi.fn(), + observeCheckDuration: vi.fn(), + recordStatus: metricsRecordStatusSpy, + recordHttpResponseCode: vi.fn(), + recordCarParseStatus: vi.fn(), + recordIpniStatus: metricsRecordIpniSpy, + recordBlockFetchStatus: metricsRecordBlockFetchSpy, + } as unknown as AnonRetrievalCheckMetrics; + + const service = new AnonRetrievalService( + anonPieceSelector, + pieceRetrievalService, + carValidationService, + walletSdkService, + metrics, + clickhouseService, + spRepository, + ); + + return { + service, + insertSpy, + fetchSpy, + validateCarSpy, + metricsRecordStatusSpy, + metricsRecordIpniSpy, + metricsRecordBlockFetchSpy, + }; +} + +describe("AnonRetrievalService", () => { + beforeEach(() => { + vi.clearAllMocks(); + }); + + it("emits a ClickHouse row with partial metrics when fetchPiece returns aborted=true", async () => { + const partial: PieceRetrievalResult = { + success: false, + pieceCid: PIECE.pieceCid, + bytesReceived: 524288, + pieceBytes: null, + latencyMs: 42000, + ttfbMs: 150, + throughputBps: 12500, + statusCode: 200, + commPValid: false, + errorMessage: "Anon retrieval job timeout (60s) for sp1", + aborted: true, + }; + + const { service, insertSpy } = makeService({ pieceResult: partial }); + + await service.performForProvider(SP_ADDRESS); + + expect(insertSpy).toHaveBeenCalledTimes(1); + const [table, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(table).toBe("anon_retrieval_checks"); + expect(row.piece_fetch_status).toBe(RetrievalStatus.FAILED); + expect(row.bytes_retrieved).toBe(524288); + expect(row.first_byte_ms).toBe(150); + expect(row.last_byte_ms).toBe(42000); + expect(row.throughput_bps).toBe(12500); + expect(row.http_response_code).toBe(200); + expect(row.error_message).toContain("Anon retrieval job timeout"); + expect(row.piece_cid).toBe(PIECE.pieceCid); + expect(row.sp_address).toBe(SP_ADDRESS); + expect(row.sp_id).toBe(7); + expect(row.probe_location).toBe("test-location"); + expect(typeof row.retrieval_id).toBe("string"); + + // CAR/IPNI/block-fetch were never run on a non-IPFS-indexed piece — every + // dimension column should explicitly say "skipped" (ipni_status) or null. + expect(row.car_parseable).toBeNull(); + expect(row.car_block_count).toBeNull(); + expect(row.block_fetch_endpoint).toBeNull(); + expect(row.block_fetch_valid).toBeNull(); + expect(row.block_fetch_sampled_count).toBeNull(); + expect(row.block_fetch_failed_count).toBeNull(); + expect(row.ipni_status).toBe("skipped"); + expect(row.ipni_verify_ms).toBeNull(); + }); + + it("still emits a row when the signal aborts before fetchPiece runs", async () => { + const ac = new AbortController(); + ac.abort(new Error("Anon retrieval job timeout (60s) for sp1")); + + const never: PieceRetrievalResult = { + success: false, + pieceCid: PIECE.pieceCid, + bytesReceived: 0, + pieceBytes: null, + latencyMs: 0, + ttfbMs: 0, + throughputBps: 0, + statusCode: 0, + commPValid: false, + }; + + const { service, insertSpy, fetchSpy } = makeService({ pieceResult: never }); + + await service.performForProvider(SP_ADDRESS, ac.signal); + + expect(fetchSpy).not.toHaveBeenCalled(); + expect(insertSpy).toHaveBeenCalledTimes(1); + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.piece_fetch_status).toBe(RetrievalStatus.FAILED); + expect(row.error_message).toContain("Anon retrieval job timeout"); + expect(row.bytes_retrieved).toBeNull(); + expect(row.first_byte_ms).toBeNull(); + }); + + it("still emits a row when fetchPiece throws unexpectedly", async () => { + const never: PieceRetrievalResult = { + success: false, + pieceCid: PIECE.pieceCid, + bytesReceived: 0, + pieceBytes: null, + latencyMs: 0, + ttfbMs: 0, + throughputBps: 0, + statusCode: 0, + commPValid: false, + }; + + const { service, insertSpy } = makeService({ + pieceResult: never, + fetchPieceImpl: async () => { + throw new Error("network down"); + }, + }); + + await expect(service.performForProvider(SP_ADDRESS)).rejects.toThrow("network down"); + + expect(insertSpy).toHaveBeenCalledTimes(1); + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.piece_fetch_status).toBe(RetrievalStatus.FAILED); + }); + + describe("with IPFS indexing", () => { + const INDEXED_PIECE: AnonPiece = { + ...PIECE, + withIPFSIndexing: true, + ipfsRootCid: "bafyrootcid", + }; + + function okPiece(bytes: Buffer): PieceRetrievalResult { + return { + success: true, + pieceCid: INDEXED_PIECE.pieceCid, + bytesReceived: bytes.length, + pieceBytes: bytes, + latencyMs: 200, + ttfbMs: 20, + throughputBps: 51200, + statusCode: 200, + commPValid: true, + }; + } + + it("emits populated CAR/IPNI/block-fetch columns when validation fully succeeds", async () => { + const carResult: CarValidationResult = { + carParseable: true, + blockCount: 42, + sampledCidCount: 5, + ipniValid: true, + ipniVerifyMs: 137, + blockFetchValid: true, + blockFetchFailedCount: 0, + blockFetchEndpoint: "https://sp.test/ipfs/", + }; + + const { service, insertSpy, validateCarSpy } = makeService({ + pieceResult: okPiece(Buffer.from("car-bytes")), + piece: INDEXED_PIECE, + carResult, + }); + + await service.performForProvider(SP_ADDRESS); + + expect(validateCarSpy).toHaveBeenCalledTimes(1); + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.piece_fetch_status).toBe(RetrievalStatus.SUCCESS); + expect(row.commp_valid).toBe(true); + expect(row.car_parseable).toBe(true); + expect(row.car_block_count).toBe(42); + expect(row.block_fetch_endpoint).toBe("https://sp.test/ipfs/"); + expect(row.block_fetch_valid).toBe(true); + expect(row.block_fetch_sampled_count).toBe(5); + expect(row.block_fetch_failed_count).toBe(0); + expect(row.ipni_status).toBe("valid"); + expect(row.ipni_verify_ms).toBe(137); + }); + + it("distinguishes IPNI invalid from block-fetch failures", async () => { + const carResult: CarValidationResult = { + carParseable: true, + blockCount: 100, + sampledCidCount: 5, + ipniValid: false, + ipniVerifyMs: 250, + blockFetchValid: false, + blockFetchFailedCount: 2, + blockFetchEndpoint: "https://sp.test/ipfs/", + }; + + const { service, insertSpy } = makeService({ + pieceResult: okPiece(Buffer.from("car-bytes")), + piece: INDEXED_PIECE, + carResult, + }); + + await service.performForProvider(SP_ADDRESS); + + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + // The piece-fetch path still succeeded — failures are surfaced as + // independent dimensions, not folded into piece_fetch_status. + expect(row.piece_fetch_status).toBe(RetrievalStatus.SUCCESS); + expect(row.car_parseable).toBe(true); + expect(row.ipni_status).toBe("invalid"); + expect(row.block_fetch_valid).toBe(false); + expect(row.block_fetch_sampled_count).toBe(5); + expect(row.block_fetch_failed_count).toBe(2); + }); + + it("emits ipni_status='error' (not 'skipped') when CAR validation throws on a successful piece", async () => { + // Distinguishes a real infra outage (e.g. IpniVerificationService down) + // from a piece that legitimately had no IPFS indexing. Without the + // distinction, an outage looks like normal non-IPFS volume in dashboards. + const { service, insertSpy, metricsRecordIpniSpy, metricsRecordBlockFetchSpy } = makeService({ + pieceResult: okPiece(Buffer.from("car-bytes")), + piece: INDEXED_PIECE, + validateCarImpl: async () => { + throw new Error("IpniVerificationService down"); + }, + }); + + await service.performForProvider(SP_ADDRESS); + + expect(metricsRecordIpniSpy).toHaveBeenCalledWith(expect.anything(), "error"); + expect(metricsRecordBlockFetchSpy).toHaveBeenCalledWith(expect.anything(), "error"); + + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.ipni_status).toBe("error"); + // Piece-fetch path itself succeeded — only the validation pipeline failed. + expect(row.commp_valid).toBe(true); + expect(row.car_parseable).toBeNull(); + }); + + it("emits car_parseable=false with skipped IPNI/block-fetch when bytes don't parse as CAR", async () => { + const carResult: CarValidationResult = { + carParseable: false, + blockCount: 0, + sampledCidCount: 0, + ipniValid: null, + ipniVerifyMs: null, + blockFetchValid: null, + blockFetchFailedCount: null, + blockFetchEndpoint: null, + }; + + const { service, insertSpy } = makeService({ + pieceResult: okPiece(Buffer.from("not-a-car")), + piece: INDEXED_PIECE, + carResult, + }); + + await service.performForProvider(SP_ADDRESS); + + const [, row] = insertSpy.mock.calls[0] as [string, Record]; + expect(row.car_parseable).toBe(false); + // car_block_count and block_fetch_sampled_count are gated on carParseable + // so an unparseable CAR doesn't emit a misleading 0. + expect(row.car_block_count).toBeNull(); + expect(row.block_fetch_sampled_count).toBeNull(); + expect(row.block_fetch_endpoint).toBeNull(); + expect(row.block_fetch_valid).toBeNull(); + expect(row.block_fetch_failed_count).toBeNull(); + expect(row.ipni_status).toBe("skipped"); + expect(row.ipni_verify_ms).toBeNull(); + }); + }); +}); diff --git a/apps/backend/src/retrieval-anon/anon-retrieval.service.ts b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts new file mode 100644 index 00000000..a74c2bf0 --- /dev/null +++ b/apps/backend/src/retrieval-anon/anon-retrieval.service.ts @@ -0,0 +1,252 @@ +import { randomUUID } from "node:crypto"; +import { Injectable, Logger } from "@nestjs/common"; +import { InjectRepository } from "@nestjs/typeorm"; +import type { Repository } from "typeorm"; +import { ClickhouseService } from "../clickhouse/clickhouse.service.js"; +import { type ProviderJobContext, toStructuredError } from "../common/logging.js"; +import { StorageProvider } from "../database/entities/storage-provider.entity.js"; +import { IpniCheckStatus, RetrievalStatus, ServiceType } from "../database/types.js"; +import { buildCheckMetricLabels } from "../metrics-prometheus/check-metric-labels.js"; +import { AnonRetrievalCheckMetrics } from "../metrics-prometheus/check-metrics.service.js"; +import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; +import { AnonPieceSelectorService } from "./anon-piece-selector.service.js"; +import { CarValidationService } from "./car-validation.service.js"; +import { PieceRetrievalService } from "./piece-retrieval.service.js"; +import type { CarValidationResult, PieceRetrievalResult } from "./types.js"; + +const ANON_RETRIEVAL_CHECKS_TABLE = "anon_retrieval_checks"; + +@Injectable() +export class AnonRetrievalService { + private readonly logger = new Logger(AnonRetrievalService.name); + + constructor( + private readonly anonPieceSelectorService: AnonPieceSelectorService, + private readonly pieceRetrievalService: PieceRetrievalService, + private readonly carValidationService: CarValidationService, + private readonly walletSdkService: WalletSdkService, + private readonly metrics: AnonRetrievalCheckMetrics, + private readonly clickhouseService: ClickhouseService, + @InjectRepository(StorageProvider) + private readonly spRepository: Repository, + ) {} + + async performForProvider(spAddress: string, signal?: AbortSignal, logContext?: ProviderJobContext): Promise { + // Build metric labels + const provider = await this.spRepository.findOne({ where: { address: spAddress } }); + const labels = buildCheckMetricLabels({ + checkType: "anon_retrieval", + providerId: provider?.providerId, + providerName: provider?.name, + providerIsApproved: provider?.isApproved, + }); + + // 1. Select an anonymous piece + const piece = await this.anonPieceSelectorService.selectPieceForProvider(spAddress); + if (!piece) { + this.logger.warn({ + ...logContext, + event: "anon_retrieval_no_piece", + message: "No anonymous piece found for SP", + spAddress, + }); + this.metrics.recordStatus(labels, "failure.no_piece"); + return; + } + + this.logger.log({ + ...logContext, + event: "anon_retrieval_started", + message: "Starting anonymous retrieval test", + pieceCid: piece.pieceCid, + dataSetId: piece.dataSetId, + pieceId: piece.pieceId, + withIPFSIndexing: piece.withIPFSIndexing, + spAddress, + }); + + const checkStart = Date.now(); + const startedAt = new Date(); + + let pieceResult: PieceRetrievalResult | null = null; + let carResult: CarValidationResult | null = null; + let validatedCarPiece: boolean = false; + + try { + // 2. Fetch the piece. fetchPiece never throws on abort — it returns a + // result with partial metrics so we can persist what we have. + if (signal?.aborted) { + pieceResult = buildAbortedPlaceholder(piece.pieceCid, signal.reason); + } else { + pieceResult = await this.pieceRetrievalService.fetchPiece(spAddress, piece.pieceCid, signal); + } + + // Emit piece retrieval metrics + this.metrics.observeFirstByteMs(labels, pieceResult.ttfbMs); + this.metrics.observeLastByteMs(labels, pieceResult.latencyMs); + this.metrics.observeThroughput(labels, pieceResult.throughputBps); + this.metrics.recordHttpResponseCode(labels, pieceResult.statusCode); + + // 3. CAR validation (only if piece was successfully retrieved and has IPFS indexing) + if ( + pieceResult.success && + piece.withIPFSIndexing && + piece.ipfsRootCid && + pieceResult.pieceBytes && + provider && + !signal?.aborted + ) { + try { + validatedCarPiece = true; + carResult = await this.carValidationService.validateCarPiece( + pieceResult.pieceBytes, + provider, + piece.ipfsRootCid, + signal, + ); + this.metrics.recordCarParseStatus(labels, carResult.carParseable); + this.metrics.recordIpniStatus(labels, ipniStatusFromResult(carResult)); + this.metrics.recordBlockFetchStatus( + labels, + carResult.blockFetchValid === null + ? IpniCheckStatus.SKIPPED + : carResult.blockFetchValid + ? IpniCheckStatus.VALID + : IpniCheckStatus.INVALID, + ); + } catch (error) { + // Validation was attempted on a successful piece retrieval but threw. + this.metrics.recordCarParseStatus(labels, false); + this.metrics.recordIpniStatus(labels, IpniCheckStatus.ERROR); + this.metrics.recordBlockFetchStatus(labels, IpniCheckStatus.ERROR); + this.logger.warn({ + ...logContext, + event: "anon_retrieval_car_validation_failed", + message: "CAR validation threw an error", + pieceCid: piece.pieceCid, + spAddress, + error: toStructuredError(error), + }); + } + } else if (!pieceResult.success) { + // Piece retrieval failed — IPNI and block fetch were skipped + this.metrics.recordIpniStatus(labels, IpniCheckStatus.SKIPPED); + this.metrics.recordBlockFetchStatus(labels, IpniCheckStatus.SKIPPED); + } + + // Overall check duration and status + this.metrics.observeCheckDuration(labels, Date.now() - checkStart); + const pieceServedCorrectly = pieceResult.success && pieceResult.commPValid; + this.metrics.recordStatus( + labels, + pieceServedCorrectly + ? "success" + : pieceResult.aborted + ? "failure.timedout" + : pieceResult.success + ? "failure.commp" + : "failure.http", + ); + } finally { + // Always emit a ClickHouse row — even on abort or unexpected error — so + // we never lose the evidence (ttfb, bytes, response code) we already + // collected. ClickhouseService.insert is a no-op when disabled. + const finalPieceResult = pieceResult ?? buildAbortedPlaceholder(piece.pieceCid, signal?.reason); + const retrievalId = randomUUID(); + const providerInfo = this.walletSdkService.getProviderInfo(spAddress); + const spBaseUrl = providerInfo?.pdp.serviceURL.replace(/\/$/, "") ?? spAddress; + const pieceFetchStatus = finalPieceResult.success ? RetrievalStatus.SUCCESS : RetrievalStatus.FAILED; + const ipniStatus: IpniCheckStatus = !validatedCarPiece + ? IpniCheckStatus.SKIPPED + : carResult + ? ipniStatusFromResult(carResult) + : IpniCheckStatus.ERROR; + + try { + this.clickhouseService.insert(ANON_RETRIEVAL_CHECKS_TABLE, { + timestamp: startedAt.getTime(), + probe_location: this.clickhouseService.probeLocation, + sp_address: spAddress, + sp_id: provider?.providerId != null ? Number(provider.providerId) : null, + sp_name: provider?.name ?? null, + retrieval_id: retrievalId, + piece_cid: piece.pieceCid, + data_set_id: piece.dataSetId, + piece_id: piece.pieceId, + raw_size: piece.rawSize, + with_ipfs_indexing: piece.withIPFSIndexing, + ipfs_root_cid: piece.ipfsRootCid, + service_type: ServiceType.DIRECT_SP, + retrieval_endpoint: `${spBaseUrl}/piece/${piece.pieceCid}`, + piece_fetch_status: pieceFetchStatus, + http_response_code: finalPieceResult.statusCode > 0 ? finalPieceResult.statusCode : null, + first_byte_ms: finalPieceResult.ttfbMs > 0 ? finalPieceResult.ttfbMs : null, + last_byte_ms: finalPieceResult.latencyMs > 0 ? finalPieceResult.latencyMs : null, + bytes_retrieved: finalPieceResult.bytesReceived > 0 ? finalPieceResult.bytesReceived : null, + throughput_bps: finalPieceResult.throughputBps > 0 ? Math.round(finalPieceResult.throughputBps) : null, + commp_valid: finalPieceResult.success ? finalPieceResult.commPValid : null, + car_parseable: carResult ? carResult.carParseable : null, + car_block_count: carResult?.carParseable ? carResult?.blockCount : null, + block_fetch_endpoint: carResult?.blockFetchEndpoint ?? null, + block_fetch_valid: carResult ? carResult.blockFetchValid : null, + block_fetch_sampled_count: carResult?.carParseable ? carResult?.sampledCidCount : null, + block_fetch_failed_count: carResult?.blockFetchFailedCount ?? null, + ipni_status: ipniStatus, + ipni_verify_ms: carResult?.ipniVerifyMs ?? null, + error_message: finalPieceResult.errorMessage ?? null, + }); + } catch (error) { + // ClickhouseService.insert is buffered/non-throwing in normal operation, but + // guard against unexpected runtime errors so we don't break the probe cycle. + this.logger.warn({ + ...logContext, + event: "anon_retrieval_clickhouse_insert_failed", + message: "Failed to enqueue anonymous retrieval row to ClickHouse", + pieceCid: piece.pieceCid, + spAddress, + error: toStructuredError(error), + }); + } + + this.logger.log({ + ...logContext, + event: "anon_retrieval_completed", + message: "Anonymous retrieval test completed", + retrievalId, + pieceCid: piece.pieceCid, + spAddress, + success: finalPieceResult.success, + aborted: finalPieceResult.aborted === true, + latencyMs: finalPieceResult.latencyMs, + ttfbMs: finalPieceResult.ttfbMs, + bytesRetrieved: finalPieceResult.bytesReceived, + carParseable: carResult?.carParseable, + ipniValid: carResult?.ipniValid, + blockFetchValid: carResult?.blockFetchValid, + }); + } + } +} + +function ipniStatusFromResult(result: CarValidationResult): IpniCheckStatus { + if (result.ipniValid === null) return IpniCheckStatus.SKIPPED; + return result.ipniValid ? IpniCheckStatus.VALID : IpniCheckStatus.INVALID; +} + +function buildAbortedPlaceholder(pieceCid: string, reason: unknown): PieceRetrievalResult { + const message = + reason instanceof Error && reason.message ? reason.message : typeof reason === "string" ? reason : "aborted"; + return { + success: false, + pieceCid, + bytesReceived: 0, + pieceBytes: null, + latencyMs: 0, + ttfbMs: 0, + throughputBps: 0, + statusCode: 0, + commPValid: false, + errorMessage: message, + aborted: true, + }; +} diff --git a/apps/backend/src/retrieval-anon/car-validation.service.ts b/apps/backend/src/retrieval-anon/car-validation.service.ts new file mode 100644 index 00000000..c3a6c717 --- /dev/null +++ b/apps/backend/src/retrieval-anon/car-validation.service.ts @@ -0,0 +1,243 @@ +import { CarReader } from "@ipld/car"; +import * as dagPB from "@ipld/dag-pb"; +import { Injectable, Logger } from "@nestjs/common"; +import { ConfigService } from "@nestjs/config"; +import { create as createBlock } from "multiformats/block"; +import { CID } from "multiformats/cid"; +import * as raw from "multiformats/codecs/raw"; +import { sha256 } from "multiformats/hashes/sha2"; +import { toStructuredError } from "../common/logging.js"; +import type { IConfig } from "../config/app.config.js"; +import type { StorageProvider } from "../database/entities/storage-provider.entity.js"; +import { HttpClientService } from "../http-client/http-client.service.js"; +import { IpniVerificationService } from "../ipni/ipni-verification.service.js"; +import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; +import type { CarValidationResult } from "./types.js"; + +// UnixFS DAGs use only dag-pb (interior nodes) and raw (leaf data) codecs +const unixfsCodecs: Record unknown }> = { + [dagPB.code]: dagPB, + [raw.code]: raw, +}; + +@Injectable() +export class CarValidationService { + private readonly logger = new Logger(CarValidationService.name); + + constructor( + private readonly configService: ConfigService, + private readonly httpClientService: HttpClientService, + private readonly walletSdkService: WalletSdkService, + private readonly ipniVerificationService: IpniVerificationService, + ) {} + + /** + * Validate an anonymous piece retrieved as a CAR: + * 1. parse the CAR, + * 2. sample random blocks, + * 3. confirm the SP is advertised for the root + sampled CIDs via IPNI, + * 4. fetch each sampled block from the SP and hash-verify it. + * + * CAR parse failure is attributed to the client (bad upload), not the SP. + */ + async validateCarPiece( + pieceBytes: Buffer, + provider: StorageProvider, + ipfsRootCid: string, + signal?: AbortSignal, + ): Promise { + let blocks: { cid: CID; bytes: Uint8Array }[]; + try { + blocks = await this.parseCar(pieceBytes); + } catch (error) { + this.logger.debug({ + event: "car_parse_failed", + message: "Failed to parse piece bytes as CAR - client fault, not SP", + spAddress: provider.address, + ipfsRootCid, + error: toStructuredError(error), + }); + return { + carParseable: false, + blockCount: 0, + sampledCidCount: 0, + ipniValid: null, + ipniVerifyMs: null, + blockFetchValid: null, + blockFetchFailedCount: null, + blockFetchEndpoint: null, + }; + } + if (blocks.length === 0) { + return { + carParseable: true, + blockCount: 0, + sampledCidCount: 0, + ipniValid: null, + ipniVerifyMs: null, + blockFetchValid: null, + blockFetchFailedCount: null, + blockFetchEndpoint: null, + errorMessage: "CAR contained no blocks", + }; + } + + const sampleCount = this.configService.get("retrieval", { infer: true }).anonBlockSampleCount; + const shuffled = [...blocks].sort(() => Math.random() - 0.5); + const sampledBlocks = shuffled.slice(0, sampleCount); + + const ipni = await this.checkIpni(provider, ipfsRootCid, sampledBlocks, signal); + const blockFetchResult = await this.checkBlockFetch(sampledBlocks, provider.address, signal); + + return { + carParseable: true, + blockCount: blocks.length, + sampledCidCount: sampledBlocks.length, + ipniValid: ipni.valid, + ipniVerifyMs: ipni.durationMs, + blockFetchValid: blockFetchResult.valid, + blockFetchFailedCount: blockFetchResult.failedCount, + blockFetchEndpoint: blockFetchResult.endpoint, + errorMessage: blockFetchResult.errorMessage, + }; + } + + private async parseCar(pieceBytes: Buffer): Promise<{ cid: CID; bytes: Uint8Array }[]> { + const reader = await CarReader.fromBytes(new Uint8Array(pieceBytes)); + const blocks: { cid: CID; bytes: Uint8Array }[] = []; + for await (const block of reader.blocks()) { + blocks.push({ cid: block.cid, bytes: block.bytes }); + } + return blocks; + } + + /** + * Verify via IPNI that the SP is advertised for the root CID and each sampled child CID. + * Delegates to the shared IpniVerificationService which uses filecoin-pin's provider-scoped check. + */ + private async checkIpni( + provider: StorageProvider, + ipfsRootCid: string, + sampledBlocks: ReadonlyArray<{ cid: CID }>, + signal?: AbortSignal, + ): Promise<{ + valid: boolean; + durationMs: number | null; + }> { + const timeouts = this.configService.get("timeouts", { infer: true }); + let rootCid: CID; + try { + rootCid = CID.parse(ipfsRootCid); + } catch (error) { + this.logger.warn({ + event: "ipni_root_cid_invalid", + message: "Failed to parse ipfsRootCID", + ipfsRootCid, + providerAddress: provider.address, + error: toStructuredError(error), + }); + return { valid: false, durationMs: null }; + } + + const result = await this.ipniVerificationService.verify({ + rootCid, + blockCids: sampledBlocks.map((b) => b.cid), + storageProvider: provider, + timeoutMs: timeouts.ipniVerificationTimeoutMs, + pollIntervalMs: timeouts.ipniVerificationPollingMs, + signal, + }); + + return { + valid: result.rootCIDVerified, + durationMs: result.durationMs, + }; + } + + /** + * Fetch each sampled block from the SP endpoint and hash-verify the response + * against the declared CID. Mirrors IpfsBlockRetrievalStrategy's per-block + * verification for the sampled subset (no DAG traversal). + */ + private async checkBlockFetch( + sampledBlocks: ReadonlyArray<{ cid: CID; bytes: Uint8Array }>, + spAddress: string, + signal?: AbortSignal, + ): Promise<{ valid: boolean | null; failedCount: number | null; endpoint: string | null; errorMessage?: string }> { + const providerInfo = this.walletSdkService.getProviderInfo(spAddress); + if (!providerInfo) { + return { + valid: null, + failedCount: null, + endpoint: null, + errorMessage: `Provider info not found for ${spAddress}`, + }; + } + + const spBaseUrl = providerInfo.pdp.serviceURL.replace(/\/$/, ""); + const endpoint = `${spBaseUrl}/ipfs/`; + let failedCount = 0; + + for (const block of sampledBlocks) { + const cidStr = block.cid.toString(); + const blockUrl = `${spBaseUrl}/ipfs/${cidStr}?format=raw`; + + try { + const resp = await this.httpClientService.requestWithMetrics(blockUrl, { + headers: { Accept: "application/vnd.ipld.raw" }, + httpVersion: "2", + signal, + }); + + if (resp.metrics.statusCode < 200 || resp.metrics.statusCode >= 300) { + failedCount += 1; + this.logger.warn({ + event: "block_fetch_non_2xx", + message: "Block fetch returned non-2xx status", + cid: cidStr, + spAddress, + statusCode: resp.metrics.statusCode, + }); + continue; + } + + if (block.cid.multihash.code !== sha256.code) { + this.logger.warn({ + event: "block_unsupported_hash", + message: `Unsupported hash algorithm 0x${block.cid.multihash.code.toString(16)}`, + cid: cidStr, + spAddress, + }); + failedCount += 1; + continue; + } + + const codec = unixfsCodecs[block.cid.code]; + if (!codec) { + this.logger.warn({ + event: "block_unsupported_codec", + message: `Unsupported codec 0x${block.cid.code.toString(16)}`, + cid: cidStr, + spAddress, + }); + failedCount += 1; + continue; + } + + // Hash-verifies and decodes; throws on mismatch + await createBlock({ bytes: resp.data, cid: block.cid, hasher: sha256, codec }); + } catch (error) { + failedCount += 1; + this.logger.warn({ + event: "block_fetch_failed", + message: "Block fetch or hash verification failed", + cid: cidStr, + spAddress, + error: toStructuredError(error), + }); + } + } + + return { valid: failedCount === 0, failedCount, endpoint }; + } +} diff --git a/apps/backend/src/retrieval-anon/piece-retrieval.service.ts b/apps/backend/src/retrieval-anon/piece-retrieval.service.ts new file mode 100644 index 00000000..51150661 --- /dev/null +++ b/apps/backend/src/retrieval-anon/piece-retrieval.service.ts @@ -0,0 +1,195 @@ +import { asPieceCID, calculate as calculatePieceCid } from "@filoz/synapse-core/piece"; +import { Injectable, Logger } from "@nestjs/common"; +import { toStructuredError } from "../common/logging.js"; +import { HttpClientService } from "../http-client/http-client.service.js"; +import { WalletSdkService } from "../wallet-sdk/wallet-sdk.service.js"; +import type { PieceRetrievalResult } from "./types.js"; + +@Injectable() +export class PieceRetrievalService { + private readonly logger = new Logger(PieceRetrievalService.name); + + constructor( + private readonly walletSdkService: WalletSdkService, + private readonly httpClientService: HttpClientService, + ) {} + + async fetchPiece(spAddress: string, pieceCid: string, signal?: AbortSignal): Promise { + const providerInfo = this.walletSdkService.getProviderInfo(spAddress); + + if (!providerInfo) { + this.logger.warn({ + event: "provider_info_not_found", + message: "Cannot fetch piece: provider info not found", + spAddress, + pieceCid, + }); + + return { + success: false, + pieceCid, + bytesReceived: 0, + pieceBytes: null, + latencyMs: 0, + ttfbMs: 0, + throughputBps: 0, + statusCode: 0, + commPValid: false, + errorMessage: `Provider info not found for ${spAddress}`, + }; + } + + const baseUrl = providerInfo.pdp.serviceURL.replace(/\/$/, ""); + const url = `${baseUrl}/piece/${pieceCid}`; + + try { + const result = await this.httpClientService.requestWithMetrics(url, { + httpVersion: "2", + signal, + }); + + const { metrics } = result; + const isSuccess = metrics.statusCode >= 200 && metrics.statusCode < 300; + const throughputBps = metrics.totalTime > 0 ? metrics.responseSize / (metrics.totalTime / 1000) : 0; + + if (result.aborted) { + this.logger.warn({ + event: "piece_fetch_aborted", + message: "Piece fetch aborted mid-download; returning partial metrics", + url, + pieceCid, + spAddress, + bytesReceived: metrics.responseSize, + ttfbMs: metrics.ttfb, + abortReason: result.abortReason, + }); + + return { + success: false, + pieceCid, + bytesReceived: metrics.responseSize, + pieceBytes: null, + latencyMs: metrics.totalTime, + ttfbMs: metrics.ttfb, + throughputBps, + statusCode: metrics.statusCode, + commPValid: false, + errorMessage: result.abortReason ?? "aborted", + aborted: true, + }; + } + + if (!isSuccess) { + this.logger.warn({ + event: "piece_fetch_non_2xx", + message: "Piece fetch returned non-2xx status", + url, + statusCode: metrics.statusCode, + pieceCid, + spAddress, + }); + + return { + success: false, + pieceCid, + bytesReceived: metrics.responseSize, + pieceBytes: null, + latencyMs: metrics.totalTime, + ttfbMs: metrics.ttfb, + throughputBps, + statusCode: metrics.statusCode, + commPValid: false, + errorMessage: `HTTP ${metrics.statusCode}`, + }; + } + + const pieceBytes = Buffer.isBuffer(result.data) ? result.data : Buffer.from(result.data); + const commPValid = await this.validateCommP(pieceBytes, pieceCid); + + this.logger.debug({ + event: "piece_fetch_success", + message: "Piece fetched successfully", + pieceCid, + spAddress, + bytesReceived: metrics.responseSize, + latencyMs: metrics.totalTime, + ttfbMs: metrics.ttfb, + }); + + return { + success: true, + pieceCid, + bytesReceived: metrics.responseSize, + pieceBytes, + latencyMs: metrics.totalTime, + ttfbMs: metrics.ttfb, + throughputBps, + statusCode: metrics.statusCode, + commPValid, + }; + } catch (error) { + const aborted = signal?.aborted === true; + this.logger.warn({ + event: "piece_fetch_failed", + message: "Piece fetch threw an error", + url, + pieceCid, + spAddress, + aborted, + error: toStructuredError(error), + }); + + return { + success: false, + pieceCid, + bytesReceived: 0, + pieceBytes: null, + latencyMs: 0, + ttfbMs: 0, + throughputBps: 0, + statusCode: 0, + commPValid: false, + errorMessage: error instanceof Error ? error.message : String(error), + aborted, + }; + } + } + + /** + * Compute the piece CID (sha2-256-trunc254-padded) of the retrieved bytes and compare + * against the expected CID. Returns false on parse failure, computation failure, or mismatch. + */ + private async validateCommP(bytes: Buffer, pieceCid: string): Promise { + const expected = asPieceCID(pieceCid); + if (!expected) { + this.logger.warn({ + event: "commp_invalid_piece_cid", + message: "Cannot parse expected piece CID for CommP validation", + pieceCid, + }); + return false; + } + + try { + const computed = calculatePieceCid(bytes); + const matches = computed.toString() === expected.toString(); + if (!matches) { + this.logger.warn({ + event: "commp_mismatch", + message: "Piece CID mismatch: SP-returned bytes hash to a different CID", + expected: expected.toString(), + computed: computed.toString(), + }); + } + return matches; + } catch (error) { + this.logger.warn({ + event: "commp_validation_error", + message: "CommP computation threw an error", + pieceCid, + error: toStructuredError(error), + }); + return false; + } + } +} diff --git a/apps/backend/src/retrieval-anon/retrieval-anon.module.ts b/apps/backend/src/retrieval-anon/retrieval-anon.module.ts new file mode 100644 index 00000000..c05dcb5f --- /dev/null +++ b/apps/backend/src/retrieval-anon/retrieval-anon.module.ts @@ -0,0 +1,26 @@ +import { Module } from "@nestjs/common"; +import { ConfigModule } from "@nestjs/config"; +import { TypeOrmModule } from "@nestjs/typeorm"; +import { StorageProvider } from "../database/entities/storage-provider.entity.js"; +import { HttpClientModule } from "../http-client/http-client.module.js"; +import { IpniModule } from "../ipni/ipni.module.js"; +import { SubgraphModule } from "../subgraph/subgraph.module.js"; +import { WalletSdkModule } from "../wallet-sdk/wallet-sdk.module.js"; +import { AnonPieceSelectorService } from "./anon-piece-selector.service.js"; +import { AnonRetrievalService } from "./anon-retrieval.service.js"; +import { CarValidationService } from "./car-validation.service.js"; +import { PieceRetrievalService } from "./piece-retrieval.service.js"; + +@Module({ + imports: [ + ConfigModule, + TypeOrmModule.forFeature([StorageProvider]), + SubgraphModule, + WalletSdkModule, + HttpClientModule, + IpniModule, + ], + providers: [AnonPieceSelectorService, PieceRetrievalService, CarValidationService, AnonRetrievalService], + exports: [AnonRetrievalService], +}) +export class RetrievalAnonModule {} diff --git a/apps/backend/src/retrieval-anon/types.ts b/apps/backend/src/retrieval-anon/types.ts new file mode 100644 index 00000000..9013a5ea --- /dev/null +++ b/apps/backend/src/retrieval-anon/types.ts @@ -0,0 +1,38 @@ +/** The result of anonymous piece selection. */ +export type AnonPiece = { + pieceCid: string; + dataSetId: string; + pieceId: string; + serviceProvider: string; + withIPFSIndexing: boolean; + ipfsRootCid: string | null; + rawSize: string; +}; + +/** Result of piece retrieval. */ +export type PieceRetrievalResult = { + success: boolean; + pieceCid: string; + bytesReceived: number; + pieceBytes: Buffer | null; + latencyMs: number; + ttfbMs: number; + throughputBps: number; + statusCode: number; + commPValid: boolean; + errorMessage?: string; + aborted?: boolean; +}; + +/** Result of CAR validation. */ +export type CarValidationResult = { + carParseable: boolean; + blockCount: number; + sampledCidCount: number; + ipniValid: boolean | null; + ipniVerifyMs: number | null; + blockFetchValid: boolean | null; + blockFetchFailedCount: number | null; + blockFetchEndpoint: string | null; + errorMessage?: string; +}; diff --git a/apps/backend/src/subgraph/queries.ts b/apps/backend/src/subgraph/queries.ts new file mode 100644 index 00000000..74802ddf --- /dev/null +++ b/apps/backend/src/subgraph/queries.ts @@ -0,0 +1,78 @@ +export const Queries = { + GET_PROVIDERS_WITH_DATASETS: ` + query GetProvidersWithDataSet($addresses: [Bytes!], $blockNumber: BigInt!) { + providers(where: {address_in: $addresses}) { + address + totalFaultedPeriods + totalProvingPeriods + proofSets (where: {nextDeadline_lt: $blockNumber, status: PROVING}) { + nextDeadline + maxProvingPeriod + } + } + } + `, + GET_SUBGRAPH_META: ` + query GetSubgraphMeta { + _meta { + block { + number + } + } + } + `, +} as const; + +/** + * Build a sampleAnonPiece query scoped to the requested pool. The single + * piece of query shape that differs is whether the proofSet filter pins + * `withIPFSIndexing: true`; assembling the fragment here keeps the rest + * of the query and the returned selection set shared. + */ +export function buildSampleAnonPieceQuery(pool: "indexed" | "any"): string { + const indexingFilter = pool === "indexed" ? "withIPFSIndexing: true" : ""; + return ` + query SampleAnonPiece( + $serviceProvider: Bytes! + $payer: Bytes! + $sampleKey: Bytes! + $minSize: BigInt! + $maxSize: BigInt! + ) { + _meta { + block { + number + } + } + roots( + first: 1 + orderBy: sampleKey + orderDirection: asc + where: { + sampleKey_gte: $sampleKey + removed: false + rawSize_gte: $minSize + rawSize_lte: $maxSize + proofSet_: { + fwssServiceProvider: $serviceProvider + fwssPayer_not: $payer + isActive: true + ${indexingFilter} + } + } + subgraphError: allow + ) { + rootId + cid + rawSize + ipfsRootCID + proofSet { + setId + withIPFSIndexing + fwssPayer + pdpPaymentEndEpoch + } + } + } + `; +} diff --git a/apps/backend/src/subgraph/subgraph.module.ts b/apps/backend/src/subgraph/subgraph.module.ts new file mode 100644 index 00000000..7834c39b --- /dev/null +++ b/apps/backend/src/subgraph/subgraph.module.ts @@ -0,0 +1,8 @@ +import { Module } from "@nestjs/common"; +import { SubgraphService } from "./subgraph.service.js"; + +@Module({ + providers: [SubgraphService], + exports: [SubgraphService], +}) +export class SubgraphModule {} diff --git a/apps/backend/src/subgraph/subgraph.service.spec.ts b/apps/backend/src/subgraph/subgraph.service.spec.ts new file mode 100644 index 00000000..64f28435 --- /dev/null +++ b/apps/backend/src/subgraph/subgraph.service.spec.ts @@ -0,0 +1,851 @@ +import type { ConfigService } from "@nestjs/config"; +import { CID } from "multiformats/cid"; +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import type { IConfig } from "../config/app.config.js"; +import { SubgraphService } from "./subgraph.service.js"; + +const VALID_ADDRESS = "0xd8da6bf26964af9d7eed9e03e53415d37aa96045" as const; +const SUBGRAPH_ENDPOINT = "https://api.thegraph.com/subgraphs/filecoin/pdp" as const; + +const makeSubgraphResponse = (providers: Record[] = []) => ({ + data: { providers }, +}); + +const makeValidProvider = (overrides: Record = {}) => ({ + address: VALID_ADDRESS, + totalFaultedPeriods: "10", + totalProvingPeriods: "100", + proofSets: [ + { + totalFaultedPeriods: "2", + currentDeadlineCount: "5", + nextDeadline: "1000", + maxProvingPeriod: "100", + }, + ], + ...overrides, +}); + +const makeSubgraphMetaResponse = (blockNumber = 12345) => ({ + data: { + _meta: { + block: { + number: blockNumber, + }, + }, + }, +}); + +const FWSS_SP_ADDRESS = "0xAaaaAAaaaaAAaaaAaAaAaaAaaaAaAaAaaAaaa111"; +const FWSS_PAYER = "0xBBbbBBbbBBbBBbBbbBBbbBBbbbbBbBBbbBBbb222"; +const EXAMPLE_PIECE_CID = "baga6ea4seaqpzwrimvoc4jp4l7mk6knsknf6owsc2ev4krrs2peenl5qelh6u4y"; +const pieceCidHex = `0x${Buffer.from(CID.parse(EXAMPLE_PIECE_CID).bytes).toString("hex")}`; + +const makeSampleRoot = (overrides: Record = {}) => ({ + rootId: "1", + cid: pieceCidHex, + rawSize: "1048576", + ipfsRootCID: "bafyroot", + proofSet: { + setId: "42", + withIPFSIndexing: true, + fwssPayer: FWSS_PAYER.toLowerCase(), + pdpPaymentEndEpoch: null, + }, + ...overrides, +}); + +const makeSampleResponse = (roots: Record[] = [], blockNumber = 12345) => ({ + data: { + _meta: { block: { number: blockNumber } }, + roots, + }, +}); + +const SAMPLE_KEY = "0x0000000000000000000000000000000000000000000000000000000000000001"; +const defaultSampleParams = { + serviceProvider: FWSS_SP_ADDRESS, + payer: FWSS_PAYER, + sampleKey: SAMPLE_KEY, + minSize: "0", + maxSize: "1000000000000", + pool: "indexed" as const, +}; + +describe("SubgraphService", () => { + let service: SubgraphService; + let fetchMock: ReturnType; + + beforeEach(() => { + const configService = { + get: vi.fn((key: keyof IConfig) => { + if (key === "blockchain") { + return { subgraphEndpoint: SUBGRAPH_ENDPOINT }; + } + return undefined; + }), + } as unknown as ConfigService; + + service = new SubgraphService(configService); + + fetchMock = vi.fn(); + vi.stubGlobal("fetch", fetchMock); + + vi.useFakeTimers(); + }); + + afterEach(() => { + vi.restoreAllMocks(); + vi.useRealTimers(); + }); + + describe("fetchProvidersWithDatasets", () => { + it("fetches and returns validated providers with bigint fields", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => makeSubgraphResponse([makeValidProvider()]), + }); + + const providers = await service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }); + + expect(fetchMock).toHaveBeenCalledWith(SUBGRAPH_ENDPOINT, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: expect.stringContaining('"addresses"'), + }); + + expect(providers).toHaveLength(1); + expect(providers[0].address).toBe(VALID_ADDRESS); + expect(providers[0].totalFaultedPeriods).toBe(10n); + expect(providers[0].totalProvingPeriods).toBe(100n); + expect(providers[0].proofSets[0].maxProvingPeriod).toBe(100n); + }); + + it("returns empty array when no providers exist", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => makeSubgraphResponse([]), + }); + + const providers = await service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }); + expect(providers).toEqual([]); + }); + + it("returns empty array when addresses array is empty", async () => { + const providers = await service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [], + }); + + expect(providers).toEqual([]); + expect(fetchMock).not.toHaveBeenCalled(); + }); + + it("throws on HTTP error response", async () => { + fetchMock.mockResolvedValue({ + ok: false, + status: 500, + }); + + const promise = service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }); + + // This stops Node.js from throwing an Unhandled Rejection during fast-forward. + promise.catch(() => {}); + + await vi.runAllTimersAsync(); + + await expect(promise).rejects.toThrow("Failed to fetch provider data after 3 attempts"); + expect(fetchMock).toHaveBeenCalledTimes(3); + }); + + it("throws on GraphQL errors in response", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => ({ + data: null, + errors: [{ message: "Query failed" }], + }), + }); + + const promise = service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }); + promise.catch(() => {}); + + await vi.runAllTimersAsync(); + + // Now await the final promise to catch the expected error + await expect(promise).rejects.toThrow("Failed to fetch provider data after 3 attempts"); + expect(fetchMock).toHaveBeenCalledTimes(3); + }); + + it("throws on network failure", async () => { + fetchMock.mockRejectedValueOnce(new Error("Network error")); + + const promise = service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }); + promise.catch(() => {}); + + await vi.runAllTimersAsync(); + + // Now await the final promise to catch the expected error + await expect(promise).rejects.toThrow("Failed to fetch provider data after 3 attempts"); + expect(fetchMock).toHaveBeenCalledTimes(3); // Initial + 2 retries = 3 total + }); + + it("throws immediately on validation error without retrying", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => ({ + data: { providers: [{ address: "invalid" }] }, + }), + }); + + await expect( + service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }), + ).rejects.toThrow("Data validation failed"); + + // Should only be called once - no retries for validation errors + expect(fetchMock).toHaveBeenCalledTimes(1); + }); + + it("throws immediately when response data is missing required fields", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => ({ + data: { providers: [{ address: VALID_ADDRESS }] }, // Missing required fields + }), + }); + + await expect( + service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }), + ).rejects.toThrow("Data validation failed"); + + // Should only be called once - no retries for validation errors + expect(fetchMock).toHaveBeenCalledTimes(1); + }); + + it("sends blockNumber as string in the GraphQL variables", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => makeSubgraphResponse([makeValidProvider()]), + }); + + await service.fetchProvidersWithDatasets({ + blockNumber: 12345, + addresses: [VALID_ADDRESS], + }); + + const body = JSON.parse(fetchMock.mock.calls[0][1].body); + expect(body.variables.blockNumber).toBe("12345"); + }); + + it("retries network errors but not validation errors", async () => { + // First attempt: network error (should retry) + fetchMock.mockRejectedValueOnce(new Error("Network timeout")); + + // Second attempt: succeeds but validation fails (should not retry) + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => ({ + data: { providers: [{ address: "invalid" }] }, + }), + }); + + const promise = service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }); + promise.catch(() => {}); + + await vi.runAllTimersAsync(); + + // Now await the final promise to catch the expected error + await expect(promise).rejects.toThrow("Data validation failed"); + + // Should be called twice: initial network error + 1 retry that fails validation + expect(fetchMock).toHaveBeenCalledTimes(2); + }); + + it("sends addresses array in the GraphQL variables", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => makeSubgraphResponse([makeValidProvider()]), + }); + + const addresses = [VALID_ADDRESS, "0xAb5801a7D398351b8bE11C439e05C5B3259aeC9B"]; + await service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses, + }); + + const body = JSON.parse(fetchMock.mock.calls[0][1].body); + expect(body.variables.addresses).toEqual(addresses); + }); + + it("batches large address lists into chunks of MAX_PROVIDERS_PER_QUERY", async () => { + // Create 150 addresses (should be split into 2 batches: 100 + 50) + const addresses = Array.from({ length: 150 }, (_, i) => `0x${i.toString().padStart(40, "0")}`); + + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphResponse([]), + }); + + await service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses, + }); + + // Should make 2 requests + expect(fetchMock).toHaveBeenCalledTimes(2); + }); + + it("retries failed requests with exponential backoff", async () => { + // Fail on first attempt, succeed on second attempt (1 retry) + fetchMock.mockRejectedValueOnce(new Error("Network timeout")).mockResolvedValueOnce({ + ok: true, + json: async () => makeSubgraphResponse([makeValidProvider()]), + }); + + const promise = service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses: [VALID_ADDRESS], + }); + + await vi.runAllTimersAsync(); + + // Now await the final promise to resolve + const providers = await promise; + + expect(fetchMock).toHaveBeenCalledTimes(2); // Initial attempt + 1 retry + expect(providers).toHaveLength(1); + }); + + it("processes batches with concurrency control", async () => { + // Create 120 addresses (should be 2 batches of 100 each, but processed with concurrency limit) + const addresses = Array.from({ length: 120 }, (_, i) => `0x${i.toString().padStart(40, "0")}`); + + let concurrentCalls = 0; + let maxConcurrentCalls = 0; + + fetchMock.mockImplementation(async () => { + concurrentCalls++; + maxConcurrentCalls = Math.max(maxConcurrentCalls, concurrentCalls); + await new Promise((resolve) => setTimeout(resolve, 10)); + concurrentCalls--; + return { + ok: true, + json: async () => makeSubgraphResponse([]), + }; + }); + + const fetchPromise = service.fetchProvidersWithDatasets({ + blockNumber: 5000, + addresses, + }); + + await vi.runAllTimersAsync(); + + await fetchPromise; + + // Should respect MAX_CONCURRENT_REQUESTS (50) + expect(maxConcurrentCalls).toBeLessThanOrEqual(50); + expect(fetchMock).toHaveBeenCalledTimes(2); + }); + }); + + describe("fetchSubgraphMeta", () => { + it("fetches and returns subgraph metadata with block number", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + const meta = await service.fetchSubgraphMeta(); + + expect(fetchMock).toHaveBeenCalledWith(SUBGRAPH_ENDPOINT, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: expect.stringContaining("GetSubgraphMeta"), + }); + + expect(meta).toEqual({ + _meta: { + block: { + number: 12345, + }, + }, + }); + }); + + it("throws when subgraph endpoint is not configured", async () => { + const configService = { + get: vi.fn(() => ({ subgraphEndpoint: "" })), + } as unknown as ConfigService; + + const serviceWithoutEndpoint = new SubgraphService(configService); + + await expect(serviceWithoutEndpoint.fetchSubgraphMeta()).rejects.toThrow("No subgraph endpoint configured"); + }); + + it("throws on HTTP error response", async () => { + fetchMock.mockResolvedValueOnce({ + ok: false, + status: 500, + statusText: "Internal Server Error", + }); + + const promise = service.fetchSubgraphMeta(); + promise.catch(() => {}); + + await vi.runAllTimersAsync(); + + // Now await the final promise to catch the expected error + await expect(promise).rejects.toThrow("Failed to fetch subgraph metadata after 3 attempts"); + }); + + it("throws on GraphQL errors in response", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => ({ + errors: [{ message: "Query timeout" }], + }), + }); + + const promise = service.fetchSubgraphMeta(); + promise.catch(() => {}); + + await vi.runAllTimersAsync(); + + // Now await the final promise to catch the expected error + await expect(promise).rejects.toThrow("Failed to fetch subgraph metadata after 3 attempts"); + }); + + it("throws on validation failure without retry", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => ({ + data: { + _meta: { + block: { + number: "not-a-number", // Invalid - should be number + }, + }, + }, + }), + }); + + await expect(service.fetchSubgraphMeta()).rejects.toThrow("Data validation failed"); + expect(fetchMock).toHaveBeenCalledTimes(1); // Should not retry validation errors + }); + + it("throws on missing required fields", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => ({ + data: { + _meta: { + block: { + number: undefined, // missing required field + }, + }, + }, + }), + }); + + await expect(service.fetchSubgraphMeta()).rejects.toThrow("Data validation failed"); + expect(fetchMock).toHaveBeenCalledTimes(1); + }); + + it("retries on network failures with exponential backoff", async () => { + fetchMock.mockRejectedValueOnce(new Error("Network timeout")).mockResolvedValueOnce({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + const promise = service.fetchSubgraphMeta(); + + await vi.runAllTimersAsync(); + + // Now await the second promise to resolve + const meta = await promise; + + expect(fetchMock).toHaveBeenCalledTimes(2); // Initial + 1 retry + expect(meta._meta.block.number).toBe(12345); + }); + + it("throws after MAX_RETRIES attempts on persistent network errors", async () => { + fetchMock.mockRejectedValue(new Error("Network timeout")); + + const promise = service.fetchSubgraphMeta(); + promise.catch(() => {}); + + await vi.runAllTimersAsync(); + + // Now await the final promise to catch the expected error + await expect(promise).rejects.toThrow("Failed to fetch subgraph metadata after 3 attempts"); + expect(fetchMock).toHaveBeenCalledTimes(3); + }); + }); + + describe("enforceRateLimit (sliding window)", () => { + it("allows requests when under the rate limit", async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + const startTime = Date.now(); + + // Make 5 requests - should all go through immediately + const promises = Array.from({ length: 5 }, () => service.fetchSubgraphMeta()); + + await Promise.all(promises); + + const endTime = Date.now(); + const elapsed = endTime - startTime; + + // Should complete quickly (no waiting) + expect(elapsed).toBeLessThan(100); + expect(fetchMock).toHaveBeenCalledTimes(5); + }); + + it("enforces rate limit when exceeding MAX_CONCURRENT_REQUESTS", async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + // Fill up the rate limit window with 50 requests + const initialPromises = Array.from({ length: 50 }, () => service.fetchSubgraphMeta()); + await Promise.all(initialPromises); + + fetchMock.mockClear(); + + // Try to make one more request - should wait for oldest to expire + const promise = service.fetchSubgraphMeta(); + + // Advance past the 10 second window + buffer + await vi.advanceTimersByTimeAsync(10010); + await promise; + + expect(fetchMock).toHaveBeenCalledTimes(1); + }); + + it("throws error when requestCount exceeds MAX_CONCURRENT_REQUESTS", async () => { + // Access private method via type assertion for testing + const enforceRateLimit = (service as any).enforceRateLimit.bind(service); + + await expect(enforceRateLimit(51)).rejects.toThrow("Cannot request 51 items; exceeds rate limit window of 50"); + }); + + it("correctly calculates wait time for multiple required slots", async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + // Fill 48 slots + const initialPromises = Array.from({ length: 48 }, () => service.fetchSubgraphMeta()); + await vi.runAllTimersAsync(); + await Promise.all(initialPromises); + + fetchMock.mockClear(); + + // Request 5 more slots (need 3 to free up: 5 - 2 available = 3) + // Should wait for the 3rd oldest timestamp to expire + const enforceRateLimit = (service as any).enforceRateLimit.bind(service); + const promise = enforceRateLimit(5); + + // The 3rd request should expire at ~10 seconds + await vi.advanceTimersByTimeAsync(10010); + await promise; + + // Verify slots were reserved + // After 10s, the first 48 expired, so we should only have the 5 new ones + const timestamps = (service as any).requestTimestamps; + expect(timestamps.length).toBe(5); // Only the 5 new slots remain + }); + + it("handles sliding window correctly as old requests expire", async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + // Make 30 requests at t=0 + const batch1 = Array.from({ length: 30 }, () => service.fetchSubgraphMeta()); + await vi.runAllTimersAsync(); + await Promise.all(batch1); + + // Advance 5 seconds + await vi.advanceTimersByTimeAsync(5000); + + // Make 20 more requests at t=5000 + const batch2 = Array.from({ length: 20 }, () => service.fetchSubgraphMeta()); + await vi.runAllTimersAsync(); + await Promise.all(batch2); + + // Now at t=5000, we have 50 requests in the window + // Advance to t=10100 - first 30 should expire + await vi.advanceTimersByTimeAsync(5100); + + fetchMock.mockClear(); + + // Should be able to make 30 more requests immediately + const batch3 = Array.from({ length: 30 }, () => service.fetchSubgraphMeta()); + await vi.runAllTimersAsync(); + await Promise.all(batch3); + + expect(fetchMock).toHaveBeenCalledTimes(30); + }); + + it("adds 10ms buffer to prevent timing edge cases", async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + // Fill the window + const initialPromises = Array.from({ length: 50 }, () => service.fetchSubgraphMeta()); + await vi.runAllTimersAsync(); + await Promise.all(initialPromises); + + fetchMock.mockClear(); + + const promise = service.fetchSubgraphMeta(); + + // Advance past the window + buffer + await vi.advanceTimersByTimeAsync(10010); + await promise; + + expect(fetchMock).toHaveBeenCalledTimes(1); + }); + + it("recursively waits when multiple batches need to expire", async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + // Fill window with 50 requests + const batch1 = Array.from({ length: 50 }, () => service.fetchSubgraphMeta()); + await vi.runAllTimersAsync(); + await Promise.all(batch1); + + // Advance 5 seconds + await vi.advanceTimersByTimeAsync(5000); + + fetchMock.mockClear(); + + // Try to request 30 slots (need to wait for 30 to expire) + const enforceRateLimit = (service as any).enforceRateLimit.bind(service); + const promise = enforceRateLimit(30); + + // First recursion: wait for 30th oldest to expire (~10s from start) + await vi.advanceTimersByTimeAsync(5010); + + // Should recursively check and complete + await promise; + + const timestamps = (service as any).requestTimestamps; + // After 10s from start, all 50 initial requests expired, only 30 new ones remain + expect(timestamps.length).toBe(30); // Only the 30 new slots + }); + + it("reserves slots immediately to prevent race conditions", async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + // Fill 47 slots + const initial = Array.from({ length: 47 }, () => service.fetchSubgraphMeta()); + await vi.runAllTimersAsync(); + await Promise.all(initial); + + // Now we have 3 available slots + const enforceRateLimit = (service as any).enforceRateLimit.bind(service); + + // Request 3 slots - should succeed immediately + await enforceRateLimit(3); + + const timestamps = (service as any).requestTimestamps; + expect(timestamps.length).toBe(50); // 47 + 3 = 50 (full) + + // Try to request 1 more - should need to wait + const promise = enforceRateLimit(1); + + // Advance time to free up a slot + await vi.advanceTimersByTimeAsync(10010); + await promise; + + // After waiting, the old slots expired and new one was added + const finalTimestamps = (service as any).requestTimestamps; + expect(finalTimestamps.length).toBe(1); // Only the new request remains + }); + + it("filters out expired timestamps from the sliding window", async () => { + fetchMock.mockResolvedValue({ + ok: true, + json: async () => makeSubgraphMetaResponse(12345), + }); + + // Make 20 requests + const batch1 = Array.from({ length: 20 }, () => service.fetchSubgraphMeta()); + await vi.runAllTimersAsync(); + await Promise.all(batch1); + + // Advance past the window + await vi.advanceTimersByTimeAsync(11000); + + fetchMock.mockClear(); + + // Make another request - should have full window available + await service.fetchSubgraphMeta(); + + const timestamps = (service as any).requestTimestamps; + // Should only have 1 timestamp (the new one), old ones filtered out + expect(timestamps.length).toBe(1); + }); + }); + + describe("sampleAnonPiece", () => { + it("throws when endpoint is not configured (distinct from empty result)", async () => { + // Returning null here would make a misconfigured deployment indistinguishable + // from a genuinely empty candidate pool — every anon job would silently + // no-op forever. Fail loudly instead. + const noEndpointConfig = { + get: vi.fn(() => ({ subgraphEndpoint: "" })), + } as unknown as ConfigService; + const noEndpointService = new SubgraphService(noEndpointConfig); + + await expect(noEndpointService.sampleAnonPiece(defaultSampleParams)).rejects.toThrow( + "No subgraph endpoint configured", + ); + expect(fetchMock).not.toHaveBeenCalled(); + }); + + it("returns null when the subgraph yields no matching root", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => makeSampleResponse([]), + }); + + const piece = await service.sampleAnonPiece(defaultSampleParams); + expect(piece).toBeNull(); + }); + + it("parses the sampled root into a decoded candidate piece", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => makeSampleResponse([makeSampleRoot()]), + }); + + const piece = await service.sampleAnonPiece(defaultSampleParams); + + expect(piece).toMatchObject({ + pieceCid: EXAMPLE_PIECE_CID, + pieceId: "1", + dataSetId: "42", + rawSize: "1048576", + withIPFSIndexing: true, + ipfsRootCid: "bafyroot", + pdpPaymentEndEpoch: null, + indexedAtBlock: 12345, + }); + }); + + it("returns pdpPaymentEndEpoch as bigint when the dataset is terminating", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => + makeSampleResponse([ + makeSampleRoot({ + proofSet: { + setId: "42", + withIPFSIndexing: true, + fwssPayer: FWSS_PAYER.toLowerCase(), + pdpPaymentEndEpoch: "5000", + }, + }), + ]), + }); + + const piece = await service.sampleAnonPiece(defaultSampleParams); + expect(piece?.pdpPaymentEndEpoch).toBe(5000n); + }); + + it("lowercases SP and payer addresses before querying", async () => { + fetchMock.mockResolvedValueOnce({ ok: true, json: async () => makeSampleResponse([]) }); + + await service.sampleAnonPiece(defaultSampleParams); + + const [, opts] = fetchMock.mock.calls[0]; + const body = JSON.parse(opts.body as string); + expect(body.variables.serviceProvider).toBe(FWSS_SP_ADDRESS.toLowerCase()); + expect(body.variables.payer).toBe(FWSS_PAYER.toLowerCase()); + expect(body.query).toContain("withIPFSIndexing: true"); + }); + + it("uses the any-pool query when pool is 'any'", async () => { + fetchMock.mockResolvedValueOnce({ ok: true, json: async () => makeSampleResponse([]) }); + + await service.sampleAnonPiece({ ...defaultSampleParams, pool: "any" }); + + const [, opts] = fetchMock.mock.calls[0]; + const body = JSON.parse(opts.body as string); + expect(body.query).not.toContain("withIPFSIndexing: true"); + }); + + it("returns null when the sampled root has an undecodable CID", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => makeSampleResponse([makeSampleRoot({ cid: "0xdeadbeef" })]), + }); + + const piece = await service.sampleAnonPiece(defaultSampleParams); + expect(piece).toBeNull(); + }); + + it("throws after max retries on repeated HTTP errors", async () => { + fetchMock.mockResolvedValue({ ok: false, status: 500, statusText: "Internal Server Error" }); + + const promise = service.sampleAnonPiece(defaultSampleParams); + promise.catch(() => {}); + await vi.runAllTimersAsync(); + + await expect(promise).rejects.toThrow("Failed to fetch subgraph sample_anon_piece_indexed after 3 attempts"); + expect(fetchMock).toHaveBeenCalledTimes(3); + }); + + it("does not retry on schema validation failure", async () => { + fetchMock.mockResolvedValueOnce({ + ok: true, + json: async () => ({ data: { _meta: { block: { number: 1 } } } }), // missing roots + }); + + await expect(service.sampleAnonPiece(defaultSampleParams)).rejects.toThrow(/validation failed/i); + expect(fetchMock).toHaveBeenCalledTimes(1); + }); + }); +}); diff --git a/apps/backend/src/subgraph/subgraph.service.ts b/apps/backend/src/subgraph/subgraph.service.ts new file mode 100644 index 00000000..3d4e8370 --- /dev/null +++ b/apps/backend/src/subgraph/subgraph.service.ts @@ -0,0 +1,422 @@ +import { Injectable, Logger } from "@nestjs/common"; +import { ConfigService } from "@nestjs/config"; +import { toStructuredError } from "../common/logging.js"; +import type { IBlockchainConfig, IConfig } from "../config/app.config.js"; +import { buildSampleAnonPieceQuery, Queries } from "./queries.js"; +import type { + AnonCandidatePiece, + GraphQLResponse, + ProviderDataSetResponse, + ProvidersWithDataSetsOptions, + RawSampleAnonPieceResponse, + SubgraphMeta, +} from "./types.js"; +import { + decodePieceCid, + validateProviderDataSetResponse, + validateSampleAnonPieceResponse, + validateSubgraphMetaResponse, +} from "./types.js"; + +/** Pool of pieces to sample from. */ +export type AnonPiecePool = "indexed" | "any"; + +/** Inputs for a single anonymous piece sample query. */ +export type SampleAnonPieceParams = { + /** Service provider address (lowercase hex). */ + serviceProvider: string; + /** Dealbot's own payer address (excluded to keep the sample non-dealbot). */ + payer: string; + /** Uniform-random 32-byte sort key as `0x`-prefixed hex. */ + sampleKey: string; + /** Inclusive lower bound on raw piece size in bytes (decimal string). */ + minSize: string; + /** Inclusive upper bound on raw piece size in bytes (decimal string). */ + maxSize: string; + /** Which pool to sample from. */ + pool: AnonPiecePool; +}; + +/** + * Error thrown when data validation fails. + * These errors should not be retried as they indicate schema/data issues. + */ +class ValidationError extends Error { + constructor(message: string) { + super(message); + this.name = "ValidationError"; + if (Error.captureStackTrace) { + Error.captureStackTrace(this, ValidationError); + } + } +} + +/** + * Client for the dealbot-owned subgraph (driven by `SUBGRAPH_ENDPOINT`). + * + * Functionally a superset of `PDPSubgraphService`: it exposes the same + * `fetchSubgraphMeta` / `fetchProvidersWithDatasets` surface plus the new + * `sampleAnonPiece` query used by anonymous retrievals. + * + * The two services intentionally coexist while we migrate off the upstream + * pdp-explorer subgraph: `PDPSubgraphService` continues to drive the + * established data-retention path against `PDP_SUBGRAPH_ENDPOINT`, and + * `SubgraphService` is scoped to the new anonymous-retrieval flow only. + * Once the dealbot-owned subgraph has soaked in production, this service + * should become the single drop-in replacement for `PDPSubgraphService` + * and `PDP_SUBGRAPH_ENDPOINT` can be retired. + */ +@Injectable() +export class SubgraphService { + private readonly logger: Logger = new Logger(SubgraphService.name); + private readonly blockchainConfig: IBlockchainConfig; + + private static readonly MAX_PROVIDERS_PER_QUERY = 100; + private static readonly MAX_CONCURRENT_REQUESTS = 50; + private static readonly RATE_LIMIT_WINDOW_MS = 10000; + private static readonly MAX_RETRIES = 3; + private static readonly INITIAL_RETRY_DELAY_MS = 1000; + + private requestTimestamps: number[] = []; + + constructor(private readonly configService: ConfigService) { + this.blockchainConfig = this.configService.get("blockchain"); + } + + /** + * Fetch subgraph metadata including the latest indexed block number. + * + * @throws Error if endpoint is not configured or after MAX_RETRIES attempts + */ + async fetchSubgraphMeta(): Promise { + return this.executeQuery("metadata", Queries.GET_SUBGRAPH_META, {}, validateSubgraphMetaResponse); + } + + /** + * Fetch provider-level totals from subgraph with batching, pagination, and rate limiting + * + * @param options - Options containing block number and provider addresses + * @returns Array of providers with their data sets currently proving + */ + async fetchProvidersWithDatasets( + options: ProvidersWithDataSetsOptions, + ): Promise { + const { blockNumber, addresses } = options; + + if (addresses.length === 0) { + return []; + } + + if (addresses.length <= SubgraphService.MAX_PROVIDERS_PER_QUERY) { + return this.fetchWithRetry(blockNumber, addresses); + } + + return this.fetchMultipleBatchesWithRateLimit(blockNumber, addresses); + } + + /** + * Draw a single random anonymous piece for retrieval testing. + * + * Uses the Root.sampleKey (keccak256 of the entity id) to pick the + * smallest key ≥ `params.sampleKey` that matches the filters — a uniform + * random pick when `sampleKey` is generated uniformly. Server-side filters + * cover SP, payer-exclusion, active status, size range, and optionally + * `withIPFSIndexing`. Returns null when no piece matches (callers should + * retry with a fresh sampleKey or relax the pool/bucket). + * + * `pdpPaymentEndEpoch` is returned to the caller for a cheap client-side + * epoch comparison — GraphQL filters on nullable BigInts are awkward. + */ + async sampleAnonPiece(params: SampleAnonPieceParams): Promise { + if (!this.blockchainConfig.subgraphEndpoint) { + // Surface misconfiguration distinctly so it does not look like an empty + // candidate pool (which silently no-ops every anon retrieval job). + this.logger.error({ + event: "subgraph_endpoint_not_configured", + message: "Cannot sample anonymous piece — no subgraph endpoint configured", + }); + throw new Error("No subgraph endpoint configured"); + } + + const query = buildSampleAnonPieceQuery(params.pool); + const variables = { + serviceProvider: params.serviceProvider.toLowerCase(), + payer: params.payer.toLowerCase(), + sampleKey: params.sampleKey, + minSize: params.minSize, + maxSize: params.maxSize, + }; + + const validated = await this.executeQuery( + `sample_anon_piece_${params.pool}`, + query, + variables, + validateSampleAnonPieceResponse, + ); + + const root = validated.roots[0]; + if (!root) { + return null; + } + + try { + return { + pieceCid: decodePieceCid(root.cid), + pieceId: root.rootId, + dataSetId: root.proofSet.setId, + rawSize: root.rawSize, + withIPFSIndexing: root.proofSet.withIPFSIndexing, + ipfsRootCid: root.ipfsRootCID ?? null, + indexedAtBlock: validated._meta.block.number, + pdpPaymentEndEpoch: root.proofSet.pdpPaymentEndEpoch != null ? BigInt(root.proofSet.pdpPaymentEndEpoch) : null, + }; + } catch (error) { + this.logger.warn({ + event: "anon_piece_cid_decode_failed", + message: "Failed to decode piece CID from subgraph data", + dataSetId: root.proofSet.setId, + pieceId: root.rootId, + error: toStructuredError(error), + }); + return null; + } + } + + /** + * Generic single-query helper with retry and rate limiting. Used by queries that + * don't fit the batched provider-fetch shape. + */ + private async executeQuery( + operationName: string, + query: string, + variables: Record, + transform: (data: unknown) => T, + attempt: number = 1, + ): Promise { + if (!this.blockchainConfig.subgraphEndpoint) { + throw new Error("No subgraph endpoint configured"); + } + + try { + await this.enforceRateLimit(); + + const response = await fetch(this.blockchainConfig.subgraphEndpoint, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ query, variables }), + }); + + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } + + const result = (await response.json()) as GraphQLResponse; + + if (result.errors) { + const errorMessage = result.errors?.[0]?.message || "Unknown GraphQL error"; + throw new Error(`GraphQL error: ${errorMessage}`); + } + + try { + return transform(result.data); + } catch (validationError) { + const errorMessage = validationError instanceof Error ? validationError.message : "Unknown validation error"; + throw new ValidationError(`Data validation failed: ${errorMessage}`); + } + } catch (error) { + const errorMessage = error instanceof Error ? error.message : "Unknown error"; + + if (error instanceof ValidationError) { + this.logger.error({ + event: `subgraph_${operationName}_validation_failed`, + message: `Subgraph ${operationName} validation failed`, + error: toStructuredError(error), + }); + throw error; + } + + if (attempt < SubgraphService.MAX_RETRIES) { + const delay = SubgraphService.INITIAL_RETRY_DELAY_MS * (1 << (attempt - 1)); + this.logger.warn({ + event: `subgraph_${operationName}_request_retry`, + message: `Subgraph ${operationName} request failed. Retrying...`, + attempt, + maxRetries: SubgraphService.MAX_RETRIES, + retryDelayMs: delay, + error: toStructuredError(error), + }); + await new Promise((resolve) => setTimeout(resolve, delay)); + return this.executeQuery(operationName, query, variables, transform, attempt + 1); + } + + this.logger.error({ + event: `subgraph_${operationName}_request_failed`, + message: `Subgraph ${operationName} request failed after maximum retries`, + maxRetries: SubgraphService.MAX_RETRIES, + error: toStructuredError(error), + }); + throw new Error( + `Failed to fetch subgraph ${operationName} after ${SubgraphService.MAX_RETRIES} attempts: ${errorMessage}`, + ); + } + } + + /** + * Fetch multiple batches with rate limiting and concurrency control + */ + private async fetchMultipleBatchesWithRateLimit( + blockNumber: number, + addresses: string[], + ): Promise { + const batches: string[][] = []; + for (let i = 0; i < addresses.length; i += SubgraphService.MAX_PROVIDERS_PER_QUERY) { + const addressesLimit = Math.min(addresses.length, i + SubgraphService.MAX_PROVIDERS_PER_QUERY); + batches.push(addresses.slice(i, addressesLimit)); + } + + const allProviders: ProviderDataSetResponse["providers"] = []; + + for (let i = 0; i < batches.length; i += SubgraphService.MAX_CONCURRENT_REQUESTS) { + const batchGroup = batches.slice(i, i + SubgraphService.MAX_CONCURRENT_REQUESTS); + + const results = await Promise.all(batchGroup.map((batch) => this.fetchWithRetry(blockNumber, batch))); + + allProviders.push(...results.flat()); + } + + return allProviders; + } + + /** + * Fetch with exponential backoff retry mechanism + * Assuming initial request to be first attempt + */ + private async fetchWithRetry( + blockNumber: number, + addresses: string[], + attempt: number = 1, + ): Promise { + if (!this.blockchainConfig.subgraphEndpoint) { + throw new Error("No subgraph endpoint configured"); + } + + const variables = { + blockNumber: blockNumber.toString(), + addresses, + }; + + try { + await this.enforceRateLimit(); + + const response = await fetch(this.blockchainConfig.subgraphEndpoint, { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ + query: Queries.GET_PROVIDERS_WITH_DATASETS, + variables, + }), + }); + + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } + + const result = (await response.json()) as GraphQLResponse; + + if (result.errors) { + const errorMessage = result.errors?.[0]?.message || "Unknown GraphQL error"; + throw new Error(`GraphQL error: ${errorMessage}`); + } + + let validated: ProviderDataSetResponse; + try { + validated = validateProviderDataSetResponse(result.data); + } catch (validationError) { + const errorMessage = validationError instanceof Error ? validationError.message : "Unknown validation error"; + throw new ValidationError(`Data validation failed: ${errorMessage}`); + } + + return validated.providers; + } catch (error) { + const errorMessage = error instanceof Error ? error.message : "Unknown error"; + + // No need to retry on validation errors - they indicate schema/data issues, not transient failures + if (error instanceof ValidationError) { + this.logger.error({ + event: "subgraph_provider_data_validation_failed", + message: "Subgraph data validation failed", + error: toStructuredError(error), + }); + throw error; + } + + // Retry on network/HTTP errors + if (attempt < SubgraphService.MAX_RETRIES) { + const delay = SubgraphService.INITIAL_RETRY_DELAY_MS * (1 << (attempt - 1)); + this.logger.warn({ + event: "subgraph_provider_request_retry", + message: "Subgraph provider request failed. Retrying...", + attempt, + maxRetries: SubgraphService.MAX_RETRIES, + retryDelayMs: delay, + addressCount: addresses.length, + error: toStructuredError(error), + }); + await new Promise((resolve) => setTimeout(resolve, delay)); + return this.fetchWithRetry(blockNumber, addresses, attempt + 1); + } + + this.logger.error({ + event: "subgraph_provider_request_failed", + message: "Subgraph provider request failed after maximum retries", + maxRetries: SubgraphService.MAX_RETRIES, + blockNumber, + addressCount: addresses.length, + error: toStructuredError(error), + }); + throw new Error(`Failed to fetch provider data after ${SubgraphService.MAX_RETRIES} attempts: ${errorMessage}`); + } + } + + /** + * Enforce rate limiting: max 50 requests per 10 seconds + * This rate limit is applied by Goldsky on their public endpoints + * Read more here: https://docs.goldsky.com/subgraphs/graphql-endpoints#public-endpoints + */ + private async enforceRateLimit(requestCount: number = 1): Promise { + if (requestCount > SubgraphService.MAX_CONCURRENT_REQUESTS) { + throw new Error( + `Cannot request ${requestCount} items; exceeds rate limit window of ${SubgraphService.MAX_CONCURRENT_REQUESTS}`, + ); + } + + const now = Date.now(); + const windowStart = now - SubgraphService.RATE_LIMIT_WINDOW_MS; + + this.requestTimestamps = this.requestTimestamps.filter((timestamp) => timestamp > windowStart); + + const availableSlots = SubgraphService.MAX_CONCURRENT_REQUESTS - this.requestTimestamps.length; + + if (requestCount > availableSlots) { + const requiredSlots = requestCount - availableSlots; + + const index = Math.min(this.requestTimestamps.length, requiredSlots) - 1; + const oldestTimestamp = this.requestTimestamps[index] || now; + + // wait time with 10ms buffer + const waitTime = oldestTimestamp + SubgraphService.RATE_LIMIT_WINDOW_MS - now + 10; + + if (waitTime > 0) { + await new Promise((resolve) => setTimeout(resolve, waitTime)); + return this.enforceRateLimit(requestCount); + } + } + + // Reserve the slots NOW + for (let i = 0; i < requestCount; i++) { + this.requestTimestamps.push(Date.now()); + } + } +} diff --git a/apps/backend/src/subgraph/types.spec.ts b/apps/backend/src/subgraph/types.spec.ts new file mode 100644 index 00000000..02e6eee0 --- /dev/null +++ b/apps/backend/src/subgraph/types.spec.ts @@ -0,0 +1,245 @@ +import { describe, expect, it } from "vitest"; +import { validateProviderDataSetResponse, validateSubgraphMetaResponse } from "./types.js"; + +// Subgraph stores addresses in lowercase +const VALID_ADDRESS = "0xd8da6bf26964af9d7eed9e03e53415d37aa96045" as const; + +const makeValidProvider = (overrides: Record = {}) => ({ + address: VALID_ADDRESS, + totalFaultedPeriods: "10", + totalProvingPeriods: "100", + proofSets: [ + { + nextDeadline: "1000", + maxProvingPeriod: "100", + }, + ], + ...overrides, +}); + +const makeValidResponse = (providers = [makeValidProvider()]) => ({ + providers, +}); + +describe("validateProviderDataSetResponse", () => { + it("validates and transforms a well-formed response", () => { + const result = validateProviderDataSetResponse(makeValidResponse()); + + expect(result.providers).toHaveLength(1); + const provider = result.providers[0]; + expect(provider.address).toBe(VALID_ADDRESS); + expect(provider.totalFaultedPeriods).toBe(10n); + expect(provider.totalProvingPeriods).toBe(100n); + + const proofSet = provider.proofSets[0]; + expect(proofSet.nextDeadline).toBe(1000n); + expect(proofSet.maxProvingPeriod).toBe(100n); + }); + + it("converts string numbers to bigint", () => { + const result = validateProviderDataSetResponse( + makeValidResponse([ + makeValidProvider({ + totalFaultedPeriods: "999999999999999999", + totalProvingPeriods: "1000000000000000000", + }), + ]), + ); + + expect(typeof result.providers[0].totalFaultedPeriods).toBe("bigint"); + expect(result.providers[0].totalFaultedPeriods).toBe(999999999999999999n); + expect(result.providers[0].totalProvingPeriods).toBe(1000000000000000000n); + }); + + it("accepts an empty providers array", () => { + const result = validateProviderDataSetResponse({ providers: [] }); + expect(result.providers).toEqual([]); + }); + + it("accepts a provider with empty proofSets", () => { + const result = validateProviderDataSetResponse(makeValidResponse([makeValidProvider({ proofSets: [] })])); + expect(result.providers[0].proofSets).toEqual([]); + }); + + it("preserves unknown fields (schema uses .unknown(true))", () => { + const result = validateProviderDataSetResponse(makeValidResponse([makeValidProvider({ extraField: "hello" })])); + expect((result.providers[0] as Record).extraField).toBe("hello"); + }); + + it("throws on missing providers field", () => { + expect(() => validateProviderDataSetResponse({})).toThrow("Invalid provider dataset response format"); + }); + + it("throws on null input", () => { + expect(() => validateProviderDataSetResponse(null)).toThrow("Invalid provider dataset response format"); + }); + + it("throws on missing required provider fields", () => { + expect(() => + validateProviderDataSetResponse({ + providers: [{ address: VALID_ADDRESS }], + }), + ).toThrow("Invalid provider dataset response format"); + }); + + it("throws on invalid Ethereum address", () => { + expect(() => + validateProviderDataSetResponse(makeValidResponse([makeValidProvider({ address: "not-an-address" })])), + ).toThrow("Invalid provider dataset response format"); + }); + + it("throws on non-numeric string for bigint fields", () => { + expect(() => + validateProviderDataSetResponse(makeValidResponse([makeValidProvider({ totalFaultedPeriods: "abc" })])), + ).toThrow("Invalid provider dataset response format"); + }); + + it("throws on negative number string for bigint fields", () => { + expect(() => + validateProviderDataSetResponse(makeValidResponse([makeValidProvider({ totalFaultedPeriods: "-1" })])), + ).toThrow("Invalid provider dataset response format"); + }); + + it("throws on missing proofSet fields", () => { + expect(() => + validateProviderDataSetResponse( + makeValidResponse([ + makeValidProvider({ + proofSets: [{ totalFaultedPeriods: "1" }], + }), + ]), + ), + ).toThrow("Invalid provider dataset response format"); + }); + + it("validates multiple providers in a single response", () => { + const provider1 = makeValidProvider({ address: VALID_ADDRESS, totalFaultedPeriods: "5" }); + const provider2 = makeValidProvider({ + address: "0xAb5801a7D398351b8bE11C439e05C5B3259aeC9B", + totalFaultedPeriods: "15", + }); + + const result = validateProviderDataSetResponse(makeValidResponse([provider1, provider2])); + + expect(result.providers).toHaveLength(2); + expect(result.providers[0].totalFaultedPeriods).toBe(5n); + expect(result.providers[1].totalFaultedPeriods).toBe(15n); + }); + + it("handles zero values correctly", () => { + const result = validateProviderDataSetResponse( + makeValidResponse([ + makeValidProvider({ + totalFaultedPeriods: "0", + totalProvingPeriods: "0", + proofSets: [ + { + nextDeadline: "0", + maxProvingPeriod: "0", + }, + ], + }), + ]), + ); + + expect(result.providers[0].totalFaultedPeriods).toBe(0n); + expect(result.providers[0].totalProvingPeriods).toBe(0n); + expect(result.providers[0].proofSets[0].maxProvingPeriod).toBe(0n); + }); +}); + +describe("validateSubgraphMetaResponse", () => { + it("validates a well-formed subgraph meta response", () => { + const input = { + _meta: { + block: { + number: 12345, + }, + }, + }; + + const result = validateSubgraphMetaResponse(input); + + expect(result._meta.block.number).toBe(12345); + }); + + it("accepts large block numbers", () => { + const input = { + _meta: { + block: { + number: 999999999, + }, + }, + }; + + const result = validateSubgraphMetaResponse(input); + + expect(result._meta.block.number).toBe(999999999); + }); + + it("accepts numeric strings block number", () => { + const result = validateSubgraphMetaResponse({ + _meta: { + block: { + number: "12345", + }, + }, + }); + + expect(result._meta.block.number).toBe(12345); + }); + + it("throws on missing _meta field", () => { + expect(() => validateSubgraphMetaResponse({})).toThrow("Invalid subgraph meta response format"); + }); + + it("throws on missing block field", () => { + expect(() => + validateSubgraphMetaResponse({ + _meta: {}, + }), + ).toThrow("Invalid subgraph meta response format"); + }); + + it("throws on missing number field", () => { + expect(() => + validateSubgraphMetaResponse({ + _meta: { + block: {}, + }, + }), + ).toThrow("Invalid subgraph meta response format"); + }); + + it("throws on null input", () => { + expect(() => validateSubgraphMetaResponse(null)).toThrow("Invalid subgraph meta response format"); + }); + + it("throws on undefined input", () => { + expect(() => validateSubgraphMetaResponse(undefined)).toThrow("Invalid subgraph meta response format"); + }); + + it("throws on negative block number", () => { + expect(() => + validateSubgraphMetaResponse({ + _meta: { + block: { + number: -1, + }, + }, + }), + ).toThrow("Invalid subgraph meta response format"); + }); + + it("throws on floating point block number", () => { + expect(() => + validateSubgraphMetaResponse({ + _meta: { + block: { + number: 123.45, + }, + }, + }), + ).toThrow("Invalid subgraph meta response format"); + }); +}); diff --git a/apps/backend/src/subgraph/types.ts b/apps/backend/src/subgraph/types.ts new file mode 100644 index 00000000..3a89f360 --- /dev/null +++ b/apps/backend/src/subgraph/types.ts @@ -0,0 +1,252 @@ +import Joi from "joi"; +import { CID } from "multiformats/cid"; +import { Hex, isAddress } from "viem"; + +// ----------------------------------------- +// Types +// ----------------------------------------- + +/** The response from the subgraph GraphQL query */ +export type GraphQLResponse = { + /** The data from the query */ + data?: unknown; + /** The errors from the query */ + errors?: { message: string }[]; +}; + +/** + * Options for fetching providers with data sets + */ +export type ProvidersWithDataSetsOptions = { + addresses: string[]; + blockNumber: number; +}; + +/** + * Validated response from the PDP subgraph meta query. + */ +export type SubgraphMeta = { + _meta: { + block: { + number: number; + }; + }; +}; + +/** + * A single proof set within a provider, representing deadline-related proving data. + * All numeric fields are bigints converted from the subgraph string representation. + */ +export type DataSet = { + nextDeadline: bigint; + maxProvingPeriod: bigint; +}; + +/** + * Validated and transformed response from the PDP subgraph providers query. + * Numeric fields are converted from subgraph string representation to bigint. + */ +export type ProviderDataSetResponse = { + providers: { + address: Hex; + totalFaultedPeriods: bigint; + totalProvingPeriods: bigint; + proofSets: DataSet[]; + }[]; +}; + +/** A piece eligible for anonymous retrieval. */ +export type AnonCandidatePiece = { + /** Decoded piece CID string (e.g. "bafk..."). */ + pieceCid: string; + /** On-chain piece ID (rootId) as a decimal string. */ + pieceId: string; + /** On-chain dataset ID (setId) as a decimal string. */ + dataSetId: string; + /** Raw piece size in bytes, as a decimal string. */ + rawSize: string; + /** True iff the parent dataset declared withIPFSIndexing metadata. */ + withIPFSIndexing: boolean; + /** IPFS root CID declared by the client when uploading, or null. */ + ipfsRootCid: string | null; + /** Subgraph-indexed block number at query time. */ + indexedAtBlock: number; + /** pdpPaymentEndEpoch from the parent dataset, or null. */ + pdpPaymentEndEpoch: bigint | null; +}; + +/** + * Validated raw shape of the anonymous piece sampling subgraph response. + * At most one root is returned (`first: 1`). + */ +export type RawSampleAnonPieceResponse = { + _meta: { block: { number: number } }; + roots: Array<{ + rootId: string; + cid: string; + rawSize: string; + ipfsRootCID: string | null; + proofSet: { + setId: string; + withIPFSIndexing: boolean; + fwssPayer: string | null; + pdpPaymentEndEpoch: string | null; + }; + }>; +}; + +// ----------------------------------------- +// Helpers +// ----------------------------------------- + +/** + * Decodes a hex-encoded CID (0x...) into its string representation. + */ +export function decodePieceCid(hexData: string): string { + const bytes = Buffer.from(hexData.slice(2), "hex"); + return CID.decode(new Uint8Array(bytes)).toString(); +} + +// ----------------------------------------- +// Joi Custom Schema Converters +// ----------------------------------------- + +/** Joi custom validator that converts a numeric string to bigint. */ +const toBigInt = (value: unknown, helpers: Joi.CustomHelpers) => { + try { + return BigInt(value as string); + } catch { + return helpers.error("any.invalid", { + message: "Invalid bigint value", + }); + } +}; + +/** Joi custom validator to validate an Ethereum address and normalize to lowercase. */ +const toEthereumAddress = (value: unknown, helpers: Joi.CustomHelpers) => { + if (!isAddress(value as string)) { + return helpers.error("any.invalid", { message: "Invalid Ethereum address" }); + } + + // Normalize to lowercase for consistent key lookups + return (value as string).toLowerCase() as Hex; +}; + +// ----------------------------------------- +// Joi Schemas +// ----------------------------------------- + +const metaSchema = Joi.object({ + _meta: Joi.object({ + block: Joi.object({ + number: Joi.number().integer().positive().required(), + }) + .unknown(true) + .required(), + }) + .unknown(true) + .required(), +}) + .unknown(true) + .required(); + +const dataSetSchema = Joi.object({ + nextDeadline: Joi.string().pattern(/^\d+$/).required().custom(toBigInt), + maxProvingPeriod: Joi.string().pattern(/^\d+$/).required().custom(toBigInt), +}).unknown(true); + +const providerDataSetResponseSchema = Joi.object({ + providers: Joi.array() + .items( + Joi.object({ + address: Joi.string().required().custom(toEthereumAddress), + totalFaultedPeriods: Joi.string().pattern(/^\d+$/).required().custom(toBigInt), + totalProvingPeriods: Joi.string().pattern(/^\d+$/).required().custom(toBigInt), + proofSets: Joi.array().items(dataSetSchema).required(), + }).unknown(true), + ) + .required(), +}) + .unknown(true) + .required(); + +const sampleRootProofSetSchema = Joi.object({ + setId: Joi.string().pattern(/^\d+$/).required(), + withIPFSIndexing: Joi.boolean().required(), + fwssPayer: Joi.string() + .pattern(/^0x[0-9a-fA-F]{40}$/) + .allow(null) + .optional(), + pdpPaymentEndEpoch: Joi.string().pattern(/^\d+$/).allow(null).optional(), +}).unknown(true); + +const sampleRootSchema = Joi.object({ + rootId: Joi.string().pattern(/^\d+$/).required(), + cid: Joi.string() + .pattern(/^0x[0-9a-fA-F]+$/) + .required(), + rawSize: Joi.string().pattern(/^\d+$/).required(), + ipfsRootCID: Joi.string().allow(null).optional(), + proofSet: sampleRootProofSetSchema.required(), +}).unknown(true); + +const sampleAnonPieceResponseSchema = Joi.object({ + _meta: Joi.object({ + block: Joi.object({ + number: Joi.number().integer().positive().required(), + }) + .unknown(true) + .required(), + }) + .unknown(true) + .required(), + roots: Joi.array().items(sampleRootSchema).max(1).required(), +}) + .unknown(true) + .required(); + +// ----------------------------------------- +// Validator Functions +// ----------------------------------------- + +/** + * Validates a raw subgraph meta response into SubgraphMeta. + * + * @param value - The raw parsed JSON from the subgraph + * @throws Error if validation fails + */ +export function validateSubgraphMetaResponse(value: unknown): SubgraphMeta { + const { error, value: validated } = metaSchema.validate(value, { abortEarly: false }); + if (error) { + throw new Error(`Invalid subgraph meta response format: ${error.message}`); + } + return validated as SubgraphMeta; +} + +/** + * Validates and transforms a raw subgraph response into ProviderDataSetResponse. + * Converts string fields to bigint. + * + * @param value - The raw parsed JSON from the subgraph + * @throws Error if validation fails + */ +export function validateProviderDataSetResponse(value: unknown): ProviderDataSetResponse { + const { error, value: validated } = providerDataSetResponseSchema.validate(value, { abortEarly: false }); + if (error) { + throw new Error(`Invalid provider dataset response format: ${error.message}`); + } + return validated as ProviderDataSetResponse; +} + +/** + * Validates the raw sampleAnonPiece response from the subgraph. + * + * @throws Error if validation fails + */ +export function validateSampleAnonPieceResponse(value: unknown): RawSampleAnonPieceResponse { + const { error, value: validated } = sampleAnonPieceResponseSchema.validate(value, { abortEarly: false }); + if (error) { + throw new Error(`Invalid sampleAnonPiece response format: ${error.message}`); + } + return validated as RawSampleAnonPieceResponse; +} diff --git a/docs/checks/README.md b/docs/checks/README.md index 74b1a872..136349ee 100644 --- a/docs/checks/README.md +++ b/docs/checks/README.md @@ -4,6 +4,7 @@ The files are: - [production-configuration-and-approval-methodology.md](./production-configuration-and-approval-methodology.md): Defines the production configuration and approval methodology. - [data-storage.md](./data-storage.md): Defines the "data storage check" and how it is calculated. - [retrievals.md](./retrievals.md): Defines the "retrieval check" and how it is calculated. +- [anon-retrievals.md](./anon-retrievals.md): Defines the "anonymous retrieval check" (sampled public pieces, not dealbot-uploaded) and how it is calculated. - [data-retention.md](./data-retention.md): Defines the "data retention check" and how it is calculated. - [events-and-metrics.md](./events-and-metrics.md): Defines the events and metrics that are used to assess SP performance. @@ -14,7 +15,7 @@ DealBot creates synthetic traffic for SPs in the onchain SP registry and monitor ## Terminology ### Check -A "check" refers to a task type that dealbot performs on a SP. We currently have [Data Storage](./data-storage.md) and [Retrieval](./retrievals.md) checks. +A "check" refers to a task type that dealbot performs on an SP. We currently have [Data Storage](./data-storage.md), [Retrieval](./retrievals.md), [Anonymous Retrieval](./anon-retrievals.md), and [Data Retention](./data-retention.md) checks. ### Deal This is synonym for "Data Storage Check". This is covered in the [data-storage.md](./data-storage.md). diff --git a/docs/checks/anon-retrievals.md b/docs/checks/anon-retrievals.md new file mode 100644 index 00000000..c3b69610 --- /dev/null +++ b/docs/checks/anon-retrievals.md @@ -0,0 +1,145 @@ +# Anonymous Retrieval Check + +This document is the **source of truth** for how dealbot's Anonymous Retrieval check works. + +Source code links throughout this document point to the current implementation. + +For event and metric definitions to be used by the dashboard, see [Dealbot Events & Metrics](./events-and-metrics.md). + +## Overview + +The Anonymous Retrieval check (sometimes referred to internally as [retrieval++](https://github.com/FilOzone/dealbot/pull/427)) tests publicly discoverable pieces on a storage provider (pieces that were *not* uploaded by dealbot). The intent is to measure SP retrievability against real-world tenant data, not just dealbot's own corpus. + +This is distinct from the [Retrieval check](./retrievals.md), which exercises pieces dealbot itself uploaded as part of a [Data Storage check](./data-storage.md). The Anonymous Retrieval check answers a different question: does the SP serve arbitrary pieces from its broader public corpus, with the same correctness and performance properties as dealbot's controlled pieces? + +### Definition of Successful Retrieval + +A successful anonymous retrieval requires: + +1. **Piece fetch** — `GET {spBaseUrl}/piece/{pieceCid}` returns HTTP 2xx and the response bytes hash to the declared CommP (piece CID). + +If the piece advertises IPFS indexing (`withIPFSIndexing = true` and a non-null `ipfsRootCid`), three additional dimensions are validated *independently*. Importantly, they do not gate the overall `piece_fetch_status`, and each is recorded as its own outcome column / metric: + +2. **CAR parseable:** the fetched bytes parse as a CAR file. +3. **IPNI:** the SP is advertised as a provider for the root CID and a sample of child CIDs via filecoinpin.contact. +4. **Block fetch:** a sample of CIDs from the parsed CAR is re-fetched via `{spBaseUrl}/ipfs/{cid}?format=raw` and each response is hash-verified against its declared CID. + +A piece without IPFS indexing is exercised only at step (1). + +Operational timeouts exist to prevent jobs from running indefinitely. If the job exceeds `ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS`, it is aborted; a row is still emitted so that partial metrics (TTFB, bytes, response code) are not lost. + +## Piece Selection + +Unlike the [Retrieval check](./retrievals.md#piece-selection), dealbot does not retrieve from its own deals. Pieces are sampled from the [on-chain subgraph](../../src/subgraph) of all FWSS-served pieces for the SP under test. + +Selection strategy (per scheduled job, per SP): + +1. **Pick a size bucket** by weighted random: + - `small` (1–20 MiB) — 20% + - `medium` (20–100 MiB) — 50% + - `large` (100–500 MiB) — 30% +2. **Pick a pool**: + - `indexed` (IPFS-indexed pieces) — 80% + - `any` (all FWSS pieces) — 20% +3. **Generate a uniform-random `sampleKey`** and query the subgraph for the smallest `Root.sampleKey ≥ $sampleKey` matching the SP, payer, size range, and pool filters. +4. **Drop the candidate** if `pdpPaymentEndEpoch` has passed. +5. **Fall back** through: (same bucket, opposite pool) → (any bucket, indexed) → (any bucket, any). + +The 80/20 split for `indexed` vs `any` exists so that SPs cannot optimize only their CAR corpus and still appear healthy on this check. + +> [!NOTE] +> The bucket sizes were chosen such that the whole file will still fit into memory. In the future we may implement a streaming verification and parsing. + +Source: [`anon-piece-selector.service.ts`](../../apps/backend/src/retrieval-anon/anon-piece-selector.service.ts) + +## What Happens Each Cycle + +```mermaid +flowchart TD + Select["Sample anonymous piece for SP from subgraph"] --> Fetch["GET /piece/{pieceCid}"] + Fetch --> CommP["Hash bytes → verify CommP"] + CommP --> HasIpfs{"piece.withIPFSIndexing
and ipfsRootCid?"} + HasIpfs -- "no" --> Record["Persist Clickhosue row + emit Prometheus metrics"] + HasIpfs -- "yes" --> ParseCar["Parse bytes as CAR"] + ParseCar --> SampleBlocks["Pick N random CIDs
(ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT)"] + SampleBlocks --> Ipni["IPNI: verify SP advertises root + sampled CIDs"] + SampleBlocks --> BlockFetch["GET /ipfs/{cid}?format=raw for each sampled CID"] + BlockFetch --> HashCheck["Hash-verify each response against its CID"] + Ipni --> Record + HashCheck --> Record +``` + +### Piece Fetch + +- **URL:** `{spBaseUrl}/piece/{pieceCid}` (HTTP/2) +- **Buffered in memory** — piece sizes are capped at 500 MiB by selection. +- **Validates CommP** — the CommP of the response bytes must match `pieceCid`. + +Source: [`piece-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/piece-retrieval.service.ts) + +### CAR Validation (only when piece advertises IPFS indexing) + +When the selected piece has `withIPFSIndexing = true` and a non-null `ipfsRootCid`, the fetched bytes are parsed as a CAR and a random sample of `ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT` CIDs is exercised: + +- **IPNI check:** `IpniVerificationService.verify(rootCid, sampledCids, sp)` polls filecoinpin.contact until each CID resolves to the SP under test, the timeout fires, or `IPNI_VERIFICATION_TIMEOUT_MS` is reached. +- **Block fetch check:** for each sampled CID, fetch `{spBaseUrl}/ipfs/{cid}?format=raw` and hash-verify the response against the CID. Non-2xx, hash mismatch, unsupported codec, or transport errors all count as a single failed block. + +Source: [`car-validation.service.ts`](../../apps/backend/src/retrieval-anon/car-validation.service.ts) + +## What Gets Asserted + +| # | Assertion | How It's Checked | Retries | Relevant Metric | Implemented? | +|---|-----------|------------------|:---:|------------------|:---:| +| 1 | SP serves the piece | `GET /piece/{pieceCid}` returns HTTP 2xx | 0 | [`anonPieceRetrievalLastByteMs`](./events-and-metrics.md#anonPieceRetrievalLastByteMs) | Yes | +| 2 | Bytes match the declared CommP | Hash of response bytes equals `pieceCid` | 0 | [`anonPieceRetrievalStatus`](./events-and-metrics.md#anonPieceRetrievalStatus) | Yes | +| 3 | Bytes parse as a CAR (IPFS-indexed pieces only) | `@ipld/car` parses the response | 0 | [`anonCarParseStatus`](./events-and-metrics.md#anonCarParseStatus) | Yes | +| 4 | SP is advertised on IPNI for root + sampled CIDs | filecoinpin.contact returns provider records | polling until timeout | [`anonIpniStatus`](./events-and-metrics.md#anonIpniStatus) | Yes | +| 5 | Sampled blocks fetch + hash-verify | `/ipfs/{cid}?format=raw` for each sample | 0 | [`anonBlockFetchStatus`](./events-and-metrics.md#anonBlockFetchStatus) | Yes | + +## Result Recording + +Each anonymous retrieval attempt writes one row to the `anon_retrieval_checks` ClickHouse table. The row is emitted **even on abort or unexpected error** so that the partial evidence (TTFB, bytes, response code) is preserved. + +The DDL and column-level comments in [`clickhouse.schema.ts`](../../apps/backend/src/clickhouse/clickhouse.schema.ts) are authoritative. The summary below is for orientation. + +| Column | Meaning | +|--------|---------| +| `timestamp` | When the check started (ms UTC) | +| `probe_location` | Dealbot probe location (`DEALBOT_PROBE_LOCATION`) | +| `sp_address`, `sp_id`, `sp_name` | SP identity | +| `retrieval_id` | Per-event UUID; correlates row to logs and Prometheus | +| `piece_cid`, `data_set_id`, `piece_id`, `raw_size` | Sampled piece identity | +| `with_ipfs_indexing`, `ipfs_root_cid` | Whether the piece advertises IPNI metadata | +| `service_type` | Always `direct_sp` today | +| `retrieval_endpoint` | URL probed for piece fetch | +| `piece_fetch_status` | `success` or `failed` — outcome of `/piece/{cid}` (HTTP 2xx **and** CommP match). CAR/IPNI/block-fetch outcomes live in their own columns and do **not** flip this status. | +| `http_response_code` | Raw HTTP status; null on transport failure | +| `first_byte_ms`, `last_byte_ms`, `bytes_retrieved`, `throughput_bps` | Piece-fetch performance | +| `commp_valid` | Null when retrieval failed before CommP could be hashed | +| `car_parseable`, `car_block_count` | Null when CAR validation was skipped (no IPFS indexing or piece fetch failed) | +| `block_fetch_endpoint`, `block_fetch_valid`, `block_fetch_sampled_count`, `block_fetch_failed_count` | Block-fetch outcomes; null when skipped | +| `ipni_status` | `valid` \| `invalid` \| `skipped` \| `error` | +| `ipni_verify_ms`, `ipni_verified_cids_count`, `ipni_unverified_cids_count` | IPNI check details | +| `error_message` | Failure reason; null on success | + +Source: [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) + +## Metrics Recorded + +Anonymous-retrieval Prometheus metric definitions live in [Dealbot Events & Metrics](./events-and-metrics.md). All anon-retrieval metrics carry `checkType=anon_retrieval`. + +## Configuration + +Key environment variables that control anonymous retrieval testing: + +| Variable | Description | +|----------|-------------| +| `RETRIEVALS_ANON_PER_SP_PER_HOUR` | Anonymous retrieval rate per SP. Falls back to `RETRIEVALS_PER_SP_PER_HOUR` when unset. | +| `ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS` | Max end-to-end anon retrieval job runtime before forced abort (default 360s). | +| `ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT` | Number of CIDs sampled from the parsed CAR for IPNI + block-fetch verification (default 5, max 50). | +| `IPNI_VERIFICATION_TIMEOUT_MS` | Max time to wait for IPNI provider verification (shared with the Retrieval check). | +| `IPNI_VERIFICATION_POLLING_MS` | Poll interval between IPNI verification attempts (shared). | +| `CONNECT_TIMEOUT_MS` | Connection/header timeout for HTTP requests. | +| `HTTP2_REQUEST_TIMEOUT_MS` | Total timeout for HTTP/2 retrieval requests. | + +See also: [`docs/environment-variables.md`](../environment-variables.md) for the full configuration reference. diff --git a/docs/checks/events-and-metrics.md b/docs/checks/events-and-metrics.md index 6c461f7f..37761e89 100644 --- a/docs/checks/events-and-metrics.md +++ b/docs/checks/events-and-metrics.md @@ -6,6 +6,16 @@ This document is the intended **source of truth** for the events emitted by deal ## Data Storage Event Model +The [Anonymous Retrieval check](./anon-retrievals.md) is a single-shot flow per piece: select → fetch piece → (optional) parse CAR + IPNI + block fetch → write one ClickHouse row. + +It is not modeled as a sequence of named lifecycle events. Instead it emits: + +- **Outcome metrics** when each step completes — see the [time](#time-related-metrics) and [status](#status-count-related-metrics) metric tables for `anonPieceRetrievalFirstByteMs`, `anonRetrievalCheckMs`, `anonPieceRetrievalStatus`, `anonCarParseStatus`, `anonIpniStatus`, `anonBlockFetchStatus`, and friends. +- **One row per attempt** in the `anon_retrieval_checks` [ClickHouse table](#clickhouse-tables), emitted even on abort or unexpected error. +- **Structured log lines** (`anon_retrieval_started`, `anon_retrieval_completed`, `anon_retrieval_no_piece`, `anon_retrieval_car_validation_failed`, `anon_retrieval_clickhouse_insert_failed`) carrying a `retrievalId` so each row can be joined back to log evidence. + +## Data Storage Event Model + Below are the sequence of events for a [Data Storage check](./data-storage.md). The Data Storage flow is used because it encapsulates a [Retrieval check](./retrievals.md) as well. ### Data Storage Event Timeline @@ -87,6 +97,10 @@ sequenceDiagram | `dataStorageCheckMs` | Data Storage | [`uploadToSpStart`](#uploadToSpStart) | [`ipfsRetrievalIntegrityChecked`](#ipfsRetrievalIntegrityChecked) | Duration of a Data Storage check | | | `retrievalCheckMs` | Retrieval | Retrieval check start | [`ipfsRetrievalIntegrityChecked`](#ipfsRetrievalIntegrityChecked) | Duration of a Retrieval check | | | `dataSetCreationMs` | Data-Set Creation | Data-set creation uploadToSpStart | Data-set creation pieceConfirmed | Duration of one data-set creation with confirmed piece (all using `createDataSetWithPiece`) | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | +| `anonPieceRetrievalFirstByteMs` | Anonymous Retrieval | Piece fetch start | First byte received from `/piece/{pieceCid}` | Time to first byte for anonymous piece retrievals | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonPieceRetrievalLastByteMs` | Anonymous Retrieval | Piece fetch start | Last byte received from `/piece/{pieceCid}` | Total time to retrieve an anonymous piece | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonPieceRetrievalThroughputBps` | Anonymous Retrieval | n/a | n/a | `(bytesRetrieved / anonPieceRetrievalLastByteMs) * 1000` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonRetrievalCheckMs` | Anonymous Retrieval | Anon retrieval check start | After CAR/IPNI/block-fetch validation completes (or on abort) | End-to-end anonymous retrieval check duration | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | ### Status Count Related Metrics @@ -106,6 +120,11 @@ sequenceDiagram | `dataSetCreationStatus` | Data-Set Creation | Not tied to an [event above](#event-list) but rather to data-set creation start (`pending`) and completion (`success`/`failure.*`) | `pending`, `success`, `failure.timedout`, `failure.other` | [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts) | | `dataSetChallengeStatus` | Data Retention | Emitted on each [Data Retention Check](./data-retention.md) poll when a provider's confirmed proving-period totals advance (strictly positive deltas). Unit: **challenges** (period delta × `CHALLENGES_PER_PROVING_PERIOD = 5`). | `success` (challenges in successfully-proven periods), `failure` (challenges in faulted periods) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | | `pdp_provider_estimated_overdue_periods` | Data Retention | Emitted on every [Data Retention Check](./data-retention.md) poll for every successfully processed provider. | Gauge value in proving periods (non-negative integer) | [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts) | +| `anonPieceRetrievalStatus` | Anonymous Retrieval | After piece fetch completes (or on abort) | `success` (HTTP 2xx **and** CommP matches), `failure.http`, `failure.commp` (HTTP 2xx but bytes hashed to a different CID), `failure.timedout`, `failure.no_piece`. | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonPieceHttpResponseCode` | Anonymous Retrieval | After piece fetch completes | `200`, `500`, `2xxSuccess`, `4xxClientError`, `5xxServerError`, `otherHttpStatusCodes`, `failure` (same classifier as [`ipfsRetrievalHttpResponseCode`](#ipfsRetrievalHttpResponseCode)) | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonCarParseStatus` | Anonymous Retrieval | After CAR validation runs (skipped when piece fetch failed or piece is not IPFS-indexed) | `parseable`, `not_parseable` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonIpniStatus` | Anonymous Retrieval | After CAR validation runs, **or** when piece fetch failed (records `skipped`) | `valid`, `invalid`, `skipped`, `error` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | +| `anonBlockFetchStatus` | Anonymous Retrieval | After block-fetch sampling runs, **or** when piece fetch failed (records `skipped`) | `valid`, `invalid`, `skipped`, `error` | [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts) | ## ClickHouse Tables @@ -115,6 +134,7 @@ When `CLICKHOUSE_URL` is configured, dealbot writes one row per check result to - **`data_storage_checks`** — one row written each time a deal is saved (on every status transition). Populated by [`deal.service.ts`](../../apps/backend/src/deal/deal.service.ts). - **`retrieval_checks`** — one row per retrieval attempt. Populated by [`retrieval.service.ts`](../../apps/backend/src/retrieval/retrieval.service.ts). +- **`anon_retrieval_checks`** — one row per [Anonymous Retrieval check](./anon-retrievals.md) attempt; emitted even on abort or unexpected error. Populated by [`anon-retrieval.service.ts`](../../apps/backend/src/retrieval-anon/anon-retrieval.service.ts). See [Anonymous Retrieval § Result Recording](./anon-retrievals.md#result-recording) for column-level meanings. - **`data_retention_challenges`** — one row per provider per poll cycle. Populated by [`data-retention.service.ts`](../../apps/backend/src/data-retention/data-retention.service.ts). All tables share the primary key `(probe_location, sp_address, timestamp)`: diff --git a/docs/checks/production-configuration-and-approval-methodology.md b/docs/checks/production-configuration-and-approval-methodology.md index 5566904d..2e89a45d 100644 --- a/docs/checks/production-configuration-and-approval-methodology.md +++ b/docs/checks/production-configuration-and-approval-methodology.md @@ -41,7 +41,7 @@ Relevant parameters include: | Parameter | Value | Notes | |-----------|-------|-------| | [`PDP_SUBGRAPH_ENDPOINT`](../environment-variables.md#pdp_subgraph_endpoint) | TODO: fill this in | Uses the subgraph from [pdp-explorer](https://github.com/FilOzone/pdp-explorer). | -| [`MIN_NUM_DATASETS_FOR_CHECKS`](../environment-variables.md#dataset-configuration) | 15 | Ensure there are enough datasets with pieces being added so that statistical significance for [Data Retention Fault Rate](#data-retention-fault-rate) can be achieved quicker. Note that on mainnet each dataset incurs 5 challenges[^1] per daily proof[^2]. With this many datasets, an SP can be approved for data retention after a faultless ~7 days even if the SP doesn't have other datasets. | +| [`MIN_NUM_DATASETS_FOR_CHECKS`](../environment-variables.md#dataset-configuration) | 15 | Ensure there are enough datasets with pieces being added so that statistical significance for [Data Retention Fault Rate](#data-retention-fault-rate) can be achieved quicker. Note that on mainnet each dataset incurs 5 challenges[^1] per daily proof[^2]. With this many datasets, an SP can be approved for data retention after a faultless ~7 days even if the SP doesn't have other datasets. | See [How are data retention statistics/thresholds calculated?](#how-are-data-retention-statisticsthresholds-calculated) for more details. diff --git a/docs/environment-variables.md b/docs/environment-variables.md index 359d86da..72fadca0 100644 --- a/docs/environment-variables.md +++ b/docs/environment-variables.md @@ -8,10 +8,10 @@ This document provides a comprehensive guide to all environment variables used b | ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------ | | [Application](#application-configuration) | `NODE_ENV`, `DEALBOT_PORT`, `DEALBOT_HOST`, `DEALBOT_RUN_MODE`, `DEALBOT_METRICS_PORT`, `DEALBOT_METRICS_HOST`, `DEALBOT_ALLOWED_ORIGINS`, `ENABLE_DEV_MODE` | | [Database](#database-configuration) | `DATABASE_HOST`, `DATABASE_PORT`, `DATABASE_POOL_MAX`, `DATABASE_USER`, `DATABASE_PASSWORD`, `DATABASE_NAME` | -| [Blockchain](#blockchain-configuration) | `NETWORK`, `RPC_URL`, `WALLET_ADDRESS`, `WALLET_PRIVATE_KEY`, `SESSION_KEY_PRIVATE_KEY`, `CHECK_DATASET_CREATION_FEES`, `USE_ONLY_APPROVED_PROVIDERS`, `PDP_SUBGRAPH_ENDPOINT` | +| [Blockchain](#blockchain-configuration) | `NETWORK`, `RPC_URL`, `WALLET_ADDRESS`, `WALLET_PRIVATE_KEY`, `SESSION_KEY_PRIVATE_KEY`, `CHECK_DATASET_CREATION_FEES`, `USE_ONLY_APPROVED_PROVIDERS`, `PDP_SUBGRAPH_ENDPOINT`, `SUBGRAPH_ENDPOINT` | | [Dataset Versioning](#dataset-versioning) | `DEALBOT_DATASET_VERSION` | | [Scheduling](#scheduling-configuration) | `PROVIDERS_REFRESH_INTERVAL_SECONDS`, `DATA_RETENTION_POLL_INTERVAL_SECONDS`, `DEALBOT_MAINTENANCE_WINDOWS_UTC`, `DEALBOT_MAINTENANCE_WINDOW_MINUTES` | -| [Jobs (pg-boss)](#jobs-pg-boss) | `DEALBOT_PGBOSS_SCHEDULER_ENABLED`, `DEALBOT_PGBOSS_POOL_MAX`, `DEALS_PER_SP_PER_HOUR`, `DATASET_CREATIONS_PER_SP_PER_HOUR`, `RETRIEVALS_PER_SP_PER_HOUR`, `JOB_SCHEDULER_POLL_SECONDS`, `JOB_WORKER_POLL_SECONDS`, `PG_BOSS_LOCAL_CONCURRENCY`, `JOB_CATCHUP_MAX_ENQUEUE`, `JOB_SCHEDULE_PHASE_SECONDS`, `JOB_ENQUEUE_JITTER_SECONDS`, `DEAL_JOB_TIMEOUT_SECONDS`, `RETRIEVAL_JOB_TIMEOUT_SECONDS`, `IPFS_BLOCK_FETCH_CONCURRENCY` | +| [Jobs (pg-boss)](#jobs-pg-boss) | `DEALBOT_PGBOSS_SCHEDULER_ENABLED`, `DEALBOT_PGBOSS_POOL_MAX`, `DEALS_PER_SP_PER_HOUR`, `DATASET_CREATIONS_PER_SP_PER_HOUR`, `RETRIEVALS_PER_SP_PER_HOUR`, `RETRIEVALS_ANON_PER_SP_PER_HOUR`, `JOB_SCHEDULER_POLL_SECONDS`, `JOB_WORKER_POLL_SECONDS`, `PG_BOSS_LOCAL_CONCURRENCY`, `JOB_CATCHUP_MAX_ENQUEUE`, `JOB_SCHEDULE_PHASE_SECONDS`, `JOB_ENQUEUE_JITTER_SECONDS`, `DEAL_JOB_TIMEOUT_SECONDS`, `RETRIEVAL_JOB_TIMEOUT_SECONDS`, `ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS`, `ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT`, `IPFS_BLOCK_FETCH_CONCURRENCY` | | [Dataset](#dataset-configuration) | `DEALBOT_LOCAL_DATASETS_PATH`, `RANDOM_PIECE_SIZES` | | [ClickHouse](#clickhouse-configuration) | `CLICKHOUSE_URL`, `CLICKHOUSE_BATCH_SIZE`, `CLICKHOUSE_FLUSH_INTERVAL_MS`, `DEALBOT_PROBE_LOCATION` | | [Timeouts](#timeout-configuration) | `CONNECT_TIMEOUT_MS`, `HTTP_REQUEST_TIMEOUT_MS`, `HTTP2_REQUEST_TIMEOUT_MS`, `IPNI_VERIFICATION_TIMEOUT_MS`, `IPNI_VERIFICATION_POLLING_MS` | @@ -433,9 +433,11 @@ Session keys are scoped (only storage operations, not deposits or withdrawals) a **Role**: The Graph API endpoint for querying PDP (Proof of Data Possession) subgraph data. This endpoint is used to retrieve data retention info for provider data. +This variable is kept distinct from [`SUBGRAPH_ENDPOINT`](#subgraph_endpoint) so the [dealbot-owned subgraph](../../src/subgraph) can be rolled out incrementally. Only the newer [anonymous-retrieval check](./checks/anon-retrievals.md) points at the new endpoint while the established [data-retention check](./checks/data-retention.md) stays on the upstream subgraph. + **When to update**: -- When switching between different Graph API endpoints +- When switching between different Graph API endpoints for the pdp-explorer subgraph. **Example**: @@ -445,6 +447,29 @@ PDP_SUBGRAPH_ENDPOINT=https://api.thegraph.com/subgraphs/filecoin/pdp --- +### `SUBGRAPH_ENDPOINT` + +- **Type**: `string` (URL) +- **Required**: No +- **Default**: Empty string (feature disabled) + +**Role**: The Graph API endpoint for the dealbot-owned subgraph. Currently drives only the [anonymous-retrieval](./checks/anon-retrievals.md) candidate-piece query. Once the dealbot-owned subgraph has soaked in production it is intended to replace [`PDP_SUBGRAPH_ENDPOINT`](#pdp_subgraph_endpoint). + +The dealbot-owned subgraph lives at [`apps/subgraph/`](../apps/subgraph) (package `@dealbot/subgraph`) and is deployed to [Goldsky](https://goldsky.com). + +**When to update**: + +- When swapping between the dealbot-owned subgraph slots on Goldsky (mainnet vs calibnet). +- When deploying a new subgraph version. + +**Example**: + +```bash +SUBGRAPH_ENDPOINT=https://api.goldsky.com/api/public//subgraphs/dealbot-subgraph//gn +``` + +--- + ## Dataset Versioning ### `DEALBOT_DATASET_VERSION` @@ -619,6 +644,19 @@ rate-based (per hour) and persisted in Postgres so restarts do not reset timing. --- +### `RETRIEVALS_ANON_PER_SP_PER_HOUR` + +- **Type**: `number` +- **Required**: No +- **Default**: Falls back to `RETRIEVALS_PER_SP_PER_HOUR`, which itself defaults to `2` +- **Limits**: `0.001` – `20` + +**Role**: Target [anonymous retrieval](./checks/anon-retrievals.md) check rate per storage provider. Anonymous retrievals fetch arbitrary FWSS pieces sampled from the on-chain subgraph (not pieces dealbot uploaded), so this rate controls coverage of the SP's broader public corpus independently of the dealbot-owned [retrieval check](./checks/retrievals.md) rate. + +**Notes**: Fractional values are supported. For example, `0.5` means one anon retrieval every 2 hours per storage provider. + +--- + ### `DATASET_CREATIONS_PER_SP_PER_HOUR` - **Type**: `number` @@ -784,6 +822,50 @@ Use this to stagger multiple dealbot deployments that are not sharing a database **Note**: This is independent of HTTP-level timeouts. The job timeout enforces end-to-end execution time of a Retrieval Check job. +--- + +### `ANON_RETRIEVAL_JOB_TIMEOUT_SECONDS` + +- **Type**: `number` +- **Required**: No +- **Default**: `360` (6 minutes) +- **Minimum**: `60` +- **Enforced**: Yes (config validation) + +**Role**: Maximum runtime for anonymous retrieval jobs before forced abort. Anonymous retrievals fetch arbitrary pieces (up to ~500 MiB) that were not produced by the dealbot, so this is typically larger than `RETRIEVAL_JOB_TIMEOUT_SECONDS`. When the timeout trips, partial metrics (`ttfb_ms`, `bytes_retrieved`, `response_code`) are still persisted so the abort is not silently lost. + +**When to update**: + +- Increase if large pieces are consistently being cut off mid-download +- Decrease to detect and fail stuck retrievals faster + +**Note**: This is independent of HTTP-level timeouts (`CONNECT_TIMEOUT_MS`, `HTTP2_REQUEST_TIMEOUT_MS`). The job timeout covers the end-to-end execution of an Anon Retrieval Check (piece selection, download, CommP validation, CAR/IPNI validation). + +--- + +### `ANON_RETRIEVAL_BLOCK_SAMPLE_COUNT` + +- **Type**: `number` (integer) +- **Required**: No +- **Default**: `5` +- **Minimum**: `1` +- **Maximum**: `50` +- **Enforced**: Yes (config validation) + +**Role**: Number of CIDs randomly sampled from the parsed CAR for IPNI verification and block-fetch validation during an [anonymous retrieval check](./checks/anon-retrievals.md). Only applies to pieces with IPFS indexing enabled — pieces without an `ipfsRootCid` skip CAR validation entirely. + +For each sampled CID, dealbot: + +1. Confirms via filecoinpin.contact that the SP is advertised as a provider for the CID. +2. Re-fetches the block via `{spBaseUrl}/ipfs/{cid}?format=raw` and hash-verifies the response. + +**When to update**: + +- Increase for stronger statistical confidence that the SP serves the entire DAG correctly (more IPNI queries + per-block fetches per check) +- Decrease to reduce per-check load on the SP and on filecoinpin.contact + +**Note**: A higher sample count multiplies both IPNI traffic and block-fetch traffic per check. The IPNI step is all-or-nothing across the root CID and the sampled child CIDs — see [Anonymous Retrieval § CAR Validation](./checks/anon-retrievals.md#car-validation-only-when-piece-advertises-ipfs-indexing). + --- ### `IPFS_BLOCK_FETCH_CONCURRENCY` diff --git a/kustomize/overlays/local/backend-configmap-local.yaml b/kustomize/overlays/local/backend-configmap-local.yaml index 9226d24e..52918aa2 100644 --- a/kustomize/overlays/local/backend-configmap-local.yaml +++ b/kustomize/overlays/local/backend-configmap-local.yaml @@ -27,6 +27,7 @@ data: JOB_WORKER_POLL_SECONDS: "60" RANDOM_PIECE_SIZES: "10485760" PDP_SUBGRAPH_ENDPOINT: "https://api.goldsky.com/api/public/project_cmdfaaxeuz6us01u359yjdctw/subgraphs/pdp-explorer/calibration311a/gn" + SUBGRAPH_ENDPOINT: "https://api.goldsky.com/api/public/project_cmdfaaxeuz6us01u359yjdctw/subgraphs/pdp-explorer/calibration311a/gn" JOB_SCHEDULER_POLL_SECONDS: "60" CLICKHOUSE_URL: "http://default:@dealbot-clickhouse:8123/dealbot" DEALBOT_PROBE_LOCATION: "local"