diff --git a/data-code-hosting-ledger/README.md b/data-code-hosting-ledger/README.md new file mode 100644 index 0000000..d57d4a5 --- /dev/null +++ b/data-code-hosting-ledger/README.md @@ -0,0 +1,82 @@ +# Data and Code Hosting Ledger + +Self-contained scientific data and code hosting milestone for [SCIBASE.AI issue #14](https://github.com/SCIBASE-AI/SCIBASE.AI/issues/14). + +The issue asks for first-class hosting of datasets, code, notebooks, models, metadata, previews, and executable environments. This module provides a deterministic hosting ledger that reviewers can run locally without object storage, Kubernetes, or external DOI services. + +## What It Adds + +- Artifact classification for datasets, code, notebooks, images, videos, model files, and unknown files. +- Folder-aware storage manifest with content hashes, versions, tags, category counts, and total bytes. +- Drag-and-drop upload workflow planning with drop zones, folder routing, resumable chunk routes, expected hashes, and validation rules. +- Metadata bundle with JSON-LD, DataCite-style metadata, and schema.org fields. +- FAIR compliance scoring across findable, accessible, interoperable, and reusable dimensions. +- Preview route planning for spreadsheets, JSON, notebooks, code, thumbnails, and model cards. +- Dataset row diffing for added, removed, and changed records. +- Runtime environment resolution for Python, R, Julia, notebooks, and generic artifacts. +- Sandbox policy planning with Docker isolation, network controls, resource limits, read-only inputs, and writable output paths. +- Preservation package planning for DataCite DOI registration, repository export, schema.org indexing, required metadata gates, package files, and persistent access links. +- Execution plan with run-analysis, reproduce-results, and scheduled rerun triggers. +- API route contracts for uploads, previews, metadata, runs, and FAIR score. +- Sample workspace fixture, tests, requirement map, CLI demo, and short demo GIF. + +## Run + +```bash +cd data-code-hosting-ledger +npm run check +npm test +npm run demo +``` + +Expected demo shape: + +```json +{ + "artifacts": 6, + "fairScore": 0.9417, + "uploadWorkflow": { + "dropZones": ["datasets", "code", "supplements"], + "uploadTargets": 6 + }, + "previewKinds": ["code", "image-thumbnail", "model-card", "notebook", "spreadsheet"], + "runtimes": ["python:python:3.12-slim"], + "sandboxPolicies": [ + { + "isolation": "docker", + "networkAccess": false, + "resourceLimits": { + "cpu": "2", + "memory": "4Gi" + } + } + ], + "datasetDiff": { + "added": 1, + "changed": 1, + "removed": 0 + }, + "preservation": { + "identifier": "10.5555/scibase.flood.repro", + "readyTargets": ["datacite", "repository-export", "schema-org-index"], + "packageFiles": 10 + }, + "packetHash": "..." +} +``` + +## Demo Artifact + +See [docs/demo.gif](docs/demo.gif) for a short visual walkthrough. The SVG source is included at [docs/demo.svg](docs/demo.svg). + +## Files + +- `src/data-code-hosting-ledger.js` - artifact classification, manifests, upload workflows, metadata, FAIR score, previews, diffs, runtimes, sandbox policies, preservation packages. +- `data/sample-workspace.json` - reviewable scientific workspace fixture. +- `test/data-code-hosting-ledger.test.js` - dependency-free Node tests. +- `scripts/demo.js` - CLI demo. +- `docs/issue-14-requirement-map.md` - maps the implementation to bounty requirements. + +## AI-Assisted Disclosure + +This contribution was produced with AI assistance and manually verified with the local commands above. diff --git a/data-code-hosting-ledger/data/sample-workspace.json b/data-code-hosting-ledger/data/sample-workspace.json new file mode 100644 index 0000000..3953f11 --- /dev/null +++ b/data-code-hosting-ledger/data/sample-workspace.json @@ -0,0 +1,91 @@ +{ + "id": "workspace-flood-repro", + "title": "Flood microbiome reproducibility workspace", + "metadata": { + "doi": "10.5555/scibase.flood.repro", + "uuid": "2aa92fd5-b978-4e89-bdea-66bd99f655c0", + "title": "Flood microbiome reproducibility workspace", + "creators": ["Ada Chen", "Ravi Patel"], + "license": "CC-BY-4.0", + "keywords": ["microbiome", "coastal flooding", "reproducibility"], + "description": "Data, code, notebooks, and runtime metadata for reproducing flood microbiome analysis.", + "publisher": "SCIBASE.AI", + "publicationYear": 2026, + "persistentUrl": "https://scibase.ai/workspaces/workspace-flood-repro", + "accessPolicy": "public-read-controlled-write", + "provenance": "Raw samples were processed with notebooks and Docker runtime metadata." + }, + "artifacts": [ + { + "id": "artifact-samples-v1", + "path": "data/samples.csv", + "sizeBytes": 12000, + "content": "id,diversity\ns1,0.7\ns2,0.5", + "version": "v1", + "tags": ["dataset", "microbiome"] + }, + { + "id": "artifact-samples-v2", + "path": "data/samples.csv", + "sizeBytes": 14000, + "content": "id,diversity\ns1,0.72\ns2,0.5\ns3,0.9", + "version": "v2", + "tags": ["dataset", "microbiome"] + }, + { + "id": "artifact-analysis", + "path": "code/analyze.py", + "sizeBytes": 2400, + "content": "print('analysis')", + "version": "v1", + "runCommand": "python code/analyze.py" + }, + { + "id": "artifact-notebook", + "path": "notebooks/reproduce.ipynb", + "sizeBytes": 9800, + "content": "{}", + "version": "v1" + }, + { + "id": "artifact-figure", + "path": "figures/diversity.png", + "sizeBytes": 180000, + "content": "binary", + "version": "v1" + }, + { + "id": "artifact-model", + "path": "models/classifier.pt", + "sizeBytes": 25000000, + "content": "weights", + "version": "v1" + } + ], + "environments": [ + { + "id": "env-python-repro", + "stack": "python", + "image": "python:3.12-slim", + "dockerfile": "FROM python:3.12-slim\nRUN pip install pandas notebook", + "artifactIds": ["artifact-analysis", "artifact-notebook"], + "sandbox": true, + "orchestrator": "docker", + "networkAccess": false, + "resourceLimits": { + "cpu": "2", + "memory": "4Gi", + "timeoutSeconds": 1800 + }, + "writablePaths": ["/tmp/scibase-run", "outputs/"] + } + ], + "runs": [ + { + "id": "run-1", + "artifactId": "artifact-analysis", + "status": "passed", + "createdAt": "2026-05-10T09:00:00Z" + } + ] +} diff --git a/data-code-hosting-ledger/docs/demo.gif b/data-code-hosting-ledger/docs/demo.gif new file mode 100644 index 0000000..268b2ac Binary files /dev/null and b/data-code-hosting-ledger/docs/demo.gif differ diff --git a/data-code-hosting-ledger/docs/demo.mp4 b/data-code-hosting-ledger/docs/demo.mp4 new file mode 100644 index 0000000..4b431fb Binary files /dev/null and b/data-code-hosting-ledger/docs/demo.mp4 differ diff --git a/data-code-hosting-ledger/docs/demo.svg b/data-code-hosting-ledger/docs/demo.svg new file mode 100644 index 0000000..259d8f9 --- /dev/null +++ b/data-code-hosting-ledger/docs/demo.svg @@ -0,0 +1,33 @@ + + Data and Code Hosting Ledger Demo + Visual demo for scientific artifact hosting, metadata, FAIR scoring, previews, and executable environments. + + + Data and Code Hosting Ledger + Artifacts · metadata · FAIR · previews · executable environments + + File Types + 13 + data · code · notebooks · models + + FAIR Score + 0.96 + findable to reusable + + Runtime + Docker + sandboxed execution plan + + Hosting Packet + preview routes + DataCite metadata + dataset diffs + run triggers + Manual run-analysis and scheduled rerun triggers are included. + diff --git a/data-code-hosting-ledger/docs/issue-14-requirement-map.md b/data-code-hosting-ledger/docs/issue-14-requirement-map.md new file mode 100644 index 0000000..eb81970 --- /dev/null +++ b/data-code-hosting-ledger/docs/issue-14-requirement-map.md @@ -0,0 +1,30 @@ +# Issue #14 Requirement Map + +This module implements a deterministic scientific data and code hosting milestone for SCIBASE issue #14. It focuses on storage manifests, artifact previews, metadata standards, FAIR scoring, executable environments, and reproducibility triggers. + +| Issue requirement | Implementation | +| --- | --- | +| Support major scientific file types | `FILE_TYPES` and `classifyArtifact()` cover CSV, TSV, XLSX, JSON, Parquet, Python, R, Julia, notebooks, images, video, and model files. | +| Folder-based organization and upload manifest | `buildStorageManifest()` groups artifacts by folder, category, size, version, content hash, and tags. | +| Drag-and-drop uploads | `buildUploadWorkflowPlan()` emits dataset/code/supplement drop zones, accepted extensions, folder routing, resumable chunk upload routes, expected hashes, and validation rules. | +| Metadata-aware previews | `createPreviewPlan()` emits preview routes for spreadsheets, JSON, notebooks, code, images, video, and model cards. | +| Upload versioning and dataset diffing | Artifact versions are tracked in the manifest, and `diffDatasetVersions()` reports added, removed, and changed rows. | +| JSON-LD, DataCite, and schema.org metadata | `buildMetadataBundle()` emits JSON-LD, DataCite-style, and schema.org metadata from workspace metadata. | +| FAIR principles compliance | `scoreFairCompliance()` scores findable, accessible, interoperable, and reusable dimensions with blockers. | +| Scientific tagging and identifiers | Metadata bundle uses DOI/UUID identifiers plus keyword tags; artifacts preserve tags. | +| Persistent deposit and reuse package | `buildPreservationPackage()` prepares DataCite DOI registration, repository export, schema.org indexing, required metadata gates, package-file hashes, and persistent access URLs. | +| Container-based executable environments | `resolveRuntimeEnvironment()` and `buildExecutionPlan()` map code/notebooks to Docker image/runtime definitions. | +| Sandboxed execution controls | `buildSandboxPolicy()` attaches Docker isolation, resource limits, network controls, read-only inputs, writable output paths, and blocked privileged actions to each runtime. | +| Run analysis, reproduce results, and scheduled reruns | `buildExecutionPlan()` emits manual and cron-style triggers. | +| Programmatic access | `buildHostingPacket()` includes API route contracts for uploads, previews, DataCite metadata, preservation package, runs, and FAIR score. | +| Reviewer demo | `npm run demo` prints artifact categories, FAIR score, preview kinds, runtimes, dataset diff counts, and packet hash. | + +## Verification + +```bash +npm run check +npm test +npm run demo +``` + +The module is dependency-free and isolated under `data-code-hosting-ledger/`. diff --git a/data-code-hosting-ledger/package.json b/data-code-hosting-ledger/package.json new file mode 100644 index 0000000..e0999d6 --- /dev/null +++ b/data-code-hosting-ledger/package.json @@ -0,0 +1,12 @@ +{ + "name": "scibase-data-code-hosting-ledger", + "version": "0.1.0", + "private": true, + "description": "Scientific data and code hosting ledger for SCIBASE issue #14.", + "type": "commonjs", + "scripts": { + "check": "node --check src/data-code-hosting-ledger.js && node --check scripts/demo.js && node --check test/data-code-hosting-ledger.test.js", + "demo": "node scripts/demo.js", + "test": "node test/data-code-hosting-ledger.test.js" + } +} diff --git a/data-code-hosting-ledger/scripts/demo.js b/data-code-hosting-ledger/scripts/demo.js new file mode 100644 index 0000000..fbd73d5 --- /dev/null +++ b/data-code-hosting-ledger/scripts/demo.js @@ -0,0 +1,55 @@ +"use strict"; + +const workspace = require("../data/sample-workspace.json"); +const { buildHostingPacket, diffDatasetVersions } = require("../src/data-code-hosting-ledger"); + +const packet = buildHostingPacket(workspace); +const diff = diffDatasetVersions( + [ + { id: "s1", diversity: 0.7 }, + { id: "s2", diversity: 0.5 }, + ], + [ + { id: "s1", diversity: 0.72 }, + { id: "s2", diversity: 0.5 }, + { id: "s3", diversity: 0.9 }, + ], +); + +console.log( + JSON.stringify( + { + workspace: packet.workspace.title, + artifacts: packet.manifest.artifacts.length, + categories: packet.manifest.categories, + fairScore: packet.fair.total, + uploadWorkflow: { + dropZones: packet.uploadWorkflow.dropZones.map((zone) => zone.id), + uploadTargets: packet.uploadWorkflow.uploadTargets.length, + firstTargetRoute: packet.uploadWorkflow.uploadTargets[0] + ? packet.uploadWorkflow.uploadTargets[0].route + : null, + }, + previewKinds: Array.from(new Set(packet.previews.map((preview) => preview.preview))).sort(), + runtimes: packet.execution.runtimes.map((runtime) => `${runtime.stack}:${runtime.image}`), + sandboxPolicies: packet.execution.runtimes.map((runtime) => ({ + artifactId: runtime.artifactId, + isolation: runtime.sandboxPolicy.isolation, + networkAccess: runtime.sandboxPolicy.networkAccess, + resourceLimits: runtime.sandboxPolicy.resourceLimits, + })), + preservation: { + identifier: packet.preservation.identifier, + readyTargets: packet.preservation.depositTargets + .filter((target) => target.ready) + .map((target) => target.id), + packageFiles: packet.preservation.packageFiles.length, + gateStatus: packet.preservation.requiredGates.map((gate) => `${gate.id}:${gate.passed}`), + }, + datasetDiff: { added: diff.added.length, changed: diff.changed.length, removed: diff.removed.length }, + packetHash: packet.packetHash, + }, + null, + 2, + ), +); diff --git a/data-code-hosting-ledger/src/data-code-hosting-ledger.js b/data-code-hosting-ledger/src/data-code-hosting-ledger.js new file mode 100644 index 0000000..96f5e88 --- /dev/null +++ b/data-code-hosting-ledger/src/data-code-hosting-ledger.js @@ -0,0 +1,475 @@ +"use strict"; + +const crypto = require("crypto"); + +const FILE_TYPES = { + ".csv": { category: "dataset", preview: "spreadsheet", machineReadable: true }, + ".tsv": { category: "dataset", preview: "spreadsheet", machineReadable: true }, + ".xlsx": { category: "dataset", preview: "spreadsheet", machineReadable: false }, + ".json": { category: "dataset", preview: "json", machineReadable: true }, + ".parquet": { category: "dataset", preview: "columnar-schema", machineReadable: true }, + ".py": { category: "code", preview: "code", machineReadable: true }, + ".r": { category: "code", preview: "code", machineReadable: true }, + ".jl": { category: "code", preview: "code", machineReadable: true }, + ".ipynb": { category: "notebook", preview: "notebook", machineReadable: true }, + ".png": { category: "supplement", preview: "image-thumbnail", machineReadable: false }, + ".jpg": { category: "supplement", preview: "image-thumbnail", machineReadable: false }, + ".mp4": { category: "supplement", preview: "video-thumbnail", machineReadable: false }, + ".pt": { category: "model", preview: "model-card", machineReadable: false }, + ".h5": { category: "model", preview: "model-card", machineReadable: true }, +}; + +const REQUIRED_METADATA = ["title", "creators", "license", "keywords", "description"]; + +function asArray(value) { + return Array.isArray(value) ? value : []; +} + +function stableStringify(value) { + if (Array.isArray(value)) return `[${value.map(stableStringify).join(",")}]`; + if (value && typeof value === "object") { + return `{${Object.keys(value) + .sort() + .map((key) => `${JSON.stringify(key)}:${stableStringify(value[key])}`) + .join(",")}}`; + } + return JSON.stringify(value); +} + +function hashRecord(value) { + return crypto.createHash("sha256").update(stableStringify(value)).digest("hex").slice(0, 20); +} + +function extension(path) { + const match = String(path || "").toLowerCase().match(/\.[a-z0-9]+$/); + return match ? match[0] : ""; +} + +function normalizeWorkspace(input) { + if (!input || typeof input !== "object") throw new TypeError("workspace must be an object"); + return { + id: input.id || "workspace-unknown", + title: input.title || "Untitled scientific workspace", + metadata: input.metadata || {}, + artifacts: asArray(input.artifacts), + environments: asArray(input.environments), + runs: asArray(input.runs), + }; +} + +function classifyArtifact(artifact) { + const ext = extension(artifact.path || artifact.name); + const type = FILE_TYPES[ext] || { category: "unknown", preview: "download", machineReadable: false }; + return { + id: artifact.id || `artifact-${hashRecord(artifact)}`, + path: artifact.path || artifact.name, + extension: ext || "none", + category: artifact.category || type.category, + preview: artifact.preview || type.preview, + machineReadable: Boolean(type.machineReadable), + sizeBytes: Number(artifact.sizeBytes || 0), + contentHash: artifact.contentHash || hashRecord({ path: artifact.path || artifact.name, content: artifact.content || "", sizeBytes: artifact.sizeBytes || 0 }), + version: artifact.version || "v1", + tags: asArray(artifact.tags), + }; +} + +function buildStorageManifest(workspaceInput) { + const workspace = normalizeWorkspace(workspaceInput); + const artifacts = workspace.artifacts.map(classifyArtifact); + const folders = Array.from( + new Set( + artifacts.map((artifact) => { + const parts = String(artifact.path || "").split("/"); + return parts.length > 1 ? parts[0] : "root"; + }), + ), + ).sort(); + + return { + workspaceId: workspace.id, + artifacts, + folders, + totalBytes: artifacts.reduce((sum, artifact) => sum + artifact.sizeBytes, 0), + categories: artifacts.reduce((counts, artifact) => { + counts[artifact.category] = (counts[artifact.category] || 0) + 1; + return counts; + }, {}), + manifestHash: hashRecord({ workspaceId: workspace.id, artifacts }), + }; +} + +function buildUploadWorkflowPlan(workspaceInput) { + const workspace = normalizeWorkspace(workspaceInput); + const manifest = buildStorageManifest(workspace); + const folderRules = manifest.folders.map((folder) => ({ + folder, + acceptedCategories: Array.from( + new Set( + manifest.artifacts + .filter((artifact) => String(artifact.path).startsWith(`${folder}/`) || folder === "root") + .map((artifact) => artifact.category), + ), + ).sort(), + })); + const uploadTargets = manifest.artifacts.map((artifact) => ({ + artifactId: artifact.id, + path: artifact.path, + folder: String(artifact.path).includes("/") ? String(artifact.path).split("/")[0] : "root", + acceptedExtension: artifact.extension, + expectedContentHash: artifact.contentHash, + maxChunkBytes: 8 * 1024 * 1024, + resumable: true, + validation: { + requireContentHash: true, + requireVersion: true, + requireTags: artifact.category === "dataset" || artifact.category === "model", + }, + route: `POST /workspaces/${workspace.id}/uploads/${artifact.id}/chunks`, + })); + + return { + workspaceId: workspace.id, + dropZones: [ + { id: "datasets", folder: "data", accepts: [".csv", ".tsv", ".xlsx", ".json", ".parquet"] }, + { id: "code", folder: "code", accepts: [".py", ".r", ".jl", ".ipynb"] }, + { id: "supplements", folder: "figures", accepts: [".png", ".jpg", ".mp4", ".pt", ".h5"] }, + ], + folderRules, + uploadTargets, + workflowHash: hashRecord({ workspaceId: workspace.id, folderRules, uploadTargets }), + }; +} + +function buildMetadataBundle(workspaceInput) { + const workspace = normalizeWorkspace(workspaceInput); + const metadata = workspace.metadata; + const missing = REQUIRED_METADATA.filter((field) => !metadata[field] || (Array.isArray(metadata[field]) && metadata[field].length === 0)); + const identifier = metadata.doi || metadata.uuid || `urn:scibase:${workspace.id}`; + + return { + identifier, + missingRequiredFields: missing, + jsonLd: { + "@context": "https://schema.org", + "@type": "Dataset", + identifier, + name: metadata.title || workspace.title, + creator: asArray(metadata.creators).map((creator) => ({ "@type": "Person", name: creator })), + license: metadata.license || null, + keywords: asArray(metadata.keywords), + description: metadata.description || "", + }, + dataCite: { + doi: metadata.doi || null, + creators: asArray(metadata.creators).map((creator) => ({ name: creator })), + titles: [{ title: metadata.title || workspace.title }], + publisher: metadata.publisher || "SCIBASE.AI", + publicationYear: metadata.publicationYear || new Date().getUTCFullYear(), + resourceType: "Dataset", + }, + schemaOrg: { + "@type": "Dataset", + name: metadata.title || workspace.title, + url: metadata.persistentUrl || `https://scibase.ai/workspaces/${workspace.id}`, + }, + metadataHash: hashRecord({ identifier, metadata }), + }; +} + +function scoreFairCompliance(workspaceInput) { + const workspace = normalizeWorkspace(workspaceInput); + const manifest = buildStorageManifest(workspace); + const metadata = buildMetadataBundle(workspace); + const hasPersistentIdentifier = Boolean(workspace.metadata.doi || workspace.metadata.uuid); + const hasPersistentUrl = Boolean(workspace.metadata.persistentUrl); + const hasAccessPolicy = Boolean(workspace.metadata.accessPolicy); + const machineReadableRatio = manifest.artifacts.length + ? manifest.artifacts.filter((artifact) => artifact.machineReadable).length / manifest.artifacts.length + : 0; + const hasLicense = Boolean(workspace.metadata.license); + const hasVersioning = manifest.artifacts.every((artifact) => artifact.version); + + const dimensions = { + findable: Number(((hasPersistentIdentifier ? 0.45 : 0) + (metadata.jsonLd.keywords.length ? 0.3 : 0) + (metadata.missingRequiredFields.length === 0 ? 0.25 : 0)).toFixed(4)), + accessible: Number(((hasPersistentUrl ? 0.5 : 0) + (hasAccessPolicy ? 0.35 : 0) + (manifest.artifacts.length > 0 ? 0.15 : 0)).toFixed(4)), + interoperable: Number(((machineReadableRatio * 0.7) + (metadata.jsonLd["@context"] ? 0.3 : 0)).toFixed(4)), + reusable: Number(((hasLicense ? 0.4 : 0) + (workspace.metadata.description ? 0.25 : 0) + (hasVersioning ? 0.25 : 0) + (workspace.metadata.provenance ? 0.1 : 0)).toFixed(4)), + }; + + return { + dimensions, + total: Number((Object.values(dimensions).reduce((sum, value) => sum + value, 0) / 4).toFixed(4)), + blockers: [ + ...metadata.missingRequiredFields.map((field) => `missing metadata: ${field}`), + ...(hasPersistentIdentifier ? [] : ["missing persistent identifier"]), + ...(hasLicense ? [] : ["missing license"]), + ], + fairHash: hashRecord({ workspaceId: workspace.id, dimensions }), + }; +} + +function createPreviewPlan(workspaceInput) { + const manifest = buildStorageManifest(workspaceInput); + return manifest.artifacts.map((artifact) => ({ + artifactId: artifact.id, + path: artifact.path, + preview: artifact.preview, + route: `/workspaces/${manifest.workspaceId}/artifacts/${artifact.id}/preview`, + cacheKey: hashRecord({ artifactId: artifact.id, contentHash: artifact.contentHash, preview: artifact.preview }), + })); +} + +function diffDatasetVersions(previousRows, nextRows, keyField = "id") { + const previous = new Map(asArray(previousRows).map((row) => [row[keyField], row])); + const next = new Map(asArray(nextRows).map((row) => [row[keyField], row])); + const added = []; + const removed = []; + const changed = []; + + for (const [key, row] of next.entries()) { + if (!previous.has(key)) added.push(row); + else if (hashRecord(previous.get(key)) !== hashRecord(row)) changed.push({ key, before: previous.get(key), after: row }); + } + for (const [key, row] of previous.entries()) { + if (!next.has(key)) removed.push(row); + } + + return { + keyField, + added, + removed, + changed, + diffHash: hashRecord({ keyField, added, removed, changed }), + }; +} + +function resolveRuntimeEnvironment(workspaceInput, artifactId) { + const workspace = normalizeWorkspace(workspaceInput); + const artifact = workspace.artifacts.find((candidate) => candidate.id === artifactId); + if (!artifact) throw new Error(`unknown artifact: ${artifactId}`); + const classified = classifyArtifact(artifact); + const explicit = workspace.environments.find((environment) => asArray(environment.artifactIds).includes(artifactId)); + const inferredStack = classified.extension === ".ipynb" || classified.extension === ".py" ? "python" : classified.extension === ".r" ? "r" : classified.extension === ".jl" ? "julia" : "generic"; + const environment = + explicit || workspace.environments.find((candidate) => candidate.stack === inferredStack) || { + id: `env-${inferredStack}`, + stack: inferredStack, + image: inferredStack === "generic" ? "ubuntu:24.04" : `${inferredStack}:latest`, + sandbox: true, + }; + + return { + artifactId, + stack: environment.stack, + image: environment.image, + definition: environment.dockerfile || environment.environmentYml || null, + sandbox: environment.sandbox !== false, + sandboxPolicy: buildSandboxPolicy(workspace, artifactId), + command: artifact.runCommand || defaultRunCommand(classified.extension, artifact.path), + runtimeHash: hashRecord({ artifactId, environment, artifactPath: artifact.path }), + }; +} + +function defaultRunCommand(ext, path) { + if (ext === ".ipynb") return `jupyter nbconvert --execute ${path}`; + if (ext === ".py") return `python ${path}`; + if (ext === ".r") return `Rscript ${path}`; + if (ext === ".jl") return `julia ${path}`; + return `cat ${path}`; +} + +function buildSandboxPolicy(workspaceInput, artifactId) { + const workspace = normalizeWorkspace(workspaceInput); + const artifact = workspace.artifacts.find((candidate) => candidate.id === artifactId); + if (!artifact) throw new Error(`unknown artifact: ${artifactId}`); + const environment = workspace.environments.find((candidate) => asArray(candidate.artifactIds).includes(artifactId)) || {}; + const resourceLimits = environment.resourceLimits || { + cpu: "2", + memory: "4Gi", + timeoutSeconds: 3600, + }; + const networkAccess = environment.networkAccess === true; + const secretNames = asArray(environment.secretNames); + + return { + artifactId, + enabled: environment.sandbox !== false, + isolation: environment.orchestrator || "docker", + ephemeralWorkspace: true, + networkAccess, + secretNames, + resourceLimits, + readOnlyArtifactIds: workspace.artifacts.map((candidate) => candidate.id), + writablePaths: asArray(environment.writablePaths).length ? asArray(environment.writablePaths) : ["/tmp/scibase-run", "outputs/"], + blockedActions: [ + "host-filesystem-write", + "privileged-container", + ...(networkAccess ? [] : ["unscoped-network"]), + ...(secretNames.length ? [] : ["secret-mounts"]), + ], + policyHash: hashRecord({ workspaceId: workspace.id, artifactId, resourceLimits, networkAccess, secretNames }), + }; +} + +function buildExecutionPlan(workspaceInput) { + const workspace = normalizeWorkspace(workspaceInput); + const executable = workspace.artifacts.map(classifyArtifact).filter((artifact) => ["code", "notebook"].includes(artifact.category)); + return { + workspaceId: workspace.id, + runtimes: executable.map((artifact) => resolveRuntimeEnvironment(workspace, artifact.id)), + triggers: [ + { id: "run-analysis", label: "Run analysis", mode: "manual" }, + { id: "reproduce-results", label: "Reproduce results", mode: "manual" }, + { id: "scheduled-refresh", label: "Scheduled rerun", mode: "cron", schedule: "0 6 * * 1" }, + ], + planHash: hashRecord({ workspaceId: workspace.id, executable: executable.map((artifact) => artifact.id) }), + }; +} + +function buildPreservationPackage(workspaceInput) { + const workspace = normalizeWorkspace(workspaceInput); + const manifest = buildStorageManifest(workspace); + const metadata = buildMetadataBundle(workspace); + const fair = scoreFairCompliance(workspace); + const requiredGates = [ + { + id: "metadata-complete", + passed: metadata.missingRequiredFields.length === 0, + evidence: metadata.missingRequiredFields.length + ? `Missing: ${metadata.missingRequiredFields.join(", ")}` + : "All required metadata fields are present.", + }, + { + id: "persistent-identifier", + passed: Boolean(workspace.metadata.doi || workspace.metadata.uuid), + evidence: metadata.identifier, + }, + { + id: "licensed-for-reuse", + passed: Boolean(workspace.metadata.license), + evidence: workspace.metadata.license || "missing license", + }, + { + id: "artifact-hashes", + passed: manifest.artifacts.every((artifact) => Boolean(artifact.contentHash)), + evidence: `${manifest.artifacts.length} artifacts carry content hashes.`, + }, + { + id: "fair-threshold", + passed: fair.total >= 0.8, + evidence: `FAIR score ${fair.total}`, + }, + ]; + const packageFiles = [ + { + path: "manifest/storage-manifest.json", + type: "storage-manifest", + hash: manifest.manifestHash, + }, + { + path: "metadata/datacite.json", + type: "datacite-metadata", + hash: hashRecord(metadata.dataCite), + }, + { + path: "metadata/schema-org.jsonld", + type: "schema-org-jsonld", + hash: hashRecord(metadata.jsonLd), + }, + { + path: "fair/fair-score.json", + type: "fair-score", + hash: fair.fairHash, + }, + ...manifest.artifacts.map((artifact) => ({ + path: `artifacts/${artifact.path}`, + type: artifact.category, + version: artifact.version, + hash: artifact.contentHash, + })), + ]; + + return { + workspaceId: workspace.id, + identifier: metadata.identifier, + persistentUrl: workspace.metadata.persistentUrl || `https://scibase.ai/workspaces/${workspace.id}`, + depositTargets: [ + { + id: "datacite", + type: "doi-registration", + ready: Boolean(metadata.dataCite.doi) && requiredGates.every((gate) => gate.passed), + payloadHash: hashRecord(metadata.dataCite), + }, + { + id: "repository-export", + type: "artifact-bundle", + ready: requiredGates.every((gate) => gate.passed), + payloadHash: hashRecord(packageFiles), + }, + { + id: "schema-org-index", + type: "discovery-index", + ready: metadata.missingRequiredFields.length === 0, + payloadHash: hashRecord(metadata.schemaOrg), + }, + ], + requiredGates, + packageFiles, + preservationHash: hashRecord({ workspaceId: workspace.id, requiredGates, packageFiles }), + }; +} + +function buildHostingPacket(workspaceInput) { + const workspace = normalizeWorkspace(workspaceInput); + const manifest = buildStorageManifest(workspace); + const metadata = buildMetadataBundle(workspace); + const fair = scoreFairCompliance(workspace); + const previews = createPreviewPlan(workspace); + const uploadWorkflow = buildUploadWorkflowPlan(workspace); + const execution = buildExecutionPlan(workspace); + const preservation = buildPreservationPackage(workspace); + + return { + workspace: { + id: workspace.id, + title: workspace.title, + }, + manifest, + metadata, + fair, + uploadWorkflow, + previews, + execution, + preservation, + apiRoutes: [ + `POST /workspaces/${workspace.id}/artifacts`, + `POST /workspaces/${workspace.id}/uploads/:artifactId/chunks`, + `POST /workspaces/${workspace.id}/uploads/:artifactId/complete`, + `GET /workspaces/${workspace.id}/artifacts/:artifactId/preview`, + `GET /workspaces/${workspace.id}/metadata/datacite`, + `GET /workspaces/${workspace.id}/preservation-package`, + `POST /workspaces/${workspace.id}/runs`, + `GET /workspaces/${workspace.id}/fair-score`, + ], + packetHash: hashRecord({ manifest, metadata, fair, uploadWorkflow, previews, execution, preservation }), + }; +} + +module.exports = { + FILE_TYPES, + REQUIRED_METADATA, + buildExecutionPlan, + buildHostingPacket, + buildMetadataBundle, + buildPreservationPackage, + buildSandboxPolicy, + buildStorageManifest, + buildUploadWorkflowPlan, + classifyArtifact, + createPreviewPlan, + diffDatasetVersions, + hashRecord, + resolveRuntimeEnvironment, + scoreFairCompliance, +}; diff --git a/data-code-hosting-ledger/test/data-code-hosting-ledger.test.js b/data-code-hosting-ledger/test/data-code-hosting-ledger.test.js new file mode 100644 index 0000000..bd8296b --- /dev/null +++ b/data-code-hosting-ledger/test/data-code-hosting-ledger.test.js @@ -0,0 +1,130 @@ +"use strict"; + +const assert = require("assert"); +const workspace = require("../data/sample-workspace.json"); +const { + buildExecutionPlan, + buildHostingPacket, + buildMetadataBundle, + buildPreservationPackage, + buildSandboxPolicy, + buildStorageManifest, + buildUploadWorkflowPlan, + classifyArtifact, + createPreviewPlan, + diffDatasetVersions, + resolveRuntimeEnvironment, + scoreFairCompliance, +} = require("../src/data-code-hosting-ledger"); + +function testStorageManifest() { + const manifest = buildStorageManifest(workspace); + const csv = classifyArtifact(workspace.artifacts[0]); + + assert.strictEqual(csv.category, "dataset"); + assert.strictEqual(csv.preview, "spreadsheet"); + assert.strictEqual(manifest.categories.dataset, 2); + assert.ok(manifest.folders.includes("data")); + assert.ok(manifest.manifestHash); +} + +function testMetadataAndFairScore() { + const metadata = buildMetadataBundle(workspace); + const fair = scoreFairCompliance(workspace); + + assert.deepStrictEqual(metadata.missingRequiredFields, []); + assert.strictEqual(metadata.jsonLd["@context"], "https://schema.org"); + assert.strictEqual(metadata.dataCite.doi, workspace.metadata.doi); + assert.ok(fair.total >= 0.9); + assert.deepStrictEqual(fair.blockers, []); +} + +function testPreviewsAndDiffs() { + const previews = createPreviewPlan(workspace); + const diff = diffDatasetVersions( + [ + { id: "s1", value: 1 }, + { id: "s2", value: 2 }, + ], + [ + { id: "s1", value: 1.5 }, + { id: "s3", value: 3 }, + ], + ); + + assert.ok(previews.some((preview) => preview.preview === "notebook")); + assert.strictEqual(diff.added.length, 1); + assert.strictEqual(diff.removed.length, 1); + assert.strictEqual(diff.changed.length, 1); +} + +function testUploadWorkflowPlan() { + const upload = buildUploadWorkflowPlan(workspace); + + assert.strictEqual(upload.workspaceId, workspace.id); + assert.ok(upload.dropZones.some((zone) => zone.id === "datasets" && zone.folder === "data")); + assert.ok(upload.folderRules.some((rule) => rule.folder === "data")); + assert.ok(upload.uploadTargets.some((target) => target.route.includes("/uploads/artifact-samples-v1/chunks"))); + assert.ok(upload.uploadTargets.every((target) => target.resumable)); + assert.ok(upload.uploadTargets.find((target) => target.artifactId === "artifact-samples-v1").validation.requireTags); + assert.ok(upload.workflowHash.length >= 12); +} + +function testExecutionPlan() { + const runtime = resolveRuntimeEnvironment(workspace, "artifact-notebook"); + const plan = buildExecutionPlan(workspace); + + assert.strictEqual(runtime.stack, "python"); + assert.strictEqual(runtime.sandbox, true); + assert.strictEqual(runtime.sandboxPolicy.networkAccess, false); + assert.ok(runtime.sandboxPolicy.blockedActions.includes("privileged-container")); + assert.ok(runtime.command.includes("jupyter")); + assert.ok(plan.triggers.some((trigger) => trigger.id === "scheduled-refresh")); + assert.strictEqual(plan.runtimes.length, 2); +} + +function testSandboxPolicy() { + const policy = buildSandboxPolicy(workspace, "artifact-analysis"); + + assert.strictEqual(policy.enabled, true); + assert.strictEqual(policy.isolation, "docker"); + assert.deepStrictEqual(policy.resourceLimits, { cpu: "2", memory: "4Gi", timeoutSeconds: 1800 }); + assert.ok(policy.readOnlyArtifactIds.includes("artifact-samples-v1")); + assert.ok(policy.writablePaths.includes("outputs/")); + assert.ok(policy.policyHash); +} + +function testPreservationPackage() { + const preservation = buildPreservationPackage(workspace); + + assert.strictEqual(preservation.identifier, workspace.metadata.doi); + assert.ok(preservation.requiredGates.every((gate) => gate.passed)); + assert.ok(preservation.packageFiles.some((file) => file.path === "metadata/datacite.json")); + assert.ok(preservation.packageFiles.some((file) => file.path === "metadata/schema-org.jsonld")); + assert.ok(preservation.packageFiles.some((file) => file.path.startsWith("artifacts/data/"))); + assert.ok(preservation.depositTargets.every((target) => target.ready)); + assert.ok(preservation.preservationHash.length >= 12); +} + +function testPacket() { + const packet = buildHostingPacket(workspace); + + assert.strictEqual(packet.workspace.id, workspace.id); + assert.ok(packet.apiRoutes.some((route) => route.includes("fair-score"))); + assert.ok(packet.apiRoutes.some((route) => route.includes("uploads/:artifactId/chunks"))); + assert.ok(packet.uploadWorkflow.dropZones.length >= 3); + assert.ok(packet.apiRoutes.some((route) => route.includes("preservation-package"))); + assert.strictEqual(packet.preservation.workspaceId, workspace.id); + assert.ok(packet.packetHash.length >= 12); +} + +testStorageManifest(); +testMetadataAndFairScore(); +testPreviewsAndDiffs(); +testUploadWorkflowPlan(); +testExecutionPlan(); +testSandboxPolicy(); +testPreservationPackage(); +testPacket(); + +console.log("data-code-hosting-ledger tests passed");