Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions data-code-hosting-ledger/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Data and Code Hosting Ledger

Self-contained scientific data and code hosting milestone for [SCIBASE.AI issue #14](https://github.com/SCIBASE-AI/SCIBASE.AI/issues/14).

The issue asks for first-class hosting of datasets, code, notebooks, models, metadata, previews, and executable environments. This module provides a deterministic hosting ledger that reviewers can run locally without object storage, Kubernetes, or external DOI services.

## What It Adds

- Artifact classification for datasets, code, notebooks, images, videos, model files, and unknown files.
- Folder-aware storage manifest with content hashes, versions, tags, category counts, and total bytes.
- Drag-and-drop upload workflow planning with drop zones, folder routing, resumable chunk routes, expected hashes, and validation rules.
- Metadata bundle with JSON-LD, DataCite-style metadata, and schema.org fields.
- FAIR compliance scoring across findable, accessible, interoperable, and reusable dimensions.
- Preview route planning for spreadsheets, JSON, notebooks, code, thumbnails, and model cards.
- Dataset row diffing for added, removed, and changed records.
- Runtime environment resolution for Python, R, Julia, notebooks, and generic artifacts.
- Sandbox policy planning with Docker isolation, network controls, resource limits, read-only inputs, and writable output paths.
- Preservation package planning for DataCite DOI registration, repository export, schema.org indexing, required metadata gates, package files, and persistent access links.
- Execution plan with run-analysis, reproduce-results, and scheduled rerun triggers.
- API route contracts for uploads, previews, metadata, runs, and FAIR score.
- Sample workspace fixture, tests, requirement map, CLI demo, and short demo GIF.

## Run

```bash
cd data-code-hosting-ledger
npm run check
npm test
npm run demo
```

Expected demo shape:

```json
{
"artifacts": 6,
"fairScore": 0.9417,
"uploadWorkflow": {
"dropZones": ["datasets", "code", "supplements"],
"uploadTargets": 6
},
"previewKinds": ["code", "image-thumbnail", "model-card", "notebook", "spreadsheet"],
"runtimes": ["python:python:3.12-slim"],
"sandboxPolicies": [
{
"isolation": "docker",
"networkAccess": false,
"resourceLimits": {
"cpu": "2",
"memory": "4Gi"
}
}
],
"datasetDiff": {
"added": 1,
"changed": 1,
"removed": 0
},
"preservation": {
"identifier": "10.5555/scibase.flood.repro",
"readyTargets": ["datacite", "repository-export", "schema-org-index"],
"packageFiles": 10
},
"packetHash": "..."
}
```

## Demo Artifact

See [docs/demo.gif](docs/demo.gif) for a short visual walkthrough. The SVG source is included at [docs/demo.svg](docs/demo.svg).

## Files

- `src/data-code-hosting-ledger.js` - artifact classification, manifests, upload workflows, metadata, FAIR score, previews, diffs, runtimes, sandbox policies, preservation packages.
- `data/sample-workspace.json` - reviewable scientific workspace fixture.
- `test/data-code-hosting-ledger.test.js` - dependency-free Node tests.
- `scripts/demo.js` - CLI demo.
- `docs/issue-14-requirement-map.md` - maps the implementation to bounty requirements.

## AI-Assisted Disclosure

This contribution was produced with AI assistance and manually verified with the local commands above.
91 changes: 91 additions & 0 deletions data-code-hosting-ledger/data/sample-workspace.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
{
"id": "workspace-flood-repro",
"title": "Flood microbiome reproducibility workspace",
"metadata": {
"doi": "10.5555/scibase.flood.repro",
"uuid": "2aa92fd5-b978-4e89-bdea-66bd99f655c0",
"title": "Flood microbiome reproducibility workspace",
"creators": ["Ada Chen", "Ravi Patel"],
"license": "CC-BY-4.0",
"keywords": ["microbiome", "coastal flooding", "reproducibility"],
"description": "Data, code, notebooks, and runtime metadata for reproducing flood microbiome analysis.",
"publisher": "SCIBASE.AI",
"publicationYear": 2026,
"persistentUrl": "https://scibase.ai/workspaces/workspace-flood-repro",
"accessPolicy": "public-read-controlled-write",
"provenance": "Raw samples were processed with notebooks and Docker runtime metadata."
},
"artifacts": [
{
"id": "artifact-samples-v1",
"path": "data/samples.csv",
"sizeBytes": 12000,
"content": "id,diversity\ns1,0.7\ns2,0.5",
"version": "v1",
"tags": ["dataset", "microbiome"]
},
{
"id": "artifact-samples-v2",
"path": "data/samples.csv",
"sizeBytes": 14000,
"content": "id,diversity\ns1,0.72\ns2,0.5\ns3,0.9",
"version": "v2",
"tags": ["dataset", "microbiome"]
},
{
"id": "artifact-analysis",
"path": "code/analyze.py",
"sizeBytes": 2400,
"content": "print('analysis')",
"version": "v1",
"runCommand": "python code/analyze.py"
},
{
"id": "artifact-notebook",
"path": "notebooks/reproduce.ipynb",
"sizeBytes": 9800,
"content": "{}",
"version": "v1"
},
{
"id": "artifact-figure",
"path": "figures/diversity.png",
"sizeBytes": 180000,
"content": "binary",
"version": "v1"
},
{
"id": "artifact-model",
"path": "models/classifier.pt",
"sizeBytes": 25000000,
"content": "weights",
"version": "v1"
}
],
"environments": [
{
"id": "env-python-repro",
"stack": "python",
"image": "python:3.12-slim",
"dockerfile": "FROM python:3.12-slim\nRUN pip install pandas notebook",
"artifactIds": ["artifact-analysis", "artifact-notebook"],
"sandbox": true,
"orchestrator": "docker",
"networkAccess": false,
"resourceLimits": {
"cpu": "2",
"memory": "4Gi",
"timeoutSeconds": 1800
},
"writablePaths": ["/tmp/scibase-run", "outputs/"]
}
],
"runs": [
{
"id": "run-1",
"artifactId": "artifact-analysis",
"status": "passed",
"createdAt": "2026-05-10T09:00:00Z"
}
]
}
Binary file added data-code-hosting-ledger/docs/demo.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added data-code-hosting-ledger/docs/demo.mp4
Binary file not shown.
33 changes: 33 additions & 0 deletions data-code-hosting-ledger/docs/demo.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
30 changes: 30 additions & 0 deletions data-code-hosting-ledger/docs/issue-14-requirement-map.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Issue #14 Requirement Map

This module implements a deterministic scientific data and code hosting milestone for SCIBASE issue #14. It focuses on storage manifests, artifact previews, metadata standards, FAIR scoring, executable environments, and reproducibility triggers.

| Issue requirement | Implementation |
| --- | --- |
| Support major scientific file types | `FILE_TYPES` and `classifyArtifact()` cover CSV, TSV, XLSX, JSON, Parquet, Python, R, Julia, notebooks, images, video, and model files. |
| Folder-based organization and upload manifest | `buildStorageManifest()` groups artifacts by folder, category, size, version, content hash, and tags. |
| Drag-and-drop uploads | `buildUploadWorkflowPlan()` emits dataset/code/supplement drop zones, accepted extensions, folder routing, resumable chunk upload routes, expected hashes, and validation rules. |
| Metadata-aware previews | `createPreviewPlan()` emits preview routes for spreadsheets, JSON, notebooks, code, images, video, and model cards. |
| Upload versioning and dataset diffing | Artifact versions are tracked in the manifest, and `diffDatasetVersions()` reports added, removed, and changed rows. |
| JSON-LD, DataCite, and schema.org metadata | `buildMetadataBundle()` emits JSON-LD, DataCite-style, and schema.org metadata from workspace metadata. |
| FAIR principles compliance | `scoreFairCompliance()` scores findable, accessible, interoperable, and reusable dimensions with blockers. |
| Scientific tagging and identifiers | Metadata bundle uses DOI/UUID identifiers plus keyword tags; artifacts preserve tags. |
| Persistent deposit and reuse package | `buildPreservationPackage()` prepares DataCite DOI registration, repository export, schema.org indexing, required metadata gates, package-file hashes, and persistent access URLs. |
| Container-based executable environments | `resolveRuntimeEnvironment()` and `buildExecutionPlan()` map code/notebooks to Docker image/runtime definitions. |
| Sandboxed execution controls | `buildSandboxPolicy()` attaches Docker isolation, resource limits, network controls, read-only inputs, writable output paths, and blocked privileged actions to each runtime. |
| Run analysis, reproduce results, and scheduled reruns | `buildExecutionPlan()` emits manual and cron-style triggers. |
| Programmatic access | `buildHostingPacket()` includes API route contracts for uploads, previews, DataCite metadata, preservation package, runs, and FAIR score. |
| Reviewer demo | `npm run demo` prints artifact categories, FAIR score, preview kinds, runtimes, dataset diff counts, and packet hash. |

## Verification

```bash
npm run check
npm test
npm run demo
```

The module is dependency-free and isolated under `data-code-hosting-ledger/`.
12 changes: 12 additions & 0 deletions data-code-hosting-ledger/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"name": "scibase-data-code-hosting-ledger",
"version": "0.1.0",
"private": true,
"description": "Scientific data and code hosting ledger for SCIBASE issue #14.",
"type": "commonjs",
"scripts": {
"check": "node --check src/data-code-hosting-ledger.js && node --check scripts/demo.js && node --check test/data-code-hosting-ledger.test.js",
"demo": "node scripts/demo.js",
"test": "node test/data-code-hosting-ledger.test.js"
}
}
55 changes: 55 additions & 0 deletions data-code-hosting-ledger/scripts/demo.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"use strict";

const workspace = require("../data/sample-workspace.json");
const { buildHostingPacket, diffDatasetVersions } = require("../src/data-code-hosting-ledger");

const packet = buildHostingPacket(workspace);
const diff = diffDatasetVersions(
[
{ id: "s1", diversity: 0.7 },
{ id: "s2", diversity: 0.5 },
],
[
{ id: "s1", diversity: 0.72 },
{ id: "s2", diversity: 0.5 },
{ id: "s3", diversity: 0.9 },
],
);

console.log(
JSON.stringify(
{
workspace: packet.workspace.title,
artifacts: packet.manifest.artifacts.length,
categories: packet.manifest.categories,
fairScore: packet.fair.total,
uploadWorkflow: {
dropZones: packet.uploadWorkflow.dropZones.map((zone) => zone.id),
uploadTargets: packet.uploadWorkflow.uploadTargets.length,
firstTargetRoute: packet.uploadWorkflow.uploadTargets[0]
? packet.uploadWorkflow.uploadTargets[0].route
: null,
},
previewKinds: Array.from(new Set(packet.previews.map((preview) => preview.preview))).sort(),
runtimes: packet.execution.runtimes.map((runtime) => `${runtime.stack}:${runtime.image}`),
sandboxPolicies: packet.execution.runtimes.map((runtime) => ({
artifactId: runtime.artifactId,
isolation: runtime.sandboxPolicy.isolation,
networkAccess: runtime.sandboxPolicy.networkAccess,
resourceLimits: runtime.sandboxPolicy.resourceLimits,
})),
preservation: {
identifier: packet.preservation.identifier,
readyTargets: packet.preservation.depositTargets
.filter((target) => target.ready)
.map((target) => target.id),
packageFiles: packet.preservation.packageFiles.length,
gateStatus: packet.preservation.requiredGates.map((gate) => `${gate.id}:${gate.passed}`),
},
datasetDiff: { added: diff.added.length, changed: diff.changed.length, removed: diff.removed.length },
packetHash: packet.packetHash,
},
null,
2,
),
);
Loading