From 38084c4b93bcd0afe01117c4a304a2aad52b0dda Mon Sep 17 00:00:00 2001
From: Gautam <gautam@aperturedata.io>
Date: Mon, 15 Sep 2025 16:26:50 -0400
Subject: [PATCH 1/4] image captions usign blip.

---
 .github/workflows/main.yml               |   1 +
 apps/caption-image/Dockerfile            |  15 +++
 apps/caption-image/README.md             |   6 ++
 apps/caption-image/app/app.sh            |   4 +
 apps/caption-image/app/caption_images.py |  35 +++++++
 apps/caption-image/app/images.py         | 128 +++++++++++++++++++++++
 apps/caption-image/app/weights.py        |  16 +++
 apps/caption-image/requirements.txt      |   1 +
 8 files changed, 206 insertions(+)
 create mode 100644 apps/caption-image/Dockerfile
 create mode 100644 apps/caption-image/README.md
 create mode 100644 apps/caption-image/app/app.sh
 create mode 100644 apps/caption-image/app/caption_images.py
 create mode 100644 apps/caption-image/app/images.py
 create mode 100644 apps/caption-image/app/weights.py
 create mode 100644 apps/caption-image/requirements.txt

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 9fffe718..4344b194 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -99,6 +99,7 @@ jobs:
           sql-server,
           label-studio,
           ocr-extraction,
+          caption-image,
         ]
     runs-on:
       - ubuntu-latest
diff --git a/apps/caption-image/Dockerfile b/apps/caption-image/Dockerfile
new file mode 100644
index 00000000..42e107eb
--- /dev/null
+++ b/apps/caption-image/Dockerfile
@@ -0,0 +1,15 @@
+# Pull base image.
+ARG VERSION=latest
+FROM aperturedata/workflows-base:${VERSION}
+
+ENV APP_NAME=workflows-caption-image
+
+COPY requirements.txt /
+RUN pip install -U pip
+RUN pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
+RUN pip install --no-cache-dir -r /requirements.txt
+
+COPY app/weights.py /app/weights.py
+RUN python /app/weights.py
+
+COPY app /app/
diff --git a/apps/caption-image/README.md b/apps/caption-image/README.md
new file mode 100644
index 00000000..4f202533
--- /dev/null
+++ b/apps/caption-image/README.md
@@ -0,0 +1,6 @@
+# Example App
+
+This workflow retrieves all images from ApertureDB that have not been
+analyzed before, and runs them through a
+[BLIP (Bootstrapping Language-Image Pre-training)](https://github.com/salesforce/BLIP)
+model to generate a caption for each image.
diff --git a/apps/caption-image/app/app.sh b/apps/caption-image/app/app.sh
new file mode 100644
index 00000000..c29f107d
--- /dev/null
+++ b/apps/caption-image/app/app.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+set -e
+
+python3 caption_images.py
diff --git a/apps/caption-image/app/caption_images.py b/apps/caption-image/app/caption_images.py
new file mode 100644
index 00000000..9200d2e5
--- /dev/null
+++ b/apps/caption-image/app/caption_images.py
@@ -0,0 +1,35 @@
+import logging
+
+from typer import Typer
+
+from images import FindImageQueryGenerator
+from aperturedb import ParallelQuery
+from connection_pool import ConnectionPool
+
+app = Typer()
+DONE_PROPERTY = 'wf_caption_image'
+
+@app.command()
+def caption_images(
+    num_workers:int = 1,
+    batch_size:int = 1,
+    log_level:str = "INFO"
+):
+    logging.basicConfig(level=logging.getLevelName(log_level))
+    pool = ConnectionPool()
+    data = FindImageQueryGenerator(
+        pool,
+        done_property=DONE_PROPERTY)
+
+    print("Running Caption Image...")
+    with pool.get_connection() as db:
+        querier = ParallelQuery.ParallelQuery(db)
+        querier.query(data, batchsize=batch_size, numthreads=num_workers, stats=True)
+
+
+def main():
+
+    app()
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/apps/caption-image/app/images.py b/apps/caption-image/app/images.py
new file mode 100644
index 00000000..c28bec78
--- /dev/null
+++ b/apps/caption-image/app/images.py
@@ -0,0 +1,128 @@
+import io
+import math
+import logging
+
+from PIL import Image
+
+from aperturedb import QueryGenerator
+from connection_pool import ConnectionPool
+
+from PIL import Image
+from transformers import AutoProcessor, BlipForConditionalGeneration
+
+processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+
+logger = logging.getLogger(__name__)
+
+
+class FindImageQueryGenerator(QueryGenerator.QueryGenerator):
+
+    """
+        Generates n FindImage Queries
+    """
+
+    def __init__(self, pool, done_property: str):
+
+        self.pool = pool
+        self.done_property = done_property
+
+        query = [{
+            "FindImage": {
+                "constraints": {
+                    self.done_property: ["==", None]
+                },
+                "results": {
+                    "count": True
+                }
+            }
+        }]
+
+        _, response, _ = self.pool.execute_query(query)
+
+        try:
+            total_images = response[0]["FindImage"]["count"]
+        except:
+            logger.error("Error retrieving the number of images. No images in the db?")
+            exit(0)
+
+        if total_images == 0:
+            logger.warning("No images to be processed. Continuing!")
+
+        logger.info(f"Total images to process: {total_images}")
+
+        self.batch_size = 32
+        self.total_batches = int(math.ceil(total_images / self.batch_size))
+
+        self.len = self.total_batches
+
+    def __len__(self):
+        return self.len
+
+    def getitem(self, idx):
+
+        if idx < 0 or self.len <= idx:
+            return None
+
+        query = [{
+            "FindImage": {
+                "blobs": True,
+                "constraints": {
+                    self.done_property: ["==", None]
+                },
+                "batch": {
+                    "batch_size": self.batch_size,
+                    "batch_id": idx
+                },
+                "results": {
+                    "list": ["_uniqueid"]
+                }
+            }
+        }]
+
+        return query, []
+
+    def response_handler(self, query, blobs, response, r_blobs):
+
+        try:
+            uniqueids = [i["_uniqueid"]
+                         for i in response[0]["FindImage"]["entities"]]
+        except:
+            logger.exception(f"error: {response}")
+            return 0
+
+        desc_blobs = []
+
+        captions = []
+        for b in r_blobs:
+            image = Image.open(io.BytesIO(b))
+            text = "A picture of"
+            inputs = processor(images=image, text=text, return_tensors="pt")
+            output = model.generate(**inputs)
+            caption = processor.decode(output[0], skip_special_tokens=True)
+            captions.append(caption)
+
+        query = []
+        for uniqueid, i in zip(uniqueids, range(len(uniqueids))):
+
+            query.append({
+                "FindImage": {
+                    "_ref": i + 1,
+                    "constraints": {
+                        "_uniqueid": ["==", uniqueid]
+                    },
+                }
+            })
+
+            query.append({
+                "UpdateImage": {
+                    "ref": i + 1,
+                    "properties": {
+                        self.done_property: captions[i]
+                    },
+                }
+            })
+
+
+
+        self.pool.execute_query(query)
\ No newline at end of file
diff --git a/apps/caption-image/app/weights.py b/apps/caption-image/app/weights.py
new file mode 100644
index 00000000..6f701877
--- /dev/null
+++ b/apps/caption-image/app/weights.py
@@ -0,0 +1,16 @@
+from PIL import Image
+import requests
+from transformers import AutoProcessor, BlipForConditionalGeneration
+
+processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+text = "A picture of"
+
+inputs = processor(images=image, text=text, return_tensors="pt")
+
+output = model.generate(**inputs)
+caption = processor.decode(output[0], skip_special_tokens=True)
+print(caption)
diff --git a/apps/caption-image/requirements.txt b/apps/caption-image/requirements.txt
new file mode 100644
index 00000000..976a2b1f
--- /dev/null
+++ b/apps/caption-image/requirements.txt
@@ -0,0 +1 @@
+transformers

From 2dfb0b831bc11a63b7abfc522cd81e60ca589409 Mon Sep 17 00:00:00 2001
From: Gautam Saluja <52312085+gsaluja9@users.noreply.github.com>
Date: Wed, 17 Sep 2025 14:20:49 -0400
Subject: [PATCH 2/4] Adding devcontainers. (#208)

---
 .devcontainer/caption-image/devcontainer.json | 30 ++++++
 .../caption-image/docker-compose.yml          | 99 +++++++++++++++++++
 .devcontainer/crawl-website/devcontainer.json | 30 ++++++
 .../crawl-website/docker-compose.yml          | 99 +++++++++++++++++++
 .../dataset-ingestion/devcontainer.json       | 30 ++++++
 .../dataset-ingestion/docker-compose.yml      | 99 +++++++++++++++++++
 .gitignore                                    |  1 +
 .vscode/launch.json                           | 16 +++
 apps/caption-image/README.md                  | 62 +++++++++++-
 apps/caption-image/app/weights.py             |  2 +
 base/docker/scripts/sitecustomize.py          | 12 ++-
 configuration_params.py                       | 11 +++
 initcommand.sh                                |  4 +
 pipeline.py                                   | 20 ++++
 postinstall.sh                                |  4 +
 workflows-devcontiner.code-workspace          | 11 +++
 16 files changed, 525 insertions(+), 5 deletions(-)
 create mode 100644 .devcontainer/caption-image/devcontainer.json
 create mode 100644 .devcontainer/caption-image/docker-compose.yml
 create mode 100644 .devcontainer/crawl-website/devcontainer.json
 create mode 100644 .devcontainer/crawl-website/docker-compose.yml
 create mode 100644 .devcontainer/dataset-ingestion/devcontainer.json
 create mode 100644 .devcontainer/dataset-ingestion/docker-compose.yml
 create mode 100644 .vscode/launch.json
 create mode 100644 configuration_params.py
 create mode 100755 initcommand.sh
 create mode 100644 pipeline.py
 create mode 100755 postinstall.sh
 create mode 100644 workflows-devcontiner.code-workspace

diff --git a/.devcontainer/caption-image/devcontainer.json b/.devcontainer/caption-image/devcontainer.json
new file mode 100644
index 00000000..d515086f
--- /dev/null
+++ b/.devcontainer/caption-image/devcontainer.json
@@ -0,0 +1,30 @@
+{
+    "name": "caption-image",
+    "dockerComposeFile": [
+        "docker-compose.yml"
+    ],
+    "service": "caption-image",
+    "workspaceFolder": "/workflows",
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "ms-python.python",
+                "ms-python.pylint",
+                "ms-python.black-formatter",
+                "ms-toolsai.jupyter"
+            ]
+        }
+    },
+    "settings": {
+        "python.defaultInterpreterPath": "/opt/venv/bin/python",
+        "python.formatting.provider": "black",
+        "python.linting.enabled": true,
+        "python.linting.pylintEnabled": true,
+        "files.exclude": {
+            "**/__pycache__": true,
+            "**/*.pyc": true
+        }
+    },
+    "initializeCommand": "./initcommand.sh",
+    "postCreateCommand": "./postinstall.sh"
+}
\ No newline at end of file
diff --git a/.devcontainer/caption-image/docker-compose.yml b/.devcontainer/caption-image/docker-compose.yml
new file mode 100644
index 00000000..0c372911
--- /dev/null
+++ b/.devcontainer/caption-image/docker-compose.yml
@@ -0,0 +1,99 @@
+name: aperturedb-local-linux
+
+services:
+  ca:
+    image: alpine/openssl
+    restart: on-failure
+    command: req -x509 -newkey rsa:4096 -days 3650 -nodes -keyout /cert/tls.key -out /cert/tls.crt -subj "/C=US/O=ApertureData Inc./CN=localhost"
+    volumes:
+      - ./aperturedb/certificate:/cert
+
+  lenz:
+    depends_on:
+      ca:
+        condition: service_completed_successfully
+      aperturedb:
+        condition: service_started
+    image: aperturedata/lenz:latest
+    ports:
+      - ${ADB_PORT}:55551
+    restart: always
+    environment:
+      LNZ_HEALTH_PORT: 58085
+      LNZ_TCP_PORT: 55551
+      LNZ_HTTP_PORT: 8080
+      LNZ_ADB_BACKENDS: '["aperturedb:55553"]'
+      LNZ_REPLICAS: 1
+      LNZ_ADB_MAX_CONCURRENCY: 48
+      LNZ_FORCE_SSL: false
+      LNZ_CERTIFICATE_PATH: /etc/lenz/certificate/tls.crt
+      LNZ_PRIVATE_KEY_PATH: /etc/lenz/certificate/tls.key
+    volumes:
+      - ./aperturedb/certificate:/etc/lenz/certificate
+
+  aperturedb:
+    image: aperturedata/aperturedb-community:latest
+    volumes:
+      - ./aperturedb/db:/aperturedb/db
+      - ./aperturedb/logs:/aperturedb/logs
+    restart: always
+    environment:
+      ADB_KVGD_DB_SIZE: "204800"
+      ADB_LOG_PATH: "logs"
+      ADB_ENABLE_DEBUG: 1
+      ADB_MASTER_KEY: "admin"
+      ADB_PORT: 55553
+      ADB_FORCE_SSL: false
+
+  webui:
+    image: aperturedata/aperturedata-platform-web-private:latest
+    restart: always
+
+  nginx:
+    depends_on:
+      ca:
+        condition: service_completed_successfully
+    image: nginx
+    restart: always
+    ports:
+      - 8081:80
+      - 8443:443
+    configs:
+      - source: nginx.conf
+        target: /etc/nginx/conf.d/default.conf
+    volumes:
+      - ./aperturedb/certificate:/etc/nginx/certificate
+
+  caption-image:
+    build:
+      context: ../../apps/caption-image
+    volumes:
+      - ../../:/workflows
+    environment:
+      WF_LOGS_AWS_CREDENTIALS: "aws-credentials"
+      DB_HOST: lenz
+      DB_PORT: 55551
+      PORT: 8080
+      PROMETHEUS_PORT: 8001
+    command: bash -c "while true; do sleep 1000; done"
+    depends_on:
+      aperturedb:
+        condition: service_started
+
+configs:
+  nginx.conf:
+    content: |
+      server {
+        listen 80;
+        listen 443 ssl;
+        client_max_body_size 256m;
+        ssl_certificate /etc/nginx/certificate/tls.crt;
+        ssl_certificate_key /etc/nginx/certificate/tls.key;
+        location / {
+          proxy_pass http://webui;
+        }
+        location /api/ {
+          proxy_pass http://lenz:8080;
+        }
+      }
+
diff --git a/.devcontainer/crawl-website/devcontainer.json b/.devcontainer/crawl-website/devcontainer.json
new file mode 100644
index 00000000..a36eb044
--- /dev/null
+++ b/.devcontainer/crawl-website/devcontainer.json
@@ -0,0 +1,30 @@
+{
+    "name": "crawl-website",
+    "dockerComposeFile": [
+        "docker-compose.yml"
+    ],
+    "service": "crawl-website",
+    "workspaceFolder": "/workflows",
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "ms-python.python",
+                "ms-python.pylint",
+                "ms-python.black-formatter",
+                "ms-toolsai.jupyter"
+            ]
+        }
+    },
+    "settings": {
+        "python.defaultInterpreterPath": "/opt/venv/bin/python",
+        "python.formatting.provider": "black",
+        "python.linting.enabled": true,
+        "python.linting.pylintEnabled": true,
+        "files.exclude": {
+            "**/__pycache__": true,
+            "**/*.pyc": true
+        }
+    },
+    "initializeCommand": "./initcommand.sh",
+    "postCreateCommand": "./postinstall.sh"
+}
\ No newline at end of file
diff --git a/.devcontainer/crawl-website/docker-compose.yml b/.devcontainer/crawl-website/docker-compose.yml
new file mode 100644
index 00000000..87ae1bc8
--- /dev/null
+++ b/.devcontainer/crawl-website/docker-compose.yml
@@ -0,0 +1,99 @@
+name: aperturedb-local-linux
+
+services:
+  ca:
+    image: alpine/openssl
+    restart: on-failure
+    command: req -x509 -newkey rsa:4096 -days 3650 -nodes -keyout /cert/tls.key -out /cert/tls.crt -subj "/C=US/O=ApertureData Inc./CN=localhost"
+    volumes:
+      - ./aperturedb/certificate:/cert
+
+  lenz:
+    depends_on:
+      ca:
+        condition: service_completed_successfully
+      aperturedb:
+        condition: service_started
+    image: aperturedata/lenz:latest
+    ports:
+      - ${ADB_PORT}:55551
+    restart: always
+    environment:
+      LNZ_HEALTH_PORT: 58085
+      LNZ_TCP_PORT: 55551
+      LNZ_HTTP_PORT: 8080
+      LNZ_ADB_BACKENDS: '["aperturedb:55553"]'
+      LNZ_REPLICAS: 1
+      LNZ_ADB_MAX_CONCURRENCY: 48
+      LNZ_FORCE_SSL: false
+      LNZ_CERTIFICATE_PATH: /etc/lenz/certificate/tls.crt
+      LNZ_PRIVATE_KEY_PATH: /etc/lenz/certificate/tls.key
+    volumes:
+      - ./aperturedb/certificate:/etc/lenz/certificate
+
+  aperturedb:
+    image: aperturedata/aperturedb-community:latest
+    volumes:
+      - ./aperturedb/db:/aperturedb/db
+      - ./aperturedb/logs:/aperturedb/logs
+    restart: always
+    environment:
+      ADB_KVGD_DB_SIZE: "204800"
+      ADB_LOG_PATH: "logs"
+      ADB_ENABLE_DEBUG: 1
+      ADB_MASTER_KEY: "admin"
+      ADB_PORT: 55553
+      ADB_FORCE_SSL: false
+
+  webui:
+    image: aperturedata/aperturedata-platform-web-private:latest
+    restart: always
+
+  nginx:
+    depends_on:
+      ca:
+        condition: service_completed_successfully
+    image: nginx
+    restart: always
+    ports:
+      - 8081:80
+      - 8443:443
+    configs:
+      - source: nginx.conf
+        target: /etc/nginx/conf.d/default.conf
+    volumes:
+      - ./aperturedb/certificate:/etc/nginx/certificate
+
+  crawl-website:
+    build:
+      context: ../../apps/crawl-website
+    volumes:
+      - ../../:/workflows
+    environment:
+      WF_LOGS_AWS_CREDENTIALS: "aws-credentials"
+      DB_HOST: lenz
+      DB_PORT: 55551
+      PORT: 8080
+      PROMETHEUS_PORT: 8001
+    command: bash -c "while true; do sleep 1000; done"
+    depends_on:
+      aperturedb:
+        condition: service_started
+
+configs:
+  nginx.conf:
+    content: |
+      server {
+        listen 80;
+        listen 443 ssl;
+        client_max_body_size 256m;
+        ssl_certificate /etc/nginx/certificate/tls.crt;
+        ssl_certificate_key /etc/nginx/certificate/tls.key;
+        location / {
+          proxy_pass http://webui;
+        }
+        location /api/ {
+          proxy_pass http://lenz:8080;
+        }
+      }
+
diff --git a/.devcontainer/dataset-ingestion/devcontainer.json b/.devcontainer/dataset-ingestion/devcontainer.json
new file mode 100644
index 00000000..223ded36
--- /dev/null
+++ b/.devcontainer/dataset-ingestion/devcontainer.json
@@ -0,0 +1,30 @@
+{
+    "name": "dataset-ingestion",
+    "dockerComposeFile": [
+        "docker-compose.yml"
+    ],
+    "service": "dataset-ingestion",
+    "workspaceFolder": "/workflows",
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "ms-python.python",
+                "ms-python.pylint",
+                "ms-python.black-formatter",
+                "ms-toolsai.jupyter"
+            ]
+        }
+    },
+    "settings": {
+        "python.defaultInterpreterPath": "/opt/venv/bin/python",
+        "python.formatting.provider": "black",
+        "python.linting.enabled": true,
+        "python.linting.pylintEnabled": true,
+        "files.exclude": {
+            "**/__pycache__": true,
+            "**/*.pyc": true
+        }
+    },
+    "initializeCommand": "./initcommand.sh",
+    "postCreateCommand": "./postinstall.sh"
+}
\ No newline at end of file
diff --git a/.devcontainer/dataset-ingestion/docker-compose.yml b/.devcontainer/dataset-ingestion/docker-compose.yml
new file mode 100644
index 00000000..f050eafa
--- /dev/null
+++ b/.devcontainer/dataset-ingestion/docker-compose.yml
@@ -0,0 +1,99 @@
+name: aperturedb-local-linux
+
+services:
+  ca:
+    image: alpine/openssl
+    restart: on-failure
+    command: req -x509 -newkey rsa:4096 -days 3650 -nodes -keyout /cert/tls.key -out /cert/tls.crt -subj "/C=US/O=ApertureData Inc./CN=localhost"
+    volumes:
+      - ./aperturedb/certificate:/cert
+
+  lenz:
+    depends_on:
+      ca:
+        condition: service_completed_successfully
+      aperturedb:
+        condition: service_started
+    image: aperturedata/lenz:latest
+    ports:
+      - ${ADB_PORT}:55551
+    restart: always
+    environment:
+      LNZ_HEALTH_PORT: 58085
+      LNZ_TCP_PORT: 55551
+      LNZ_HTTP_PORT: 8080
+      LNZ_ADB_BACKENDS: '["aperturedb:55553"]'
+      LNZ_REPLICAS: 1
+      LNZ_ADB_MAX_CONCURRENCY: 48
+      LNZ_FORCE_SSL: false
+      LNZ_CERTIFICATE_PATH: /etc/lenz/certificate/tls.crt
+      LNZ_PRIVATE_KEY_PATH: /etc/lenz/certificate/tls.key
+    volumes:
+      - ./aperturedb/certificate:/etc/lenz/certificate
+
+  aperturedb:
+    image: aperturedata/aperturedb-community:latest
+    volumes:
+      - ./aperturedb/db:/aperturedb/db
+      - ./aperturedb/logs:/aperturedb/logs
+    restart: always
+    environment:
+      ADB_KVGD_DB_SIZE: "204800"
+      ADB_LOG_PATH: "logs"
+      ADB_ENABLE_DEBUG: 1
+      ADB_MASTER_KEY: "admin"
+      ADB_PORT: 55553
+      ADB_FORCE_SSL: false
+
+  webui:
+    image: aperturedata/aperturedata-platform-web-private:latest
+    restart: always
+
+  nginx:
+    depends_on:
+      ca:
+        condition: service_completed_successfully
+    image: nginx
+    restart: always
+    ports:
+      - 8081:80
+      - 8443:443
+    configs:
+      - source: nginx.conf
+        target: /etc/nginx/conf.d/default.conf
+    volumes:
+      - ./aperturedb/certificate:/etc/nginx/certificate
+
+  dataset-ingestion:
+    build:
+      context: ../../apps/dataset-ingestion
+    volumes:
+      - ../../:/workflows
+    environment:
+      WF_DATA_SOURCE_GCP_BUCKET: "ad-demos-datasets"
+      WF_LOGS_AWS_CREDENTIALS: "aws-credentials"
+      DB_HOST: lenz
+      DB_PORT: 55551
+      PORT: 8080
+      PROMETHEUS_PORT: 8001
+    command: bash -c "while true; do sleep 1000; done"
+    depends_on:
+      aperturedb:
+        condition: service_started
+
+configs:
+  nginx.conf:
+    content: |
+      server {
+        listen 80;
+        listen 443 ssl;
+        client_max_body_size 256m;
+        ssl_certificate /etc/nginx/certificate/tls.crt;
+        ssl_certificate_key /etc/nginx/certificate/tls.key;
+        location / {
+          proxy_pass http://webui;
+        }
+        location /api/ {
+          proxy_pass http://lenz:8080;
+        }
+      }
diff --git a/.gitignore b/.gitignore
index d73f198c..dfefecac 100644
--- a/.gitignore
+++ b/.gitignore
@@ -173,3 +173,4 @@ cython_debug/
 apps/dataset-ingestion/input
 log.txt
 input/
+aperturedb/
\ No newline at end of file
diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 00000000..ff8c469c
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,16 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+
+        {
+            "name": "Python Debugger: Current File with Arguments",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/apps/caption-image/README.md b/apps/caption-image/README.md
index 4f202533..c13166a9 100644
--- a/apps/caption-image/README.md
+++ b/apps/caption-image/README.md
@@ -1,6 +1,66 @@
-# Example App
+# Caption Image App
 
 This workflow retrieves all images from ApertureDB that have not been
 analyzed before, and runs them through a
 [BLIP (Bootstrapping Language-Image Pre-training)](https://github.com/salesforce/BLIP)
 model to generate a caption for each image.
+
+The workflow will run once and process all uncaptioned images.
+
+## Database details
+
+```mermaid
+sequenceDiagram
+    participant W as Caption Image
+    participant A as ApertureDB instance
+
+    W->>A: FindImage
+    A-->>W: count
+    loop Until done
+        W->>A: FindImage
+        A-->>W: images
+        W->>A: UpdateImage
+    end
+```
+
+Each image is updated with a caption property (`wf_caption_image`) containing the generated caption text. The BLIP model processes each image to generate descriptive text that describes the visual content of the image.
+
+## Running in Docker
+
+```
+docker run \
+           -e RUN_NAME=my_testing_run \
+           -e DB_HOST=workflowstesting.gcp.cloud.aperturedata.dev \
+           -e DB_PASS="password" \
+           -e NUM_WORKERS=4 \
+           -e BATCH_SIZE=32 \
+           -e LOG_LEVEL=INFO \
+           aperturedata/workflows-caption-image
+```
+
+Parameters:
+* **`NUM_WORKERS`**: Specifies the number of worker threads that will be running simultaneously,
+retrieving and processing images in parallel. Default is `1`.
+* **`BATCH_SIZE`**: Specifies the batch size for processing images. Default is `1`.
+* **`LOG_LEVEL`**: Set log level for workflow code. Available options: DEBUG, INFO, WARNING, ERROR. Default is `INFO`.
+
+See [Common Parameters](../../README.md#common-parameters) for common parameters.
+
+## Cleaning up
+
+To clean all captions generated by this workflow, simply run the following query:
+
+```
+q = [
+        {
+            "UpdateImage": {
+                "constraints": {
+                    "wf_caption_image": ["!=", null]
+                },
+                "remove_props": ["wf_caption_image"]
+            }
+        }
+    ]
+```
+
+or manually remove the `wf_caption_image` property from images that have been processed.
\ No newline at end of file
diff --git a/apps/caption-image/app/weights.py b/apps/caption-image/app/weights.py
index 6f701877..61f570ef 100644
--- a/apps/caption-image/app/weights.py
+++ b/apps/caption-image/app/weights.py
@@ -14,3 +14,5 @@
 output = model.generate(**inputs)
 caption = processor.decode(output[0], skip_special_tokens=True)
 print(caption)
+
+assert "cat" in caption.lower(), f"{caption} does not contain 'cat'"
diff --git a/base/docker/scripts/sitecustomize.py b/base/docker/scripts/sitecustomize.py
index d2558a11..26aa47eb 100644
--- a/base/docker/scripts/sitecustomize.py
+++ b/base/docker/scripts/sitecustomize.py
@@ -1,17 +1,21 @@
+"""Site customization module for setting up global exception handling."""
 import sys
-from status_tools import StatusUpdater, WorkFlowError
 import logging
 
+from status_tools import StatusUpdater, WorkFlowError
+
+
 old_handler = sys.excepthook
 
 logging.info("Setting up exception handler")
 updater = StatusUpdater()
 
-def exception_handler(type, value, tb):
+def exception_handler(etype, value, tb):
+    """Handle uncaught exceptions by posting status updates."""
     updater.post_update(
-        error_message=f"Exception: {type.__name__} {value}",
+        error_message=f"Exception: {etype.__name__} {value}",
         error_code=WorkFlowError.WORKFLOW_ERROR
     )
-    old_handler(type, value, tb)
+    old_handler(etype, value, tb)
 
 sys.excepthook = exception_handler
diff --git a/configuration_params.py b/configuration_params.py
new file mode 100644
index 00000000..70a1b5bb
--- /dev/null
+++ b/configuration_params.py
@@ -0,0 +1,11 @@
+import platform
+
+
+def is_mac():
+    return platform.system() == "Darwin"
+
+def main():
+    print(f"ADB_PORT={55557 if is_mac() else 55555}")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/initcommand.sh b/initcommand.sh
new file mode 100755
index 00000000..7ec1075d
--- /dev/null
+++ b/initcommand.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+docker build --build-arg WORKFLOW_VERSION=\"latest\" -t aperturedata/workflows-base base/docker
+python3 configuration_params.py > .devcontainer/caption-image/.env
\ No newline at end of file
diff --git a/pipeline.py b/pipeline.py
new file mode 100644
index 00000000..f1a01b79
--- /dev/null
+++ b/pipeline.py
@@ -0,0 +1,20 @@
+from prefect import flow, task
+import httpx
+
+
+@task(log_prints=True)
+def get_stars(repo: str):
+    url = f"https://api.github.com/repos/{repo}"
+    count = httpx.get(url).json()["stargazers_count"]
+    print(f"{repo} has {count} stars!")
+
+
+@flow(name="GitHub Stars")
+def github_stars(repos: list[str]):
+    for repo in repos:
+        get_stars(repo)
+
+
+# run the flow!
+if __name__=="__main__":
+    github_stars(["PrefectHQ/Prefect"])
\ No newline at end of file
diff --git a/postinstall.sh b/postinstall.sh
new file mode 100755
index 00000000..10196c24
--- /dev/null
+++ b/postinstall.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+/opt/venv/bin/adb config create default --host=${DB_HOST} --port=${DB_PORT} --no-interactive
+/opt/venv/bin/adb --install-completion
\ No newline at end of file
diff --git a/workflows-devcontiner.code-workspace b/workflows-devcontiner.code-workspace
new file mode 100644
index 00000000..6ff8bea8
--- /dev/null
+++ b/workflows-devcontiner.code-workspace
@@ -0,0 +1,11 @@
+{
+	"folders": [
+		{
+			"path": "."
+		},
+		{
+			"path": "../app"
+		}
+	],
+	"settings": {}
+}
\ No newline at end of file

From e3b59949c7db6ae41c978c8040d28bd5faf64e96 Mon Sep 17 00:00:00 2001
From: Gautam Saluja <gautam@aperturedata.io>
Date: Thu, 18 Sep 2025 15:58:08 -0400
Subject: [PATCH 3/4] stray file.

---
 pipeline.py | 20 --------------------
 1 file changed, 20 deletions(-)
 delete mode 100644 pipeline.py

diff --git a/pipeline.py b/pipeline.py
deleted file mode 100644
index f1a01b79..00000000
--- a/pipeline.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from prefect import flow, task
-import httpx
-
-
-@task(log_prints=True)
-def get_stars(repo: str):
-    url = f"https://api.github.com/repos/{repo}"
-    count = httpx.get(url).json()["stargazers_count"]
-    print(f"{repo} has {count} stars!")
-
-
-@flow(name="GitHub Stars")
-def github_stars(repos: list[str]):
-    for repo in repos:
-        get_stars(repo)
-
-
-# run the flow!
-if __name__=="__main__":
-    github_stars(["PrefectHQ/Prefect"])
\ No newline at end of file

From 4a3651fe06538722d06f612ca90f3b43bafa77d7 Mon Sep 17 00:00:00 2001
From: Gautam Saluja <gautam@aperturedata.io>
Date: Fri, 19 Sep 2025 09:29:08 -0400
Subject: [PATCH 4/4] Some review feedback

---
 .devcontainer/caption-image/devcontainer.json          |  1 -
 .devcontainer/crawl-website/devcontainer.json          |  1 -
 .devcontainer/dataset-ingestion/devcontainer.json      |  1 -
 apps/caption-image/Dockerfile                          |  4 ++--
 apps/caption-image/README.md                           |  2 +-
 apps/caption-image/app/caption_images.py               |  6 +++---
 apps/caption-image/app/images.py                       | 10 +++++-----
 .../app/{weights.py => warmup_validate.py}             |  3 +++
 ...-workspace => workflows-devcontainer.code-workspace |  0
 9 files changed, 14 insertions(+), 14 deletions(-)
 rename apps/caption-image/app/{weights.py => warmup_validate.py} (85%)
 rename workflows-devcontiner.code-workspace => workflows-devcontainer.code-workspace (100%)

diff --git a/.devcontainer/caption-image/devcontainer.json b/.devcontainer/caption-image/devcontainer.json
index d515086f..053110e4 100644
--- a/.devcontainer/caption-image/devcontainer.json
+++ b/.devcontainer/caption-image/devcontainer.json
@@ -10,7 +10,6 @@
             "extensions": [
                 "ms-python.python",
                 "ms-python.pylint",
-                "ms-python.black-formatter",
                 "ms-toolsai.jupyter"
             ]
         }
diff --git a/.devcontainer/crawl-website/devcontainer.json b/.devcontainer/crawl-website/devcontainer.json
index a36eb044..8ac82ddd 100644
--- a/.devcontainer/crawl-website/devcontainer.json
+++ b/.devcontainer/crawl-website/devcontainer.json
@@ -10,7 +10,6 @@
             "extensions": [
                 "ms-python.python",
                 "ms-python.pylint",
-                "ms-python.black-formatter",
                 "ms-toolsai.jupyter"
             ]
         }
diff --git a/.devcontainer/dataset-ingestion/devcontainer.json b/.devcontainer/dataset-ingestion/devcontainer.json
index 223ded36..bff9c435 100644
--- a/.devcontainer/dataset-ingestion/devcontainer.json
+++ b/.devcontainer/dataset-ingestion/devcontainer.json
@@ -10,7 +10,6 @@
             "extensions": [
                 "ms-python.python",
                 "ms-python.pylint",
-                "ms-python.black-formatter",
                 "ms-toolsai.jupyter"
             ]
         }
diff --git a/apps/caption-image/Dockerfile b/apps/caption-image/Dockerfile
index 42e107eb..3c079c5b 100644
--- a/apps/caption-image/Dockerfile
+++ b/apps/caption-image/Dockerfile
@@ -9,7 +9,7 @@ RUN pip install -U pip
 RUN pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
 RUN pip install --no-cache-dir -r /requirements.txt
 
-COPY app/weights.py /app/weights.py
-RUN python /app/weights.py
+COPY app/warmup_validate.py /app/warmup_validate.py
+RUN python /app/warmup_validate.py
 
 COPY app /app/
diff --git a/apps/caption-image/README.md b/apps/caption-image/README.md
index c13166a9..4fd3707c 100644
--- a/apps/caption-image/README.md
+++ b/apps/caption-image/README.md
@@ -42,7 +42,7 @@ Parameters:
 * **`NUM_WORKERS`**: Specifies the number of worker threads that will be running simultaneously,
 retrieving and processing images in parallel. Default is `1`.
 * **`BATCH_SIZE`**: Specifies the batch size for processing images. Default is `1`.
-* **`LOG_LEVEL`**: Set log level for workflow code. Available options: DEBUG, INFO, WARNING, ERROR. Default is `INFO`.
+* **`LOG_LEVEL`**: Set log level for workflow code. Available options: DEBUG, INFO, WARNING, ERROR. Default is `WARNING`.
 
 See [Common Parameters](../../README.md#common-parameters) for common parameters.
 
diff --git a/apps/caption-image/app/caption_images.py b/apps/caption-image/app/caption_images.py
index 9200d2e5..b9bb8e00 100644
--- a/apps/caption-image/app/caption_images.py
+++ b/apps/caption-image/app/caption_images.py
@@ -7,19 +7,19 @@
 from connection_pool import ConnectionPool
 
 app = Typer()
-DONE_PROPERTY = 'wf_caption_image'
+CAPTION_IMAGE_PROPERTY = 'wf_caption_image'
 
 @app.command()
 def caption_images(
     num_workers:int = 1,
     batch_size:int = 1,
-    log_level:str = "INFO"
+    log_level:str = "WARNING"
 ):
     logging.basicConfig(level=logging.getLevelName(log_level))
     pool = ConnectionPool()
     data = FindImageQueryGenerator(
         pool,
-        done_property=DONE_PROPERTY)
+        caption_image_property=CAPTION_IMAGE_PROPERTY)
 
     print("Running Caption Image...")
     with pool.get_connection() as db:
diff --git a/apps/caption-image/app/images.py b/apps/caption-image/app/images.py
index c28bec78..9b717d0f 100644
--- a/apps/caption-image/app/images.py
+++ b/apps/caption-image/app/images.py
@@ -22,15 +22,15 @@ class FindImageQueryGenerator(QueryGenerator.QueryGenerator):
         Generates n FindImage Queries
     """
 
-    def __init__(self, pool, done_property: str):
+    def __init__(self, pool, caption_image_property: str):
 
         self.pool = pool
-        self.done_property = done_property
+        self.caption_image_property = caption_image_property
 
         query = [{
             "FindImage": {
                 "constraints": {
-                    self.done_property: ["==", None]
+                    self.caption_image_property: ["==", None]
                 },
                 "results": {
                     "count": True
@@ -68,7 +68,7 @@ def getitem(self, idx):
             "FindImage": {
                 "blobs": True,
                 "constraints": {
-                    self.done_property: ["==", None]
+                    self.caption_image_property: ["==", None]
                 },
                 "batch": {
                     "batch_size": self.batch_size,
@@ -118,7 +118,7 @@ def response_handler(self, query, blobs, response, r_blobs):
                 "UpdateImage": {
                     "ref": i + 1,
                     "properties": {
-                        self.done_property: captions[i]
+                        self.caption_image_property: captions[i]
                     },
                 }
             })
diff --git a/apps/caption-image/app/weights.py b/apps/caption-image/app/warmup_validate.py
similarity index 85%
rename from apps/caption-image/app/weights.py
rename to apps/caption-image/app/warmup_validate.py
index 61f570ef..7e496d23 100644
--- a/apps/caption-image/app/weights.py
+++ b/apps/caption-image/app/warmup_validate.py
@@ -2,6 +2,9 @@
 import requests
 from transformers import AutoProcessor, BlipForConditionalGeneration
 
+
+# This serves as a warmup for the model to load into memory
+# It also validates that the model is working correctly
 processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
 
diff --git a/workflows-devcontiner.code-workspace b/workflows-devcontainer.code-workspace
similarity index 100%
rename from workflows-devcontiner.code-workspace
rename to workflows-devcontainer.code-workspace