diff --git a/.gitignore b/.gitignore
index a781c98..b0bf348 100644
--- a/.gitignore
+++ b/.gitignore
@@ -327,3 +327,9 @@ dist
 
 # Ignore master key for decrypting credentials and more.
 /config/master.key
+
+pyrightconfig.json
+
+
+meta-llama-8b-instruct-q4_K_M.gguf
+llama-model.gguf
diff --git a/llama-demo/Dockerfile.gcp b/llama-demo/Dockerfile.gcp
new file mode 100644
index 0000000..944afbc
--- /dev/null
+++ b/llama-demo/Dockerfile.gcp
@@ -0,0 +1,19 @@
+FROM python:3.11-slim
+
+WORKDIR /code
+
+RUN apt-get update && apt-get install -y --no-install-recommends g++ gcc
+
+COPY ./main.py /code/main.py
+COPY ./infra.py /code/infra.py
+COPY ./llama_demo /code/llama_demo
+COPY ./templates /code/templates
+COPY ./requirements.txt /code/requirements.txt
+
+RUN pip install uv
+RUN uv pip install --system -r requirements.txt
+
+ENV PORT=80
+EXPOSE $PORT
+
+CMD fastapi run main.py --host 0.0.0.0 --port $PORT
diff --git a/llama-demo/infra.py b/llama-demo/infra.py
new file mode 100644
index 0000000..dbd8b7c
--- /dev/null
+++ b/llama-demo/infra.py
@@ -0,0 +1,50 @@
+"""infra.py
+
+This file is used to customize the infrastructure your application deploys to.
+
+Create your cloud infrastructure with:
+    lf create
+
+Deploy your application with:
+    lf deploy
+
+"""
+
+import launchflow as lf
+
+if lf.environment == "lf-llama-gcp":
+    llama_service = lf.gcp.ComputeEngineService(
+        "launchflow-llama-service",
+        dockerfile="Dockerfile.gcp",  # Path to your Dockerfile
+        machine_type="e2-standard-4",
+        build_directory="llama_server",
+        disk_size_gb=50,
+    )
+    model_bucket = lf.gcp.GCSBucket("launchflow-llama-demo")
+elif lf.environment == "lf-llama-aws":
+    llama_service = lf.aws.ECSFargateService(
+        "launchflow-llama-service",
+        dockerfile="Dockerfile.aws",  # Path to your Dockerfile
+        build_directory="llama_server",
+        cpu=8192,  # 8 cpus are required for GPU support
+        memory=16384,  # 16 GB of memory are required for GPU support
+        # load_balancer=lf.aws.alb.InternalHTTP(),
+    )
+    serving_service = lf.aws.LambdaService(
+        "launchflow-llama-serving-demo",
+        handler="main.handler",
+        build_ignore=[
+            "llama_server",
+            "Dockerfile.*",
+            "requirements*",
+            "launchflow.yaml",
+        ],
+        runtime=lf.aws.lambda_service.PythonRuntime(
+            requirements_txt_path="requirements-aws.txt"
+        ),
+        timeout_seconds=900,
+        env={"LLAMA_SERVER_ADDRESS": lf.Depends(llama_service).service_url},  # type: ignore
+    )
+    model_bucket = lf.aws.S3Bucket("launchflow-llama-demo")
+else:
+    raise ValueError(f"Unknown environment: {lf.environment}")
diff --git a/llama-demo/launchflow.yaml b/llama-demo/launchflow.yaml
new file mode 100644
index 0000000..ff44589
--- /dev/null
+++ b/llama-demo/launchflow.yaml
@@ -0,0 +1,2 @@
+project: llama-demo
+backend: lf://default
diff --git a/llama-demo/llama_demo/chat_router.py b/llama-demo/llama_demo/chat_router.py
new file mode 100644
index 0000000..7a524ec
--- /dev/null
+++ b/llama-demo/llama_demo/chat_router.py
@@ -0,0 +1,56 @@
+from typing import Annotated
+from fastapi import APIRouter
+from fastapi.params import Depends
+
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+from starlette.responses import StreamingResponse
+
+from llama_demo.settings import settings
+from llama_demo.schemas import Chat
+from llama_demo.llama_client import LlamaClient
+
+router = APIRouter(prefix="/v1", tags=["v1"])
+
+
+class ChatResponse(BaseModel):
+    content: str
+
+
+@router.post("/chat", response_model=None)
+async def chat(
+    chat: Chat, model: Annotated[LlamaClient, Depends(LlamaClient)]
+) -> StreamingResponse | JSONResponse:
+    messages = []
+    for message in chat.context:
+        messages.append({"role": message.role, "content": message.content})
+
+    context_string = "".join([msg["content"] for msg in messages])
+    if len(context_string) > 512:
+        trimmed_context = []
+        current_length = 0
+        for msg in reversed(messages):
+            msg_length = len(msg["content"])
+            if current_length + msg_length <= settings.context_window:
+                trimmed_context.append(msg)
+                current_length += msg_length
+            else:
+                num_to_append = msg_length - (settings.context_window - current_length)
+                msg["content"] = msg["content"][-num_to_append:]
+                trimmed_context.append(msg)
+                break
+        messages = list(reversed(trimmed_context))
+
+    completion = model.chat(messages=messages)
+
+    def iter_content():
+        for item in completion:
+            yield item
+
+    if settings.streaming:
+        return StreamingResponse(iter_content())
+
+    full_content = ""
+    for item in iter_content():
+        full_content += item
+    return JSONResponse(content={"content": full_content})
diff --git a/llama-demo/llama_demo/llama_client.py b/llama-demo/llama_demo/llama_client.py
new file mode 100644
index 0000000..4df9e22
--- /dev/null
+++ b/llama-demo/llama_demo/llama_client.py
@@ -0,0 +1,30 @@
+import httpx
+from openai.types.chat import ChatCompletionMessageParam
+from llama_demo.settings import settings
+import openai
+from typing import Dict, Any, Iterable
+
+
+async def httpx_client():
+    async with httpx.AsyncClient() as client:
+        yield client
+
+
+class LlamaClient:
+    def __init__(self):
+        self.client = openai.Client(
+            base_url=settings.llama_server_address, api_key="no-api-key"
+        )
+
+    def chat(self, messages: Iterable[ChatCompletionMessageParam]):
+        result = self.client.chat.completions.create(
+            messages=messages,
+            model="gpt-3.5-turbo",
+            stream=True,
+            max_tokens=int(settings.context_window / 2),
+            timeout=600,
+        )
+        for r in result:
+            content = r.choices[0].delta.content
+            if content is not None:
+                yield content
diff --git a/llama-demo/llama_demo/schemas.py b/llama-demo/llama_demo/schemas.py
new file mode 100644
index 0000000..94c6e08
--- /dev/null
+++ b/llama-demo/llama_demo/schemas.py
@@ -0,0 +1,13 @@
+from typing import Literal
+
+from pydantic import BaseModel
+
+
+class ChatMessage(BaseModel):
+    content: str
+    role: Literal["system", "user", "assistant"]
+
+
+class Chat(BaseModel):
+    message: str
+    context: list[ChatMessage]
diff --git a/llama-demo/llama_demo/settings.py b/llama-demo/llama_demo/settings.py
new file mode 100644
index 0000000..cf187c2
--- /dev/null
+++ b/llama-demo/llama_demo/settings.py
@@ -0,0 +1,17 @@
+import os
+
+from dotenv import load_dotenv
+
+load_dotenv()
+
+
+class Settings:
+    context_window: int = int(os.environ.get("CONTEXT_WINDOW", 5000))
+    llama_server_address: str = os.environ.get(
+        "LLAMA_SERVER_ADDRESS",
+        "http://launchflow-llama-service-l-79dfc-799820891.us-east-1.elb.amazonaws.com",
+    )
+    streaming: bool = os.environ.get("LAUNCHFLOW_ENVIRONMENT") == "lf-llama-gcp"
+
+
+settings = Settings()
diff --git a/llama-demo/llama_server/Dockerfile.aws b/llama-demo/llama_server/Dockerfile.aws
new file mode 100644
index 0000000..7022a1d
--- /dev/null
+++ b/llama-demo/llama_server/Dockerfile.aws
@@ -0,0 +1,28 @@
+# Use the existing Dockerfile from the repo as the base
+FROM public.ecr.aws/docker/library/ubuntu:22.04
+
+# Install necessary dependencies
+ARG DEBIAN_FRONTEND=noninteractive
+ENV TZ=Etc/UTC
+RUN apt-get update && apt-get install -y \
+    git build-essential cmake awscli wget unzip \
+    && rm -rf /var/lib/apt/lists/*
+
+# Clone the llama.cpp repository
+RUN git clone https://github.com/ggerganov/llama.cpp.git /app/llama.cpp
+WORKDIR /app/llama.cpp
+
+# Build server
+RUN make llama-server
+
+# Expose necessary port for the server
+EXPOSE 80
+
+# Copy a custom script that handles the download at runtime
+COPY download_and_run.sh /app/download_and_run.sh
+RUN chmod +x /app/download_and_run.sh
+
+WORKDIR /app
+
+# Start the server through the custom script
+ENTRYPOINT ["/app/download_and_run.sh"]
diff --git a/llama-demo/llama_server/Dockerfile.gcp b/llama-demo/llama_server/Dockerfile.gcp
new file mode 100644
index 0000000..5cc8ae5
--- /dev/null
+++ b/llama-demo/llama_server/Dockerfile.gcp
@@ -0,0 +1,32 @@
+# Use the existing Dockerfile from the repo as the base
+FROM ubuntu:22.04
+
+# Install necessary dependencies
+RUN apt-get update && apt-get install -y \
+    git build-essential cmake wget unzip curl python3 python3-distutils \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Google Cloud SDK (for gsutil)
+RUN curl -sSL https://sdk.cloud.google.com | bash && \
+    /root/google-cloud-sdk/install.sh
+
+ENV PATH=$PATH:/root/google-cloud-sdk/bin
+
+# Clone the llama.cpp repository
+RUN git clone https://github.com/ggerganov/llama.cpp.git /app/llama.cpp
+WORKDIR /app/llama.cpp
+
+# Build server
+RUN make llama-server
+
+# Expose necessary port for the server
+EXPOSE 80
+
+# Copy a custom script that handles the download at runtime
+COPY download_and_run.sh /app/download_and_run.sh
+RUN chmod +x /app/download_and_run.sh
+
+WORKDIR /app
+
+# Start the server through the custom script
+ENTRYPOINT ["/app/download_and_run.sh"]
diff --git a/llama-demo/llama_server/download_and_run.sh b/llama-demo/llama_server/download_and_run.sh
new file mode 100644
index 0000000..34efc01
--- /dev/null
+++ b/llama-demo/llama_server/download_and_run.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Define S3 bucket and model file
+LOCAL_MODAL_PATH="/models/llama.gguf"
+REMOTE_MODEL_FILE_NAME="meta-llama-8b-instruct-q4_K_M.gguf"
+
+
+
+if [ "${LAUNCHFLOW_ENVIRONMENT}" = "lf-llama-aws" ]; then
+    bucket_url="s3://launchflow-llama-demo"
+    echo "Downloading model from ${S3_BUCKET_URL}/${MODEL_FILENAME}..."
+    aws s3 cp ${bucket_url}/${REMOTE_MODEL_FILE_NAME} $LOCAL_MODAL_PATH
+elif [ "${LAUNCHFLOW_ENVIRONMENT}" = "lf-llama-gcp" ]; then
+    bucket_url="gs://launchflow-llama-demo"
+    echo "Downloading model from ${bucket_url}/${REMOTE_MODEL_FILE_NAME}..."
+    gsutil cp ${bucket_url}/${REMOTE_MODEL_FILE_NAME} $LOCAL_MODAL_PATH
+fi
+
+# Start the server with the downloaded model
+
+cd llama.cpp
+./llama-server --model $LOCAL_MODAL_PATH --ctx-size 5000 --port 80 --host 0.0.0.0
diff --git a/llama-demo/main.py b/llama-demo/main.py
new file mode 100644
index 0000000..1cee5fd
--- /dev/null
+++ b/llama-demo/main.py
@@ -0,0 +1,34 @@
+from fastapi import FastAPI
+from fastapi.requests import Request
+from fastapi.templating import Jinja2Templates
+from llama_demo.chat_router import router
+from mangum import Mangum
+
+from llama_demo.settings import settings
+
+app = FastAPI()
+
+app.include_router(router)
+
+templates = Jinja2Templates(directory="templates")
+
+
+@app.get("/")
+def index(request: Request):
+    return templates.TemplateResponse(
+        request=request,
+        name="index.html",
+        context={
+            "navigation": [
+                {
+                    "caption": "Llama Chat Demo",
+                    "href": "/",
+                    "active": True,
+                }
+            ],
+            "streaming": settings.streaming,
+        },
+    )
+
+
+handler = Mangum(app, lifespan="off")
diff --git a/llama-demo/requirements-aws.in b/llama-demo/requirements-aws.in
new file mode 100644
index 0000000..466a46c
--- /dev/null
+++ b/llama-demo/requirements-aws.in
@@ -0,0 +1,7 @@
+fastapi
+launchflow[aws]
+pydantic
+openai
+mangum
+durationpy==0.6
+python-dotenv
diff --git a/llama-demo/requirements-aws.txt b/llama-demo/requirements-aws.txt
new file mode 100644
index 0000000..5f7b0bb
--- /dev/null
+++ b/llama-demo/requirements-aws.txt
@@ -0,0 +1,181 @@
+# This file was autogenerated by uv v0.1.1 via the following command:
+#    uv pip compile requirements-aws.in -o requirements-aws.txt
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.6.0
+    # via
+    #   httpx
+    #   openai
+    #   starlette
+backoff==2.2.1
+    # via posthog
+beaupy==3.9.2
+    # via launchflow
+boto3==1.35.27
+    # via launchflow
+botocore==1.35.27
+    # via
+    #   boto3
+    #   s3transfer
+cachetools==5.5.0
+    # via google-auth
+certifi==2024.8.30
+    # via
+    #   httpcore
+    #   httpx
+    #   kubernetes
+    #   requests
+charset-normalizer==3.3.2
+    # via requests
+click==8.1.7
+    # via typer
+deepdiff==8.0.1
+    # via launchflow
+distro==1.9.0
+    # via openai
+docker==7.1.0
+    # via launchflow
+durationpy==0.6
+emoji==2.13.2
+    # via beaupy
+fastapi==0.115.0
+google-auth==2.35.0
+    # via kubernetes
+h11==0.14.0
+    # via httpcore
+httpcore==1.0.5
+    # via httpx
+httpx==0.27.2
+    # via
+    #   launchflow
+    #   openai
+idna==3.10
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+jinja2==3.1.4
+    # via launchflow
+jiter==0.5.0
+    # via openai
+jmespath==1.0.1
+    # via
+    #   boto3
+    #   botocore
+kubernetes==30.1.0
+    # via launchflow
+launchflow==0.4.12
+mangum==0.18.0
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==2.1.5
+    # via jinja2
+mdurl==0.1.2
+    # via markdown-it-py
+monotonic==1.6
+    # via posthog
+oauthlib==3.2.2
+    # via
+    #   kubernetes
+    #   requests-oauthlib
+openai==1.48.0
+orderly-set==5.2.2
+    # via deepdiff
+pathspec==0.12.1
+    # via launchflow
+posthog==3.6.6
+    # via launchflow
+pyasn1==0.6.1
+    # via
+    #   pyasn1-modules
+    #   rsa
+pyasn1-modules==0.4.1
+    # via google-auth
+pydantic==2.9.2
+    # via
+    #   fastapi
+    #   launchflow
+    #   openai
+pydantic-core==2.23.4
+    # via pydantic
+pygments==2.18.0
+    # via rich
+pyjwt==2.9.0
+    # via launchflow
+python-dateutil==2.9.0.post0
+    # via
+    #   botocore
+    #   kubernetes
+    #   posthog
+python-dotenv==1.0.1
+python-yakh==0.3.2
+    # via
+    #   beaupy
+    #   questo
+pyyaml==6.0.2
+    # via
+    #   kubernetes
+    #   launchflow
+questo==0.3.0
+    # via beaupy
+requests==2.32.3
+    # via
+    #   docker
+    #   kubernetes
+    #   launchflow
+    #   posthog
+    #   requests-oauthlib
+requests-oauthlib==2.0.0
+    # via kubernetes
+rich==13.8.1
+    # via
+    #   beaupy
+    #   launchflow
+    #   questo
+    #   typer
+rsa==4.9
+    # via google-auth
+s3transfer==0.10.2
+    # via boto3
+setuptools==75.1.0
+    # via launchflow
+shellingham==1.5.4
+    # via typer
+six==1.16.0
+    # via
+    #   kubernetes
+    #   posthog
+    #   python-dateutil
+sniffio==1.3.1
+    # via
+    #   anyio
+    #   httpx
+    #   openai
+starlette==0.38.6
+    # via fastapi
+terminaltexteffects==0.11.0
+    # via launchflow
+toml==0.10.2
+    # via launchflow
+tqdm==4.66.5
+    # via openai
+typer==0.12.5
+    # via launchflow
+typing-extensions==4.12.2
+    # via
+    #   fastapi
+    #   mangum
+    #   openai
+    #   pydantic
+    #   pydantic-core
+    #   typer
+urllib3==2.2.3
+    # via
+    #   botocore
+    #   docker
+    #   kubernetes
+    #   requests
+uvloop==0.20.0
+    # via launchflow
+websocket-client==1.8.0
+    # via kubernetes
diff --git a/llama-demo/requirements-gcp.in b/llama-demo/requirements-gcp.in
new file mode 100644
index 0000000..b6cbbe8
--- /dev/null
+++ b/llama-demo/requirements-gcp.in
@@ -0,0 +1,8 @@
+fastapi[standard]
+launchflow[gcp]
+pydantic
+pydantic-settings
+openai
+mangum
+durationpy==0.6
+python-dotenv
diff --git a/llama-demo/requirements-gcp.txt b/llama-demo/requirements-gcp.txt
new file mode 100644
index 0000000..8d49ead
--- /dev/null
+++ b/llama-demo/requirements-gcp.txt
@@ -0,0 +1,391 @@
+# This file was autogenerated by uv v0.1.1 via the following command:
+#    uv pip compile requirements.in -o requirements.txt
+aiofiles==24.1.0
+    # via cloud-sql-python-connector
+aiohappyeyeballs==2.4.0
+    # via aiohttp
+aiohttp==3.10.6
+    # via cloud-sql-python-connector
+aiosignal==1.3.1
+    # via aiohttp
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.6.0
+    # via
+    #   httpx
+    #   openai
+    #   starlette
+    #   watchfiles
+attrs==24.2.0
+    # via aiohttp
+backoff==2.2.1
+    # via posthog
+beaupy==3.9.2
+    # via launchflow
+boto3==1.35.26
+    # via launchflow
+botocore==1.35.26
+    # via
+    #   boto3
+    #   s3transfer
+cachetools==5.5.0
+    # via google-auth
+certifi==2024.8.30
+    # via
+    #   httpcore
+    #   httpx
+    #   kubernetes
+    #   requests
+cffi==1.17.1
+    # via cryptography
+charset-normalizer==3.3.2
+    # via requests
+click==8.1.7
+    # via
+    #   typer
+    #   uvicorn
+cloud-sql-python-connector==1.12.1
+    # via launchflow
+cryptography==43.0.1
+    # via cloud-sql-python-connector
+deepdiff==8.0.1
+    # via launchflow
+distro==1.9.0
+    # via openai
+dnspython==2.6.1
+    # via email-validator
+docker==7.1.0
+    # via launchflow
+durationpy==0.6
+email-validator==2.2.0
+    # via fastapi
+emoji==2.13.2
+    # via beaupy
+fastapi==0.115.0
+fastapi-cli==0.0.5
+    # via fastapi
+frozenlist==1.4.1
+    # via
+    #   aiohttp
+    #   aiosignal
+google-api-core==2.20.0
+    # via
+    #   google-api-python-client
+    #   google-cloud-bigquery
+    #   google-cloud-billing
+    #   google-cloud-build
+    #   google-cloud-compute
+    #   google-cloud-container
+    #   google-cloud-core
+    #   google-cloud-pubsub
+    #   google-cloud-resource-manager
+    #   google-cloud-run
+    #   google-cloud-secret-manager
+    #   google-cloud-service-usage
+    #   google-cloud-storage
+    #   google-cloud-tasks
+google-api-python-client==2.147.0
+    # via launchflow
+google-auth==2.35.0
+    # via
+    #   cloud-sql-python-connector
+    #   google-api-core
+    #   google-api-python-client
+    #   google-auth-httplib2
+    #   google-cloud-bigquery
+    #   google-cloud-billing
+    #   google-cloud-build
+    #   google-cloud-compute
+    #   google-cloud-container
+    #   google-cloud-core
+    #   google-cloud-pubsub
+    #   google-cloud-resource-manager
+    #   google-cloud-run
+    #   google-cloud-secret-manager
+    #   google-cloud-service-usage
+    #   google-cloud-storage
+    #   google-cloud-tasks
+    #   kubernetes
+google-auth-httplib2==0.2.0
+    # via google-api-python-client
+google-cloud-bigquery==3.25.0
+    # via launchflow
+google-cloud-billing==1.13.6
+    # via launchflow
+google-cloud-build==3.25.0
+    # via launchflow
+google-cloud-compute==1.19.2
+    # via launchflow
+google-cloud-container==2.51.0
+    # via launchflow
+google-cloud-core==2.4.1
+    # via
+    #   google-cloud-bigquery
+    #   google-cloud-storage
+google-cloud-pubsub==2.23.1
+    # via launchflow
+google-cloud-resource-manager==1.12.5
+    # via launchflow
+google-cloud-run==0.10.8
+    # via launchflow
+google-cloud-secret-manager==2.20.2
+    # via launchflow
+google-cloud-service-usage==1.10.5
+    # via launchflow
+google-cloud-storage==2.18.2
+    # via launchflow
+google-cloud-tasks==2.16.5
+    # via launchflow
+google-crc32c==1.6.0
+    # via
+    #   google-cloud-storage
+    #   google-resumable-media
+google-resumable-media==2.7.2
+    # via
+    #   google-cloud-bigquery
+    #   google-cloud-storage
+googleapis-common-protos==1.65.0
+    # via
+    #   google-api-core
+    #   grpc-google-iam-v1
+    #   grpcio-status
+grpc-google-iam-v1==0.13.1
+    # via
+    #   google-cloud-billing
+    #   google-cloud-build
+    #   google-cloud-pubsub
+    #   google-cloud-resource-manager
+    #   google-cloud-run
+    #   google-cloud-secret-manager
+    #   google-cloud-tasks
+grpcio==1.66.1
+    # via
+    #   google-api-core
+    #   google-cloud-pubsub
+    #   googleapis-common-protos
+    #   grpc-google-iam-v1
+    #   grpcio-status
+grpcio-status==1.66.1
+    # via
+    #   google-api-core
+    #   google-cloud-pubsub
+h11==0.14.0
+    # via
+    #   httpcore
+    #   uvicorn
+httpcore==1.0.5
+    # via httpx
+httplib2==0.22.0
+    # via
+    #   google-api-python-client
+    #   google-auth-httplib2
+httptools==0.6.1
+    # via uvicorn
+httpx==0.27.2
+    # via
+    #   fastapi
+    #   launchflow
+    #   openai
+idna==3.10
+    # via
+    #   anyio
+    #   email-validator
+    #   httpx
+    #   requests
+    #   yarl
+jinja2==3.1.4
+    # via
+    #   fastapi
+    #   launchflow
+jiter==0.5.0
+    # via openai
+jmespath==1.0.1
+    # via
+    #   boto3
+    #   botocore
+kubernetes==30.1.0
+    # via launchflow
+launchflow==0.4.12
+mangum==0.18.0
+markdown-it-py==3.0.0
+    # via rich
+markupsafe==2.1.5
+    # via jinja2
+mdurl==0.1.2
+    # via markdown-it-py
+monotonic==1.6
+    # via posthog
+multidict==6.1.0
+    # via
+    #   aiohttp
+    #   yarl
+oauthlib==3.2.2
+    # via
+    #   kubernetes
+    #   requests-oauthlib
+openai==1.48.0
+orderly-set==5.2.2
+    # via deepdiff
+packaging==24.1
+    # via google-cloud-bigquery
+pathspec==0.12.1
+    # via launchflow
+posthog==3.6.6
+    # via launchflow
+proto-plus==1.24.0
+    # via
+    #   google-api-core
+    #   google-cloud-billing
+    #   google-cloud-build
+    #   google-cloud-compute
+    #   google-cloud-container
+    #   google-cloud-pubsub
+    #   google-cloud-resource-manager
+    #   google-cloud-run
+    #   google-cloud-secret-manager
+    #   google-cloud-service-usage
+    #   google-cloud-tasks
+protobuf==5.28.2
+    # via
+    #   google-api-core
+    #   google-cloud-billing
+    #   google-cloud-build
+    #   google-cloud-compute
+    #   google-cloud-container
+    #   google-cloud-pubsub
+    #   google-cloud-resource-manager
+    #   google-cloud-run
+    #   google-cloud-secret-manager
+    #   google-cloud-service-usage
+    #   google-cloud-tasks
+    #   googleapis-common-protos
+    #   grpc-google-iam-v1
+    #   grpcio-status
+    #   proto-plus
+pyasn1==0.6.1
+    # via
+    #   pyasn1-modules
+    #   rsa
+pyasn1-modules==0.4.1
+    # via google-auth
+pycparser==2.22
+    # via cffi
+pydantic==2.9.2
+    # via
+    #   fastapi
+    #   launchflow
+    #   openai
+    #   pydantic-settings
+pydantic-core==2.23.4
+    # via pydantic
+pydantic-settings==2.5.2
+pygments==2.18.0
+    # via rich
+pyjwt==2.9.0
+    # via launchflow
+pyparsing==3.1.4
+    # via httplib2
+python-dateutil==2.9.0.post0
+    # via
+    #   botocore
+    #   google-cloud-bigquery
+    #   kubernetes
+    #   posthog
+python-dotenv==1.0.1
+    # via
+    #   pydantic-settings
+    #   uvicorn
+python-multipart==0.0.10
+    # via fastapi
+python-yakh==0.3.2
+    # via
+    #   beaupy
+    #   questo
+pyyaml==6.0.2
+    # via
+    #   kubernetes
+    #   launchflow
+    #   uvicorn
+questo==0.3.0
+    # via beaupy
+requests==2.32.3
+    # via
+    #   cloud-sql-python-connector
+    #   docker
+    #   google-api-core
+    #   google-cloud-bigquery
+    #   google-cloud-storage
+    #   kubernetes
+    #   launchflow
+    #   posthog
+    #   requests-oauthlib
+requests-oauthlib==2.0.0
+    # via kubernetes
+rich==13.8.1
+    # via
+    #   beaupy
+    #   launchflow
+    #   questo
+    #   typer
+rsa==4.9
+    # via google-auth
+s3transfer==0.10.2
+    # via boto3
+setuptools==75.1.0
+    # via launchflow
+shellingham==1.5.4
+    # via typer
+six==1.16.0
+    # via
+    #   kubernetes
+    #   posthog
+    #   python-dateutil
+sniffio==1.3.1
+    # via
+    #   anyio
+    #   httpx
+    #   openai
+starlette==0.38.6
+    # via fastapi
+terminaltexteffects==0.11.0
+    # via launchflow
+toml==0.10.2
+    # via launchflow
+tqdm==4.66.5
+    # via openai
+typer==0.12.5
+    # via
+    #   fastapi-cli
+    #   launchflow
+typing-extensions==4.12.2
+    # via
+    #   fastapi
+    #   mangum
+    #   openai
+    #   pydantic
+    #   pydantic-core
+    #   typer
+uritemplate==4.1.1
+    # via google-api-python-client
+urllib3==2.2.3
+    # via
+    #   botocore
+    #   docker
+    #   kubernetes
+    #   requests
+uvicorn==0.30.6
+    # via
+    #   fastapi
+    #   fastapi-cli
+uvloop==0.20.0
+    # via
+    #   launchflow
+    #   uvicorn
+watchfiles==0.24.0
+    # via uvicorn
+websocket-client==1.8.0
+    # via kubernetes
+websockets==13.1
+    # via uvicorn
+yarl==1.12.1
+    # via aiohttp
diff --git a/llama-demo/templates/index.html b/llama-demo/templates/index.html
new file mode 100644
index 0000000..c1bf2be
--- /dev/null
+++ b/llama-demo/templates/index.html
@@ -0,0 +1,249 @@
+<!doctype html>
+<html lang="en" class="h-full dark">
+    <head>
+        <meta charset="utf-8" />
+        <meta name="viewport" content="width=device-width" />
+        <title>LaunchFlow Demo</title>
+        <link rel="preconnect" href="https://rsms.me/" />
+        <link rel="stylesheet" href="https://rsms.me/inter/inter.css" />
+        <script src="https://cdn.tailwindcss.com"></script>
+        <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
+
+        <style>
+            :root {
+                font-family: Inter, sans-serif;
+                font-feature-settings:
+                    "liga" 1,
+                    "calt" 1; /* fix for Chrome */
+            }
+            @supports (font-variation-settings: normal) {
+                :root {
+                    font-family: InterVariable, sans-serif;
+                }
+            }
+
+            html {
+                color-scheme: dark;
+            }
+        </style>
+
+        <link
+            rel="stylesheet"
+            href="https://unpkg.com/franken-ui/dist/css/core.min.css"
+        />
+
+        <script>
+            const htmlElement = document.documentElement;
+
+            if (
+                localStorage.getItem("mode") === "dark" ||
+                (!("mode" in localStorage) &&
+                    window.matchMedia("(prefers-color-scheme: dark)").matches)
+            ) {
+                console.log;
+                htmlElement.classList.add("dark");
+            } else {
+                htmlElement.classList.remove("dark");
+            }
+
+            htmlElement.classList.add("uk-theme-zinc");
+        </script>
+
+        <script
+            type="module"
+            src="https://unpkg.com/franken-ui/dist/js/core.iife.js"
+        ></script>
+        <script
+            type="module"
+            src="https://unpkg.com/franken-ui/dist/js/icon.iife.js"
+        ></script>
+    </head>
+    <body class="bg-background text-foreground h-full">
+        <main class="w-full min-h-screen flex-1">
+            <nav
+                class="fixed z-50 top-0 pl-4 uk-navbar-container w-full"
+                uk-navbar
+            >
+                <div class="uk-navbar-left">
+                    <ul class="uk-navbar-nav">
+                        {% for item in navigation %} {% if item.active %}
+                        <li class="uk-active">{% else %}</li>
+
+                        <li>
+                            {% endif %}
+                            <a href="{{ item.href }}">{{ item.caption }}</a>
+                        </li>
+                        {% endfor %}
+                    </ul>
+                </div>
+            </nav>
+
+            <div class="flex flex-col w-full h-screen mx-auto items-center">
+                <div
+                    class="flex-grow pt-20 overflow-auto w-full items-center flex flex-col"
+                    id="content"
+                ></div>
+                <div class="py-4 max-w-[800px] w-full flex">
+                    <form id="chat-form" class="w-full">
+                        <textarea
+                            class="uk-textarea"
+                            id="chat-input"
+                            rows="5"
+                            placeholder="Message Llama Demo Bot..."
+                            aria-label="Textarea"
+                        ></textarea>
+                    </form>
+                </div>
+            </div>
+        </main>
+        <script>
+            async function streamChat(
+                messageContent,
+                contentDiv,
+                inputValue,
+                context,
+            ) {
+                const response = await fetch("/v1/chat", {
+                    method: "POST",
+                    headers: {
+                        "Content-Type": "application/json",
+                    },
+                    body: JSON.stringify({
+                        message: inputValue,
+                        context: context,
+                    }),
+                });
+
+                const reader = response.body.getReader();
+                const decoder = new TextDecoder();
+                let done = false;
+                let responseText = "";
+                while (!done) {
+                    const { value, done: readerDone } = await reader.read();
+                    done = readerDone;
+                    let text = decoder.decode(value, { stream: !done });
+
+                    if (text.startsWith('"') && text.endsWith('"')) {
+                        text = text.substring(1, text.length - 1);
+                    }
+
+                    responseText += text;
+                    messageContent.innerHTML = marked.parse(responseText);
+                    contentDiv.scrollTop = contentDiv.scrollHeight;
+                }
+                return responseText;
+            }
+
+            async function fetchChat(
+              messageContent,
+              contentDiv,
+              inputValue,
+              context,
+            ) {
+                const response = await fetch("/v1/chat", {
+                    method: "POST",
+                    headers: {
+                        "Content-Type": "application/json",
+                    },
+                    body: JSON.stringify({
+                        message: inputValue,
+                        context: context,
+                    }),
+                });
+
+                const jsonResponse = await response.json();
+                console.log(jsonResponse);
+
+                let responseText = jsonResponse.content;
+                messageContent.innerHTML = marked.parse(responseText);
+                contentDiv.scrollTop = contentDiv.scrollHeight;
+                return responseText;
+            }
+
+            document.addEventListener("DOMContentLoaded", (event) => {
+                document.querySelector("#chat-input").focus();
+            });
+
+            const context = [];
+
+            const chatForm = document.getElementById("chat-form");
+
+            chatForm.addEventListener("submit", async function (event) {
+                event.preventDefault();
+                const input = document.getElementById("chat-input");
+                const contentDiv = document.getElementById("content");
+
+                const inputValue = input.value;
+
+                context.push({
+                    content: inputValue,
+                    role: "user",
+                });
+
+                const message = marked.parse(inputValue);
+                contentDiv.innerHTML += `<div class="-mr-3 max-w-[800px] flex w-full justify-end pb-4"><span class="uk-background-muted rounded-lg px-4 py-2 max-w-[70%]">${message}</span></div>`;
+                input.value = ""; // Clear the input after sending
+                contentDiv.scrollTop = contentDiv.scrollHeight;
+
+                input.disabled = true;
+
+                const responseDiv = document.createElement("div");
+                responseDiv.classList.add(
+                    "max-w-[800px]",
+                    "pl-2.5",
+                    "flex",
+                    "w-full",
+                    "justify-start",
+                    "pb-4",
+                );
+                const messageContent = document.createElement("span");
+                messageContent.classList.add(
+                    "uk-background-muted",
+                    "rounded-lg",
+                    "px-4",
+                    "py-2",
+                    "max-w-[70%]",
+                );
+                const spinnerDiv = document.createElement("div");
+                spinnerDiv.setAttribute("uk-spinner", "ratio: .5");
+                messageContent.appendChild(spinnerDiv);
+                responseDiv.appendChild(messageContent);
+                contentDiv.appendChild(responseDiv);
+                contentDiv.scrollTop = contentDiv.scrollHeight;
+
+
+                let responseText = "";
+                {% if streaming %}
+                responseText = await streamChat(
+                    messageContent,
+                    contentDiv,
+                    inputValue,
+                    context,
+                );
+                {% else %}
+                responseText = await fetchChat(messageContent,
+                    contentDiv,
+                    inputValue,
+                    context,
+                );
+                {% endif %}
+
+                context.push({
+                    content: responseText,
+                    role: "assistant",
+                });
+                input.disabled = false;
+                input.focus();
+            });
+
+            document
+                .getElementById("chat-input")
+                .addEventListener("keydown", function (event) {
+                    if (event.key === "Enter" && !event.shiftKey) {
+                        event.preventDefault(); // Prevent the default action to stop from inserting a new line
+                        chatForm.dispatchEvent(new Event("submit")); // Programmatically submit the form
+                    }
+                });
+        </script>
+    </body>
+</html>