launchflow · boetro · Sep 27, 2024
diff --git a/.gitignore b/.gitignore
@@ -327,3 +327,9 @@ dist
 
 # Ignore master key for decrypting credentials and more.
 /config/master.key
+
+pyrightconfig.json
+
+
+meta-llama-8b-instruct-q4_K_M.gguf
+llama-model.gguf
diff --git a/llama-demo/Dockerfile.gcp b/llama-demo/Dockerfile.gcp
@@ -0,0 +1,19 @@
+FROM python:3.11-slim
+
+WORKDIR /code
+
+RUN apt-get update && apt-get install -y --no-install-recommends g++ gcc
+
+COPY ./main.py /code/main.py
+COPY ./infra.py /code/infra.py
+COPY ./llama_demo /code/llama_demo
+COPY ./templates /code/templates
+COPY ./requirements.txt /code/requirements.txt
+
+RUN pip install uv
+RUN uv pip install --system -r requirements.txt
+
+ENV PORT=80
+EXPOSE $PORT
+
+CMD fastapi run main.py --host 0.0.0.0 --port $PORT
diff --git a/llama-demo/infra.py b/llama-demo/infra.py
@@ -0,0 +1,50 @@
+"""infra.py
+
+This file is used to customize the infrastructure your application deploys to.
+
+Create your cloud infrastructure with:
+    lf create
+
+Deploy your application with:
+    lf deploy
+
+"""
+
+import launchflow as lf
+
+if lf.environment == "lf-llama-gcp":
+    llama_service = lf.gcp.ComputeEngineService(
+        "launchflow-llama-service",
+        dockerfile="Dockerfile.gcp",  # Path to your Dockerfile
+        machine_type="e2-standard-4",
+        build_directory="llama_server",
+        disk_size_gb=50,
+    )
+    model_bucket = lf.gcp.GCSBucket("launchflow-llama-demo")
+elif lf.environment == "lf-llama-aws":
+    llama_service = lf.aws.ECSFargateService(
+        "launchflow-llama-service",
+        dockerfile="Dockerfile.aws",  # Path to your Dockerfile
+        build_directory="llama_server",
+        cpu=8192,  # 8 cpus are required for GPU support
+        memory=16384,  # 16 GB of memory are required for GPU support
+        # load_balancer=lf.aws.alb.InternalHTTP(),
+    )
+    serving_service = lf.aws.LambdaService(
+        "launchflow-llama-serving-demo",
+        handler="main.handler",
+        build_ignore=[
+            "llama_server",
+            "Dockerfile.*",
+            "requirements*",
+            "launchflow.yaml",
+        ],
+        runtime=lf.aws.lambda_service.PythonRuntime(
+            requirements_txt_path="requirements-aws.txt"
+        ),
+        timeout_seconds=900,
+        env={"LLAMA_SERVER_ADDRESS": lf.Depends(llama_service).service_url},  # type: ignore
+    )
+    model_bucket = lf.aws.S3Bucket("launchflow-llama-demo")
+else:
+    raise ValueError(f"Unknown environment: {lf.environment}")
diff --git a/llama-demo/launchflow.yaml b/llama-demo/launchflow.yaml
@@ -0,0 +1,2 @@
+project: llama-demo
+backend: lf://default
diff --git a/llama-demo/llama_demo/chat_router.py b/llama-demo/llama_demo/chat_router.py
@@ -0,0 +1,56 @@
+from typing import Annotated
+from fastapi import APIRouter
+from fastapi.params import Depends
+
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel
+from starlette.responses import StreamingResponse
+
+from llama_demo.settings import settings
+from llama_demo.schemas import Chat
+from llama_demo.llama_client import LlamaClient
+
+router = APIRouter(prefix="/v1", tags=["v1"])
+
+
+class ChatResponse(BaseModel):
+    content: str
+
+
+@router.post("/chat", response_model=None)
+async def chat(
+    chat: Chat, model: Annotated[LlamaClient, Depends(LlamaClient)]
+) -> StreamingResponse | JSONResponse:
+    messages = []
+    for message in chat.context:
+        messages.append({"role": message.role, "content": message.content})
+
+    context_string = "".join([msg["content"] for msg in messages])
+    if len(context_string) > 512:
+        trimmed_context = []
+        current_length = 0
+        for msg in reversed(messages):
+            msg_length = len(msg["content"])
+            if current_length + msg_length <= settings.context_window:
+                trimmed_context.append(msg)
+                current_length += msg_length
+            else:
+                num_to_append = msg_length - (settings.context_window - current_length)
+                msg["content"] = msg["content"][-num_to_append:]
+                trimmed_context.append(msg)
+                break
+        messages = list(reversed(trimmed_context))
+
+    completion = model.chat(messages=messages)
+
+    def iter_content():
+        for item in completion:
+            yield item
+
+    if settings.streaming:
+        return StreamingResponse(iter_content())
+
+    full_content = ""
+    for item in iter_content():
+        full_content += item
+    return JSONResponse(content={"content": full_content})
diff --git a/llama-demo/llama_demo/llama_client.py b/llama-demo/llama_demo/llama_client.py
@@ -0,0 +1,30 @@
+import httpx
+from openai.types.chat import ChatCompletionMessageParam
+from llama_demo.settings import settings
+import openai
+from typing import Dict, Any, Iterable
+
+
+async def httpx_client():
+    async with httpx.AsyncClient() as client:
+        yield client
+
+
+class LlamaClient:
+    def __init__(self):
+        self.client = openai.Client(
+            base_url=settings.llama_server_address, api_key="no-api-key"
+        )
+
+    def chat(self, messages: Iterable[ChatCompletionMessageParam]):
+        result = self.client.chat.completions.create(
+            messages=messages,
+            model="gpt-3.5-turbo",
+            stream=True,
+            max_tokens=int(settings.context_window / 2),
+            timeout=600,
+        )
+        for r in result:
+            content = r.choices[0].delta.content
+            if content is not None:
+                yield content
diff --git a/llama-demo/llama_demo/schemas.py b/llama-demo/llama_demo/schemas.py
@@ -0,0 +1,13 @@
+from typing import Literal
+
+from pydantic import BaseModel
+
+
+class ChatMessage(BaseModel):
+    content: str
+    role: Literal["system", "user", "assistant"]
+
+
+class Chat(BaseModel):
+    message: str
+    context: list[ChatMessage]
diff --git a/llama-demo/llama_demo/settings.py b/llama-demo/llama_demo/settings.py
@@ -0,0 +1,17 @@
+import os
+
+from dotenv import load_dotenv
+
+load_dotenv()
+
+
+class Settings:
+    context_window: int = int(os.environ.get("CONTEXT_WINDOW", 5000))
+    llama_server_address: str = os.environ.get(
+        "LLAMA_SERVER_ADDRESS",
+        "http://launchflow-llama-service-l-79dfc-799820891.us-east-1.elb.amazonaws.com",
+    )
+    streaming: bool = os.environ.get("LAUNCHFLOW_ENVIRONMENT") == "lf-llama-gcp"
+
+
+settings = Settings()
diff --git a/llama-demo/llama_server/Dockerfile.aws b/llama-demo/llama_server/Dockerfile.aws
@@ -0,0 +1,28 @@
+# Use the existing Dockerfile from the repo as the base
+FROM public.ecr.aws/docker/library/ubuntu:22.04
+
+# Install necessary dependencies
+ARG DEBIAN_FRONTEND=noninteractive
+ENV TZ=Etc/UTC
+RUN apt-get update && apt-get install -y \
+    git build-essential cmake awscli wget unzip \
+    && rm -rf /var/lib/apt/lists/*
+
+# Clone the llama.cpp repository
+RUN git clone https://github.com/ggerganov/llama.cpp.git /app/llama.cpp
+WORKDIR /app/llama.cpp
+
+# Build server
+RUN make llama-server
+
+# Expose necessary port for the server
+EXPOSE 80
+
+# Copy a custom script that handles the download at runtime
+COPY download_and_run.sh /app/download_and_run.sh
+RUN chmod +x /app/download_and_run.sh
+
+WORKDIR /app
+
+# Start the server through the custom script
+ENTRYPOINT ["/app/download_and_run.sh"]
diff --git a/llama-demo/llama_server/Dockerfile.gcp b/llama-demo/llama_server/Dockerfile.gcp
@@ -0,0 +1,32 @@
+# Use the existing Dockerfile from the repo as the base
+FROM ubuntu:22.04
+
+# Install necessary dependencies
+RUN apt-get update && apt-get install -y \
+    git build-essential cmake wget unzip curl python3 python3-distutils \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Google Cloud SDK (for gsutil)
+RUN curl -sSL https://sdk.cloud.google.com | bash && \
+    /root/google-cloud-sdk/install.sh
+
+ENV PATH=$PATH:/root/google-cloud-sdk/bin
+
+# Clone the llama.cpp repository
+RUN git clone https://github.com/ggerganov/llama.cpp.git /app/llama.cpp
+WORKDIR /app/llama.cpp
+
+# Build server
+RUN make llama-server
+
+# Expose necessary port for the server
+EXPOSE 80
+
+# Copy a custom script that handles the download at runtime
+COPY download_and_run.sh /app/download_and_run.sh
+RUN chmod +x /app/download_and_run.sh
+
+WORKDIR /app
+
+# Start the server through the custom script
+ENTRYPOINT ["/app/download_and_run.sh"]
diff --git a/llama-demo/llama_server/download_and_run.sh b/llama-demo/llama_server/download_and_run.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Define S3 bucket and model file
+LOCAL_MODAL_PATH="/models/llama.gguf"
+REMOTE_MODEL_FILE_NAME="meta-llama-8b-instruct-q4_K_M.gguf"
+
+
+
+if [ "${LAUNCHFLOW_ENVIRONMENT}" = "lf-llama-aws" ]; then
+    bucket_url="s3://launchflow-llama-demo"
+    echo "Downloading model from ${S3_BUCKET_URL}/${MODEL_FILENAME}..."
+    aws s3 cp ${bucket_url}/${REMOTE_MODEL_FILE_NAME} $LOCAL_MODAL_PATH
+elif [ "${LAUNCHFLOW_ENVIRONMENT}" = "lf-llama-gcp" ]; then
+    bucket_url="gs://launchflow-llama-demo"
+    echo "Downloading model from ${bucket_url}/${REMOTE_MODEL_FILE_NAME}..."
+    gsutil cp ${bucket_url}/${REMOTE_MODEL_FILE_NAME} $LOCAL_MODAL_PATH
+fi
+
+# Start the server with the downloaded model
+
+cd llama.cpp
+./llama-server --model $LOCAL_MODAL_PATH --ctx-size 5000 --port 80 --host 0.0.0.0
diff --git a/llama-demo/main.py b/llama-demo/main.py
@@ -0,0 +1,34 @@
+from fastapi import FastAPI
+from fastapi.requests import Request
+from fastapi.templating import Jinja2Templates
+from llama_demo.chat_router import router
+from mangum import Mangum
+
+from llama_demo.settings import settings
+
+app = FastAPI()
+
+app.include_router(router)
+
+templates = Jinja2Templates(directory="templates")
+
+
+@app.get("/")
+def index(request: Request):
+    return templates.TemplateResponse(
+        request=request,
+        name="index.html",
+        context={
+            "navigation": [
+                {
+                    "caption": "Llama Chat Demo",
+                    "href": "/",
+                    "active": True,
+                }
+            ],
+            "streaming": settings.streaming,
+        },
+    )
+
+
+handler = Mangum(app, lifespan="off")
diff --git a/llama-demo/requirements-aws.in b/llama-demo/requirements-aws.in
@@ -0,0 +1,7 @@
+fastapi
+launchflow[aws]
+pydantic
+openai
+mangum
+durationpy==0.6
+python-dotenv