Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -327,3 +327,9 @@ dist

# Ignore master key for decrypting credentials and more.
/config/master.key

pyrightconfig.json


meta-llama-8b-instruct-q4_K_M.gguf
llama-model.gguf
19 changes: 19 additions & 0 deletions llama-demo/Dockerfile.gcp
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
FROM python:3.11-slim

WORKDIR /code

RUN apt-get update && apt-get install -y --no-install-recommends g++ gcc

COPY ./main.py /code/main.py
COPY ./infra.py /code/infra.py
COPY ./llama_demo /code/llama_demo
COPY ./templates /code/templates
COPY ./requirements.txt /code/requirements.txt

RUN pip install uv
RUN uv pip install --system -r requirements.txt

ENV PORT=80
EXPOSE $PORT

CMD fastapi run main.py --host 0.0.0.0 --port $PORT
50 changes: 50 additions & 0 deletions llama-demo/infra.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""infra.py

This file is used to customize the infrastructure your application deploys to.

Create your cloud infrastructure with:
lf create

Deploy your application with:
lf deploy

"""

import launchflow as lf

if lf.environment == "lf-llama-gcp":
llama_service = lf.gcp.ComputeEngineService(
"launchflow-llama-service",
dockerfile="Dockerfile.gcp", # Path to your Dockerfile
machine_type="e2-standard-4",
build_directory="llama_server",
disk_size_gb=50,
)
model_bucket = lf.gcp.GCSBucket("launchflow-llama-demo")
elif lf.environment == "lf-llama-aws":
llama_service = lf.aws.ECSFargateService(
"launchflow-llama-service",
dockerfile="Dockerfile.aws", # Path to your Dockerfile
build_directory="llama_server",
cpu=8192, # 8 cpus are required for GPU support
memory=16384, # 16 GB of memory are required for GPU support
# load_balancer=lf.aws.alb.InternalHTTP(),
)
serving_service = lf.aws.LambdaService(
"launchflow-llama-serving-demo",
handler="main.handler",
build_ignore=[
"llama_server",
"Dockerfile.*",
"requirements*",
"launchflow.yaml",
],
runtime=lf.aws.lambda_service.PythonRuntime(
requirements_txt_path="requirements-aws.txt"
),
timeout_seconds=900,
env={"LLAMA_SERVER_ADDRESS": lf.Depends(llama_service).service_url}, # type: ignore
)
model_bucket = lf.aws.S3Bucket("launchflow-llama-demo")
else:
raise ValueError(f"Unknown environment: {lf.environment}")
2 changes: 2 additions & 0 deletions llama-demo/launchflow.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
project: llama-demo
backend: lf://default
56 changes: 56 additions & 0 deletions llama-demo/llama_demo/chat_router.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from typing import Annotated
from fastapi import APIRouter
from fastapi.params import Depends

from fastapi.responses import JSONResponse
from pydantic import BaseModel
from starlette.responses import StreamingResponse

from llama_demo.settings import settings
from llama_demo.schemas import Chat
from llama_demo.llama_client import LlamaClient

router = APIRouter(prefix="/v1", tags=["v1"])


class ChatResponse(BaseModel):
content: str


@router.post("/chat", response_model=None)
async def chat(
chat: Chat, model: Annotated[LlamaClient, Depends(LlamaClient)]
) -> StreamingResponse | JSONResponse:
messages = []
for message in chat.context:
messages.append({"role": message.role, "content": message.content})

context_string = "".join([msg["content"] for msg in messages])
if len(context_string) > 512:
trimmed_context = []
current_length = 0
for msg in reversed(messages):
msg_length = len(msg["content"])
if current_length + msg_length <= settings.context_window:
trimmed_context.append(msg)
current_length += msg_length
else:
num_to_append = msg_length - (settings.context_window - current_length)
msg["content"] = msg["content"][-num_to_append:]
trimmed_context.append(msg)
break
messages = list(reversed(trimmed_context))

completion = model.chat(messages=messages)

def iter_content():
for item in completion:
yield item

if settings.streaming:
return StreamingResponse(iter_content())

full_content = ""
for item in iter_content():
full_content += item
return JSONResponse(content={"content": full_content})
30 changes: 30 additions & 0 deletions llama-demo/llama_demo/llama_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import httpx
from openai.types.chat import ChatCompletionMessageParam
from llama_demo.settings import settings
import openai
from typing import Dict, Any, Iterable


async def httpx_client():
async with httpx.AsyncClient() as client:
yield client


class LlamaClient:
def __init__(self):
self.client = openai.Client(
base_url=settings.llama_server_address, api_key="no-api-key"
)

def chat(self, messages: Iterable[ChatCompletionMessageParam]):
result = self.client.chat.completions.create(
messages=messages,
model="gpt-3.5-turbo",
stream=True,
max_tokens=int(settings.context_window / 2),
timeout=600,
)
for r in result:
content = r.choices[0].delta.content
if content is not None:
yield content
13 changes: 13 additions & 0 deletions llama-demo/llama_demo/schemas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from typing import Literal

from pydantic import BaseModel


class ChatMessage(BaseModel):
content: str
role: Literal["system", "user", "assistant"]


class Chat(BaseModel):
message: str
context: list[ChatMessage]
17 changes: 17 additions & 0 deletions llama-demo/llama_demo/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import os

from dotenv import load_dotenv

load_dotenv()


class Settings:
context_window: int = int(os.environ.get("CONTEXT_WINDOW", 5000))
llama_server_address: str = os.environ.get(
"LLAMA_SERVER_ADDRESS",
"http://launchflow-llama-service-l-79dfc-799820891.us-east-1.elb.amazonaws.com",
)
streaming: bool = os.environ.get("LAUNCHFLOW_ENVIRONMENT") == "lf-llama-gcp"


settings = Settings()
28 changes: 28 additions & 0 deletions llama-demo/llama_server/Dockerfile.aws
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Use the existing Dockerfile from the repo as the base
FROM public.ecr.aws/docker/library/ubuntu:22.04

# Install necessary dependencies
ARG DEBIAN_FRONTEND=noninteractive
ENV TZ=Etc/UTC
RUN apt-get update && apt-get install -y \
git build-essential cmake awscli wget unzip \
&& rm -rf /var/lib/apt/lists/*

# Clone the llama.cpp repository
RUN git clone https://github.com/ggerganov/llama.cpp.git /app/llama.cpp
WORKDIR /app/llama.cpp

# Build server
RUN make llama-server

# Expose necessary port for the server
EXPOSE 80

# Copy a custom script that handles the download at runtime
COPY download_and_run.sh /app/download_and_run.sh
RUN chmod +x /app/download_and_run.sh

WORKDIR /app

# Start the server through the custom script
ENTRYPOINT ["/app/download_and_run.sh"]
32 changes: 32 additions & 0 deletions llama-demo/llama_server/Dockerfile.gcp
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Use the existing Dockerfile from the repo as the base
FROM ubuntu:22.04

# Install necessary dependencies
RUN apt-get update && apt-get install -y \
git build-essential cmake wget unzip curl python3 python3-distutils \
&& rm -rf /var/lib/apt/lists/*

# Install Google Cloud SDK (for gsutil)
RUN curl -sSL https://sdk.cloud.google.com | bash && \
/root/google-cloud-sdk/install.sh

ENV PATH=$PATH:/root/google-cloud-sdk/bin

# Clone the llama.cpp repository
RUN git clone https://github.com/ggerganov/llama.cpp.git /app/llama.cpp
WORKDIR /app/llama.cpp

# Build server
RUN make llama-server

# Expose necessary port for the server
EXPOSE 80

# Copy a custom script that handles the download at runtime
COPY download_and_run.sh /app/download_and_run.sh
RUN chmod +x /app/download_and_run.sh

WORKDIR /app

# Start the server through the custom script
ENTRYPOINT ["/app/download_and_run.sh"]
22 changes: 22 additions & 0 deletions llama-demo/llama_server/download_and_run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash

# Define S3 bucket and model file
LOCAL_MODAL_PATH="/models/llama.gguf"
REMOTE_MODEL_FILE_NAME="meta-llama-8b-instruct-q4_K_M.gguf"



if [ "${LAUNCHFLOW_ENVIRONMENT}" = "lf-llama-aws" ]; then
bucket_url="s3://launchflow-llama-demo"
echo "Downloading model from ${S3_BUCKET_URL}/${MODEL_FILENAME}..."
aws s3 cp ${bucket_url}/${REMOTE_MODEL_FILE_NAME} $LOCAL_MODAL_PATH
elif [ "${LAUNCHFLOW_ENVIRONMENT}" = "lf-llama-gcp" ]; then
bucket_url="gs://launchflow-llama-demo"
echo "Downloading model from ${bucket_url}/${REMOTE_MODEL_FILE_NAME}..."
gsutil cp ${bucket_url}/${REMOTE_MODEL_FILE_NAME} $LOCAL_MODAL_PATH
fi

# Start the server with the downloaded model

cd llama.cpp
./llama-server --model $LOCAL_MODAL_PATH --ctx-size 5000 --port 80 --host 0.0.0.0
34 changes: 34 additions & 0 deletions llama-demo/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
from fastapi import FastAPI
from fastapi.requests import Request
from fastapi.templating import Jinja2Templates
from llama_demo.chat_router import router
from mangum import Mangum

from llama_demo.settings import settings

app = FastAPI()

app.include_router(router)

templates = Jinja2Templates(directory="templates")


@app.get("/")
def index(request: Request):
return templates.TemplateResponse(
request=request,
name="index.html",
context={
"navigation": [
{
"caption": "Llama Chat Demo",
"href": "/",
"active": True,
}
],
"streaming": settings.streaming,
},
)


handler = Mangum(app, lifespan="off")
7 changes: 7 additions & 0 deletions llama-demo/requirements-aws.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
fastapi
launchflow[aws]
pydantic
openai
mangum
durationpy==0.6
python-dotenv
Loading