Skip to content

Clinical AI Eval (CI) #37

Clinical AI Eval (CI)

Clinical AI Eval (CI) #37

Workflow file for this run

name: Clinical AI Eval (CI)
on:
workflow_dispatch:
inputs:
provider:
description: "LLM provider (openai or mock)"
required: true
default: "openai"
model:
description: "Model id (provider-specific)"
required: true
default: "gpt-4.1-mini"
max_cases:
description: "Max cases to run (cost control)"
required: true
default: "25"
prompt_version:
description: "Prompt version label"
required: true
default: "v1"
push:
branches: ["main"]
paths:
- "dataset/**"
- "src/**"
- "docs/**"
- "requirements.txt"
- ".github/workflows/eval.yml"
permissions:
contents: write
jobs:
run-eval:
# Prevent infinite loops when the workflow commits results back to the repo
if: ${{ github.actor != 'github-actions[bot]' }}
runs-on: ubuntu-latest
env:
# Reads from GitHub Secrets in real runs
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
# Defaults for push-triggered runs
PROVIDER: "openai"
MODEL_ID: "gpt-4.1-mini"
MAX_CASES: "25"
PROMPT_VERSION: "v1"
# Ensure `from src...` imports work when running scripts
PYTHONPATH: "."
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
cache: "pip"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
- name: Set inputs (workflow_dispatch)
if: ${{ github.event_name == 'workflow_dispatch' }}
run: |
echo "PROVIDER=${{ inputs.provider }}" >> $GITHUB_ENV
echo "MODEL_ID=${{ inputs.model }}" >> $GITHUB_ENV
echo "MAX_CASES=${{ inputs.max_cases }}" >> $GITHUB_ENV
echo "PROMPT_VERSION=${{ inputs.prompt_version }}" >> $GITHUB_ENV
- name: Safety check (OpenAI key required for openai provider)
run: |
if [ "$PROVIDER" = "openai" ] && [ -z "$OPENAI_API_KEY" ]; then
echo "OPENAI_API_KEY is missing. Add it in GitHub repo Settings → Secrets and variables → Actions."
exit 1
fi
- name: Generate answers
run: python src/generate_answers.py --dataset dataset/clinical_questions.csv --provider "$PROVIDER" --model "$MODEL_ID" --prompt-version "$PROMPT_VERSION" --max-cases "$MAX_CASES"
- name: Run evaluation
run: python src/run_evaluation.py --dataset dataset/clinical_questions.csv --max-ctx-anchors 8
- name: Debug results directory
run: |
echo "PWD: $(pwd)"
echo "---- repo root ----"
ls -la
echo "---- results/ ----"
ls -la results || true
echo "---- raw_generations.jsonl (head) ----"
head -n 2 results/raw_generations.jsonl || true
echo "---- evaluation_output.csv exists? ----"
test -f results/evaluation_output.csv && echo "YES" || echo "NO"
- name: Assert evaluation output exists
run: |
echo "---- results dir listing ----"
ls -la results || true
echo "---- find evaluation_output.csv ----"
find . -maxdepth 4 -name "evaluation_output.csv" -print || true
echo "---- raw_generations exists? ----"
test -f results/raw_generations.jsonl && echo "raw_generations.jsonl: YES" || echo "raw_generations.jsonl: NO"
echo "---- evaluation_output exists? ----"
test -f results/evaluation_output.csv && echo "evaluation_output.csv: YES" || (echo "evaluation_output.csv: NO" && exit 1)
- name: Summarize results
run: python src/summarize_results.py --top-n 10
- name: Commit results back to repo
run: |
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
# Only commit result artifacts (avoid committing raw prompts if you later decide to exclude them)
git add results/*.csv results/*.jsonl results/*.md || true
if git diff --cached --quiet; then
echo "No result changes to commit."
exit 0
fi
git commit -m "Update evaluation results [skip ci]"
git push