Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
132 changes: 132 additions & 0 deletions .github/scripts/build_diff_payload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
"""Build the JSON payload sent to GraphRAG-UI's /api/admin/update-graph.

Invoked from .github/workflows/update-graph.yml after a push to main:
reads BASE_SHA + HEAD_SHA from env, computes the .md diff, reads file
content for added+modified entries, and writes payload.json. Sets the
``skip`` step output to ``true`` when nothing ingestable changed so the
workflow can short-circuit before the network call.
"""

from __future__ import annotations

import json
import os
import pathlib
import subprocess
import sys

# git's well-known empty-tree SHA — used as the "before" when a push
# carries an all-zero ``before`` (i.e., first push to a brand-new branch).
EMPTY_TREE_SHA = "4b825dc642cb6eb9a060e54bf8d69288fbee4904"


def _git_diff_name_status(base: str, head: str) -> str:
"""Return the raw ``git diff --name-status`` output between two SHAs."""
return subprocess.run(
["git", "diff", "--name-status", base, head],
capture_output=True, text=True, check=True,
).stdout


def _read_at(head: str, path: str) -> str | None:
"""Read a file's content at a specific commit, regardless of what's
currently checked out in the working tree.

Uses ``git show <head>:<path>``, which pulls the blob from the
object store. Reading from disk via ``pathlib`` would only work if
the runner had already checked out ``head``; this is more robust
and lets the script be exercised locally against historical
commits without checking them out first. Returns None if the path
doesn't exist at ``head`` (e.g. rare rename edge cases).
"""
proc = subprocess.run(
["git", "show", f"{head}:{path}"],
capture_output=True, text=True, check=False,
)
if proc.returncode != 0:
return None
return proc.stdout
Comment on lines +31 to +48
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Fail closed when git show cannot read a changed file.

Returning None here lets _collect_md_changes() silently drop an added/modified Markdown file while the workflow still succeeds. On a rename, that can send deleted=["old.md"] without re-adding new.md, leaving the graph incomplete. This should raise and fail the job instead of emitting a partial payload.

🔧 Proposed fix
-def _read_at(head: str, path: str) -> str | None:
+def _read_at(head: str, path: str) -> str:
     """Read a file's content at a specific commit, regardless of what's
     currently checked out in the working tree.
@@
     proc = subprocess.run(
         ["git", "show", f"{head}:{path}"],
         capture_output=True, text=True, check=False,
     )
     if proc.returncode != 0:
-        return None
+        raise RuntimeError(
+            f"Failed to read {path!r} at {head}: {proc.stderr.strip() or 'git show failed'}"
+        )
     return proc.stdout

After this, the if content is not None: guards in _collect_md_changes() can be removed because unreadable changed files will already fail the step.

🧰 Tools
🪛 Ruff (0.15.12)

[error] 42-42: subprocess call: check for execution of untrusted input

(S603)


[error] 43-43: Starting a process with a partial executable path

(S607)

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In @.github/scripts/build_diff_payload.py around lines 31 - 48, The _read_at
function currently returns None on a failing `git show`, allowing
_collect_md_changes to silently skip changed files; change _read_at (the
function named _read_at) to raise a clear exception (e.g., RuntimeError or a
custom error) when subprocess.run returns non-zero so the script fails fast when
a file cannot be read, and then remove the now-unnecessary `if content is not
None:` guards in _collect_md_changes to rely on the raised error for unreadable
changed files.



def _collect_md_changes(
diff_output: str, head: str,
) -> tuple[dict[str, str], dict[str, str], list[str]]:
"""Parse ``git diff --name-status`` and bucket .md changes.

Renames (``R``) are split into delete-old + add-new so the SDK
re-extracts the content under the new path. Non-.md files are
skipped. File content for added/modified entries is read from
the git object store at ``head``, not from disk.
"""
added: dict[str, str] = {}
modified: dict[str, str] = {}
deleted: list[str] = []

for line in diff_output.splitlines():
parts = line.split("\t")
if not parts:
continue
status = parts[0][0] # strip rename similarity score, e.g. R100 → R

if status == "R" and len(parts) >= 3:
old, new = parts[1], parts[2]
if old.endswith(".md"):
deleted.append(old)
if new.endswith(".md"):
content = _read_at(head, new)
if content is not None:
added[new] = content
continue

if len(parts) < 2 or not parts[1].endswith(".md"):
continue
path = parts[1]
if status == "A":
content = _read_at(head, path)
if content is not None:
added[path] = content
elif status == "M":
content = _read_at(head, path)
if content is not None:
modified[path] = content
elif status == "D":
deleted.append(path)

return added, modified, deleted


def _set_output(name: str, value: str) -> None:
"""Append to GITHUB_OUTPUT so subsequent steps can branch on it."""
out = os.environ.get("GITHUB_OUTPUT")
if not out: # local dev / standalone run
return
with open(out, "a", encoding="utf-8") as f:
f.write(f"{name}={value}\n")


def main() -> int:
base = os.environ["BASE_SHA"]
head = os.environ["HEAD_SHA"]
if set(base) == {"0"}:
base = EMPTY_TREE_SHA

diff = _git_diff_name_status(base, head)
added, modified, deleted = _collect_md_changes(diff, head)

if not (added or modified or deleted):
print("::notice::No .md changes — skipping graph update.", file=sys.stderr)
_set_output("skip", "true")
return 0

payload = {
"graph_id": os.environ.get("GRAPH_ID", "docs_benchmark"),
"files": {"added": added, "modified": modified, "deleted": deleted},
}
pathlib.Path("payload.json").write_text(json.dumps(payload), encoding="utf-8")
print(f"::notice::Diff: +{len(added)} ~{len(modified)} -{len(deleted)} files")
_set_output("skip", "false")
return 0


if __name__ == "__main__":
sys.exit(main())
51 changes: 51 additions & 0 deletions .github/workflows/update-graph.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Incrementally updates the FalkorDB docs knowledge graph whenever .md
# files change on main. Computes the diff against the previous HEAD,
# POSTs it to GraphRAG-UI's /api/admin/update-graph endpoint, which does
# the SDK ingestion + smoke test + atomic alias flip server-side.

name: Update graph (incremental)

on:
push:
branches:
- main
paths:
- "**/*.md"

# `github.ref_name` is "main"; all pushes to main share one queue.
concurrency:
group: update-graph-${{ github.ref_name }}
cancel-in-progress: false
Comment thread
coderabbitai[bot] marked this conversation as resolved.

jobs:
update-graph:
runs-on: ubuntu-latest
timeout-minutes: 30
permissions:
contents: read
env:
GRAPH_ID: docs_benchmark
GRAPHRAG_UI_URL: ${{ vars.GRAPHRAG_UI_URL }}
steps:
- name: Checkout docs
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
with:
fetch-depth: 0 # full history needed for the diff below

- name: Build diff payload
id: payload
env:
BASE_SHA: ${{ github.event.before }}
HEAD_SHA: ${{ github.sha }}
run: python3 .github/scripts/build_diff_payload.py
Comment thread
galshubeli marked this conversation as resolved.

- name: Call admin update-graph endpoint
if: steps.payload.outputs.skip != 'true'
run: |
curl -X POST "$GRAPHRAG_UI_URL/api/admin/update-graph" \
-H "Authorization: Bearer ${{ secrets.UPDATE_GRAPH_TOKEN }}" \
-H "Content-Type: application/json" \
--data-binary @payload.json \
--fail-with-body \
--show-error \
--max-time 1800