-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgithub_summary.py
More file actions
89 lines (65 loc) · 3.17 KB
/
github_summary.py
File metadata and controls
89 lines (65 loc) · 3.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# github_summary.py
from typing import Dict, Any
from venv import logger
from utils.clone_repo import clone_repository_impl
from utils.repo_scan import analyze_repo
from utils.chunking import chunk_file
from utils.filter_file_map import filter_file_map
from utils.llm_setup import DELAY_API_RATE_LIMITS_SEC
from agents.file_summarization import initialize_file_summary, filesummary_graph
from agents.global_summarization import repo_summary_graph
from utils.logging_config import setup_logging
import os
import time
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
def summarize_github_repo(repo_path: str, metadata: Dict[str, Any], max_iterations: int = 25) -> Dict[str, Any]:
# Analyze the repo: tree, file paths, and extension counts
metadata['repo_tree'], metadata['file_map'], metadata['ext_counts'] = analyze_repo(repo_path)
# Filter out unwanted files
metadata['filtered_file_map'] = filter_file_map(metadata['file_map'])
# Prepare list of relevant files
relevant_files = list(metadata['filtered_file_map'].keys())
# Summarize README files first (if any)
readme_summary = ""
documentation_files = [item for item in relevant_files if "readme" in item.lower()]
# summarize README first
for doc_file in documentation_files:
readme_summary = initialize_file_summary(doc_file , metadata=metadata, global_summary="")['file_summary']
if DELAY_API_RATE_LIMITS_SEC > 0:
logger.info(f"Sleeping for {DELAY_API_RATE_LIMITS_SEC} seconds after summarizing README to respect API rate limits...")
time.sleep(DELAY_API_RATE_LIMITS_SEC)
relevant_files.remove(doc_file ) # remove from relevant files to avoid re-processing in global summary
# limit iterations
number_of_iterations = min(max_iterations, max(10, int(0.1 * len(relevant_files))))
repo_state = {
"metadata": metadata,
"global_summary": readme_summary,
"technologies": [],
"structure": '',
"relevant_files": relevant_files,
"files_to_summarize": None,
"additional_files_summarized": {},
"processed_files": [],
"additional_files_summarized_global": {},
"processed_files_global": [],
"iter_max": number_of_iterations,
"counter": 0
}
repo_summary_result = repo_summary_graph.invoke(repo_state)
global_summary = repo_summary_result.get("global_summary", "")
processed_files_global = repo_summary_result.get("processed_files_global", [])
additional_files_summarized_global = repo_summary_result.get("additional_files_summarized_global", {})
technologies = repo_summary_result.get("technologies", [])
structure = repo_summary_result.get("structure", "")
return {
"summary": global_summary,
"technologies": technologies,
"structure": structure,
"number_of_iterations": number_of_iterations,
"readme_summary": readme_summary,
"relevant_files_count": len(relevant_files),
"processed_files_global": processed_files_global,
"additional_files_summarized_global": additional_files_summarized_global
}