-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathindex_abcd.py
More file actions
75 lines (64 loc) · 2.18 KB
/
index_abcd.py
File metadata and controls
75 lines (64 loc) · 2.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env python
"""
Python equivalent of:
pqa --agent.index.name abcd_pubs \
--agent.index.index_directory abcd-index \
--parsing.parse_pdf "paperqa_pypdf.parse_pdf_to_pages" \
--agent.index.paper_directory ../data/ABCD.Pubs \
--agent.index.concurrency 2 \
index ../data/ABCD.Pubs
"""
import asyncio
from pathlib import Path
from paperqa import Settings
from paperqa.settings import AgentSettings, IndexSettings, ParsingSettings
from paperqa.agents import get_directory_index
import paperqa_pypdf
from litellm.litellm_core_utils.logging_callback_manager import LoggingCallbackManager
LoggingCallbackManager.MAX_CALLBACKS = 2000
DEFAULT_LLM_MODEL = "gpt-5-mini"
DEFAULT_RATE_LIMIT_PER_MINUTE = 10000
files_to_skip = [
"Mummaneni-2023-10.1002_hbm.26488",
"Park-2024-10.7554_eLife.88117"
]
def index_files(file_path: Path) -> bool:
return file_path.stem not in files_to_skip
async def main():
# Configure settings to match the CLI command
llm_model = DEFAULT_LLM_MODEL
rate_limit_per_minute = DEFAULT_RATE_LIMIT_PER_MINUTE
settings = Settings(
verbosity=1,
llm=llm_model,
summary_llm=llm_model,
embedding="text-embedding-3-small",
llm_config={
"rate_limit": {
llm_model: f"{rate_limit_per_minute} per 1 minute",
}
},
summary_llm_config={
"rate_limit": {
llm_model: f"{rate_limit_per_minute} per 1 minute",
}
},
parsing=ParsingSettings(
parse_pdf=paperqa_pypdf.parse_pdf_to_pages,
),
agent=AgentSettings(
index=IndexSettings(
name="abcd_pubs",
index_directory=Path("abcd-index"),
paper_directory=Path("../data/ABCD.Pubs"),
concurrency=1,
files_filter=index_files,
),
),
)
# Build the index (equivalent to `pqa index ../data/ABCD.Pubs`)
index = await get_directory_index(settings=settings)
print(f"Index built: {settings.agent.index.name}")
print(f"Index directory: {settings.agent.index.index_directory}")
if __name__ == "__main__":
asyncio.run(main())