|
| 1 | +import base64 |
1 | 2 | import os |
2 | 3 | import shutil |
3 | 4 | import subprocess |
| 5 | +from pathlib import Path |
| 6 | +from typing import List, Optional, Union |
4 | 7 |
|
5 | 8 | from pydantic import BaseModel |
6 | 9 |
|
7 | 10 | from metagpt.const import DEFAULT_WORKSPACE_ROOT |
| 11 | +from metagpt.logs import logger |
8 | 12 | from metagpt.tools.tool_registry import register_tool |
| 13 | +from metagpt.utils import read_docx |
| 14 | +from metagpt.utils.common import aread_bin, awrite_bin, run_coroutine_sync |
| 15 | +from metagpt.utils.repo_to_markdown import is_text_file |
9 | 16 | from metagpt.utils.report import EditorReporter |
10 | 17 |
|
11 | 18 |
|
@@ -40,12 +47,26 @@ def write(self, path: str, content: str): |
40 | 47 |
|
41 | 48 | def read(self, path: str) -> FileBlock: |
42 | 49 | """Read the whole content of a file. Using absolute paths as the argument for specifying the file location.""" |
43 | | - with open(path, "r") as f: |
44 | | - self.resource.report(path, "path") |
45 | | - lines = f.readlines() |
| 50 | + is_text, mime_type = run_coroutine_sync(is_text_file, path) |
| 51 | + if is_text: |
| 52 | + lines = self._read_text(path) |
| 53 | + elif mime_type == "application/pdf": |
| 54 | + lines = self._read_pdf(path) |
| 55 | + elif mime_type in { |
| 56 | + "application/msword", |
| 57 | + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", |
| 58 | + "application/vnd.ms-word.document.macroEnabled.12", |
| 59 | + "application/vnd.openxmlformats-officedocument.wordprocessingml.template", |
| 60 | + "application/vnd.ms-word.template.macroEnabled.12", |
| 61 | + }: |
| 62 | + lines = self._read_docx(path) |
| 63 | + else: |
| 64 | + return FileBlock(file_path=str(path), block_content="") |
| 65 | + self.resource.report(str(path), "path") |
| 66 | + |
46 | 67 | lines_with_num = [f"{i + 1:03}|{line}" for i, line in enumerate(lines)] |
47 | 68 | result = FileBlock( |
48 | | - file_path=path, |
| 69 | + file_path=str(path), |
49 | 70 | block_content="".join(lines_with_num), |
50 | 71 | ) |
51 | 72 | return result |
@@ -196,3 +217,63 @@ def _lint_file(cls, file_path: str) -> (bool, str): |
196 | 217 | lint_passed = result.returncode == 0 |
197 | 218 | lint_message = result.stdout |
198 | 219 | return lint_passed, lint_message |
| 220 | + |
| 221 | + @staticmethod |
| 222 | + def _read_text(path: Union[str, Path]) -> List[str]: |
| 223 | + with open(str(path), "r") as f: |
| 224 | + lines = f.readlines() |
| 225 | + return lines |
| 226 | + |
| 227 | + @staticmethod |
| 228 | + def _read_pdf(path: Union[str, Path]) -> List[str]: |
| 229 | + result = run_coroutine_sync(Editor._omniparse_read_file, path) |
| 230 | + if result: |
| 231 | + return result |
| 232 | + |
| 233 | + from llama_index.readers.file import PDFReader |
| 234 | + |
| 235 | + reader = PDFReader() |
| 236 | + lines = reader.load_data(file=Path(path)) |
| 237 | + return [i.text for i in lines] |
| 238 | + |
| 239 | + @staticmethod |
| 240 | + def _read_docx(path: Union[str, Path]) -> List[str]: |
| 241 | + result = run_coroutine_sync(Editor._omniparse_read_file, path) |
| 242 | + if result: |
| 243 | + return result |
| 244 | + return read_docx(str(path)) |
| 245 | + |
| 246 | + @staticmethod |
| 247 | + async def _omniparse_read_file(path: Union[str, Path]) -> Optional[List[str]]: |
| 248 | + from metagpt.tools.libs import get_env_default |
| 249 | + from metagpt.utils.omniparse_client import OmniParseClient |
| 250 | + |
| 251 | + base_url = await get_env_default(key="base_url", app_name="OmniParse", default_value="") |
| 252 | + if not base_url: |
| 253 | + return None |
| 254 | + api_key = await get_env_default(key="api_key", app_name="OmniParse", default_value="") |
| 255 | + v = await get_env_default(key="timeout", app_name="OmniParse", default_value="120") |
| 256 | + try: |
| 257 | + timeout = int(v) or 120 |
| 258 | + except ValueError: |
| 259 | + timeout = 120 |
| 260 | + |
| 261 | + try: |
| 262 | + client = OmniParseClient(api_key=api_key, base_url=base_url, max_timeout=timeout) |
| 263 | + file_data = await aread_bin(filename=path) |
| 264 | + ret = await client.parse_document(file_input=file_data, bytes_filename=str(path)) |
| 265 | + except (ValueError, Exception) as e: |
| 266 | + logger.exception(f"{path}: {e}") |
| 267 | + return None |
| 268 | + if not ret.images: |
| 269 | + return [ret.text] if ret.text else None |
| 270 | + |
| 271 | + result = [ret.text] |
| 272 | + img_dir = Path(path).parent / (Path(path).name.replace(".", "_") + "_images") |
| 273 | + img_dir.mkdir(parents=True, exist_ok=True) |
| 274 | + for i in ret.images: |
| 275 | + byte_data = base64.b64decode(i.image) |
| 276 | + filename = img_dir / i.image_name |
| 277 | + await awrite_bin(filename=filename, data=byte_data) |
| 278 | + result.append(f"})") |
| 279 | + return result |
0 commit comments