From c7b79442123d059e72b3dab60908dc40ceeb8c30 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Thu, 14 Nov 2024 08:51:09 -0600 Subject: [PATCH 1/5] airbyte-cdk: trying to fix unit tests --- .../file_based/file_types/unstructured_parser.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py index 5967a79a4..75da977de 100644 --- a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py @@ -162,6 +162,15 @@ def parse_records( logger.warn(f"File {file.uri} cannot be parsed. Skipping it.") else: raise e + except Exception as e: + exception_str = str(e) + logger.warn(f"File {file.uri} caused an error during parsing: {exception_str}.") + yield { + "content": None, + "document_key": file.uri, + "_ab_source_file_parse_error": exception_str, + } + logger.warn(f"File {file.uri} cannot be parsed. Skipping it.") def _read_file( self, @@ -186,7 +195,7 @@ def _read_file( remote_file, self._get_file_type_error_message(filetype), ) - if filetype in {FileType.MD, filetype is FileType.TXT}: + if filetype in {FileType.MD, FileType.TXT}: file_content: bytes = file_handle.read() decoded_content: str = optional_decode(file_content) return decoded_content From a83c0f7070c6714b46337f84cd555026851767c0 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Thu, 14 Nov 2024 16:28:24 -0600 Subject: [PATCH 2/5] airbyte-cdk: download dependencies if missing punk_tan or punkt --- .../file_based/file_types/unstructured_parser.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py index 75da977de..df9e6fc28 100644 --- a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py @@ -35,11 +35,18 @@ FileType, detect_filetype, ) +import nltk unstructured_partition_pdf = None unstructured_partition_docx = None unstructured_partition_pptx = None +try: + nltk.data.find("tokenizers/punkt.zip") + nltk.data.find("tokenizers/punkt_tab.zip") +except LookupError: + nltk.download('punkt') + nltk.download('punkt_tab') def optional_decode(contents: Union[str, bytes]) -> str: if isinstance(contents, bytes): @@ -164,13 +171,8 @@ def parse_records( raise e except Exception as e: exception_str = str(e) - logger.warn(f"File {file.uri} caused an error during parsing: {exception_str}.") - yield { - "content": None, - "document_key": file.uri, - "_ab_source_file_parse_error": exception_str, - } - logger.warn(f"File {file.uri} cannot be parsed. Skipping it.") + logger.error(f"File {file.uri} caused an error during parsing: {exception_str}.") + raise e def _read_file( self, From 70742a66b3728d7330cbb970f37a22db89b604c5 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Thu, 14 Nov 2024 16:34:31 -0600 Subject: [PATCH 3/5] airbyte-cdk: ruff fix --- .../sources/file_based/file_types/unstructured_parser.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py index df9e6fc28..7dafc83d9 100644 --- a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py @@ -45,8 +45,9 @@ nltk.data.find("tokenizers/punkt.zip") nltk.data.find("tokenizers/punkt_tab.zip") except LookupError: - nltk.download('punkt') - nltk.download('punkt_tab') + nltk.download("punkt") + nltk.download("punkt_tab") + def optional_decode(contents: Union[str, bytes]) -> str: if isinstance(contents, bytes): From fb545f8b456db276b6d23e4aed6739ce86073902 Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Fri, 15 Nov 2024 13:46:13 -0600 Subject: [PATCH 4/5] airbyte-cdk: fix mypy and update .gitignore --- .gitignore | 1 + .../sources/file_based/file_types/unstructured_parser.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index fe51d38a0..b95d9039b 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,5 @@ dist .mypy_cache .venv .pytest_cache +.idea **/__pycache__ diff --git a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py index 7dafc83d9..ba36f1616 100644 --- a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py @@ -430,7 +430,10 @@ def _render_markdown(self, elements: List[Any]) -> str: def _convert_to_markdown(self, el: Dict[str, Any]) -> str: if dpath.get(el, "type") == "Title": - heading_str = "#" * (dpath.get(el, "metadata/category_depth", default=1) or 1) + category_depth = dpath.get(el, "metadata/category_depth", default=1) or 1 + if not isinstance(category_depth, int): + category_depth = int(category_depth) if isinstance(category_depth, (str, float)) else 1 + heading_str = "#" * category_depth return f"{heading_str} {dpath.get(el, 'text')}" elif dpath.get(el, "type") == "ListItem": return f"- {dpath.get(el, 'text')}" From b79ad2105841c1a1443b26d4a1d93dd44f2e022d Mon Sep 17 00:00:00 2001 From: Aldo Gonzalez Date: Fri, 15 Nov 2024 13:50:11 -0600 Subject: [PATCH 5/5] airbyte-cdk: fix ruff --- .../sources/file_based/file_types/unstructured_parser.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py index ba36f1616..972240dc6 100644 --- a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py +++ b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py @@ -432,7 +432,9 @@ def _convert_to_markdown(self, el: Dict[str, Any]) -> str: if dpath.get(el, "type") == "Title": category_depth = dpath.get(el, "metadata/category_depth", default=1) or 1 if not isinstance(category_depth, int): - category_depth = int(category_depth) if isinstance(category_depth, (str, float)) else 1 + category_depth = ( + int(category_depth) if isinstance(category_depth, (str, float)) else 1 + ) heading_str = "#" * category_depth return f"{heading_str} {dpath.get(el, 'text')}" elif dpath.get(el, "type") == "ListItem":