From c7b79442123d059e72b3dab60908dc40ceeb8c30 Mon Sep 17 00:00:00 2001
From: Aldo Gonzalez <aldo.gonzalez@airbyte.io>
Date: Thu, 14 Nov 2024 08:51:09 -0600
Subject: [PATCH 1/5] airbyte-cdk: trying to fix unit tests

---
 .../file_based/file_types/unstructured_parser.py      | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py
index 5967a79a4..75da977de 100644
--- a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py
+++ b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py
@@ -162,6 +162,15 @@ def parse_records(
                     logger.warn(f"File {file.uri} cannot be parsed. Skipping it.")
                 else:
                     raise e
+            except Exception as e:
+                exception_str = str(e)
+                logger.warn(f"File {file.uri} caused an error during parsing: {exception_str}.")
+                yield {
+                    "content": None,
+                    "document_key": file.uri,
+                    "_ab_source_file_parse_error": exception_str,
+                }
+                logger.warn(f"File {file.uri} cannot be parsed. Skipping it.")
 
     def _read_file(
         self,
@@ -186,7 +195,7 @@ def _read_file(
                 remote_file,
                 self._get_file_type_error_message(filetype),
             )
-        if filetype in {FileType.MD, filetype is FileType.TXT}:
+        if filetype in {FileType.MD, FileType.TXT}:
             file_content: bytes = file_handle.read()
             decoded_content: str = optional_decode(file_content)
             return decoded_content

From a83c0f7070c6714b46337f84cd555026851767c0 Mon Sep 17 00:00:00 2001
From: Aldo Gonzalez <aldo.gonzalez@airbyte.io>
Date: Thu, 14 Nov 2024 16:28:24 -0600
Subject: [PATCH 2/5] airbyte-cdk: download dependencies if missing punk_tan or
 punkt

---
 .../file_based/file_types/unstructured_parser.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py
index 75da977de..df9e6fc28 100644
--- a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py
+++ b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py
@@ -35,11 +35,18 @@
     FileType,
     detect_filetype,
 )
+import nltk
 
 unstructured_partition_pdf = None
 unstructured_partition_docx = None
 unstructured_partition_pptx = None
 
+try:
+    nltk.data.find("tokenizers/punkt.zip")
+    nltk.data.find("tokenizers/punkt_tab.zip")
+except LookupError:
+    nltk.download('punkt')
+    nltk.download('punkt_tab')
 
 def optional_decode(contents: Union[str, bytes]) -> str:
     if isinstance(contents, bytes):
@@ -164,13 +171,8 @@ def parse_records(
                     raise e
             except Exception as e:
                 exception_str = str(e)
-                logger.warn(f"File {file.uri} caused an error during parsing: {exception_str}.")
-                yield {
-                    "content": None,
-                    "document_key": file.uri,
-                    "_ab_source_file_parse_error": exception_str,
-                }
-                logger.warn(f"File {file.uri} cannot be parsed. Skipping it.")
+                logger.error(f"File {file.uri} caused an error during parsing: {exception_str}.")
+                raise e
 
     def _read_file(
         self,

From 70742a66b3728d7330cbb970f37a22db89b604c5 Mon Sep 17 00:00:00 2001
From: Aldo Gonzalez <aldo.gonzalez@airbyte.io>
Date: Thu, 14 Nov 2024 16:34:31 -0600
Subject: [PATCH 3/5] airbyte-cdk: ruff fix

---
 .../sources/file_based/file_types/unstructured_parser.py     | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py
index df9e6fc28..7dafc83d9 100644
--- a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py
+++ b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py
@@ -45,8 +45,9 @@
     nltk.data.find("tokenizers/punkt.zip")
     nltk.data.find("tokenizers/punkt_tab.zip")
 except LookupError:
-    nltk.download('punkt')
-    nltk.download('punkt_tab')
+    nltk.download("punkt")
+    nltk.download("punkt_tab")
+
 
 def optional_decode(contents: Union[str, bytes]) -> str:
     if isinstance(contents, bytes):

From fb545f8b456db276b6d23e4aed6739ce86073902 Mon Sep 17 00:00:00 2001
From: Aldo Gonzalez <aldo.gonzalez@airbyte.io>
Date: Fri, 15 Nov 2024 13:46:13 -0600
Subject: [PATCH 4/5] airbyte-cdk: fix mypy and update .gitignore

---
 .gitignore                                                   | 1 +
 .../sources/file_based/file_types/unstructured_parser.py     | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index fe51d38a0..b95d9039b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,4 +12,5 @@ dist
 .mypy_cache
 .venv
 .pytest_cache
+.idea
 **/__pycache__
diff --git a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py
index 7dafc83d9..ba36f1616 100644
--- a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py
+++ b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py
@@ -430,7 +430,10 @@ def _render_markdown(self, elements: List[Any]) -> str:
 
     def _convert_to_markdown(self, el: Dict[str, Any]) -> str:
         if dpath.get(el, "type") == "Title":
-            heading_str = "#" * (dpath.get(el, "metadata/category_depth", default=1) or 1)
+            category_depth = dpath.get(el, "metadata/category_depth", default=1) or 1
+            if not isinstance(category_depth, int):
+                category_depth = int(category_depth) if isinstance(category_depth, (str, float)) else 1
+            heading_str = "#" * category_depth
             return f"{heading_str} {dpath.get(el, 'text')}"
         elif dpath.get(el, "type") == "ListItem":
             return f"- {dpath.get(el, 'text')}"

From b79ad2105841c1a1443b26d4a1d93dd44f2e022d Mon Sep 17 00:00:00 2001
From: Aldo Gonzalez <aldo.gonzalez@airbyte.io>
Date: Fri, 15 Nov 2024 13:50:11 -0600
Subject: [PATCH 5/5] airbyte-cdk: fix ruff

---
 .../sources/file_based/file_types/unstructured_parser.py      | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py
index ba36f1616..972240dc6 100644
--- a/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py
+++ b/airbyte_cdk/sources/file_based/file_types/unstructured_parser.py
@@ -432,7 +432,9 @@ def _convert_to_markdown(self, el: Dict[str, Any]) -> str:
         if dpath.get(el, "type") == "Title":
             category_depth = dpath.get(el, "metadata/category_depth", default=1) or 1
             if not isinstance(category_depth, int):
-                category_depth = int(category_depth) if isinstance(category_depth, (str, float)) else 1
+                category_depth = (
+                    int(category_depth) if isinstance(category_depth, (str, float)) else 1
+                )
             heading_str = "#" * category_depth
             return f"{heading_str} {dpath.get(el, 'text')}"
         elif dpath.get(el, "type") == "ListItem":