From 19f8c353ca060c4325ffc04d1f937f7c93ba58b5 Mon Sep 17 00:00:00 2001
From: drunkpig <60862764+drunkpig@users.noreply.github.com>
Date: Sun, 10 Aug 2025 12:30:45 +0800
Subject: [PATCH 1/3] fix: noclip pre-extract problem

---
 llm_web_kit/config/pipe_tpl/noclip_html.jsonc |  4 +-
 .../config/pipe_tpl/noclip_html_test.jsonc    |  4 +-
 llm_web_kit/extractor/html/pre_extractor.py   | 41 +++++++++++++++++--
 3 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/llm_web_kit/config/pipe_tpl/noclip_html.jsonc b/llm_web_kit/config/pipe_tpl/noclip_html.jsonc
index 25692972..20200c8e 100644
--- a/llm_web_kit/config/pipe_tpl/noclip_html.jsonc
+++ b/llm_web_kit/config/pipe_tpl/noclip_html.jsonc
@@ -10,11 +10,11 @@
                 },
                 {
                     "enable": true,
-                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor"
+                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipFilterTablePreExtractor"
                 },
                 {
                     "enable": true,
-                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor",
+                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipCleanTagsPreExtractor",
                     "class_init_kwargs": {}
                 }
             ],
diff --git a/llm_web_kit/config/pipe_tpl/noclip_html_test.jsonc b/llm_web_kit/config/pipe_tpl/noclip_html_test.jsonc
index 0e471ce6..cce73d0d 100644
--- a/llm_web_kit/config/pipe_tpl/noclip_html_test.jsonc
+++ b/llm_web_kit/config/pipe_tpl/noclip_html_test.jsonc
@@ -17,11 +17,11 @@
                 },
                 {
                     "enable": true,
-                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor"
+                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipFilterTablePreExtractor"
                 },
                 {
                     "enable": true,
-                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor",
+                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipCleanTagsPreExtractor",
                     "class_init_kwargs": {}
                 }
             ],
diff --git a/llm_web_kit/extractor/html/pre_extractor.py b/llm_web_kit/extractor/html/pre_extractor.py
index d95f06e5..832b6bab 100644
--- a/llm_web_kit/extractor/html/pre_extractor.py
+++ b/llm_web_kit/extractor/html/pre_extractor.py
@@ -49,7 +49,14 @@ def _do_pre_extract(self, data_json: DataJson) -> DataJson:
 
     def __remove_format_table(self, data_json: DataJson):
         """remove 排版table."""
-        html_content = data_json['html']
+        html_content = self._get_html_content(data_json)
+        return self.__do_remove_layout_table(html_content)
+
+    def _get_html_content(self, data_json: DataJson):
+        return data_json['html']
+
+    def __do_remove_layout_table(self, html_content: str):
+        """remove 排版table."""
         html_str = html_to_element(html_content)
         first_structure = html_str.xpath('/html/body/table') != []
         second_structure = html_str.xpath('/html/body/center/table') != []
@@ -95,12 +102,12 @@ def __init__(self, config: dict):
 
     @override
     def _do_pre_extract(self, data_json: DataJson) -> DataJson:
-        data_json['html'] = self.__clean_invisible_elements(data_json)
+        html_content = data_json['html']
+        data_json['html'] = self.__clean_invisible_elements(html_content, data_json)
         return data_json
 
-    def __clean_invisible_elements(self, data_json: DataJson) -> str:
+    def __clean_invisible_elements(self, html_content: str, data_json: DataJson) -> str:
         """清理隐藏标签."""
-        html_content = data_json['html']
         tree = html_to_element(html_content)
         # 遍历所有配置的隐藏标签规则
         for tag in INVISIBLE_TAGS:
@@ -184,3 +191,29 @@ def __clean_interactive_elements(self, data_json: DataJson) -> str:
             if len(form.getchildren()) == 0 or not form.text_content().strip():
                 form.getparent().remove(form)
         return element_to_html(tree)
+
+# ##############################################################################
+# 解决 main_html和html处理混乱的问题
+# ##############################################################################
+
+
+class HTMLFileFormatNoClipFilterTablePreExtractor(HTMLFileFormatFilterTablePreExtractor):
+    """noclip管线对main_html预处理."""
+    def __init__(self, config: dict):
+        super().__init__(config)
+
+    @override
+    def _get_html_content(self, data_json: DataJson):
+        return data_json['main_html']
+
+
+class HTMLFileFormatNoClipCleanTagsPreExtractor(HTMLFileFormatCleanTagsPreExtractor):
+    """noclip管线对main_html预处理."""
+    def __init__(self, config: dict):
+        super().__init__(config)
+
+    @override
+    def _do_pre_extract(self, data_json: DataJson) -> DataJson:
+        html_content = data_json['main_html']
+        data_json['main_html'] = self.__clean_invisible_elements(html_content, data_json)
+        return data_json

From 51a14bdb3395d3bec36f20efd1904c6803957d65 Mon Sep 17 00:00:00 2001
From: drunkpig <60862764+drunkpig@users.noreply.github.com>
Date: Sun, 10 Aug 2025 12:45:36 +0800
Subject: [PATCH 2/3] fix: make function visible in subclass

---
 llm_web_kit/extractor/html/pre_extractor.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llm_web_kit/extractor/html/pre_extractor.py b/llm_web_kit/extractor/html/pre_extractor.py
index 832b6bab..c4247c03 100644
--- a/llm_web_kit/extractor/html/pre_extractor.py
+++ b/llm_web_kit/extractor/html/pre_extractor.py
@@ -103,10 +103,10 @@ def __init__(self, config: dict):
     @override
     def _do_pre_extract(self, data_json: DataJson) -> DataJson:
         html_content = data_json['html']
-        data_json['html'] = self.__clean_invisible_elements(html_content, data_json)
+        data_json['html'] = self._clean_invisible_elements(html_content, data_json)
         return data_json
 
-    def __clean_invisible_elements(self, html_content: str, data_json: DataJson) -> str:
+    def _clean_invisible_elements(self, html_content: str, data_json: DataJson) -> str:
         """清理隐藏标签."""
         tree = html_to_element(html_content)
         # 遍历所有配置的隐藏标签规则
@@ -215,5 +215,5 @@ def __init__(self, config: dict):
     @override
     def _do_pre_extract(self, data_json: DataJson) -> DataJson:
         html_content = data_json['main_html']
-        data_json['main_html'] = self.__clean_invisible_elements(html_content, data_json)
+        data_json['main_html'] = self._clean_invisible_elements(html_content, data_json)
         return data_json

From f4bb5ed208c94a67356950289b27a93e21db357e Mon Sep 17 00:00:00 2001
From: drunkpig <60862764+drunkpig@users.noreply.github.com>
Date: Sun, 10 Aug 2025 13:09:12 +0800
Subject: [PATCH 3/3] fix: unit test of hidden text

---
 tests/llm_web_kit/extractor/html/recognizer/test_text.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py
index a17f16cb..9dd16050 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py
@@ -428,7 +428,8 @@ def test_normalize_space4(self):
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
         content_md = result.get_content_list().to_mm_md()
-        assert 'Show Ignored Content\n  1. DrDu\n\n Lieber Hendrik, \n\n kannst Du hierzu was beitragen? \n\n Ich finde keinen rechten Grund' in content_md
+        assert '1. DrDu\n\n Lieber Hendrik, \n\n kannst Du hierzu was beitragen? \n\n Ich finde keinen rechten Grund' in content_md
+        assert 'Show Ignored Content' not in content_md  # 这个是隐藏标签，不应该被识别出来
 
     def test_Lack_content1(self):
         """