From 19f8c353ca060c4325ffc04d1f937f7c93ba58b5 Mon Sep 17 00:00:00 2001 From: drunkpig <60862764+drunkpig@users.noreply.github.com> Date: Sun, 10 Aug 2025 12:30:45 +0800 Subject: [PATCH 1/3] fix: noclip pre-extract problem --- llm_web_kit/config/pipe_tpl/noclip_html.jsonc | 4 +- .../config/pipe_tpl/noclip_html_test.jsonc | 4 +- llm_web_kit/extractor/html/pre_extractor.py | 41 +++++++++++++++++-- 3 files changed, 41 insertions(+), 8 deletions(-) diff --git a/llm_web_kit/config/pipe_tpl/noclip_html.jsonc b/llm_web_kit/config/pipe_tpl/noclip_html.jsonc index 25692972..20200c8e 100644 --- a/llm_web_kit/config/pipe_tpl/noclip_html.jsonc +++ b/llm_web_kit/config/pipe_tpl/noclip_html.jsonc @@ -10,11 +10,11 @@ }, { "enable": true, - "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor" + "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipFilterTablePreExtractor" }, { "enable": true, - "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor", + "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipCleanTagsPreExtractor", "class_init_kwargs": {} } ], diff --git a/llm_web_kit/config/pipe_tpl/noclip_html_test.jsonc b/llm_web_kit/config/pipe_tpl/noclip_html_test.jsonc index 0e471ce6..cce73d0d 100644 --- a/llm_web_kit/config/pipe_tpl/noclip_html_test.jsonc +++ b/llm_web_kit/config/pipe_tpl/noclip_html_test.jsonc @@ -17,11 +17,11 @@ }, { "enable": true, - "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor" + "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipFilterTablePreExtractor" }, { "enable": true, - "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor", + "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipCleanTagsPreExtractor", "class_init_kwargs": {} } ], diff --git a/llm_web_kit/extractor/html/pre_extractor.py b/llm_web_kit/extractor/html/pre_extractor.py index d95f06e5..832b6bab 100644 --- a/llm_web_kit/extractor/html/pre_extractor.py +++ b/llm_web_kit/extractor/html/pre_extractor.py @@ -49,7 +49,14 @@ def _do_pre_extract(self, data_json: DataJson) -> DataJson: def __remove_format_table(self, data_json: DataJson): """remove 排版table.""" - html_content = data_json['html'] + html_content = self._get_html_content(data_json) + return self.__do_remove_layout_table(html_content) + + def _get_html_content(self, data_json: DataJson): + return data_json['html'] + + def __do_remove_layout_table(self, html_content: str): + """remove 排版table.""" html_str = html_to_element(html_content) first_structure = html_str.xpath('/html/body/table') != [] second_structure = html_str.xpath('/html/body/center/table') != [] @@ -95,12 +102,12 @@ def __init__(self, config: dict): @override def _do_pre_extract(self, data_json: DataJson) -> DataJson: - data_json['html'] = self.__clean_invisible_elements(data_json) + html_content = data_json['html'] + data_json['html'] = self.__clean_invisible_elements(html_content, data_json) return data_json - def __clean_invisible_elements(self, data_json: DataJson) -> str: + def __clean_invisible_elements(self, html_content: str, data_json: DataJson) -> str: """清理隐藏标签.""" - html_content = data_json['html'] tree = html_to_element(html_content) # 遍历所有配置的隐藏标签规则 for tag in INVISIBLE_TAGS: @@ -184,3 +191,29 @@ def __clean_interactive_elements(self, data_json: DataJson) -> str: if len(form.getchildren()) == 0 or not form.text_content().strip(): form.getparent().remove(form) return element_to_html(tree) + +# ############################################################################## +# 解决 main_html和html处理混乱的问题 +# ############################################################################## + + +class HTMLFileFormatNoClipFilterTablePreExtractor(HTMLFileFormatFilterTablePreExtractor): + """noclip管线对main_html预处理.""" + def __init__(self, config: dict): + super().__init__(config) + + @override + def _get_html_content(self, data_json: DataJson): + return data_json['main_html'] + + +class HTMLFileFormatNoClipCleanTagsPreExtractor(HTMLFileFormatCleanTagsPreExtractor): + """noclip管线对main_html预处理.""" + def __init__(self, config: dict): + super().__init__(config) + + @override + def _do_pre_extract(self, data_json: DataJson) -> DataJson: + html_content = data_json['main_html'] + data_json['main_html'] = self.__clean_invisible_elements(html_content, data_json) + return data_json From 51a14bdb3395d3bec36f20efd1904c6803957d65 Mon Sep 17 00:00:00 2001 From: drunkpig <60862764+drunkpig@users.noreply.github.com> Date: Sun, 10 Aug 2025 12:45:36 +0800 Subject: [PATCH 2/3] fix: make function visible in subclass --- llm_web_kit/extractor/html/pre_extractor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llm_web_kit/extractor/html/pre_extractor.py b/llm_web_kit/extractor/html/pre_extractor.py index 832b6bab..c4247c03 100644 --- a/llm_web_kit/extractor/html/pre_extractor.py +++ b/llm_web_kit/extractor/html/pre_extractor.py @@ -103,10 +103,10 @@ def __init__(self, config: dict): @override def _do_pre_extract(self, data_json: DataJson) -> DataJson: html_content = data_json['html'] - data_json['html'] = self.__clean_invisible_elements(html_content, data_json) + data_json['html'] = self._clean_invisible_elements(html_content, data_json) return data_json - def __clean_invisible_elements(self, html_content: str, data_json: DataJson) -> str: + def _clean_invisible_elements(self, html_content: str, data_json: DataJson) -> str: """清理隐藏标签.""" tree = html_to_element(html_content) # 遍历所有配置的隐藏标签规则 @@ -215,5 +215,5 @@ def __init__(self, config: dict): @override def _do_pre_extract(self, data_json: DataJson) -> DataJson: html_content = data_json['main_html'] - data_json['main_html'] = self.__clean_invisible_elements(html_content, data_json) + data_json['main_html'] = self._clean_invisible_elements(html_content, data_json) return data_json From f4bb5ed208c94a67356950289b27a93e21db357e Mon Sep 17 00:00:00 2001 From: drunkpig <60862764+drunkpig@users.noreply.github.com> Date: Sun, 10 Aug 2025 13:09:12 +0800 Subject: [PATCH 3/3] fix: unit test of hidden text --- tests/llm_web_kit/extractor/html/recognizer/test_text.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py index a17f16cb..9dd16050 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py @@ -428,7 +428,8 @@ def test_normalize_space4(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert 'Show Ignored Content\n 1. DrDu\n\n Lieber Hendrik, \n\n kannst Du hierzu was beitragen? \n\n Ich finde keinen rechten Grund' in content_md + assert '1. DrDu\n\n Lieber Hendrik, \n\n kannst Du hierzu was beitragen? \n\n Ich finde keinen rechten Grund' in content_md + assert 'Show Ignored Content' not in content_md # 这个是隐藏标签,不应该被识别出来 def test_Lack_content1(self): """