ccprocessor · e06084 · Aug 12, 2025 · Aug 6, 2025 · Aug 8, 2025 · Aug 8, 2025
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -124,4 +124,4 @@ jobs:
       - name: Publish distribution to PyPI
         run: |
           pip install twine
-          twine upload --verbose --repository-url https://test.pypi.org/legacy/ dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
+          twine upload --verbose dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -5,19 +5,19 @@ repos:
     rev: 5.0.4
     hooks:
       - id: flake8
-        args: [ "--max-line-length=2200", "--ignore=E131,E125,W503,W504,E203,E231,E702,E128" ]
+        args: [ "--max-line-length=2200", "--ignore=E131,E125,W503,W504,E203,E231,E702,E128,E402" ]
         exclude: '^tests/.*/assets/'
   - repo: https://github.com/PyCQA/isort
     rev: 5.11.5
     hooks:
       - id: isort
         exclude: '^tests/.*/assets/'
-  - repo: https://github.com/pre-commit/mirrors-yapf
-    rev: v0.32.0
-    hooks:
-      - id: yapf
-        args: ["--style={based_on_style: google, column_limit: 200, indent_width: 4}"]
-        exclude: '^tests/.*/assets/'
+  # - repo: https://github.com/pre-commit/mirrors-yapf
+  #   rev: v0.32.0
+  #   hooks:
+  #     - id: yapf
+  #       args: ["--style={based_on_style: google, column_limit: 200, indent_width: 4}"]
+  #       exclude: '^tests/.*/assets/'
   # - repo: https://github.com/codespell-project/codespell
   #   rev: v2.2.1
   #   hooks:

diff --git a/llm_web_kit/config/pipe_tpl/html-test.jsonc b/llm_web_kit/config/pipe_tpl/html-test.jsonc
@@ -30,7 +30,7 @@
         "post_extractor": [
             {
                 "enable": true,
-                "python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor"
+                "python_class": "llm_web_kit.extractor.html.post_extractor.ContentListStripSpacePostExtractor"
             }
         ]
     }

diff --git a/llm_web_kit/config/pipe_tpl/html.jsonc b/llm_web_kit/config/pipe_tpl/html.jsonc
@@ -23,7 +23,7 @@
             "post_extractor": [
             {
                 "enable": true,
-                "python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor"
+                "python_class": "llm_web_kit.extractor.html.post_extractor.ContentListStripSpacePostExtractor"
             }
         ]
     }

diff --git a/llm_web_kit/config/pipe_tpl/noclip_html.jsonc b/llm_web_kit/config/pipe_tpl/noclip_html.jsonc
@@ -10,11 +10,11 @@
                 },
                 {
                     "enable": true,
-                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor"
+                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipFilterTablePreExtractor"
                 },
                 {
                     "enable": true,
-                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor",
+                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipCleanTagsPreExtractor",
                     "class_init_kwargs": {}
                 }
             ],
@@ -28,7 +28,7 @@
             "post_extractor": [
             {
                 "enable": true,
-                "python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor"
+                "python_class": "llm_web_kit.extractor.html.post_extractor.ContentListStripSpacePostExtractor"
             }
         ]
     }

diff --git a/llm_web_kit/config/pipe_tpl/noclip_html_test.jsonc b/llm_web_kit/config/pipe_tpl/noclip_html_test.jsonc
@@ -17,11 +17,11 @@
                 },
                 {
                     "enable": true,
-                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor"
+                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipFilterTablePreExtractor"
                 },
                 {
                     "enable": true,
-                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor",
+                    "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipCleanTagsPreExtractor",
                     "class_init_kwargs": {}
                 }
             ],
@@ -35,7 +35,7 @@
             "post_extractor": [
             {
                 "enable": true,
-                "python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor"
+                "python_class": "llm_web_kit.extractor.html.post_extractor.ContentListStripSpacePostExtractor"
             }
         ]
     }

diff --git a/llm_web_kit/extractor/html/post_extractor.py b/llm_web_kit/extractor/html/post_extractor.py
@@ -37,7 +37,7 @@ def _do_post_extract(self, data_json: DataJson) -> DataJson:
         raise NotImplementedError('Subclass must implement abstract method')
 
 
-class HTMLStripSpacePostExtractor(BaseFileFormatPostExtractor):
+class ContentListStripSpacePostExtractor(BaseFileFormatPostExtractor):
     """对段落文本进行处理：
     1. 连续的多个空格转换成1个
     2. 连续的\t转换成1个

diff --git a/llm_web_kit/extractor/html/pre_extractor.py b/llm_web_kit/extractor/html/pre_extractor.py
@@ -49,7 +49,14 @@ def _do_pre_extract(self, data_json: DataJson) -> DataJson:
 
     def __remove_format_table(self, data_json: DataJson):
         """remove 排版table."""
-        html_content = data_json['html']
+        html_content = self._get_html_content(data_json)
+        return self.__do_remove_layout_table(html_content)
+
+    def _get_html_content(self, data_json: DataJson):
+        return data_json['html']
+
+    def __do_remove_layout_table(self, html_content: str):
+        """remove 排版table."""
         html_str = html_to_element(html_content)
         first_structure = html_str.xpath('/html/body/table') != []
         second_structure = html_str.xpath('/html/body/center/table') != []
@@ -95,12 +102,12 @@ def __init__(self, config: dict):
 
     @override
     def _do_pre_extract(self, data_json: DataJson) -> DataJson:
-        data_json['html'] = self.__clean_invisible_elements(data_json)
+        html_content = data_json['html']
+        data_json['html'] = self._clean_invisible_elements(html_content, data_json)
         return data_json
 
-    def __clean_invisible_elements(self, data_json: DataJson) -> str:
+    def _clean_invisible_elements(self, html_content: str, data_json: DataJson) -> str:
         """清理隐藏标签."""
-        html_content = data_json['html']
         tree = html_to_element(html_content)
         # 遍历所有配置的隐藏标签规则
         for tag in INVISIBLE_TAGS:
@@ -184,3 +191,29 @@ def __clean_interactive_elements(self, data_json: DataJson) -> str:
             if len(form.getchildren()) == 0 or not form.text_content().strip():
                 form.getparent().remove(form)
         return element_to_html(tree)
+
+# ##############################################################################
+# 解决 main_html和html处理混乱的问题
+# ##############################################################################
+
+
+class HTMLFileFormatNoClipFilterTablePreExtractor(HTMLFileFormatFilterTablePreExtractor):
+    """noclip管线对main_html预处理."""
+    def __init__(self, config: dict):
+        super().__init__(config)
+
+    @override
+    def _get_html_content(self, data_json: DataJson):
+        return data_json['main_html']
+
+
+class HTMLFileFormatNoClipCleanTagsPreExtractor(HTMLFileFormatCleanTagsPreExtractor):
+    """noclip管线对main_html预处理."""
+    def __init__(self, config: dict):
+        super().__init__(config)
+
+    @override
+    def _do_pre_extract(self, data_json: DataJson) -> DataJson:
+        html_content = data_json['main_html']
+        data_json['main_html'] = self._clean_invisible_elements(html_content, data_json)
+        return data_json
diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/common.py b/llm_web_kit/extractor/html/recognizer/cc_math/common.py
@@ -1,10 +1,20 @@
+import logging
 import os
 import re
 from pathlib import Path
 from typing import List, Tuple
 
 from lxml import etree
 from lxml.html import HtmlElement
+
+# 在导入前就设置严格的日志控制
+logging.basicConfig(level=logging.WARNING, force=True)
+
+# 设置py_asciimath的日志级别，完全禁用其日志输出
+py_asciimath_logger = logging.getLogger('py_asciimath')
+py_asciimath_logger.setLevel(logging.ERROR)
+py_asciimath_logger.disabled = True
+
 from py_asciimath.translator.translator import ASCIIMath2Tex
 
 from llm_web_kit.extractor.html.recognizer.recognizer import CCTag
@@ -15,6 +25,7 @@
 from llm_web_kit.libs.text_utils import normalize_ctl_text
 
 asciimath2tex = ASCIIMath2Tex(log=False)
+
 color_regex = re.compile(r'\\textcolor\[.*?\]\{.*?\}')
 
 
@@ -137,9 +148,6 @@ class MATHINSIGHT:
 }
 
 
-asciimath2tex = ASCIIMath2Tex(log=False)
-
-
 def text_strip(text):
     return text.strip() if text else text
 

diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_script.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_script.py
@@ -15,7 +15,7 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa
         text = node.text
         if text and text_strip(text):
             if node.tag not in ['script', 'style']:
-                new_span = create_new_span([(CCMATH_INTERLINE,MathType.LATEX)], cm.wrap_math_md(text), node, math_render, o_html)
+                new_span = create_new_span([(CCMATH_INLINE,MathType.LATEX)], cm.wrap_math_md(text), node, math_render, o_html)
                 node.addnext(new_span)
             else:
                 katex_pattern = re.compile(r'katex.render')
@@ -28,7 +28,7 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa
                             target_element = target_elements[0]
                             o_html = element_to_html(target_element)
                             target_element.text = None
-                            new_span = create_new_span([(CCMATH_INTERLINE,MathType.LATEX)], cm.wrap_math_md(formula_content), target_element, math_render, o_html)
+                            new_span = create_new_span([(CCMATH_INLINE,MathType.LATEX)], cm.wrap_math_md(formula_content), target_element, math_render, o_html)
                             target_element.insert(0, new_span)
                 elif node.get('type') and 'math/tex' in node.get('type'):
                     tag_math_type_list = cm.get_equation_type(o_html)

diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py
@@ -168,11 +168,13 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
 
                 # span.katex
                 if node.tag == 'script' or 'math' == node.get('class') or 'katex' == node.get('class'):
+                    # print('匹配到script/math/katex标签: ', original_html)
                     tag_script.modify_tree(self.cm, math_render_type, original_html, node, parent)
                 # 只有有渲染器的网站才会走下面文本匹配逻辑
                 if math_render_type:
                     # 14. 只处理只有一层的p标签
                     if node.tag == 'p' and len(node.getchildren()) == 0:
+                        # print('匹配到p标签: ', original_html)
                         tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)
 
             # 修改：传入tree节点，mathjax方案作为process2，不参与上面process1节点的遍历

diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py
@@ -194,8 +194,15 @@ def __combine_text(self, text1:str, text2:str, lang='en') -> str:
             txt = text1 + text2
             return self.replace_entities(txt.strip(), entities_map)
         else:
+            # 如果text2为空，直接返回text1
+            if not text2:
+                return self.replace_entities(text1.strip(), entities_map)
+            # 如果text1为空，直接返回text2
+            if not text1:
+                return self.replace_entities(text2.strip(), entities_map)
+
             # 根据text1的最后一个字符和text2的第一个字符判断两个text之间的连接
-            if (text2[0] in string.punctuation) or (text2[0] in special_symbols) or (text2[0] in other_symbols) or (text1 and text1[-1] in other_symbols):
+            if (text2 and text2[0] in string.punctuation) or (text2 and text2[0] in special_symbols) or (text2 and text2[0] in other_symbols) or (text1 and text1[-1] in other_symbols):
                 words_sep = ''
             else:
                 words_sep = ' '

diff --git a/llm_web_kit/input/pre_data_json.py b/llm_web_kit/input/pre_data_json.py
@@ -20,6 +20,8 @@ class PreDataJsonKey:
     TYPICAL_SIMPLIFIED_HTML = 'typical_simplified_html'
     # 模型打标字典
     LLM_RESPONSE = 'llm_response'
+    # 模型结果都为0
+    LLM_RESPONSE_EMPTY = 'llm_response_empty'
     # 映射模版正文树结构的元素字典
     HTML_ELEMENT_DICT = 'html_element_dict'
     # 映射模版正文时的文本列表

diff --git a/llm_web_kit/libs/version.py b/llm_web_kit/libs/version.py
@@ -1 +1 @@
-__version__ = '3.2.0'
+__version__ = '3.2.1'
diff --git a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py
@@ -46,8 +46,15 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson:
         self.dynamic_classid_enable = pre_data.get(PreDataJsonKey.DYNAMIC_CLASSID_ENABLE, False)
         self.more_noise_enable = pre_data.get(PreDataJsonKey.MORE_NOISE_ENABLE, False)
         self.dynamic_classid_similarity_threshold = pre_data.get(PreDataJsonKey.DYNAMIC_CLASSID_SIM_THRESH, 0.85)
+        response_empty = pre_data.get(PreDataJsonKey.LLM_RESPONSE_EMPTY, False)
         template_data_str = pre_data[PreDataJsonKey.HTML_ELEMENT_DICT]
         template_data = dict()
+        # 检查第0层第一个元素是否为green，如果是则返回空的HTML
+        if response_empty:
+            pre_data[PreDataJsonKey.MAIN_HTML] = ''
+            pre_data[PreDataJsonKey.MAIN_HTML_BODY] = ''
+            pre_data[PreDataJsonKey.MAIN_HTML_SUCCESS] = False
+            return pre_data
         if isinstance(template_data_str, str):
             template_data_str = json.loads(template_data_str)
             for layer, layer_dict in template_data_str.items():
@@ -57,19 +64,6 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson:
             template_data = template_data_str
         else:
             raise ValueError(f'template_data 类型错误: {type(template_data_str)}')
-        # 检查第0层第一个元素是否为green，如果是则返回空的HTML
-        if 0 in template_data:
-            layer_0_elements = template_data[0]
-            if layer_0_elements:
-                # 获取第一个元素
-                first_element_info = next(iter(layer_0_elements.values()))
-                if isinstance(first_element_info, tuple) and len(first_element_info) > 0:
-                    label = first_element_info[0]  # 获取标签（red/green）
-                    if label == 'green':
-                        pre_data[PreDataJsonKey.MAIN_HTML] = ''
-                        pre_data[PreDataJsonKey.MAIN_HTML_BODY] = ''
-                        pre_data[PreDataJsonKey.MAIN_HTML_SUCCESS] = False
-                        return pre_data
 
         self.template_data = template_data
         content, body = self.process(html_source, template_dict_html)
@@ -375,8 +369,6 @@ def __match_tag_class(self, layer_nodes, current_layer_key, parent_key, node_htm
     def __match_tag(self, layer_nodes, current_layer_key, parent_key, node_html, template_doc, class_must=False,
                     id_exist=False):
         current_norm_key = (self.normalize_key((current_layer_key[0], None, None)), parent_key)
-        current_norm_key_with_first_class = (
-            self.normalize_key((current_layer_key[0], current_layer_key[1].strip().split(' ')[0], None)), parent_key)
         for ele_keyy, ele_value in layer_nodes.items():
             # class id要存在
             if class_must and not ele_keyy[1]:
@@ -405,10 +397,14 @@ def __match_tag(self, layer_nodes, current_layer_key, parent_key, node_html, tem
                 if template_sim >= self.dynamic_classid_similarity_threshold:
                     return ele_label, self.normalize_key(ele_keyy[0:3]), is_drop_tail
             # first class方案
-            norm_ele_keyy_with_first_class = self.normalize_key((ele_keyy[0], ele_keyy[1].strip().split(' ')[0], None))
-            norm_ele_keyy_parent_with_first_class = (norm_ele_keyy_with_first_class, ele_parent_keyy)
-            if current_norm_key_with_first_class == norm_ele_keyy_parent_with_first_class:
-                return ele_label, self.normalize_key(ele_keyy[0:3]), is_drop_tail
+            if ele_keyy[1] is not None and current_layer_key[1] is not None:
+                current_norm_key_with_first_class = (
+                    self.normalize_key((current_layer_key[0], current_layer_key[1].strip().split(' ')[0], None)),
+                    parent_key)
+                norm_ele_keyy_with_first_class = self.normalize_key((ele_keyy[0], ele_keyy[1].strip().split(' ')[0], None))
+                norm_ele_keyy_parent_with_first_class = (norm_ele_keyy_with_first_class, ele_parent_keyy)
+                if current_norm_key_with_first_class == norm_ele_keyy_parent_with_first_class:
+                    return ele_label, self.normalize_key(ele_keyy[0:3]), is_drop_tail
 
         return None, None, None
 

diff --git a/llm_web_kit/main_html_parser/parser/tag_mapping.py b/llm_web_kit/main_html_parser/parser/tag_mapping.py
@@ -57,6 +57,7 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson:
                 pre_data[PreDataJsonKey.TYPICAL_DICT_HTML] = template_dict_html
                 pre_data[PreDataJsonKey.SIMILARITY_LAYER] = 0
                 pre_data[PreDataJsonKey.TYPICAL_MAIN_HTML_SUCCESS] = False
+                pre_data[PreDataJsonKey.LLM_RESPONSE_EMPTY] = True
                 return pre_data
 
             # 模版抽取正文html