ccprocessor · e06084 · Aug 15, 2025 · Aug 13, 2025 · Aug 13, 2025 · Aug 13, 2025
diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py b/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py
@@ -533,6 +533,42 @@ def _detect_ascii_math(self, tree: HtmlElement) -> bool:
         return processascii
 
 
+class MathJaxRenderMock(MathJaxRender):
+    """虚拟的MathJax渲染器，用于没有MathJax配置但需要使用MathJax解析逻辑的情况.
+
+    这个类主要用于处理以下场景：
+    1. 网页中没有显式的MathJax配置（如<script type="text/x-mathjax-config">）
+    2. 但在HTML解析过程中检测到了数学公式元素（如<math>标签、公式相关的class等）
+    3. 需要使用MathJax渲染器方案扫一遍所有内容，防止漏抽取公式
+
+    与普通MathJaxRender的区别：
+    - MathJaxRender：会解析HTML中的MathJax配置，使用自定义的分隔符和选项
+    - MathJaxRenderMock：直接使用默认的MathJax配置，不解析HTML配置
+    """
+
+    def __init__(self):
+        """初始化虚拟MathJax渲染器."""
+        super().__init__()
+        self.render_type = MathRenderType.MATHJAX_MOCK
+        # 使用默认的MathJax选项
+        self.options = MATHJAX_OPTIONS.copy()
+
+    def get_options(self, html: str) -> Dict[str, Any]:
+        """虚拟渲染器直接返回默认选项，不解析HTML配置.
+
+        Args:
+            html: HTML字符串（忽略）
+
+        Returns:
+            Dict[str, Any]: 默认MathJax选项字典
+        """
+        return self.options
+
+    def is_customized_options(self) -> bool:
+        """虚拟渲染器始终返回False，表示使用默认配置."""
+        return False
+
+
 # 使用示例
 if __name__ == '__main__':
     # MathJax示例

diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/render/render.py b/llm_web_kit/extractor/html/recognizer/cc_math/render/render.py
@@ -11,6 +11,7 @@
 class MathRenderType:
     """数学公式渲染器类型."""
     MATHJAX = 'mathjax'
+    MATHJAX_MOCK = 'mathjax_mock'  # 虚拟的mathjax渲染器
     MATHJAX_CUSTOMIZED = 'mathjax_customized'  # 临时增加这个type，未来区分走自定义解析的数据
     KATEX = 'katex'
 

diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_script.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_script.py
@@ -14,10 +14,16 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa
     try:
         text = node.text
         if text and text_strip(text):
+            # 先处理非script标签和style标签的节点：即class为math/katex的节点
+            # 例子：<div class="math">f(x) \sim x^2, \quad x\to\infty</div>
             if node.tag not in ['script', 'style']:
                 new_span = create_new_span([(CCMATH_INLINE,MathType.LATEX)], cm.wrap_math_md(text), node, math_render, o_html)
-                node.addnext(new_span)
+                # node.addnext(new_span)
+                replace_element(node, new_span)  # 替换节点，而不是添加
+
+            # 下面是katex逻辑
             else:
+                # 例子：<script type = "e44e-text/javascript">katex.render("f(a,b,c) = (a^2+b^2+c^2)^3", mykatex);</script>
                 katex_pattern = re.compile(r'katex.render')
                 node_text = text_strip(text)
                 if katex_pattern.findall(node_text):
@@ -28,8 +34,17 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa
                             target_element = target_elements[0]
                             o_html = element_to_html(target_element)
                             target_element.text = None
-                            new_span = create_new_span([(CCMATH_INLINE,MathType.LATEX)], cm.wrap_math_md(formula_content), target_element, math_render, o_html)
+                            wrapped_formula = cm.wrap_math_md(formula_content)
+                            # 转化为ccmath，例子：
+                            # <ccmath-inline type="latex" by="katex" html='...'>f(a,b,c) = (a^2+b^2+c^2)^3</ccmath-inline>
+                            new_span = create_new_span([(CCMATH_INLINE, MathType.LATEX)], wrapped_formula,
+                                                       target_element, math_render, o_html)
+                            # 插入到span标签内，例子：
+                            # <span id="mykatex"><ccmath-inline ... </ccmath-inline></span>
                             target_element.insert(0, new_span)
+
+                # 处理sript且type为math/tex的节点
+                # 例子：<html><head><script type="math/tex">x^2 + y^2 = z^2</script></head></html>
                 elif node.get('type') and 'math/tex' in node.get('type'):
                     tag_math_type_list = cm.get_equation_type(o_html)
                     if not tag_math_type_list:

diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py
@@ -24,6 +24,7 @@ class MathRecognizer(BaseHTMLElementRecognizer):
     def __init__(self):
         super().__init__()
         self.cm = CCMATH()
+        self.mathjax_detected = False  # 添加检测标记
 
     @override
     def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str, language:str = 'en') -> List[Tuple[HtmlElement, HtmlElement]]:
@@ -122,8 +123,9 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
             self.cm.url = base_url
             tree = cc_html
             math_render_type = math_render.get_render_type()
-            # 打印遍历node次数
-            # count = 0
+            self.mathjax_detected = False  # 重置标记
+
+            # process1: node循环逻辑
             for node in iter_node(tree):
                 assert isinstance(node, HtmlElement)
                 original_html = self._element_to_html(node)
@@ -134,9 +136,11 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
                         node.tag == 'span' and
                         node.get('class') in [CSDN.INLINE, CSDN.DISPLAY]):
                     tag_script.process_katex_mathml(self.cm, math_render_type, node)
+                    self.mathjax_detected = True
 
                 if ZHIHU.DOMAIN in self.cm.url and node.tag == 'span' and node.get('class') == ZHIHU.MATH:
                     tag_script.process_zhihu_custom_tag(self.cm, math_render_type, node)
+                    self.mathjax_detected = True
 
                 # tag = span， class 为 math-containerm， 或者 mathjax 或者 wp-katex-eq
                 if node.tag == 'span' and node.get('class') and (
@@ -147,44 +151,50 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
                         'tex' in node.get('class')
                 ):
                     tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)
-
-                # script[type="math/tex"]
-                # if node.tag == 'script' and node.get('type') and 'math/tex' in node.get('type'):
-                #     print('匹配到script标签: ', node.get('type'))
-                #     tag_common_modify.modify_tree(cm, math_render_type, original_html, node, parent)
+                    self.mathjax_detected = True
 
                 # math tags
                 if node.tag == 'math' or node.tag.endswith(':math'):
                     # print(f"匹配到数学标签: {node.tag}")
                     # print(f"标签内容: {original_html}")
                     tag_math.modify_tree(self.cm, math_render_type, original_html, node, parent)
+                    self.mathjax_detected = True
 
                 if node.tag == 'mjx-container':
                     tag_mjx.modify_tree(self.cm, math_render, original_html, node)
+                    self.mathjax_detected = True
 
                 # img中的latex
                 if node.tag == 'img':
                     tag_img.modify_tree(self.cm, math_render_type, original_html, node, parent)
+                    self.mathjax_detected = True
 
                 # span.katex
                 if node.tag == 'script' or 'math' == node.get('class') or 'katex' == node.get('class'):
                     # print('匹配到script/math/katex标签: ', original_html)
                     tag_script.modify_tree(self.cm, math_render_type, original_html, node, parent)
+                    self.mathjax_detected = True
                 # 只有有渲染器的网站才会走下面文本匹配逻辑
                 if math_render_type:
                     # 14. 只处理只有一层的p标签
                     if node.tag == 'p' and len(node.getchildren()) == 0:
                         # print('匹配到p标签: ', original_html)
                         tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)
+                        self.mathjax_detected = True
 
-            # 修改：传入tree节点，mathjax方案作为process2，不参与上面process1节点的遍历
-            if math_render_type:
-                try:
-                    if math_render_type == MathRenderType.MATHJAX:
-                        math_render.find_math(tree)
-                except Exception as e:
-                    raise HtmlMathMathjaxRenderRecognizerException(f'处理MathjaxRender数学公式失败: {e}')
-
+            # procsee2: mathjax渲染器逻辑
+            try:
+                # case1：有mathjax配置
+                if math_render_type == MathRenderType.MATHJAX:
+                    math_render.find_math(tree)
+                # case2：无Mathjax配置但是开启Mathjax逻辑开关（node循环抽到公式的情况）
+                elif math_render_type is None and self.mathjax_detected:
+                    from llm_web_kit.extractor.html.recognizer.cc_math.render.mathjax import \
+                        MathJaxRenderMock
+                    math_render = MathJaxRenderMock()
+                    math_render.find_math(tree)
+            except Exception as e:
+                raise HtmlMathMathjaxRenderRecognizerException(f'处理MathjaxRender数学公式失败: {e}')
             # 保存处理后的html
             # with open('test20250702_result.html', 'w', encoding='utf-8') as f:
             #     f.write(self._element_to_html(tree))

diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py
@@ -199,13 +199,14 @@ def __extract_list_item_text_recusive(el: HtmlElement):
                 # item['c'].strip(): 会导致前面处理br标签，添加的\n\n失效
                 result['c'] = ' '.join(normalize_text_segment(item['c'].strip()) for item in paragraph)
             return result
-        list_item_tags = ('li', 'dd', 'dt', 'ul', 'div', 'p', 'span')
-        if child.tag in list_item_tags:
-            paragraph = __extract_list_item_text_recusive(child)
-            if len(paragraph) > 0:
-                tem_json = json.dumps(paragraph).replace('$br$\"}', '\"}')
-                new_paragraph = json.loads(tem_json)
-                text_paragraph.append(new_paragraph)
+        # list_item_tags = ('li', 'dd', 'dt', 'ul', 'div', 'p', 'span')
+        # if child.tag in list_item_tags:
+        # 去掉if限制条件，允许非标准结构的列表通过
+        paragraph = __extract_list_item_text_recusive(child)
+        if len(paragraph) > 0:
+            tem_json = json.dumps(paragraph).replace('$br$\"}', '\"}')
+            new_paragraph = json.loads(tem_json)
+            text_paragraph.append(new_paragraph)
 
         for n, item in enumerate(text_paragraph):
             tem_json = json.dumps(item).replace('$br$', '\\n\\n')

diff --git a/.../llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/math_mathjax_mock.html b/.../llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/math_mathjax_mock.html
diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl
@@ -100,4 +100,5 @@
 {"track_id": "test_mjx_container", "dataset_name": "test_mjx_container", "url": "https://test.com","data_source_category": "HTML",  "path":"testmathjax.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
 {"track_id": "test_word_press", "dataset_name": "test_word_press", "url": "https://test.com","data_source_category": "HTML",  "path":"word_press.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
 {"track_id": "test_ascii_delimiter", "dataset_name": "test_ascii_delimiter", "url": "https://montalk.net/notes/342/tuning-forks-and-megalithic-technology","data_source_category": "HTML",  "path":"math_test_ascii_delimiter.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
-{"track_id": "test_htmlmath_sub_sup", "dataset_name": "test_htmlmath_sub_sup", "url": "https://cccbdb.nist.gov/compvibs3.asp?casno=123911&charge=0&method=42&basis=0","data_source_category": "HTML",  "path":"math_table_title_htmlmath_sub_sup.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
+{"track_id": "test_htmlmath_sub_sup", "dataset_name": "test_htmlmath_sub_sup", "url": "https://cccbdb.nist.gov/compvibs3.asp?casno=123911&charge=0&method=42&basis=0","data_source_category": "HTML",  "path":"math_table_title_htmlmath_sub_sup.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
+{"track_id": "test_mathjax_mock", "dataset_name": "test_mathjax_mock", "url": "http://mathonline.wikidot.com/monotone-sequences-of-real-numbers","data_source_category": "HTML",  "path":"math_mathjax_mock.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math.html
diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math_1.html
diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math_inline_1.html
@@ -0,0 +1,8 @@
+1 yr = 525600 min
+1 yr → 525600 min
+4.7 yr → T
+T
+T
+4.7 yr → 2470320 min
+4.7 years = 2470320 minutes
+4.7 yr ≅ 2470320 min
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_list.py b/tests/llm_web_kit/extractor/html/recognizer/test_list.py
@@ -375,3 +375,38 @@ def test_get_attribute_standalone_improved(self):
             error_msg = str(context.exception)
             self.assertIn('中没有cclist标签', error_msg)
             self.assertIn(element.tag, error_msg)
+
+    def test_no_standard_get_list_content_list(self):
+        """测试非标准结构的list获取content_list."""
+        # 获取私有方法 __get_list_content_list
+        get_list_content_list_method = getattr(self.__list_recognize, '_ListRecognizer__get_list_content_list')
+
+        # 创建测试数据
+        test_elements = [
+            html_to_element('''<ul id="productslist">
+                                    <figure class="list">
+                                        <figcaption><h4>How to Process Oxidized Lead Zinc Ore by Flotation</h4>
+                                            <p>How to Process Oxidized Lead Zinc Ore by Flotation. Metallurgical Content. The
+                                                Flowsheet. Crushing Section; GRINDING; Conditioning and Flotation; Thickening and
+                                                Filtering; Sampling; ORE TESTING LABORATORY; The problem of treating oxidized lead
+                                                zinc ores for the production of high grade lead zinc concentrates is a complex </p>
+                                        </figcaption>
+                                    </figure>
+                                    <figure class="list">
+                                        <figcaption><h4>ore dressing flotation machine,fluorite ore flotation </h4>
+                                            <p>Ore dressing flotation machine is widely used to conduct flotation of copper ore,
+                                                lead zinc ore, glod ore, etc. Mail to sales@sinofote</p>
+                                        </figcaption>
+                                    </figure>
+                                    <figure class="list">
+                                        <figcaption><h4>Zinc Ore Mining Crusher wffofoundation</h4>
+                                            <p>Zinc ore mining process can 14 2016 31 Mar Lead zinc ore dressing equipment zinc ore
+                                                Once processing in the flotation circuit was complete, the zinc </p>
+                                        </figcaption>
+                                    </figure>
+                                </ul>''')
+        ]
+
+        for i, element in enumerate(test_elements):
+            list_content_list = get_list_content_list_method(element, 1)
+            assert len(list_content_list) == 3
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py
@@ -207,6 +207,14 @@
         'base_url': 'https://physicshelpforum.com/t/latex-upgrade-physics-forum-powered-by-mathjax-v3.17489/',
         'expected': 'assets/ccmath/math_physicsforums_2_1.html',
         'expected_inline': 'assets/ccmath/math_physicsforums_2_inline_1.html'
+    },
+    {
+        'input': [
+            'assets/ccmath/math_class_math.html',
+        ],
+        'base_url': 'https://convertoctopus.com/4-7-years-to-minutes',
+        'expected': 'assets/ccmath/math_class_math_1.html',
+        'expected_inline': 'assets/ccmath/math_class_math_inline_1.html'
     }
 ]
 

diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py
@@ -64,7 +64,7 @@ def setUp(self):
                     continue
                 self.data_json.append(json.loads(line))
 
-        assert len(self.data_json) == 103
+        assert len(self.data_json) == 104
 
         # Config for HTML extraction
         self.config = load_pipe_tpl('html-test')
@@ -810,16 +810,27 @@ def test_ascii_delimiter(self):
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
         md_content = result.get_content_list().to_nlp_md()
-        # with open('mathjax抽取case222.md', 'w', encoding='utf-8') as f:
-        #     f.write(md_content)
         self.assertIn(r'$f = \frac{1}{T} ^ 2 \sqrt{\frac{A E}{\rho}}$', md_content)
         self.assertIn(r'${m}^{2}$', md_content)
         self.assertIn(r'\rho$', md_content)
         self.assertIn(r'$f = \frac{1}{2 L} \sqrt{\frac{E}{\rho}}$', md_content)
         self.assertIn(r'$L = {T}^{2} / \left(2 W\right)$', md_content)
 
+    def test_mathjax_mock(self):
+        """测试虚拟mathjax渲染器."""
+        chain = ExtractSimpleFactory.create(self.config)
+        self.assertIsNotNone(chain)
+        test_data = self.data_json[103]
+        input_data = DataJson(test_data)
+        result = chain.extract(input_data)
+        md_content = result.get_content_list().to_nlp_md()
+        self.assertIn(r'$(a_n)$', md_content)
+        self.assertIn(r'$a_n ≤ a_{n+1}$', md_content)
+        self.assertIn(r'$n \in \mathbb{N}$', md_content)
+        self.assertIn(r'$\left ( \frac{1}{n} \right ) = (1, \frac{1}{2}, \frac{1}{3}, ..., \frac{1}{n}, \frac{1}{n+1}, ... )$', md_content)
+
     def test_htmlmath_sub_sup(self):
-        """测试ascii分隔符."""
+        """测试htmlmath中的上下标标签."""
         chain = ExtractSimpleFactory.create(self.config)
         self.assertIsNotNone(chain)
         test_data = self.data_json[102]