Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,42 @@ def _detect_ascii_math(self, tree: HtmlElement) -> bool:
return processascii


class MathJaxRenderMock(MathJaxRender):
"""虚拟的MathJax渲染器,用于没有MathJax配置但需要使用MathJax解析逻辑的情况.

这个类主要用于处理以下场景:
1. 网页中没有显式的MathJax配置(如<script type="text/x-mathjax-config">)
2. 但在HTML解析过程中检测到了数学公式元素(如<math>标签、公式相关的class等)
3. 需要使用MathJax渲染器方案扫一遍所有内容,防止漏抽取公式

与普通MathJaxRender的区别:
- MathJaxRender:会解析HTML中的MathJax配置,使用自定义的分隔符和选项
- MathJaxRenderMock:直接使用默认的MathJax配置,不解析HTML配置
"""

def __init__(self):
"""初始化虚拟MathJax渲染器."""
super().__init__()
self.render_type = MathRenderType.MATHJAX_MOCK
# 使用默认的MathJax选项
self.options = MATHJAX_OPTIONS.copy()

def get_options(self, html: str) -> Dict[str, Any]:
"""虚拟渲染器直接返回默认选项,不解析HTML配置.

Args:
html: HTML字符串(忽略)

Returns:
Dict[str, Any]: 默认MathJax选项字典
"""
return self.options

def is_customized_options(self) -> bool:
"""虚拟渲染器始终返回False,表示使用默认配置."""
return False


# 使用示例
if __name__ == '__main__':
# MathJax示例
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
class MathRenderType:
"""数学公式渲染器类型."""
MATHJAX = 'mathjax'
MATHJAX_MOCK = 'mathjax_mock' # 虚拟的mathjax渲染器
MATHJAX_CUSTOMIZED = 'mathjax_customized' # 临时增加这个type,未来区分走自定义解析的数据
KATEX = 'katex'

Expand Down
40 changes: 25 additions & 15 deletions llm_web_kit/extractor/html/recognizer/ccmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class MathRecognizer(BaseHTMLElementRecognizer):
def __init__(self):
super().__init__()
self.cm = CCMATH()
self.mathjax_detected = False # 添加检测标记

@override
def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str, language:str = 'en') -> List[Tuple[HtmlElement, HtmlElement]]:
Expand Down Expand Up @@ -122,8 +123,9 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
self.cm.url = base_url
tree = cc_html
math_render_type = math_render.get_render_type()
# 打印遍历node次数
# count = 0
self.mathjax_detected = False # 重置标记

# process1: node循环逻辑
for node in iter_node(tree):
assert isinstance(node, HtmlElement)
original_html = self._element_to_html(node)
Expand All @@ -134,9 +136,11 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
node.tag == 'span' and
node.get('class') in [CSDN.INLINE, CSDN.DISPLAY]):
tag_script.process_katex_mathml(self.cm, math_render_type, node)
self.mathjax_detected = True

if ZHIHU.DOMAIN in self.cm.url and node.tag == 'span' and node.get('class') == ZHIHU.MATH:
tag_script.process_zhihu_custom_tag(self.cm, math_render_type, node)
self.mathjax_detected = True

# tag = span, class 为 math-containerm, 或者 mathjax 或者 wp-katex-eq
if node.tag == 'span' and node.get('class') and (
Expand All @@ -147,44 +151,50 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
'tex' in node.get('class')
):
tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)

# script[type="math/tex"]
# if node.tag == 'script' and node.get('type') and 'math/tex' in node.get('type'):
# print('匹配到script标签: ', node.get('type'))
# tag_common_modify.modify_tree(cm, math_render_type, original_html, node, parent)
self.mathjax_detected = True

# math tags
if node.tag == 'math' or node.tag.endswith(':math'):
# print(f"匹配到数学标签: {node.tag}")
# print(f"标签内容: {original_html}")
tag_math.modify_tree(self.cm, math_render_type, original_html, node, parent)
self.mathjax_detected = True

if node.tag == 'mjx-container':
tag_mjx.modify_tree(self.cm, math_render, original_html, node)
self.mathjax_detected = True

# img中的latex
if node.tag == 'img':
tag_img.modify_tree(self.cm, math_render_type, original_html, node, parent)
self.mathjax_detected = True

# span.katex
if node.tag == 'script' or 'math' == node.get('class') or 'katex' == node.get('class'):
# print('匹配到script/math/katex标签: ', original_html)
tag_script.modify_tree(self.cm, math_render_type, original_html, node, parent)
self.mathjax_detected = True
# 只有有渲染器的网站才会走下面文本匹配逻辑
if math_render_type:
# 14. 只处理只有一层的p标签
if node.tag == 'p' and len(node.getchildren()) == 0:
# print('匹配到p标签: ', original_html)
tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)
self.mathjax_detected = True

# 修改:传入tree节点,mathjax方案作为process2,不参与上面process1节点的遍历
if math_render_type:
try:
if math_render_type == MathRenderType.MATHJAX:
math_render.find_math(tree)
except Exception as e:
raise HtmlMathMathjaxRenderRecognizerException(f'处理MathjaxRender数学公式失败: {e}')

# procsee2: mathjax渲染器逻辑
try:
# case1:有mathjax配置
if math_render_type == MathRenderType.MATHJAX:
math_render.find_math(tree)
# case2:无Mathjax配置但是开启Mathjax逻辑开关(node循环抽到公式的情况)
elif math_render_type is None and self.mathjax_detected:
from llm_web_kit.extractor.html.recognizer.cc_math.render.mathjax import \
MathJaxRenderMock
math_render = MathJaxRenderMock()
math_render.find_math(tree)
except Exception as e:
raise HtmlMathMathjaxRenderRecognizerException(f'处理MathjaxRender数学公式失败: {e}')
# 保存处理后的html
# with open('test20250702_result.html', 'w', encoding='utf-8') as f:
# f.write(self._element_to_html(tree))
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -100,4 +100,5 @@
{"track_id": "test_mjx_container", "dataset_name": "test_mjx_container", "url": "https://test.com","data_source_category": "HTML", "path":"testmathjax.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
{"track_id": "test_word_press", "dataset_name": "test_word_press", "url": "https://test.com","data_source_category": "HTML", "path":"word_press.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
{"track_id": "test_ascii_delimiter", "dataset_name": "test_ascii_delimiter", "url": "https://montalk.net/notes/342/tuning-forks-and-megalithic-technology","data_source_category": "HTML", "path":"math_test_ascii_delimiter.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
{"track_id": "test_htmlmath_sub_sup", "dataset_name": "test_htmlmath_sub_sup", "url": "https://cccbdb.nist.gov/compvibs3.asp?casno=123911&charge=0&method=42&basis=0","data_source_category": "HTML", "path":"math_table_title_htmlmath_sub_sup.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
{"track_id": "test_htmlmath_sub_sup", "dataset_name": "test_htmlmath_sub_sup", "url": "https://cccbdb.nist.gov/compvibs3.asp?casno=123911&charge=0&method=42&basis=0","data_source_category": "HTML", "path":"math_table_title_htmlmath_sub_sup.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
{"track_id": "test_mathjax_mock", "dataset_name": "test_mathjax_mock", "url": "http://mathonline.wikidot.com/monotone-sequences-of-real-numbers","data_source_category": "HTML", "path":"math_mathjax_mock.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
19 changes: 15 additions & 4 deletions tests/llm_web_kit/extractor/test_extractor_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def setUp(self):
continue
self.data_json.append(json.loads(line))

assert len(self.data_json) == 103
assert len(self.data_json) == 104

# Config for HTML extraction
self.config = load_pipe_tpl('html-test')
Expand Down Expand Up @@ -810,16 +810,27 @@ def test_ascii_delimiter(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
md_content = result.get_content_list().to_nlp_md()
# with open('mathjax抽取case222.md', 'w', encoding='utf-8') as f:
# f.write(md_content)
self.assertIn(r'$f = \frac{1}{T} ^ 2 \sqrt{\frac{A E}{\rho}}$', md_content)
self.assertIn(r'${m}^{2}$', md_content)
self.assertIn(r'\rho$', md_content)
self.assertIn(r'$f = \frac{1}{2 L} \sqrt{\frac{E}{\rho}}$', md_content)
self.assertIn(r'$L = {T}^{2} / \left(2 W\right)$', md_content)

def test_mathjax_mock(self):
"""测试虚拟mathjax渲染器."""
chain = ExtractSimpleFactory.create(self.config)
self.assertIsNotNone(chain)
test_data = self.data_json[103]
input_data = DataJson(test_data)
result = chain.extract(input_data)
md_content = result.get_content_list().to_nlp_md()
self.assertIn(r'$(a_n)$', md_content)
self.assertIn(r'$a_n ≤ a_{n+1}$', md_content)
self.assertIn(r'$n \in \mathbb{N}$', md_content)
self.assertIn(r'$\left ( \frac{1}{n} \right ) = (1, \frac{1}{2}, \frac{1}{3}, ..., \frac{1}{n}, \frac{1}{n+1}, ... )$', md_content)

def test_htmlmath_sub_sup(self):
"""测试ascii分隔符."""
"""测试htmlmath中的上下标标签."""
chain = ExtractSimpleFactory.create(self.config)
self.assertIsNotNone(chain)
test_data = self.data_json[102]
Expand Down