Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,42 @@ def _detect_ascii_math(self, tree: HtmlElement) -> bool:
return processascii


class MathJaxRenderMock(MathJaxRender):
"""虚拟的MathJax渲染器,用于没有MathJax配置但需要使用MathJax解析逻辑的情况.

这个类主要用于处理以下场景:
1. 网页中没有显式的MathJax配置(如<script type="text/x-mathjax-config">)
2. 但在HTML解析过程中检测到了数学公式元素(如<math>标签、公式相关的class等)
3. 需要使用MathJax渲染器方案扫一遍所有内容,防止漏抽取公式

与普通MathJaxRender的区别:
- MathJaxRender:会解析HTML中的MathJax配置,使用自定义的分隔符和选项
- MathJaxRenderMock:直接使用默认的MathJax配置,不解析HTML配置
"""

def __init__(self):
"""初始化虚拟MathJax渲染器."""
super().__init__()
self.render_type = MathRenderType.MATHJAX_MOCK
# 使用默认的MathJax选项
self.options = MATHJAX_OPTIONS.copy()

def get_options(self, html: str) -> Dict[str, Any]:
"""虚拟渲染器直接返回默认选项,不解析HTML配置.

Args:
html: HTML字符串(忽略)

Returns:
Dict[str, Any]: 默认MathJax选项字典
"""
return self.options

def is_customized_options(self) -> bool:
"""虚拟渲染器始终返回False,表示使用默认配置."""
return False


# 使用示例
if __name__ == '__main__':
# MathJax示例
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
class MathRenderType:
"""数学公式渲染器类型."""
MATHJAX = 'mathjax'
MATHJAX_MOCK = 'mathjax_mock' # 虚拟的mathjax渲染器
MATHJAX_CUSTOMIZED = 'mathjax_customized' # 临时增加这个type,未来区分走自定义解析的数据
KATEX = 'katex'

Expand Down
19 changes: 17 additions & 2 deletions llm_web_kit/extractor/html/recognizer/cc_math/tag_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,16 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa
try:
text = node.text
if text and text_strip(text):
# 先处理非script标签和style标签的节点:即class为math/katex的节点
# 例子:<div class="math">f(x) \sim x^2, \quad x\to\infty</div>
if node.tag not in ['script', 'style']:
new_span = create_new_span([(CCMATH_INLINE,MathType.LATEX)], cm.wrap_math_md(text), node, math_render, o_html)
node.addnext(new_span)
# node.addnext(new_span)
replace_element(node, new_span) # 替换节点,而不是添加

# 下面是katex逻辑
else:
# 例子:<script type = "e44e-text/javascript">katex.render("f(a,b,c) = (a^2+b^2+c^2)^3", mykatex);</script>
katex_pattern = re.compile(r'katex.render')
node_text = text_strip(text)
if katex_pattern.findall(node_text):
Expand All @@ -28,8 +34,17 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa
target_element = target_elements[0]
o_html = element_to_html(target_element)
target_element.text = None
new_span = create_new_span([(CCMATH_INLINE,MathType.LATEX)], cm.wrap_math_md(formula_content), target_element, math_render, o_html)
wrapped_formula = cm.wrap_math_md(formula_content)
# 转化为ccmath,例子:
# <ccmath-inline type="latex" by="katex" html='...'>f(a,b,c) = (a^2+b^2+c^2)^3</ccmath-inline>
new_span = create_new_span([(CCMATH_INLINE, MathType.LATEX)], wrapped_formula,
target_element, math_render, o_html)
# 插入到span标签内,例子:
# <span id="mykatex"><ccmath-inline ... </ccmath-inline></span>
target_element.insert(0, new_span)

# 处理sript且type为math/tex的节点
# 例子:<html><head><script type="math/tex">x^2 + y^2 = z^2</script></head></html>
elif node.get('type') and 'math/tex' in node.get('type'):
tag_math_type_list = cm.get_equation_type(o_html)
if not tag_math_type_list:
Expand Down
40 changes: 25 additions & 15 deletions llm_web_kit/extractor/html/recognizer/ccmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class MathRecognizer(BaseHTMLElementRecognizer):
def __init__(self):
super().__init__()
self.cm = CCMATH()
self.mathjax_detected = False # 添加检测标记

@override
def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str, language:str = 'en') -> List[Tuple[HtmlElement, HtmlElement]]:
Expand Down Expand Up @@ -122,8 +123,9 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
self.cm.url = base_url
tree = cc_html
math_render_type = math_render.get_render_type()
# 打印遍历node次数
# count = 0
self.mathjax_detected = False # 重置标记

# process1: node循环逻辑
for node in iter_node(tree):
assert isinstance(node, HtmlElement)
original_html = self._element_to_html(node)
Expand All @@ -134,9 +136,11 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
node.tag == 'span' and
node.get('class') in [CSDN.INLINE, CSDN.DISPLAY]):
tag_script.process_katex_mathml(self.cm, math_render_type, node)
self.mathjax_detected = True

if ZHIHU.DOMAIN in self.cm.url and node.tag == 'span' and node.get('class') == ZHIHU.MATH:
tag_script.process_zhihu_custom_tag(self.cm, math_render_type, node)
self.mathjax_detected = True

# tag = span, class 为 math-containerm, 或者 mathjax 或者 wp-katex-eq
if node.tag == 'span' and node.get('class') and (
Expand All @@ -147,44 +151,50 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
'tex' in node.get('class')
):
tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)

# script[type="math/tex"]
# if node.tag == 'script' and node.get('type') and 'math/tex' in node.get('type'):
# print('匹配到script标签: ', node.get('type'))
# tag_common_modify.modify_tree(cm, math_render_type, original_html, node, parent)
self.mathjax_detected = True

# math tags
if node.tag == 'math' or node.tag.endswith(':math'):
# print(f"匹配到数学标签: {node.tag}")
# print(f"标签内容: {original_html}")
tag_math.modify_tree(self.cm, math_render_type, original_html, node, parent)
self.mathjax_detected = True

if node.tag == 'mjx-container':
tag_mjx.modify_tree(self.cm, math_render, original_html, node)
self.mathjax_detected = True

# img中的latex
if node.tag == 'img':
tag_img.modify_tree(self.cm, math_render_type, original_html, node, parent)
self.mathjax_detected = True

# span.katex
if node.tag == 'script' or 'math' == node.get('class') or 'katex' == node.get('class'):
# print('匹配到script/math/katex标签: ', original_html)
tag_script.modify_tree(self.cm, math_render_type, original_html, node, parent)
self.mathjax_detected = True
# 只有有渲染器的网站才会走下面文本匹配逻辑
if math_render_type:
# 14. 只处理只有一层的p标签
if node.tag == 'p' and len(node.getchildren()) == 0:
# print('匹配到p标签: ', original_html)
tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)
self.mathjax_detected = True

# 修改:传入tree节点,mathjax方案作为process2,不参与上面process1节点的遍历
if math_render_type:
try:
if math_render_type == MathRenderType.MATHJAX:
math_render.find_math(tree)
except Exception as e:
raise HtmlMathMathjaxRenderRecognizerException(f'处理MathjaxRender数学公式失败: {e}')

# procsee2: mathjax渲染器逻辑
try:
# case1:有mathjax配置
if math_render_type == MathRenderType.MATHJAX:
math_render.find_math(tree)
# case2:无Mathjax配置但是开启Mathjax逻辑开关(node循环抽到公式的情况)
elif math_render_type is None and self.mathjax_detected:
from llm_web_kit.extractor.html.recognizer.cc_math.render.mathjax import \
MathJaxRenderMock
math_render = MathJaxRenderMock()
math_render.find_math(tree)
except Exception as e:
raise HtmlMathMathjaxRenderRecognizerException(f'处理MathjaxRender数学公式失败: {e}')
# 保存处理后的html
# with open('test20250702_result.html', 'w', encoding='utf-8') as f:
# f.write(self._element_to_html(tree))
Expand Down
15 changes: 8 additions & 7 deletions llm_web_kit/extractor/html/recognizer/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,13 +199,14 @@ def __extract_list_item_text_recusive(el: HtmlElement):
# item['c'].strip(): 会导致前面处理br标签,添加的\n\n失效
result['c'] = ' '.join(normalize_text_segment(item['c'].strip()) for item in paragraph)
return result
list_item_tags = ('li', 'dd', 'dt', 'ul', 'div', 'p', 'span')
if child.tag in list_item_tags:
paragraph = __extract_list_item_text_recusive(child)
if len(paragraph) > 0:
tem_json = json.dumps(paragraph).replace('$br$\"}', '\"}')
new_paragraph = json.loads(tem_json)
text_paragraph.append(new_paragraph)
# list_item_tags = ('li', 'dd', 'dt', 'ul', 'div', 'p', 'span')
# if child.tag in list_item_tags:
# 去掉if限制条件,允许非标准结构的列表通过
paragraph = __extract_list_item_text_recusive(child)
if len(paragraph) > 0:
tem_json = json.dumps(paragraph).replace('$br$\"}', '\"}')
new_paragraph = json.loads(tem_json)
text_paragraph.append(new_paragraph)

for n, item in enumerate(text_paragraph):
tem_json = json.dumps(item).replace('$br$', '\\n\\n')
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -100,4 +100,5 @@
{"track_id": "test_mjx_container", "dataset_name": "test_mjx_container", "url": "https://test.com","data_source_category": "HTML", "path":"testmathjax.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
{"track_id": "test_word_press", "dataset_name": "test_word_press", "url": "https://test.com","data_source_category": "HTML", "path":"word_press.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
{"track_id": "test_ascii_delimiter", "dataset_name": "test_ascii_delimiter", "url": "https://montalk.net/notes/342/tuning-forks-and-megalithic-technology","data_source_category": "HTML", "path":"math_test_ascii_delimiter.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
{"track_id": "test_htmlmath_sub_sup", "dataset_name": "test_htmlmath_sub_sup", "url": "https://cccbdb.nist.gov/compvibs3.asp?casno=123911&charge=0&method=42&basis=0","data_source_category": "HTML", "path":"math_table_title_htmlmath_sub_sup.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
{"track_id": "test_htmlmath_sub_sup", "dataset_name": "test_htmlmath_sub_sup", "url": "https://cccbdb.nist.gov/compvibs3.asp?casno=123911&charge=0&method=42&basis=0","data_source_category": "HTML", "path":"math_table_title_htmlmath_sub_sup.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}
{"track_id": "test_mathjax_mock", "dataset_name": "test_mathjax_mock", "url": "http://mathonline.wikidot.com/monotone-sequences-of-real-numbers","data_source_category": "HTML", "path":"math_mathjax_mock.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}}

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
1 yr = 525600 min
1 yr → 525600 min
4.7 yr → T
T
T
4.7 yr → 2470320 min
4.7 years = 2470320 minutes
4.7 yr ≅ 2470320 min
35 changes: 35 additions & 0 deletions tests/llm_web_kit/extractor/html/recognizer/test_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,3 +375,38 @@ def test_get_attribute_standalone_improved(self):
error_msg = str(context.exception)
self.assertIn('中没有cclist标签', error_msg)
self.assertIn(element.tag, error_msg)

def test_no_standard_get_list_content_list(self):
"""测试非标准结构的list获取content_list."""
# 获取私有方法 __get_list_content_list
get_list_content_list_method = getattr(self.__list_recognize, '_ListRecognizer__get_list_content_list')

# 创建测试数据
test_elements = [
html_to_element('''<ul id="productslist">
<figure class="list">
<figcaption><h4>How to Process Oxidized Lead Zinc Ore by Flotation</h4>
<p>How to Process Oxidized Lead Zinc Ore by Flotation. Metallurgical Content. The
Flowsheet. Crushing Section; GRINDING; Conditioning and Flotation; Thickening and
Filtering; Sampling; ORE TESTING LABORATORY; The problem of treating oxidized lead
zinc ores for the production of high grade lead zinc concentrates is a complex </p>
</figcaption>
</figure>
<figure class="list">
<figcaption><h4>ore dressing flotation machine,fluorite ore flotation </h4>
<p>Ore dressing flotation machine is widely used to conduct flotation of copper ore,
lead zinc ore, glod ore, etc. Mail to sales@sinofote</p>
</figcaption>
</figure>
<figure class="list">
<figcaption><h4>Zinc Ore Mining Crusher wffofoundation</h4>
<p>Zinc ore mining process can 14 2016 31 Mar Lead zinc ore dressing equipment zinc ore
Once processing in the flotation circuit was complete, the zinc </p>
</figcaption>
</figure>
</ul>''')
]

for i, element in enumerate(test_elements):
list_content_list = get_list_content_list_method(element, 1)
assert len(list_content_list) == 3
8 changes: 8 additions & 0 deletions tests/llm_web_kit/extractor/html/recognizer/test_math.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,14 @@
'base_url': 'https://physicshelpforum.com/t/latex-upgrade-physics-forum-powered-by-mathjax-v3.17489/',
'expected': 'assets/ccmath/math_physicsforums_2_1.html',
'expected_inline': 'assets/ccmath/math_physicsforums_2_inline_1.html'
},
{
'input': [
'assets/ccmath/math_class_math.html',
],
'base_url': 'https://convertoctopus.com/4-7-years-to-minutes',
'expected': 'assets/ccmath/math_class_math_1.html',
'expected_inline': 'assets/ccmath/math_class_math_inline_1.html'
}
]

Expand Down
19 changes: 15 additions & 4 deletions tests/llm_web_kit/extractor/test_extractor_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def setUp(self):
continue
self.data_json.append(json.loads(line))

assert len(self.data_json) == 103
assert len(self.data_json) == 104

# Config for HTML extraction
self.config = load_pipe_tpl('html-test')
Expand Down Expand Up @@ -810,16 +810,27 @@ def test_ascii_delimiter(self):
input_data = DataJson(test_data)
result = chain.extract(input_data)
md_content = result.get_content_list().to_nlp_md()
# with open('mathjax抽取case222.md', 'w', encoding='utf-8') as f:
# f.write(md_content)
self.assertIn(r'$f = \frac{1}{T} ^ 2 \sqrt{\frac{A E}{\rho}}$', md_content)
self.assertIn(r'${m}^{2}$', md_content)
self.assertIn(r'\rho$', md_content)
self.assertIn(r'$f = \frac{1}{2 L} \sqrt{\frac{E}{\rho}}$', md_content)
self.assertIn(r'$L = {T}^{2} / \left(2 W\right)$', md_content)

def test_mathjax_mock(self):
"""测试虚拟mathjax渲染器."""
chain = ExtractSimpleFactory.create(self.config)
self.assertIsNotNone(chain)
test_data = self.data_json[103]
input_data = DataJson(test_data)
result = chain.extract(input_data)
md_content = result.get_content_list().to_nlp_md()
self.assertIn(r'$(a_n)$', md_content)
self.assertIn(r'$a_n ≤ a_{n+1}$', md_content)
self.assertIn(r'$n \in \mathbb{N}$', md_content)
self.assertIn(r'$\left ( \frac{1}{n} \right ) = (1, \frac{1}{2}, \frac{1}{3}, ..., \frac{1}{n}, \frac{1}{n+1}, ... )$', md_content)

def test_htmlmath_sub_sup(self):
"""测试ascii分隔符."""
"""测试htmlmath中的上下标标签."""
chain = ExtractSimpleFactory.create(self.config)
self.assertIsNotNone(chain)
test_data = self.data_json[102]
Expand Down