Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions llm_web_kit/extractor/html/recognizer/cc_math/tag_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,16 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa
try:
text = node.text
if text and text_strip(text):
# 先处理非script标签和style标签的节点:即class为math/katex的节点
# 例子:<div class="math">f(x) \sim x^2, \quad x\to\infty</div>
Comment thread
1041206149 marked this conversation as resolved.
if node.tag not in ['script', 'style']:
new_span = create_new_span([(CCMATH_INLINE,MathType.LATEX)], cm.wrap_math_md(text), node, math_render, o_html)
node.addnext(new_span)
# node.addnext(new_span)
replace_element(node, new_span) # 替换节点,而不是添加

# 下面是katex逻辑
else:
# 例子:<script type = "e44e-text/javascript">katex.render("f(a,b,c) = (a^2+b^2+c^2)^3", mykatex);</script>
katex_pattern = re.compile(r'katex.render')
node_text = text_strip(text)
if katex_pattern.findall(node_text):
Expand All @@ -28,8 +34,17 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa
target_element = target_elements[0]
o_html = element_to_html(target_element)
target_element.text = None
new_span = create_new_span([(CCMATH_INLINE,MathType.LATEX)], cm.wrap_math_md(formula_content), target_element, math_render, o_html)
wrapped_formula = cm.wrap_math_md(formula_content)
# 转化为ccmath,例子:
# <ccmath-inline type="latex" by="katex" html='...'>f(a,b,c) = (a^2+b^2+c^2)^3</ccmath-inline>
new_span = create_new_span([(CCMATH_INLINE, MathType.LATEX)], wrapped_formula,
target_element, math_render, o_html)
# 插入到span标签内,例子:
# <span id="mykatex"><ccmath-inline ... </ccmath-inline></span>
target_element.insert(0, new_span)

# 处理sript且type为math/tex的节点
# 例子:<html><head><script type="math/tex">x^2 + y^2 = z^2</script></head></html>
elif node.get('type') and 'math/tex' in node.get('type'):
tag_math_type_list = cm.get_equation_type(o_html)
if not tag_math_type_list:
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
1 yr = 525600 min
1 yr → 525600 min
4.7 yr → T
T
T
4.7 yr → 2470320 min
4.7 years = 2470320 minutes
4.7 yr ≅ 2470320 min
8 changes: 8 additions & 0 deletions tests/llm_web_kit/extractor/html/recognizer/test_math.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,14 @@
'base_url': 'https://physicshelpforum.com/t/latex-upgrade-physics-forum-powered-by-mathjax-v3.17489/',
'expected': 'assets/ccmath/math_physicsforums_2_1.html',
'expected_inline': 'assets/ccmath/math_physicsforums_2_inline_1.html'
},
{
'input': [
'assets/ccmath/math_class_math.html',
],
'base_url': 'https://convertoctopus.com/4-7-years-to-minutes',
'expected': 'assets/ccmath/math_class_math_1.html',
'expected_inline': 'assets/ccmath/math_class_math_inline_1.html'
}
]

Expand Down