diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py b/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py index bb309468..06ac62a9 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py @@ -533,6 +533,42 @@ def _detect_ascii_math(self, tree: HtmlElement) -> bool: return processascii +class MathJaxRenderMock(MathJaxRender): + """虚拟的MathJax渲染器,用于没有MathJax配置但需要使用MathJax解析逻辑的情况. + + 这个类主要用于处理以下场景: + 1. 网页中没有显式的MathJax配置(如 katex_pattern = re.compile(r'katex.render') node_text = text_strip(text) if katex_pattern.findall(node_text): @@ -28,8 +34,17 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa target_element = target_elements[0] o_html = element_to_html(target_element) target_element.text = None - new_span = create_new_span([(CCMATH_INLINE,MathType.LATEX)], cm.wrap_math_md(formula_content), target_element, math_render, o_html) + wrapped_formula = cm.wrap_math_md(formula_content) + # 转化为ccmath,例子: + # f(a,b,c) = (a^2+b^2+c^2)^3 + new_span = create_new_span([(CCMATH_INLINE, MathType.LATEX)], wrapped_formula, + target_element, math_render, o_html) + # 插入到span标签内,例子: + # target_element.insert(0, new_span) + + # 处理sript且type为math/tex的节点 + # 例子: elif node.get('type') and 'math/tex' in node.get('type'): tag_math_type_list = cm.get_equation_type(o_html) if not tag_math_type_list: diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py index 08021dbf..28078250 100644 --- a/llm_web_kit/extractor/html/recognizer/ccmath.py +++ b/llm_web_kit/extractor/html/recognizer/ccmath.py @@ -24,6 +24,7 @@ class MathRecognizer(BaseHTMLElementRecognizer): def __init__(self): super().__init__() self.cm = CCMATH() + self.mathjax_detected = False # 添加检测标记 @override def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str, language:str = 'en') -> List[Tuple[HtmlElement, HtmlElement]]: @@ -122,8 +123,9 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe self.cm.url = base_url tree = cc_html math_render_type = math_render.get_render_type() - # 打印遍历node次数 - # count = 0 + self.mathjax_detected = False # 重置标记 + + # process1: node循环逻辑 for node in iter_node(tree): assert isinstance(node, HtmlElement) original_html = self._element_to_html(node) @@ -134,9 +136,11 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe node.tag == 'span' and node.get('class') in [CSDN.INLINE, CSDN.DISPLAY]): tag_script.process_katex_mathml(self.cm, math_render_type, node) + self.mathjax_detected = True if ZHIHU.DOMAIN in self.cm.url and node.tag == 'span' and node.get('class') == ZHIHU.MATH: tag_script.process_zhihu_custom_tag(self.cm, math_render_type, node) + self.mathjax_detected = True # tag = span, class 为 math-containerm, 或者 mathjax 或者 wp-katex-eq if node.tag == 'span' and node.get('class') and ( @@ -147,44 +151,50 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe 'tex' in node.get('class') ): tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent) - - # script[type="math/tex"] - # if node.tag == 'script' and node.get('type') and 'math/tex' in node.get('type'): - # print('匹配到script标签: ', node.get('type')) - # tag_common_modify.modify_tree(cm, math_render_type, original_html, node, parent) + self.mathjax_detected = True # math tags if node.tag == 'math' or node.tag.endswith(':math'): # print(f"匹配到数学标签: {node.tag}") # print(f"标签内容: {original_html}") tag_math.modify_tree(self.cm, math_render_type, original_html, node, parent) + self.mathjax_detected = True if node.tag == 'mjx-container': tag_mjx.modify_tree(self.cm, math_render, original_html, node) + self.mathjax_detected = True # img中的latex if node.tag == 'img': tag_img.modify_tree(self.cm, math_render_type, original_html, node, parent) + self.mathjax_detected = True # span.katex if node.tag == 'script' or 'math' == node.get('class') or 'katex' == node.get('class'): # print('匹配到script/math/katex标签: ', original_html) tag_script.modify_tree(self.cm, math_render_type, original_html, node, parent) + self.mathjax_detected = True # 只有有渲染器的网站才会走下面文本匹配逻辑 if math_render_type: # 14. 只处理只有一层的p标签 if node.tag == 'p' and len(node.getchildren()) == 0: # print('匹配到p标签: ', original_html) tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent) + self.mathjax_detected = True - # 修改:传入tree节点,mathjax方案作为process2,不参与上面process1节点的遍历 - if math_render_type: - try: - if math_render_type == MathRenderType.MATHJAX: - math_render.find_math(tree) - except Exception as e: - raise HtmlMathMathjaxRenderRecognizerException(f'处理MathjaxRender数学公式失败: {e}') - + # procsee2: mathjax渲染器逻辑 + try: + # case1:有mathjax配置 + if math_render_type == MathRenderType.MATHJAX: + math_render.find_math(tree) + # case2:无Mathjax配置但是开启Mathjax逻辑开关(node循环抽到公式的情况) + elif math_render_type is None and self.mathjax_detected: + from llm_web_kit.extractor.html.recognizer.cc_math.render.mathjax import \ + MathJaxRenderMock + math_render = MathJaxRenderMock() + math_render.find_math(tree) + except Exception as e: + raise HtmlMathMathjaxRenderRecognizerException(f'处理MathjaxRender数学公式失败: {e}') # 保存处理后的html # with open('test20250702_result.html', 'w', encoding='utf-8') as f: # f.write(self._element_to_html(tree)) diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py index d91caa23..c3599fc8 100644 --- a/llm_web_kit/extractor/html/recognizer/list.py +++ b/llm_web_kit/extractor/html/recognizer/list.py @@ -199,13 +199,14 @@ def __extract_list_item_text_recusive(el: HtmlElement): # item['c'].strip(): 会导致前面处理br标签,添加的\n\n失效 result['c'] = ' '.join(normalize_text_segment(item['c'].strip()) for item in paragraph) return result - list_item_tags = ('li', 'dd', 'dt', 'ul', 'div', 'p', 'span') - if child.tag in list_item_tags: - paragraph = __extract_list_item_text_recusive(child) - if len(paragraph) > 0: - tem_json = json.dumps(paragraph).replace('$br$\"}', '\"}') - new_paragraph = json.loads(tem_json) - text_paragraph.append(new_paragraph) + # list_item_tags = ('li', 'dd', 'dt', 'ul', 'div', 'p', 'span') + # if child.tag in list_item_tags: + # 去掉if限制条件,允许非标准结构的列表通过 + paragraph = __extract_list_item_text_recusive(child) + if len(paragraph) > 0: + tem_json = json.dumps(paragraph).replace('$br$\"}', '\"}') + new_paragraph = json.loads(tem_json) + text_paragraph.append(new_paragraph) for n, item in enumerate(text_paragraph): tem_json = json.dumps(item).replace('$br$', '\\n\\n') diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/math_mathjax_mock.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/math_mathjax_mock.html new file mode 100644 index 00000000..2928e55a --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/math_mathjax_mock.html @@ -0,0 +1 @@ + Monotone Sequences of Real Numbers - Mathonline
Monotone Sequences of Real Numbers

Monotone Sequences of Real Numbers

We will now look at two new types of sequences, increasing sequences and decreasing sequences.

Definition: A sequence of real numbers $(a_n)$ is said to be Increasing if $a_n ≤ a_{n+1}$ for all $n \in \mathbb{N}$. Similarly, a sequence of real numbers $(a_n)$ is said to be Decreasing if $a_n ≥ a_{n+1}$ for all $n \in \mathbb{N}$. A sequence $(a_n)$ is said to be Monotone or Monotonic if it is either increasing or decreasing.

A sequence $(a_n)$ is said to be Strictly Increasing if $a_n < a_{n+1}$ for all $n \in \mathbb{N}$ and Strictly Decreasing if $a_n > a_{n+1}$ for all $n \in \mathbb{N}$.

For example, consider the sequence $\left ( \frac{1}{n} \right ) = (1, \frac{1}{2}, \frac{1}{3}, ..., \frac{1}{n}, \frac{1}{n+1}, ... )$. We note that $\forall n \in \mathbb{N}$, $n < n+1$ and so $\frac{1}{n} > \frac{1}{n+1}$, and so this sequence is decreasing and hence monotone.

The following graph represents the first 10 terms of the monotonically decreasing sequence $\left ( \frac{1}{n} \right )$:

Screen%20Shot%202014-12-04%20at%203.58.31%20PM.png

One such example of an increasing sequence is the sequence $(n + 2)$. Clearly $\forall n \in \mathbb{N}$, $n + 2 < (n+1) + 2 = n + 3$ (since if not, then $n + 2 ≥ n + 3$ which implies that $0 ≥ 1$, which is a contradiction). The following graph represents the first 10 terms of the monotonically increasing sequence $(n + 2)$:

Screen%20Shot%202014-12-04%20at%204.02.37%20PM.png

From the definition of an increasing and decreasing sequence, we should note that EVERY successive term in the sequence should either be larger than the previous (increasing sequences) or smaller than the previous (decreasing sequences). Therefore the sequence $(1, 2, 1, \frac{1}{2}, \frac{1}{3}, \frac{1}{4}, ...)$ cannot be considered a decreasing sequence as $1 = a_1 \not ≥ a_2 = 2$. From this, we will formulate the following definitions:

Definition: A sequence of real numbers $(a_n)$ is said to be Ultimately Increasing if for some $K \in \mathbb{N}$ we have that $\forall n ≥ K$ then $a_n ≤ a_{n+1}$. Similarly, a sequence of real numbers $(a_n)$ is said to be Ultimately Decreasing if for some $K \in \mathbb{N}$ we have that $\forall n ≥ K$ then $a_n ≥ a_{n+1}$. A sequence $(a_n)$ is said to be Ultimately Monotone or Ultimately Monotonic if for some $K \in \mathbb{N}$, if $n ≥ K$ then $(a_n)$ is either ultimately increasing or ultimately decreasing.

Consider the sequence $(n^2 - 4n + 3) = (0, -1, 0, 3, 8, ...)$. This is an ultimately increasing sequence, since for $n ≥ 2$ we have that $a_n ≤ a_{n+1}$. The following graph represents the first 7 terms of this ultimately increasing sequence:

Screen%20Shot%202014-12-04%20at%204.20.53%20PM.png
Unless otherwise stated, the content of this page is licensed under Creative Commons Attribution-ShareAlike 3.0 License
\ No newline at end of file diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl index 7ccc1eb1..22cebca1 100644 --- a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl @@ -100,4 +100,5 @@ {"track_id": "test_mjx_container", "dataset_name": "test_mjx_container", "url": "https://test.com","data_source_category": "HTML", "path":"testmathjax.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "test_word_press", "dataset_name": "test_word_press", "url": "https://test.com","data_source_category": "HTML", "path":"word_press.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "test_ascii_delimiter", "dataset_name": "test_ascii_delimiter", "url": "https://montalk.net/notes/342/tuning-forks-and-megalithic-technology","data_source_category": "HTML", "path":"math_test_ascii_delimiter.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} -{"track_id": "test_htmlmath_sub_sup", "dataset_name": "test_htmlmath_sub_sup", "url": "https://cccbdb.nist.gov/compvibs3.asp?casno=123911&charge=0&method=42&basis=0","data_source_category": "HTML", "path":"math_table_title_htmlmath_sub_sup.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file +{"track_id": "test_htmlmath_sub_sup", "dataset_name": "test_htmlmath_sub_sup", "url": "https://cccbdb.nist.gov/compvibs3.asp?casno=123911&charge=0&method=42&basis=0","data_source_category": "HTML", "path":"math_table_title_htmlmath_sub_sup.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} +{"track_id": "test_mathjax_mock", "dataset_name": "test_mathjax_mock", "url": "http://mathonline.wikidot.com/monotone-sequences-of-real-numbers","data_source_category": "HTML", "path":"math_mathjax_mock.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math.html new file mode 100644 index 00000000..ec9e8518 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math.html @@ -0,0 +1 @@ + 4.7 Years In Minutes - How Many Minutes Is 4.7 Years?

Unit Converter

Conversion formula

The conversion factor from years to minutes is 525600, which means that 1 year is equal to 525600 minutes:

1 yr = 525600 min

To convert 4.7 years into minutes we have to multiply 4.7 by the conversion factor in order to get the time amount from years to minutes. We can also form a simple proportion to calculate the result:

1 yr → 525600 min

4.7 yr → T(min)

Solve the above proportion to obtain the time T in minutes:

T(min) = 4.7 yr × 525600 min

T(min) = 2470320 min

The final result is:

4.7 yr → 2470320 min

We conclude that 4.7 years is equivalent to 2470320 minutes:

4.7 years = 2470320 minutes

4.7 years is equal to 2470320 minutes

Alternative conversion

We can also convert by utilizing the inverse value of the conversion factor. In this case 1 minute is equal to 4.0480585511189E-7 × 4.7 years.

Another way is saying that 4.7 years is equal to 1 ÷ 4.0480585511189E-7 minutes.

Approximate result

For practical purposes we can round our final result to an approximate numerical value. We can say that four point seven years is approximately two million four hundred seventy thousand three hundred twenty minutes:

4.7 yr ≅ 2470320 min

An alternative is also that one minute is approximately zero times four point seven years.

Conversion table

years to minutes chart

For quick reference purposes, below is the conversion table you can use to convert from years to minutes

years (yr) minutes (min)
5.7 years 2995920 minutes
6.7 years 3521520 minutes
7.7 years 4047120 minutes
8.7 years 4572720 minutes
9.7 years 5098320 minutes
10.7 years 5623920 minutes
11.7 years 6149520 minutes
12.7 years 6675120 minutes
13.7 years 7200720 minutes
14.7 years 7726320 minutes
\ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math_1.html new file mode 100644 index 00000000..e69de29b diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math_inline_1.html new file mode 100644 index 00000000..cda8dd54 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math_inline_1.html @@ -0,0 +1,8 @@ +1 yr = 525600 min +1 yr → 525600 min +4.7 yr → T +T +T +4.7 yr → 2470320 min +4.7 years = 2470320 minutes +4.7 yr ≅ 2470320 min \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_list.py b/tests/llm_web_kit/extractor/html/recognizer/test_list.py index dbe79347..5f8d61de 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_list.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_list.py @@ -375,3 +375,38 @@ def test_get_attribute_standalone_improved(self): error_msg = str(context.exception) self.assertIn('中没有cclist标签', error_msg) self.assertIn(element.tag, error_msg) + + def test_no_standard_get_list_content_list(self): + """测试非标准结构的list获取content_list.""" + # 获取私有方法 __get_list_content_list + get_list_content_list_method = getattr(self.__list_recognize, '_ListRecognizer__get_list_content_list') + + # 创建测试数据 + test_elements = [ + html_to_element('''''') + ] + + for i, element in enumerate(test_elements): + list_content_list = get_list_content_list_method(element, 1) + assert len(list_content_list) == 3 diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py index f51c1869..6069c590 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py @@ -207,6 +207,14 @@ 'base_url': 'https://physicshelpforum.com/t/latex-upgrade-physics-forum-powered-by-mathjax-v3.17489/', 'expected': 'assets/ccmath/math_physicsforums_2_1.html', 'expected_inline': 'assets/ccmath/math_physicsforums_2_inline_1.html' + }, + { + 'input': [ + 'assets/ccmath/math_class_math.html', + ], + 'base_url': 'https://convertoctopus.com/4-7-years-to-minutes', + 'expected': 'assets/ccmath/math_class_math_1.html', + 'expected_inline': 'assets/ccmath/math_class_math_inline_1.html' } ] diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index dc53e015..7f1bf8c9 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -64,7 +64,7 @@ def setUp(self): continue self.data_json.append(json.loads(line)) - assert len(self.data_json) == 103 + assert len(self.data_json) == 104 # Config for HTML extraction self.config = load_pipe_tpl('html-test') @@ -810,16 +810,27 @@ def test_ascii_delimiter(self): input_data = DataJson(test_data) result = chain.extract(input_data) md_content = result.get_content_list().to_nlp_md() - # with open('mathjax抽取case222.md', 'w', encoding='utf-8') as f: - # f.write(md_content) self.assertIn(r'$f = \frac{1}{T} ^ 2 \sqrt{\frac{A E}{\rho}}$', md_content) self.assertIn(r'${m}^{2}$', md_content) self.assertIn(r'\rho$', md_content) self.assertIn(r'$f = \frac{1}{2 L} \sqrt{\frac{E}{\rho}}$', md_content) self.assertIn(r'$L = {T}^{2} / \left(2 W\right)$', md_content) + def test_mathjax_mock(self): + """测试虚拟mathjax渲染器.""" + chain = ExtractSimpleFactory.create(self.config) + self.assertIsNotNone(chain) + test_data = self.data_json[103] + input_data = DataJson(test_data) + result = chain.extract(input_data) + md_content = result.get_content_list().to_nlp_md() + self.assertIn(r'$(a_n)$', md_content) + self.assertIn(r'$a_n ≤ a_{n+1}$', md_content) + self.assertIn(r'$n \in \mathbb{N}$', md_content) + self.assertIn(r'$\left ( \frac{1}{n} \right ) = (1, \frac{1}{2}, \frac{1}{3}, ..., \frac{1}{n}, \frac{1}{n+1}, ... )$', md_content) + def test_htmlmath_sub_sup(self): - """测试ascii分隔符.""" + """测试htmlmath中的上下标标签.""" chain = ExtractSimpleFactory.create(self.config) self.assertIsNotNone(chain) test_data = self.data_json[102]