From b7b26c6472543ea31778956aac531020ea22fc33 Mon Sep 17 00:00:00 2001 From: chupei Date: Wed, 6 Aug 2025 10:25:05 +0800 Subject: [PATCH 01/11] fix: set logging ERROR level in ASCIIMath2Tex (#508) --- .pre-commit-config.yaml | 14 +++++++------- .../extractor/html/recognizer/cc_math/common.py | 14 +++++++++++--- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c0710e80..dd9b4265 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -5,19 +5,19 @@ repos: rev: 5.0.4 hooks: - id: flake8 - args: [ "--max-line-length=2200", "--ignore=E131,E125,W503,W504,E203,E231,E702,E128" ] + args: [ "--max-line-length=2200", "--ignore=E131,E125,W503,W504,E203,E231,E702,E128,E402" ] exclude: '^tests/.*/assets/' - repo: https://github.com/PyCQA/isort rev: 5.11.5 hooks: - id: isort exclude: '^tests/.*/assets/' - - repo: https://github.com/pre-commit/mirrors-yapf - rev: v0.32.0 - hooks: - - id: yapf - args: ["--style={based_on_style: google, column_limit: 200, indent_width: 4}"] - exclude: '^tests/.*/assets/' + # - repo: https://github.com/pre-commit/mirrors-yapf + # rev: v0.32.0 + # hooks: + # - id: yapf + # args: ["--style={based_on_style: google, column_limit: 200, indent_width: 4}"] + # exclude: '^tests/.*/assets/' # - repo: https://github.com/codespell-project/codespell # rev: v2.2.1 # hooks: diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/common.py b/llm_web_kit/extractor/html/recognizer/cc_math/common.py index 51bc3d5b..d9178840 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/common.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/common.py @@ -1,3 +1,4 @@ +import logging import os import re from pathlib import Path @@ -5,6 +6,15 @@ from lxml import etree from lxml.html import HtmlElement + +# 在导入前就设置严格的日志控制 +logging.basicConfig(level=logging.WARNING, force=True) + +# 设置py_asciimath的日志级别,完全禁用其日志输出 +py_asciimath_logger = logging.getLogger('py_asciimath') +py_asciimath_logger.setLevel(logging.ERROR) +py_asciimath_logger.disabled = True + from py_asciimath.translator.translator import ASCIIMath2Tex from llm_web_kit.extractor.html.recognizer.recognizer import CCTag @@ -15,6 +25,7 @@ from llm_web_kit.libs.text_utils import normalize_ctl_text asciimath2tex = ASCIIMath2Tex(log=False) + color_regex = re.compile(r'\\textcolor\[.*?\]\{.*?\}') @@ -137,9 +148,6 @@ class MATHINSIGHT: } -asciimath2tex = ASCIIMath2Tex(log=False) - - def text_strip(text): return text.strip() if text else text From 6251def44c8d51047d2fb4b7c68a6a6e8e9d12dc Mon Sep 17 00:00:00 2001 From: Kaiwen Liu Date: Fri, 8 Aug 2025 14:43:09 +0800 Subject: [PATCH 02/11] : improve dealing with response 0 (#509) --- llm_web_kit/input/pre_data_json.py | 2 ++ .../parser/layout_batch_parser.py | 20 +++++++------------ .../main_html_parser/parser/tag_mapping.py | 1 + tests/llm_web_kit/input/test_pre_data_json.py | 1 + 4 files changed, 11 insertions(+), 13 deletions(-) diff --git a/llm_web_kit/input/pre_data_json.py b/llm_web_kit/input/pre_data_json.py index 80cf7439..3298de94 100644 --- a/llm_web_kit/input/pre_data_json.py +++ b/llm_web_kit/input/pre_data_json.py @@ -20,6 +20,8 @@ class PreDataJsonKey: TYPICAL_SIMPLIFIED_HTML = 'typical_simplified_html' # 模型打标字典 LLM_RESPONSE = 'llm_response' + # 模型结果都为0 + LLM_RESPONSE_EMPTY = 'llm_response_empty' # 映射模版正文树结构的元素字典 HTML_ELEMENT_DICT = 'html_element_dict' # 映射模版正文时的文本列表 diff --git a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py index a2b834e0..b24e958b 100644 --- a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py +++ b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py @@ -46,8 +46,15 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson: self.dynamic_classid_enable = pre_data.get(PreDataJsonKey.DYNAMIC_CLASSID_ENABLE, False) self.more_noise_enable = pre_data.get(PreDataJsonKey.MORE_NOISE_ENABLE, False) self.dynamic_classid_similarity_threshold = pre_data.get(PreDataJsonKey.DYNAMIC_CLASSID_SIM_THRESH, 0.85) + response_empty = pre_data.get(PreDataJsonKey.LLM_RESPONSE_EMPTY, False) template_data_str = pre_data[PreDataJsonKey.HTML_ELEMENT_DICT] template_data = dict() + # 检查第0层第一个元素是否为green,如果是则返回空的HTML + if response_empty: + pre_data[PreDataJsonKey.MAIN_HTML] = '' + pre_data[PreDataJsonKey.MAIN_HTML_BODY] = '' + pre_data[PreDataJsonKey.MAIN_HTML_SUCCESS] = False + return pre_data if isinstance(template_data_str, str): template_data_str = json.loads(template_data_str) for layer, layer_dict in template_data_str.items(): @@ -57,19 +64,6 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson: template_data = template_data_str else: raise ValueError(f'template_data 类型错误: {type(template_data_str)}') - # 检查第0层第一个元素是否为green,如果是则返回空的HTML - if 0 in template_data: - layer_0_elements = template_data[0] - if layer_0_elements: - # 获取第一个元素 - first_element_info = next(iter(layer_0_elements.values())) - if isinstance(first_element_info, tuple) and len(first_element_info) > 0: - label = first_element_info[0] # 获取标签(red/green) - if label == 'green': - pre_data[PreDataJsonKey.MAIN_HTML] = '' - pre_data[PreDataJsonKey.MAIN_HTML_BODY] = '' - pre_data[PreDataJsonKey.MAIN_HTML_SUCCESS] = False - return pre_data self.template_data = template_data content, body = self.process(html_source, template_dict_html) diff --git a/llm_web_kit/main_html_parser/parser/tag_mapping.py b/llm_web_kit/main_html_parser/parser/tag_mapping.py index c47e74b1..d633da40 100644 --- a/llm_web_kit/main_html_parser/parser/tag_mapping.py +++ b/llm_web_kit/main_html_parser/parser/tag_mapping.py @@ -57,6 +57,7 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson: pre_data[PreDataJsonKey.TYPICAL_DICT_HTML] = template_dict_html pre_data[PreDataJsonKey.SIMILARITY_LAYER] = 0 pre_data[PreDataJsonKey.TYPICAL_MAIN_HTML_SUCCESS] = False + pre_data[PreDataJsonKey.LLM_RESPONSE_EMPTY] = True return pre_data # 模版抽取正文html diff --git a/tests/llm_web_kit/input/test_pre_data_json.py b/tests/llm_web_kit/input/test_pre_data_json.py index 6cc469a5..9e0e2540 100644 --- a/tests/llm_web_kit/input/test_pre_data_json.py +++ b/tests/llm_web_kit/input/test_pre_data_json.py @@ -380,6 +380,7 @@ def test_pre_data_json_key_constants(self): assert hasattr(PreDataJsonKey, 'TYPICAL_RAW_TAG_HTML') assert hasattr(PreDataJsonKey, 'TYPICAL_SIMPLIFIED_HTML') assert hasattr(PreDataJsonKey, 'LLM_RESPONSE') + assert hasattr(PreDataJsonKey, 'LLM_RESPONSE_EMPTY') assert hasattr(PreDataJsonKey, 'HTML_ELEMENT_DICT') assert hasattr(PreDataJsonKey, 'HTML_TARGET_LIST') assert hasattr(PreDataJsonKey, 'MAIN_HTML') From 52bd6d8fa261b08d738809e004d525a05d38be6a Mon Sep 17 00:00:00 2001 From: linfeng <56671143+LollipopsAndWine@users.noreply.github.com> Date: Fri, 8 Aug 2025 18:27:17 +0800 Subject: [PATCH 03/11] =?UTF-8?q?refactor:=20=E9=87=8D=E6=9E=84simple=20(#?= =?UTF-8?q?511)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/extractor/html/recognizer/text.py | 2 +- llm_web_kit/simple.py | 51 +++++++++++-------- tests/llm_web_kit/simple/test_simple.py | 12 +++-- 3 files changed, 38 insertions(+), 27 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py index 2bbd1f47..7768ef50 100644 --- a/llm_web_kit/extractor/html/recognizer/text.py +++ b/llm_web_kit/extractor/html/recognizer/text.py @@ -195,7 +195,7 @@ def __combine_text(self, text1:str, text2:str, lang='en') -> str: return self.replace_entities(txt.strip(), entities_map) else: # 根据text1的最后一个字符和text2的第一个字符判断两个text之间的连接 - if (text2[0] in string.punctuation) or (text2[0] in special_symbols) or (text2[0] in other_symbols) or (text1 and text1[-1] in other_symbols): + if (text2 and text2[0] in string.punctuation) or (text2 and text2[0] in special_symbols) or (text2 and text2[0] in other_symbols) or (text1 and text1[-1] in other_symbols): words_sep = '' else: words_sep = ' ' diff --git a/llm_web_kit/simple.py b/llm_web_kit/simple.py index ae687351..fefb535a 100644 --- a/llm_web_kit/simple.py +++ b/llm_web_kit/simple.py @@ -5,12 +5,14 @@ from llm_web_kit.config.cfg_reader import load_pipe_tpl from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory -from llm_web_kit.extractor.html.extractor import ( - HTMLPageLayoutType, MagicHTMLFIleFormatorExtractor, - NoClipHTMLFIleFormatorExtractor) from llm_web_kit.input.datajson import DataJson +class PipeType: + HTML = 'html' + NOCLIP = 'noclip_html' + + class ExtractorType: HTML = 'html' PDF = 'pdf' @@ -19,24 +21,32 @@ class ExtractorType: class ExtractorFactory: """factory class for extractor.""" - html_extractor = None + magic_html_extractor = None + noclip_html_extractor = None pdf_extractor = None ebook_extractor = None @staticmethod - def get_extractor(extractor_type: str): + def get_extractor(extractor_type: str, pipe_tpl_name: str): if extractor_type == ExtractorType.HTML: - if ExtractorFactory.html_extractor is None: - extractor_cfg = load_pipe_tpl('html') - chain = ExtractSimpleFactory.create(extractor_cfg) - ExtractorFactory.html_extractor = chain - return ExtractorFactory.html_extractor + if pipe_tpl_name == PipeType.HTML: + if ExtractorFactory.magic_html_extractor is None: + extractor_cfg = load_pipe_tpl(pipe_tpl_name) + chain = ExtractSimpleFactory.create(extractor_cfg) + ExtractorFactory.magic_html_extractor = chain + return ExtractorFactory.magic_html_extractor + if pipe_tpl_name == PipeType.NOCLIP: + if ExtractorFactory.noclip_html_extractor is None: + extractor_cfg = load_pipe_tpl(pipe_tpl_name) + chain = ExtractSimpleFactory.create(extractor_cfg) + ExtractorFactory.noclip_html_extractor = chain + return ExtractorFactory.noclip_html_extractor else: raise ValueError(f'Invalid extractor type: {extractor_type}') def __extract_main_html_by_no_clip_html(url:str, html_content: str, raw_html:str) -> DataJson: - extractor = NoClipHTMLFIleFormatorExtractor(load_pipe_tpl('noclip_html')) + extractor = ExtractorFactory.get_extractor(ExtractorType.HTML, PipeType.NOCLIP) if raw_html == '': raw_html = html_content input_data_dict = { @@ -54,14 +64,8 @@ def __extract_main_html_by_no_clip_html(url:str, html_content: str, raw_html:str return result -def __extract_main_html_by_maigic_html(url:str, html_str: str, page_layout_type:str) -> DataJson: - magic_html_extractor = MagicHTMLFIleFormatorExtractor(load_pipe_tpl('html')) - main_html, method, title = magic_html_extractor._extract_main_html(html_str, url, page_layout_type) - return main_html, title - - def __extract_html(url:str, html_content: str) -> DataJson: - extractor = ExtractorFactory.get_extractor(ExtractorType.HTML) + extractor = ExtractorFactory.get_extractor(ExtractorType.HTML, PipeType.HTML) input_data_dict = { 'track_id': str(uuid.uuid4()), 'url': url, @@ -94,7 +98,10 @@ def extract_html_to_mm_md(url:str, html_content: str, clip_html=True, raw_html=' return result.get_content_list().to_mm_md() -def extract_main_html_by_maigic_html(url:str, html_str: str, page_layout_type:str = HTMLPageLayoutType.LAYOUT_ARTICLE) -> str: - """extract main html.""" - result = __extract_main_html_by_maigic_html(url, html_str, page_layout_type) - return result[0], result[1] +def extract_main_html(url:str, html_content: str, clip_html=True, raw_html='') -> str: + if clip_html: + result = __extract_html(url, html_content) + else: + result = __extract_main_html_by_no_clip_html(url, html_content, raw_html) + main_html = result.get('main_html') + return main_html diff --git a/tests/llm_web_kit/simple/test_simple.py b/tests/llm_web_kit/simple/test_simple.py index 233eaead..3931b511 100644 --- a/tests/llm_web_kit/simple/test_simple.py +++ b/tests/llm_web_kit/simple/test_simple.py @@ -2,7 +2,7 @@ import unittest from llm_web_kit.simple import (extract_html_to_md, extract_html_to_mm_md, - extract_main_html_by_maigic_html) + extract_main_html) class TestSimple(unittest.TestCase): @@ -136,9 +136,13 @@ def test_extract_pure_html_to_mm_md(self): mm_md = extract_html_to_mm_md(self.url, self.html_content, clip_html=True) self.assertEqual(mm_md, '# Test Content\n\nThis is a test paragraph.\n\n![Test Image](e5db82b5bf63d49d80c5533616892d3386f43955369520986d67653c700fc53c)\n') - def test_extract_magic_html(self): - magic_html, title = extract_main_html_by_maigic_html(self.url, self.html_content) - self.assertEqual(magic_html, '

Test Content

This is a test paragraph.

Test Image
') + def test_extract_magic_main_html(self): + magic_main_html = extract_main_html(self.url, self.html_content, clip_html=True) + self.assertEqual(magic_main_html, '

Test Content

This is a test paragraph.

Test Image
') + + def test_extract_noclip_main_html(self): + magic_main_html = extract_main_html(self.url, self.html_content, clip_html=False, raw_html=self.html_content) + self.assertEqual(magic_main_html, '

Test Content

This is a test paragraph.

Test Image') def test_extract_real_html_to_md(self): md = extract_html_to_md(self.url, self.real_html_content, clip_html=False) From 0875de8ba9755aa48e09cdb6fe79a565c49e5c55 Mon Sep 17 00:00:00 2001 From: chupei Date: Fri, 8 Aug 2025 19:53:37 +0800 Subject: [PATCH 04/11] fix: combine_text with empty text (#510) --- .github/workflows/python-package.yml | 2 +- llm_web_kit/extractor/html/recognizer/text.py | 7 ++++ llm_web_kit/libs/version.py | 2 +- requirements/runtime.txt | 2 +- .../extractor/html/recognizer/test_text.py | 33 +++++++++++++++++++ ...est_HTMLFileFormatCleanTagsPreExtractor.py | 11 +++++++ 6 files changed, 54 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index b1f606b6..7e7fdb91 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -124,4 +124,4 @@ jobs: - name: Publish distribution to PyPI run: | pip install twine - twine upload --verbose --repository-url https://test.pypi.org/legacy/ dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }} + twine upload --verbose dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }} diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py index 7768ef50..0052872c 100644 --- a/llm_web_kit/extractor/html/recognizer/text.py +++ b/llm_web_kit/extractor/html/recognizer/text.py @@ -194,6 +194,13 @@ def __combine_text(self, text1:str, text2:str, lang='en') -> str: txt = text1 + text2 return self.replace_entities(txt.strip(), entities_map) else: + # 如果text2为空,直接返回text1 + if not text2: + return self.replace_entities(text1.strip(), entities_map) + # 如果text1为空,直接返回text2 + if not text1: + return self.replace_entities(text2.strip(), entities_map) + # 根据text1的最后一个字符和text2的第一个字符判断两个text之间的连接 if (text2 and text2[0] in string.punctuation) or (text2 and text2[0] in special_symbols) or (text2 and text2[0] in other_symbols) or (text1 and text1[-1] in other_symbols): words_sep = '' diff --git a/llm_web_kit/libs/version.py b/llm_web_kit/libs/version.py index f71b21a5..b50da94d 100644 --- a/llm_web_kit/libs/version.py +++ b/llm_web_kit/libs/version.py @@ -1 +1 @@ -__version__ = '3.1.2' +__version__ = '3.2.1' diff --git a/requirements/runtime.txt b/requirements/runtime.txt index 52708878..c54ad7f4 100644 --- a/requirements/runtime.txt +++ b/requirements/runtime.txt @@ -11,7 +11,7 @@ jupyter==1.1.1 langdetect_zh==1.0.4 lightgbm==4.5.0 loguru==0.7.2 -lxml==5.3.0 +lxml<=5.3.0 lxml_html_clean==0.4.2 nbconvert==7.16.6 nltk==3.8.1 diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py index 0445d0c6..a17f16cb 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py @@ -453,3 +453,36 @@ def test_Lack_content1(self): result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() assert 'a) Electronic mail: airegg.py90g@nctu.edu.tw .' in content_md + + def test_empty_string_fix(self): + """ + 测试修复字符串索引越界问题 - 当文本处理中出现空字符串时不应抛出IndexError + 这个测试验证了__combine_text方法能够正确处理空字符串的情况 + """ + # 直接测试__combine_text方法处理空字符串的能力 + text_recognizer = TextParagraphRecognizer() + + # 测试各种空字符串组合 + result1 = text_recognizer._TextParagraphRecognizer__combine_text('hello', '', 'en') + self.assertEqual(result1, 'hello') + + result2 = text_recognizer._TextParagraphRecognizer__combine_text('', 'world', 'en') + self.assertEqual(result2, 'world') + + result3 = text_recognizer._TextParagraphRecognizer__combine_text('', '', 'en') + self.assertEqual(result3, '') + + # 测试包含空格但trim后变空的情况 + result4 = text_recognizer._TextParagraphRecognizer__combine_text('hello', ' ', 'en') + self.assertEqual(result4, 'hello') + + result5 = text_recognizer._TextParagraphRecognizer__combine_text(' ', 'world', 'en') + self.assertEqual(result5, 'world') + + # 测试正常情况仍然工作 + result6 = text_recognizer._TextParagraphRecognizer__combine_text('hello', 'world', 'en') + self.assertEqual(result6, 'hello world') + + # 测试标点符号情况 + result7 = text_recognizer._TextParagraphRecognizer__combine_text('hello', ',world', 'en') + self.assertEqual(result7, 'hello,world') diff --git a/tests/llm_web_kit/extractor/html/test_HTMLFileFormatCleanTagsPreExtractor.py b/tests/llm_web_kit/extractor/html/test_HTMLFileFormatCleanTagsPreExtractor.py index 88fb6abf..7a2a9093 100644 --- a/tests/llm_web_kit/extractor/html/test_HTMLFileFormatCleanTagsPreExtractor.py +++ b/tests/llm_web_kit/extractor/html/test_HTMLFileFormatCleanTagsPreExtractor.py @@ -104,6 +104,17 @@ 'expected_html': '''

正常内容1

正常内容2

正常内容3

''' }, + # 测试display: none样式过滤 (带空格版本) + { + 'input': { + 'content_list': [], + 'data_source_category': 'html', + 'url': 'https://example.com/page', + 'html': '''

正常内容

''', + }, + 'expected_html': '

正常内容

' + }, + # 测试保留tail文本 { 'input': { From 0c4ec294934cc5783977ac6908045b4d027f0838 Mon Sep 17 00:00:00 2001 From: drunkpig <60862764+drunkpig@users.noreply.github.com> Date: Sun, 10 Aug 2025 13:14:54 +0800 Subject: [PATCH 05/11] fix: noclip pre-extract problem (#513) --- llm_web_kit/config/pipe_tpl/noclip_html.jsonc | 4 +- .../config/pipe_tpl/noclip_html_test.jsonc | 4 +- llm_web_kit/extractor/html/pre_extractor.py | 41 +++++++++++++++++-- .../extractor/html/recognizer/test_text.py | 3 +- 4 files changed, 43 insertions(+), 9 deletions(-) diff --git a/llm_web_kit/config/pipe_tpl/noclip_html.jsonc b/llm_web_kit/config/pipe_tpl/noclip_html.jsonc index 25692972..20200c8e 100644 --- a/llm_web_kit/config/pipe_tpl/noclip_html.jsonc +++ b/llm_web_kit/config/pipe_tpl/noclip_html.jsonc @@ -10,11 +10,11 @@ }, { "enable": true, - "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor" + "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipFilterTablePreExtractor" }, { "enable": true, - "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor", + "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipCleanTagsPreExtractor", "class_init_kwargs": {} } ], diff --git a/llm_web_kit/config/pipe_tpl/noclip_html_test.jsonc b/llm_web_kit/config/pipe_tpl/noclip_html_test.jsonc index 0e471ce6..cce73d0d 100644 --- a/llm_web_kit/config/pipe_tpl/noclip_html_test.jsonc +++ b/llm_web_kit/config/pipe_tpl/noclip_html_test.jsonc @@ -17,11 +17,11 @@ }, { "enable": true, - "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor" + "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipFilterTablePreExtractor" }, { "enable": true, - "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor", + "python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipCleanTagsPreExtractor", "class_init_kwargs": {} } ], diff --git a/llm_web_kit/extractor/html/pre_extractor.py b/llm_web_kit/extractor/html/pre_extractor.py index d95f06e5..c4247c03 100644 --- a/llm_web_kit/extractor/html/pre_extractor.py +++ b/llm_web_kit/extractor/html/pre_extractor.py @@ -49,7 +49,14 @@ def _do_pre_extract(self, data_json: DataJson) -> DataJson: def __remove_format_table(self, data_json: DataJson): """remove 排版table.""" - html_content = data_json['html'] + html_content = self._get_html_content(data_json) + return self.__do_remove_layout_table(html_content) + + def _get_html_content(self, data_json: DataJson): + return data_json['html'] + + def __do_remove_layout_table(self, html_content: str): + """remove 排版table.""" html_str = html_to_element(html_content) first_structure = html_str.xpath('/html/body/table') != [] second_structure = html_str.xpath('/html/body/center/table') != [] @@ -95,12 +102,12 @@ def __init__(self, config: dict): @override def _do_pre_extract(self, data_json: DataJson) -> DataJson: - data_json['html'] = self.__clean_invisible_elements(data_json) + html_content = data_json['html'] + data_json['html'] = self._clean_invisible_elements(html_content, data_json) return data_json - def __clean_invisible_elements(self, data_json: DataJson) -> str: + def _clean_invisible_elements(self, html_content: str, data_json: DataJson) -> str: """清理隐藏标签.""" - html_content = data_json['html'] tree = html_to_element(html_content) # 遍历所有配置的隐藏标签规则 for tag in INVISIBLE_TAGS: @@ -184,3 +191,29 @@ def __clean_interactive_elements(self, data_json: DataJson) -> str: if len(form.getchildren()) == 0 or not form.text_content().strip(): form.getparent().remove(form) return element_to_html(tree) + +# ############################################################################## +# 解决 main_html和html处理混乱的问题 +# ############################################################################## + + +class HTMLFileFormatNoClipFilterTablePreExtractor(HTMLFileFormatFilterTablePreExtractor): + """noclip管线对main_html预处理.""" + def __init__(self, config: dict): + super().__init__(config) + + @override + def _get_html_content(self, data_json: DataJson): + return data_json['main_html'] + + +class HTMLFileFormatNoClipCleanTagsPreExtractor(HTMLFileFormatCleanTagsPreExtractor): + """noclip管线对main_html预处理.""" + def __init__(self, config: dict): + super().__init__(config) + + @override + def _do_pre_extract(self, data_json: DataJson) -> DataJson: + html_content = data_json['main_html'] + data_json['main_html'] = self._clean_invisible_elements(html_content, data_json) + return data_json diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py index a17f16cb..9dd16050 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py @@ -428,7 +428,8 @@ def test_normalize_space4(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert 'Show Ignored Content\n 1. DrDu\n\n Lieber Hendrik, \n\n kannst Du hierzu was beitragen? \n\n Ich finde keinen rechten Grund' in content_md + assert '1. DrDu\n\n Lieber Hendrik, \n\n kannst Du hierzu was beitragen? \n\n Ich finde keinen rechten Grund' in content_md + assert 'Show Ignored Content' not in content_md # 这个是隐藏标签,不应该被识别出来 def test_Lack_content1(self): """ From c98ac407ac4c0039b599fbae7b42481d7123f2fe Mon Sep 17 00:00:00 2001 From: liukaiwen Date: Mon, 11 Aug 2025 16:09:40 +0800 Subject: [PATCH 06/11] : fix None error --- .../main_html_parser/parser/layout_batch_parser.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py index b24e958b..a0873d38 100644 --- a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py +++ b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py @@ -399,10 +399,11 @@ def __match_tag(self, layer_nodes, current_layer_key, parent_key, node_html, tem if template_sim >= self.dynamic_classid_similarity_threshold: return ele_label, self.normalize_key(ele_keyy[0:3]), is_drop_tail # first class方案 - norm_ele_keyy_with_first_class = self.normalize_key((ele_keyy[0], ele_keyy[1].strip().split(' ')[0], None)) - norm_ele_keyy_parent_with_first_class = (norm_ele_keyy_with_first_class, ele_parent_keyy) - if current_norm_key_with_first_class == norm_ele_keyy_parent_with_first_class: - return ele_label, self.normalize_key(ele_keyy[0:3]), is_drop_tail + if ele_keyy[1] is not None: + norm_ele_keyy_with_first_class = self.normalize_key((ele_keyy[0], ele_keyy[1].strip().split(' ')[0], None)) + norm_ele_keyy_parent_with_first_class = (norm_ele_keyy_with_first_class, ele_parent_keyy) + if current_norm_key_with_first_class == norm_ele_keyy_parent_with_first_class: + return ele_label, self.normalize_key(ele_keyy[0:3]), is_drop_tail return None, None, None From dcb44a00c183ba0e468f97efaad585c8c81743a3 Mon Sep 17 00:00:00 2001 From: liukaiwen Date: Mon, 11 Aug 2025 16:17:17 +0800 Subject: [PATCH 07/11] : fix None error --- llm_web_kit/main_html_parser/parser/layout_batch_parser.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py index a0873d38..13d198ee 100644 --- a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py +++ b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py @@ -369,8 +369,6 @@ def __match_tag_class(self, layer_nodes, current_layer_key, parent_key, node_htm def __match_tag(self, layer_nodes, current_layer_key, parent_key, node_html, template_doc, class_must=False, id_exist=False): current_norm_key = (self.normalize_key((current_layer_key[0], None, None)), parent_key) - current_norm_key_with_first_class = ( - self.normalize_key((current_layer_key[0], current_layer_key[1].strip().split(' ')[0], None)), parent_key) for ele_keyy, ele_value in layer_nodes.items(): # class id要存在 if class_must and not ele_keyy[1]: @@ -399,7 +397,10 @@ def __match_tag(self, layer_nodes, current_layer_key, parent_key, node_html, tem if template_sim >= self.dynamic_classid_similarity_threshold: return ele_label, self.normalize_key(ele_keyy[0:3]), is_drop_tail # first class方案 - if ele_keyy[1] is not None: + if ele_keyy[1] is not None and current_layer_key[1] is not None: + current_norm_key_with_first_class = ( + self.normalize_key((current_layer_key[0], current_layer_key[1].strip().split(' ')[0], None)), + parent_key) norm_ele_keyy_with_first_class = self.normalize_key((ele_keyy[0], ele_keyy[1].strip().split(' ')[0], None)) norm_ele_keyy_parent_with_first_class = (norm_ele_keyy_with_first_class, ele_parent_keyy) if current_norm_key_with_first_class == norm_ele_keyy_parent_with_first_class: From 093069b3a96fb5fc7a211b61e303c80ceeb73b73 Mon Sep 17 00:00:00 2001 From: chupei Date: Mon, 11 Aug 2025 18:54:52 +0800 Subject: [PATCH 08/11] feat: change HTMLStripSpacePostExtractor to ContentListStripSpacePostExtractor (#515) --- llm_web_kit/config/pipe_tpl/html-test.jsonc | 2 +- llm_web_kit/config/pipe_tpl/html.jsonc | 2 +- llm_web_kit/config/pipe_tpl/noclip_html.jsonc | 2 +- .../config/pipe_tpl/noclip_html_test.jsonc | 2 +- llm_web_kit/extractor/html/post_extractor.py | 2 +- ...or.py => test_ContentListPostExtractor.py} | 8 ++++---- tests/llm_web_kit/simple/test_simple.py | 20 +++++++++++++++++++ 7 files changed, 29 insertions(+), 9 deletions(-) rename tests/llm_web_kit/extractor/html/{test_HTMLStripSpacePostExtractor.py => test_ContentListPostExtractor.py} (88%) diff --git a/llm_web_kit/config/pipe_tpl/html-test.jsonc b/llm_web_kit/config/pipe_tpl/html-test.jsonc index 1b4441ee..50d96b51 100644 --- a/llm_web_kit/config/pipe_tpl/html-test.jsonc +++ b/llm_web_kit/config/pipe_tpl/html-test.jsonc @@ -30,7 +30,7 @@ "post_extractor": [ { "enable": true, - "python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor" + "python_class": "llm_web_kit.extractor.html.post_extractor.ContentListStripSpacePostExtractor" } ] } diff --git a/llm_web_kit/config/pipe_tpl/html.jsonc b/llm_web_kit/config/pipe_tpl/html.jsonc index 6de370dc..6700f1b7 100644 --- a/llm_web_kit/config/pipe_tpl/html.jsonc +++ b/llm_web_kit/config/pipe_tpl/html.jsonc @@ -23,7 +23,7 @@ "post_extractor": [ { "enable": true, - "python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor" + "python_class": "llm_web_kit.extractor.html.post_extractor.ContentListStripSpacePostExtractor" } ] } diff --git a/llm_web_kit/config/pipe_tpl/noclip_html.jsonc b/llm_web_kit/config/pipe_tpl/noclip_html.jsonc index 20200c8e..c9afee94 100644 --- a/llm_web_kit/config/pipe_tpl/noclip_html.jsonc +++ b/llm_web_kit/config/pipe_tpl/noclip_html.jsonc @@ -28,7 +28,7 @@ "post_extractor": [ { "enable": true, - "python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor" + "python_class": "llm_web_kit.extractor.html.post_extractor.ContentListStripSpacePostExtractor" } ] } diff --git a/llm_web_kit/config/pipe_tpl/noclip_html_test.jsonc b/llm_web_kit/config/pipe_tpl/noclip_html_test.jsonc index cce73d0d..9ff6a25a 100644 --- a/llm_web_kit/config/pipe_tpl/noclip_html_test.jsonc +++ b/llm_web_kit/config/pipe_tpl/noclip_html_test.jsonc @@ -35,7 +35,7 @@ "post_extractor": [ { "enable": true, - "python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor" + "python_class": "llm_web_kit.extractor.html.post_extractor.ContentListStripSpacePostExtractor" } ] } diff --git a/llm_web_kit/extractor/html/post_extractor.py b/llm_web_kit/extractor/html/post_extractor.py index b224d311..1d9adec2 100644 --- a/llm_web_kit/extractor/html/post_extractor.py +++ b/llm_web_kit/extractor/html/post_extractor.py @@ -37,7 +37,7 @@ def _do_post_extract(self, data_json: DataJson) -> DataJson: raise NotImplementedError('Subclass must implement abstract method') -class HTMLStripSpacePostExtractor(BaseFileFormatPostExtractor): +class ContentListStripSpacePostExtractor(BaseFileFormatPostExtractor): """对段落文本进行处理: 1. 连续的多个空格转换成1个 2. 连续的\t转换成1个 diff --git a/tests/llm_web_kit/extractor/html/test_HTMLStripSpacePostExtractor.py b/tests/llm_web_kit/extractor/html/test_ContentListPostExtractor.py similarity index 88% rename from tests/llm_web_kit/extractor/html/test_HTMLStripSpacePostExtractor.py rename to tests/llm_web_kit/extractor/html/test_ContentListPostExtractor.py index 822cdd78..8699910c 100644 --- a/tests/llm_web_kit/extractor/html/test_HTMLStripSpacePostExtractor.py +++ b/tests/llm_web_kit/extractor/html/test_ContentListPostExtractor.py @@ -2,13 +2,13 @@ import unittest from llm_web_kit.extractor.html.post_extractor import \ - HTMLStripSpacePostExtractor + ContentListStripSpacePostExtractor from llm_web_kit.input.datajson import DataJson -class TestHTMLStripSpacePostExtractor(unittest.TestCase): +class TestContentListStripSpacePostExtractor(unittest.TestCase): def setUp(self): - self.extractor = HTMLStripSpacePostExtractor({}) + self.extractor = ContentListStripSpacePostExtractor({}) self.data_json = { # 构造一个测试数据,检测是否把文本中的连续空格字符转换为1个空格字符 'content_list': [ [ @@ -27,7 +27,7 @@ def setUp(self): ] } - def test_space_post_extractor(self): + def test_content_list_strip_space(self): # Test basic text normalization data_json = DataJson(self.data_json) processed = self.extractor.post_extract(data_json).get_content_list()._get_data() diff --git a/tests/llm_web_kit/simple/test_simple.py b/tests/llm_web_kit/simple/test_simple.py index 3931b511..4025da53 100644 --- a/tests/llm_web_kit/simple/test_simple.py +++ b/tests/llm_web_kit/simple/test_simple.py @@ -162,3 +162,23 @@ def test_extract_word_press(self): html_content = open(os.path.join(self.base_path, 'assets', 'word_press.html'), 'r').read() md = extract_html_to_md(self.url, html_content, clip_html=False) assert 'For descriptions of the methods (AM1, HF, MP2, ...) a' in md + + def test_filter_display_none_content(self): + """测试display:none的内容是否被正确过滤.""" + html_content = ''' + +

正常内容

+ ''' + + # 使用clip_html=False,因为我们要测试noclip模式下HTMLFileFormatCleanTagsPreExtractor是否正确处理main_html字段 + md = extract_html_to_md(self.url, html_content, clip_html=False) + + # 验证隐藏内容被过滤掉了 + self.assertNotIn('Room Only Rate', md) + self.assertNotIn('£1,230.00', md) + + # 验证正常内容被保留 + self.assertIn('正常内容', md) From a2a07919405af6eb91287970ab1c1b3bfe9592e5 Mon Sep 17 00:00:00 2001 From: chupei Date: Mon, 11 Aug 2025 19:03:40 +0800 Subject: [PATCH 09/11] fix: change test_ContentListStripSpacePostExtractor.py filename --- ...ostExtractor.py => test_ContentListStripSpacePostExtractor.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/llm_web_kit/extractor/html/{test_ContentListPostExtractor.py => test_ContentListStripSpacePostExtractor.py} (100%) diff --git a/tests/llm_web_kit/extractor/html/test_ContentListPostExtractor.py b/tests/llm_web_kit/extractor/html/test_ContentListStripSpacePostExtractor.py similarity index 100% rename from tests/llm_web_kit/extractor/html/test_ContentListPostExtractor.py rename to tests/llm_web_kit/extractor/html/test_ContentListStripSpacePostExtractor.py From 5a1caaca0408b63fcf67408edc9a15f81ba11e66 Mon Sep 17 00:00:00 2001 From: Yanggq <1041206149@qq.com> Date: Tue, 12 Aug 2025 15:30:38 +0800 Subject: [PATCH 10/11] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=B8=80=E4=B8=AA?= =?UTF-8?q?=E9=BB=98=E8=AE=A4=E8=A1=8C=E5=86=85=E8=A1=8C=E8=A1=8C=E9=97=B4?= =?UTF-8?q?=E8=AE=BE=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../html/recognizer/cc_math/tag_script.py | 4 +- .../extractor/html/recognizer/ccmath.py | 2 + .../ccmath/geoenergymath_img_inline_1.html | 0 .../ccmath/img_codecogs_com_inline_1.html | 6 + .../ccmath/img_mimetex_cgi_inline_1.html | 35 ++ .../assets/ccmath/katex_mathjax_1.html | 46 --- .../assets/ccmath/katex_mathjax_inline_1.html | 46 +++ ...libretexts_1_p_latex_mathjax_inline_1.html | 328 ++++++++++++++++++ .../ccmath/math_physicsforums_2_inline_1.html | 1 + .../ccmath/math_physicsforums_inline_1.html | 0 .../ccmath/mathjax-mml-chtml_inline_1.html | 5 + .../mathjax-mml-chtml_prefix_inline_1.html | 0 .../ccmath/mathjax_tex_chtml_inline_1.html | 5 + ...math-container_latex_mathjax_inline_1.html | 19 + .../wikipedia_1_math_annotation_inline_1.html | 211 +++++++++++ .../extractor/html/recognizer/test_math.py | 61 +++- 16 files changed, 703 insertions(+), 66 deletions(-) create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/geoenergymath_img_inline_1.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/img_codecogs_com_inline_1.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/img_mimetex_cgi_inline_1.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/katex_mathjax_inline_1.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/libretexts_1_p_latex_mathjax_inline_1.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_physicsforums_2_inline_1.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_physicsforums_inline_1.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/mathjax-mml-chtml_inline_1.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/mathjax-mml-chtml_prefix_inline_1.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/mathjax_tex_chtml_inline_1.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax_inline_1.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/wikipedia_1_math_annotation_inline_1.html diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_script.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_script.py index 2208b4b5..0e218163 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/tag_script.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/tag_script.py @@ -15,7 +15,7 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa text = node.text if text and text_strip(text): if node.tag not in ['script', 'style']: - new_span = create_new_span([(CCMATH_INTERLINE,MathType.LATEX)], cm.wrap_math_md(text), node, math_render, o_html) + new_span = create_new_span([(CCMATH_INLINE,MathType.LATEX)], cm.wrap_math_md(text), node, math_render, o_html) node.addnext(new_span) else: katex_pattern = re.compile(r'katex.render') @@ -28,7 +28,7 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa target_element = target_elements[0] o_html = element_to_html(target_element) target_element.text = None - new_span = create_new_span([(CCMATH_INTERLINE,MathType.LATEX)], cm.wrap_math_md(formula_content), target_element, math_render, o_html) + new_span = create_new_span([(CCMATH_INLINE,MathType.LATEX)], cm.wrap_math_md(formula_content), target_element, math_render, o_html) target_element.insert(0, new_span) elif node.get('type') and 'math/tex' in node.get('type'): tag_math_type_list = cm.get_equation_type(o_html) diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py index fecf7dab..08021dbf 100644 --- a/llm_web_kit/extractor/html/recognizer/ccmath.py +++ b/llm_web_kit/extractor/html/recognizer/ccmath.py @@ -168,11 +168,13 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe # span.katex if node.tag == 'script' or 'math' == node.get('class') or 'katex' == node.get('class'): + # print('匹配到script/math/katex标签: ', original_html) tag_script.modify_tree(self.cm, math_render_type, original_html, node, parent) # 只有有渲染器的网站才会走下面文本匹配逻辑 if math_render_type: # 14. 只处理只有一层的p标签 if node.tag == 'p' and len(node.getchildren()) == 0: + # print('匹配到p标签: ', original_html) tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent) # 修改:传入tree节点,mathjax方案作为process2,不参与上面process1节点的遍历 diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/geoenergymath_img_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/geoenergymath_img_inline_1.html new file mode 100644 index 00000000..e69de29b diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/img_codecogs_com_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/img_codecogs_com_inline_1.html new file mode 100644 index 00000000..59b6ebdb --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/img_codecogs_com_inline_1.html @@ -0,0 +1,6 @@ +IQR = Q_{3}-Q_{1} +IQR = Q_{3}-Q_{1} +\frac{42+43}{2} = \frac{85}{2} = 42.5 +\frac{42+43}{2} = \frac{85}{2} = 42.5 +\frac{51+54}{2} = \frac{105}{2} = 52.5 +\frac{51+54}{2} = \frac{105}{2} = 52.5 diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/img_mimetex_cgi_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/img_mimetex_cgi_inline_1.html new file mode 100644 index 00000000..b10b0eb3 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/img_mimetex_cgi_inline_1.html @@ -0,0 +1,35 @@ +\Re(z)<0 +z \mapsto \exp(z)-1 +\Re(z)>0 +z \mapsto \ln(z+1) +\alpha(z) +\alpha(z)+k +\Re(z)<0 +k=\frac{-\pi i}{3} +\frac{\ln(z)}{3} +\frac{\ln(-z)}{3} +\Re(z)<0 +z \mapsto \exp(z)-1 +\Re(z)>0 +z \mapsto \ln(z+1) +\alpha(z) +\alpha(z)+k +\Re(z)<0 +k=\frac{-\pi i}{3} +\frac{\ln(z)}{3} +\frac{\ln(-z)}{3} +f(z)=\exp(z)-1 +\alpha_\eta(z) = \alpha(\frac{z}{e}-1)\;\;\;\alpha_\eta(\eta^z)=\alpha_\eta(z)+1\;\;\; +f(z)=\eta^z +f(z)=\exp(z)-1\;\;\; +\ln(\ln(\eta^{\eta^z})) +f(z)=\exp(z)-1 +\alpha_\eta(z) = \alpha(\frac{z}{e}-1)\;\;\;\alpha_\eta(\eta^z)=\alpha_\eta(z)+1\;\;\; +f(z)=\eta^z +f(z)=\exp(z)-1\;\;\; +\ln(\ln(\eta^{\eta^z})) +f(z)=\exp(z)-1 +\alpha_\eta(z) = \alpha(\frac{z}{e}-1)\;\;\;\alpha_\eta(\eta^z)=\alpha_\eta(z)+1\;\;\; +f(z)=\eta^z +f(z)=\exp(z)-1\;\;\; +\ln(\ln(\eta^{\eta^z})) diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/katex_mathjax_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/katex_mathjax_1.html index 29fe4f02..e69de29b 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/katex_mathjax_1.html +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/katex_mathjax_1.html @@ -1,46 +0,0 @@ -\frac{1}{\Bigl(\sqrt{\phi \sqrt{5}}-\phi\Bigr) e^{\frac25 \pi}} \equiv 1+\frac{e^{-2\pi}} {1+\frac{e^{-4\pi}} {1+\frac{e^{-6\pi}} {1+\frac{e^{-8\pi}} {1+\cdots} } } } -\left( \sum_{k=1}^n a_k b_k \right)^2 \leq \left( \sum_{k=1}^n a_k^2 \right) \left( \sum_{k=1}^n b_k^2 \right) -\displaystyle\sum_{i=1}^{k+1}i -\displaystyle= \left(\sum_{i=1}^{k}i\right) +(k+1) -\displaystyle= \frac{k(k+1)}{2}+k+1 -\displaystyle= \frac{k(k+1)+2(k+1)}{2} -\displaystyle= \frac{(k+1)(k+2)}{2} -\displaystyle= \frac{(k+1)((k+1)+1)}{2} -\displaystyle1 + \frac{q^2}{(1-q)}+\frac{q^6}{(1-q)(1-q^2)}+\cdots -= \displaystyle \prod_{j=0}^{\infty}\frac{1}{(1-q^{5j+2})(1-q^{5j+3})}, -\displaystyle\text{ for }\lvert q\rvert < 1. -k_{n+1} = n^2 + k_n^2 - k_{n-1} -\Gamma\ \Delta\ \Theta\ \Lambda\ \Xi\ \Pi\ \Sigma\ \Upsilon\ \Phi\ \Psi\ \Omega -\alpha\ \beta\ \gamma\ \delta\ \epsilon\ \zeta\ \eta\ \theta\ \iota\ \kappa\ \lambda\ \mu\ \nu\ \xi -\ \omicron\ \pi\ \rho\ \sigma\ \tau\ \upsilon\ \phi\ \chi\ \psi\ \omega\ \varepsilon\ \vartheta\ \varpi\ \varrho\ \varsigma\ \varphi -\gets\ \to\ \leftarrow\ \rightarrow\ \uparrow\ \Uparrow\ \downarrow\ \Downarrow\ \updownarrow\ \Updownarrow -\Leftarrow\ \Rightarrow\ \leftrightarrow\ \Leftrightarrow\\mapsto\ \hookleftarrow -\leftharpoonup\ \leftharpoondown\ \rightleftharpoons\ \longleftarrow\ \Longleftarrow\ \longrightarrow -\Longrightarrow\ \longleftrightarrow\ \Longleftrightarrow\ \longmapsto\\hookrightarrow\ \rightharpoonup -\rightharpoondown\ \leadsto\ \nearrow\\searrow\ \swarrow\ \nwarrow -\surd\ \barwedge\ \veebar\ \odot\ \oplus\ \otimes\ \oslash\ \circledcirc\ \boxdot\\bigtriangleup -\bigtriangledown\ \dagger\ \diamond\ \star\ \triangleleft\\triangleright\ \angle\ \infty\ \prime\ \triangle -\int u \frac{dv}{dx}\,dx=uv-\int \frac{du}{dx}v\,dx -f(x) = \int_{-\infty}^\infty \hat f(\xi)\,e^{2 \pi i \xi x} -\oint \vec{F} \cdot d\vec{s}=0 -\begin{aligned}\dot{x} & = \sigma(y-x) \\\dot{y} & = \rho x - y - xz \\\dot{z} & = -\beta z + xy\end{aligned} -\mathbf{V}_1 \times \mathbf{V}_2 = \begin{vmatrix}\mathbf{i} & \mathbf{j} & \mathbf{k} \\\frac{\partial X}{\partial u} & \frac{\partial Y}{\partial u} & 0 \\\frac{\partial X}{\partial v} & \frac{\partial Y}{\partial v} & 0\end{vmatrix} -\hat{x}\ \vec{x}\ \ddot{x} -\left(\frac{x^2}{y^3}\right) -\left.\frac{x^3}{3}\right|_0^1 -f(n) = \begin{cases} \frac{n}{2}, & \text{if } n\text{ is even} \\ 3n+1, & \text{if } n\text{ is odd} \end{cases} -\begin{aligned}\nabla \times \vec{\mathbf{B}} -\, \frac1c\, \frac{\partial\vec{\mathbf{E}}}{\partial t} & = \frac{4\pi}{c}\vec{\mathbf{j}} \\\nabla \cdot \vec{\mathbf{E}} & = 4 \pi \rho \\\nabla \times \vec{\mathbf{E}}\, +\, \frac1c\, \frac{\partial\vec{\mathbf{B}}}{\partial t} & = \vec{\mathbf{0}} \\\nabla \cdot \vec{\mathbf{B}} & = 0 \end{aligned} -\begin{aligned}\nabla \times \vec{\mathbf{B}} -\, \frac1c\, \frac{\partial\vec{\mathbf{E}}}{\partial t} & = \frac{4\pi}{c}\vec{\mathbf{j}} \\[1em]\nabla \cdot \vec{\mathbf{E}} & = 4 \pi \rho \\[0.5em]\nabla \times \vec{\mathbf{E}}\, +\, \frac1c\, \frac{\partial\vec{\mathbf{B}}}{\partial t} & = \vec{\mathbf{0}} \\[1em]\nabla \cdot \vec{\mathbf{B}} & = 0 \end{aligned} -\frac{n!}{k!(n-k)!} = {^n}C_k -{n \choose k} -\frac{\frac{1}{x}+\frac{1}{y}}{y-z} -\sqrt[n]{1+x+x^2+x^3+\ldots} -\begin{pmatrix}a_{11} & a_{12} & a_{13}\\a_{21} & a_{22} & a_{23}\\a_{31} & a_{32} & a_{33}\end{pmatrix} -\begin{bmatrix} 0 & \cdots & 0 \\ \vdots & \ddots & \vdots \\ 0 & \cdots & 0 \end{bmatrix} -f(x) = \sqrt{1+x} \quad (x \ge -1) -f(x) \sim x^2 \quad (x\to\infty) -f(x) = \sqrt{1+x}, \quad x \ge -1 -f(x) \sim x^2, \quad x\to\infty -\mathcal L_{\mathcal T}(\vec{\lambda}) = \sum_{(\mathbf{x},\mathbf{s})\in \mathcal T} \log P(\mathbf{s}\mid\mathbf{x}) - \sum_{i=1}^m \frac{\lambda_i^2}{2\sigma^2} -S (\omega)=\frac{\alpha g^2}{\omega^5} \,e ^{[-0.74\bigl\{\frac{\omega U_\omega 19.5}{g}\bigr\}^{-4}]} -f(a,b,c) = (a^2+b^2+c^2)^3 diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/katex_mathjax_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/katex_mathjax_inline_1.html new file mode 100644 index 00000000..29fe4f02 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/katex_mathjax_inline_1.html @@ -0,0 +1,46 @@ +\frac{1}{\Bigl(\sqrt{\phi \sqrt{5}}-\phi\Bigr) e^{\frac25 \pi}} \equiv 1+\frac{e^{-2\pi}} {1+\frac{e^{-4\pi}} {1+\frac{e^{-6\pi}} {1+\frac{e^{-8\pi}} {1+\cdots} } } } +\left( \sum_{k=1}^n a_k b_k \right)^2 \leq \left( \sum_{k=1}^n a_k^2 \right) \left( \sum_{k=1}^n b_k^2 \right) +\displaystyle\sum_{i=1}^{k+1}i +\displaystyle= \left(\sum_{i=1}^{k}i\right) +(k+1) +\displaystyle= \frac{k(k+1)}{2}+k+1 +\displaystyle= \frac{k(k+1)+2(k+1)}{2} +\displaystyle= \frac{(k+1)(k+2)}{2} +\displaystyle= \frac{(k+1)((k+1)+1)}{2} +\displaystyle1 + \frac{q^2}{(1-q)}+\frac{q^6}{(1-q)(1-q^2)}+\cdots += \displaystyle \prod_{j=0}^{\infty}\frac{1}{(1-q^{5j+2})(1-q^{5j+3})}, +\displaystyle\text{ for }\lvert q\rvert < 1. +k_{n+1} = n^2 + k_n^2 - k_{n-1} +\Gamma\ \Delta\ \Theta\ \Lambda\ \Xi\ \Pi\ \Sigma\ \Upsilon\ \Phi\ \Psi\ \Omega +\alpha\ \beta\ \gamma\ \delta\ \epsilon\ \zeta\ \eta\ \theta\ \iota\ \kappa\ \lambda\ \mu\ \nu\ \xi +\ \omicron\ \pi\ \rho\ \sigma\ \tau\ \upsilon\ \phi\ \chi\ \psi\ \omega\ \varepsilon\ \vartheta\ \varpi\ \varrho\ \varsigma\ \varphi +\gets\ \to\ \leftarrow\ \rightarrow\ \uparrow\ \Uparrow\ \downarrow\ \Downarrow\ \updownarrow\ \Updownarrow +\Leftarrow\ \Rightarrow\ \leftrightarrow\ \Leftrightarrow\\mapsto\ \hookleftarrow +\leftharpoonup\ \leftharpoondown\ \rightleftharpoons\ \longleftarrow\ \Longleftarrow\ \longrightarrow +\Longrightarrow\ \longleftrightarrow\ \Longleftrightarrow\ \longmapsto\\hookrightarrow\ \rightharpoonup +\rightharpoondown\ \leadsto\ \nearrow\\searrow\ \swarrow\ \nwarrow +\surd\ \barwedge\ \veebar\ \odot\ \oplus\ \otimes\ \oslash\ \circledcirc\ \boxdot\\bigtriangleup +\bigtriangledown\ \dagger\ \diamond\ \star\ \triangleleft\\triangleright\ \angle\ \infty\ \prime\ \triangle +\int u \frac{dv}{dx}\,dx=uv-\int \frac{du}{dx}v\,dx +f(x) = \int_{-\infty}^\infty \hat f(\xi)\,e^{2 \pi i \xi x} +\oint \vec{F} \cdot d\vec{s}=0 +\begin{aligned}\dot{x} & = \sigma(y-x) \\\dot{y} & = \rho x - y - xz \\\dot{z} & = -\beta z + xy\end{aligned} +\mathbf{V}_1 \times \mathbf{V}_2 = \begin{vmatrix}\mathbf{i} & \mathbf{j} & \mathbf{k} \\\frac{\partial X}{\partial u} & \frac{\partial Y}{\partial u} & 0 \\\frac{\partial X}{\partial v} & \frac{\partial Y}{\partial v} & 0\end{vmatrix} +\hat{x}\ \vec{x}\ \ddot{x} +\left(\frac{x^2}{y^3}\right) +\left.\frac{x^3}{3}\right|_0^1 +f(n) = \begin{cases} \frac{n}{2}, & \text{if } n\text{ is even} \\ 3n+1, & \text{if } n\text{ is odd} \end{cases} +\begin{aligned}\nabla \times \vec{\mathbf{B}} -\, \frac1c\, \frac{\partial\vec{\mathbf{E}}}{\partial t} & = \frac{4\pi}{c}\vec{\mathbf{j}} \\\nabla \cdot \vec{\mathbf{E}} & = 4 \pi \rho \\\nabla \times \vec{\mathbf{E}}\, +\, \frac1c\, \frac{\partial\vec{\mathbf{B}}}{\partial t} & = \vec{\mathbf{0}} \\\nabla \cdot \vec{\mathbf{B}} & = 0 \end{aligned} +\begin{aligned}\nabla \times \vec{\mathbf{B}} -\, \frac1c\, \frac{\partial\vec{\mathbf{E}}}{\partial t} & = \frac{4\pi}{c}\vec{\mathbf{j}} \\[1em]\nabla \cdot \vec{\mathbf{E}} & = 4 \pi \rho \\[0.5em]\nabla \times \vec{\mathbf{E}}\, +\, \frac1c\, \frac{\partial\vec{\mathbf{B}}}{\partial t} & = \vec{\mathbf{0}} \\[1em]\nabla \cdot \vec{\mathbf{B}} & = 0 \end{aligned} +\frac{n!}{k!(n-k)!} = {^n}C_k +{n \choose k} +\frac{\frac{1}{x}+\frac{1}{y}}{y-z} +\sqrt[n]{1+x+x^2+x^3+\ldots} +\begin{pmatrix}a_{11} & a_{12} & a_{13}\\a_{21} & a_{22} & a_{23}\\a_{31} & a_{32} & a_{33}\end{pmatrix} +\begin{bmatrix} 0 & \cdots & 0 \\ \vdots & \ddots & \vdots \\ 0 & \cdots & 0 \end{bmatrix} +f(x) = \sqrt{1+x} \quad (x \ge -1) +f(x) \sim x^2 \quad (x\to\infty) +f(x) = \sqrt{1+x}, \quad x \ge -1 +f(x) \sim x^2, \quad x\to\infty +\mathcal L_{\mathcal T}(\vec{\lambda}) = \sum_{(\mathbf{x},\mathbf{s})\in \mathcal T} \log P(\mathbf{s}\mid\mathbf{x}) - \sum_{i=1}^m \frac{\lambda_i^2}{2\sigma^2} +S (\omega)=\frac{\alpha g^2}{\omega^5} \,e ^{[-0.74\bigl\{\frac{\omega U_\omega 19.5}{g}\bigr\}^{-4}]} +f(a,b,c) = (a^2+b^2+c^2)^3 diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/libretexts_1_p_latex_mathjax_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/libretexts_1_p_latex_mathjax_inline_1.html new file mode 100644 index 00000000..8156968d --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/libretexts_1_p_latex_mathjax_inline_1.html @@ -0,0 +1,328 @@ +\PageIndex{1} +\PageIndex{2} +\PageIndex{3} +\PageIndex{4} +\PageIndex{5} +\PageIndex{6} +\PageIndex{7} +\PageIndex{8} +\PageIndex{9} +\PageIndex{10} +\PageIndex{11} +\PageIndex{12} +\PageIndex{13} +\PageIndex{14} +\PageIndex{15} +\PageIndex{16} +\PageIndex{17} +\PageIndex{18} +\PageIndex{19} +\PageIndex{20} +\PageIndex{21} +\PageIndex{22} +\PageIndex{23} +\PageIndex{24} +\PageIndex{25} +\PageIndex{26} +\PageIndex{27} +\PageIndex{28} +\PageIndex{29} +\PageIndex{30} +\PageIndex{31} +\PageIndex{32} +\PageIndex{33} +\PageIndex{34} +\PageIndex{35} +\PageIndex{36} +\PageIndex{37} +\PageIndex{38} +\PageIndex{39} +\PageIndex{40} +\PageIndex{41} +\PageIndex{42} +\PageIndex{43} +\PageIndex{44} +\PageIndex{45} +\PageIndex{46} +\PageIndex{47} +\PageIndex{48} +\PageIndex{49} +\PageIndex{50} +\PageIndex{51} +\PageIndex{52} +\PageIndex{53} +\PageIndex{54} +a\cdot b +a,\, b +\PageIndex{1} +5 +−3 +5, 3 +5, 3 +5 +\PageIndex{2} +7\cdot 4 = 28 +-8(-6) = 48 +\PageIndex{1} +\cdot +7(-9) = -63 +\cdot +-5\cdot 10= -50 +\PageIndex{2} +\PageIndex{1} +-9\cdot 3 +-2(-5) +4(-8) +7\cdot 6 +\PageIndex{2} +-6\cdot 8 +-4(-7) +9(-7) +5\cdot 12 +-48 +28 +-63 +60 +\PageIndex{3} +-8\cdot 7 +-6(-9) +7(-4) +3\cdot 13 +-56 +54 +-28 +39 +1 +−1 +−1 +−1 +−1 +\PageIndex{4} +-1 \cdot 7 +-1(-11) +\PageIndex{5} +-1\cdot 9 +-1\cdot(-17) +-9 +17 +\PageIndex{6} +-1\cdot 8 +-1\cdot(-16) +-8 +16 +15\div 3=5 +5 \cdot 3 = 15 +15 +15 +\PageIndex{3} +\PageIndex{4} +\PageIndex{7} +-27\div 3 +-100\div (-4) +\PageIndex{8} +-42\div 6 +-117\div (-3) +-7 +39 +\PageIndex{9} +-63\div 7 +-115\div (-5) +-9 +23 +\PageIndex{10} +7(-2)+4(-7)-6 +\PageIndex{11} +8(-3)+5(-7)-4 +-63 +\PageIndex{12} +9(-3)+7(-8)-1 +-84 +\PageIndex{13} +(-2)^{4} +-2^{4} +(−2) +4^{th} +2 +4^{th} +\PageIndex{14} +(-3)^{4} +-3^{4} +81 +-81 +\PageIndex{15} +(-7)^{2} +-7^{2} +49 +-49 +\PageIndex{16} +12-3(9 - 12) +\PageIndex{17} +17 - 4(8 - 11) +29 +\PageIndex{18} +16 - 6(7 - 13) +52 +\PageIndex{19} +8(-9)\div (-2)^{3} +\PageIndex{20} +12(-9)\div (-3)^{3} +4 +\PageIndex{21} +18(-4)\div (-2)^{3} +9 +\PageIndex{22} +-30\div 2 + (-3)(-7) +\PageIndex{23} +-27\div 3 + (-5)(-6) +21 +\PageIndex{24} +-32\div 4 + (-2)(-7) +6 +\PageIndex{25} +n=−5 +n+1 +−n+1 +\PageIndex{26} +n=−8 +n+2 +−n+2 +-6 +10 +\PageIndex{27} +y=−9 +y+8 +−y+8 +-1 +17 +\PageIndex{28} +(x+y)^{2} +x = -18 +y = 24 +\PageIndex{29} +(x+y)^{2} +x = -15 +y = 29 +196 +\PageIndex{30} +(x+y)^{3} +x = -8 +y = 10 +8 +\PageIndex{31} +20 -z +z = 12 +z = -12 +\PageIndex{32} +17 - k +k = 19 +k = -19 +-2 +36 +\PageIndex{33} +-5 - b +b = 14 +b = -14 +-19 +9 +\PageIndex{34} +2x^{2} + 3x + 8 +x = 4 +4 +x +\PageIndex{35} +3x^{2} - 2x + 6 +x =-3 +39 +\PageIndex{36} +4x^{2} - x - 5 +x = -2 +13 +\PageIndex{37} +8 +−12 +3 +\PageIndex{38} +9 +−16 +4 +(9 + (-16)) + 4 - 3 +\PageIndex{39} +-8 +−12 +7 +(-8 + (-12)) + 7 - 13 +a−b +a +b +a +b +b +a +b +a +\PageIndex{5} +\PageIndex{40} +13 +−21 +24 +−19 +\PageIndex{41} +14 +−23 +21 +−17 +14 - (-23); 37 +-17 - 21; -38 +\PageIndex{42} +11 +−19 +18 +−11 +11 - (-19); 30 +-11 - 18; -29 +\PageIndex{43} +−2 +14 +\PageIndex{44} +−5 +12 +-5(12); -60 +\PageIndex{45} +8 +-13 +-8(13); -104 +\PageIndex{46} +−56 +−7 +\PageIndex{47} +−63 +−9 +-63\div (-9); 7 +\PageIndex{48} +−72 +−9 +-72\div (-9); 8 +\PageIndex{49} +11 +−9 +11 +-9 +11 - (-9) +20 +\PageIndex{50} +15 +30 +45 +\PageIndex{51} +−6 +−15 +9 +\PageIndex{52} +15 +3(-15) +-45 +45 +\PageIndex{53} +15 +105 +\PageIndex{54} diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_physicsforums_2_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_physicsforums_2_inline_1.html new file mode 100644 index 00000000..01eda798 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_physicsforums_2_inline_1.html @@ -0,0 +1 @@ +ax^2 + bx + c = 0 \quad \text{versus} \quad \qty(a x^2 + b x + c = 0) diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_physicsforums_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_physicsforums_inline_1.html new file mode 100644 index 00000000..e69de29b diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/mathjax-mml-chtml_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/mathjax-mml-chtml_inline_1.html new file mode 100644 index 00000000..ce3a7483 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/mathjax-mml-chtml_inline_1.html @@ -0,0 +1,5 @@ +a\ne 0 +a{x}^{2}+bx+c=0 +k +n +\sqrt{3x-1}+{\left(1+x\right)}^{2} diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/mathjax-mml-chtml_prefix_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/mathjax-mml-chtml_prefix_inline_1.html new file mode 100644 index 00000000..e69de29b diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/mathjax_tex_chtml_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/mathjax_tex_chtml_inline_1.html new file mode 100644 index 00000000..8e010143 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/mathjax_tex_chtml_inline_1.html @@ -0,0 +1,5 @@ +a \ne 0 +ax^2 + bx + c = 0 +k +n +\sqrt{3x-1}+(1+x)^2 diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax_inline_1.html new file mode 100644 index 00000000..771c4c49 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/stackexchange_1_span-math-container_latex_mathjax_inline_1.html @@ -0,0 +1,19 @@ +h +M +M_{\odot} +\approx 1 +1 \over r +M_{\odot} +M_{\odot} +M_{\odot} +M_{\odot} +\delta L \over L +=10^{-21} +10^{-21} \times (1.3 \cdot 10^9)^2=10^{-3} +1/r +10^{-21} \times (1.3 \cdot 10^9)=10^{-9} +10^{-3} +1/1000 +9 \cdot 10^9 +1/r^2 +1/r diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/wikipedia_1_math_annotation_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/wikipedia_1_math_annotation_inline_1.html new file mode 100644 index 00000000..bf9570d9 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/wikipedia_1_math_annotation_inline_1.html @@ -0,0 +1,211 @@ +{\displaystyle \sigma ^{2}} +{\displaystyle s^{2}} +{\displaystyle \operatorname {Var} (X)} +{\displaystyle V(X)} +{\displaystyle \mathbb {V} (X)} +{\displaystyle X} +{\displaystyle X} +{\displaystyle \mu =\operatorname {E} [X]} +{\displaystyle \operatorname {Var} (X)=\operatorname {E} \left[(X-\mu )^{2}\right].} +{\displaystyle \operatorname {Var} (X)=\operatorname {Cov} (X,X).} +{\displaystyle X} +{\displaystyle \operatorname {Var} (X)} +{\displaystyle V(X)} +{\displaystyle \mathbb {V} (X)} +{\displaystyle \sigma _{X}^{2}} +{\displaystyle \sigma ^{2}} +{\displaystyle {\begin{aligned}\operatorname {Var} (X)&=\operatorname {E} \left[(X-\operatorname {E} [X])^{2}\right]\\[4pt]&=\operatorname {E} \left[X^{2}-2X\operatorname {E} [X]+\operatorname {E} [X]^{2}\right]\\[4pt]&=\operatorname {E} \left[X^{2}\right]-2\operatorname {E} [X]\operatorname {E} [X]+\operatorname {E} [X]^{2}\\[4pt]&=\operatorname {E} \left[X^{2}\right]-2\operatorname {E} [X]^{2}+\operatorname {E} [X]^{2}\\[4pt]&=\operatorname {E} \left[X^{2}\right]-\operatorname {E} [X]^{2}\end{aligned}}} +{\displaystyle X} +{\displaystyle x_{1}\mapsto p_{1},x_{2}\mapsto p_{2},\ldots ,x_{n}\mapsto p_{n}} +{\displaystyle \operatorname {Var} (X)=\sum _{i=1}^{n}p_{i}\cdot (x_{i}-\mu )^{2},} +{\displaystyle \mu } +{\displaystyle \mu =\sum _{i=1}^{n}p_{i}x_{i}.} +{\displaystyle n} +{\displaystyle \operatorname {Var} (X)={\frac {1}{n}}\sum _{i=1}^{n}(x_{i}-\mu )^{2}} +{\displaystyle \mu } +{\displaystyle \mu ={\frac {1}{n}}\sum _{i=1}^{n}x_{i}.} +{\displaystyle n} +{\displaystyle \operatorname {Var} (X)={\frac {1}{n^{2}}}\sum _{i=1}^{n}\sum _{j=1}^{n}{\frac {1}{2}}(x_{i}-x_{j})^{2}={\frac {1}{n^{2}}}\sum _{i}\sum _{j>i}(x_{i}-x_{j})^{2}.} +{\displaystyle X} +{\displaystyle f(x)} +{\displaystyle F(x)} +{\displaystyle {\begin{aligned}\operatorname {Var} (X)=\sigma ^{2}&=\int _{\mathbb {R} }(x-\mu )^{2}f(x)\,dx\\[4pt]&=\int _{\mathbb {R} }x^{2}f(x)\,dx-2\mu \int _{\mathbb {R} }xf(x)\,dx+\mu ^{2}\int _{\mathbb {R} }f(x)\,dx\\[4pt]&=\int _{\mathbb {R} }x^{2}\,dF(x)-2\mu \int _{\mathbb {R} }x\,dF(x)+\mu ^{2}\int _{\mathbb {R} }\,dF(x)\\[4pt]&=\int _{\mathbb {R} }x^{2}\,dF(x)-2\mu \cdot \mu +\mu ^{2}\cdot 1\\[4pt]&=\int _{\mathbb {R} }x^{2}\,dF(x)-\mu ^{2},\end{aligned}}} +{\displaystyle \operatorname {Var} (X)=\int _{\mathbb {R} }x^{2}f(x)\,dx-\mu ^{2},} +{\displaystyle \mu } +{\displaystyle X} +{\displaystyle \mu =\int _{\mathbb {R} }xf(x)\,dx=\int _{\mathbb {R} }x\,dF(x).} +{\displaystyle dx} +{\displaystyle dF(x)} +{\displaystyle x^{2}f(x)} +{\displaystyle [a,b]\subset \mathbb {R} ,} +{\displaystyle \operatorname {Var} (X)=\int _{-\infty }^{+\infty }x^{2}f(x)\,dx-\mu ^{2},} +{\displaystyle f(x)=\lambda e^{-\lambda x}} +{\displaystyle \operatorname {E} [X]=\int _{0}^{\infty }x\lambda e^{-\lambda x}\,dx={\frac {1}{\lambda }}.} +{\displaystyle {\begin{aligned}\operatorname {E} \left[X^{2}\right]&=\int _{0}^{\infty }x^{2}\lambda e^{-\lambda x}\,dx\\&=\left[-x^{2}e^{-\lambda x}\right]_{0}^{\infty }+\int _{0}^{\infty }2xe^{-\lambda x}\,dx\\&=0+{\frac {2}{\lambda }}\operatorname {E} [X]\\&={\frac {2}{\lambda ^{2}}}.\end{aligned}}} +{\displaystyle \operatorname {Var} (X)=\operatorname {E} \left[X^{2}\right]-\operatorname {E} [X]^{2}={\frac {2}{\lambda ^{2}}}-\left({\frac {1}{\lambda }}\right)^{2}={\frac {1}{\lambda ^{2}}}.} +{\displaystyle (1+2+3+4+5+6)/6=7/2.} +{\displaystyle {\begin{aligned}\operatorname {Var} (X)&=\sum _{i=1}^{6}{\frac {1}{6}}\left(i-{\frac {7}{2}}\right)^{2}\\[5pt]&={\frac {1}{6}}\left((-5/2)^{2}+(-3/2)^{2}+(-1/2)^{2}+(1/2)^{2}+(3/2)^{2}+(5/2)^{2}\right)\\[5pt]&={\frac {35}{12}}\approx 2.92.\end{aligned}}} +{\displaystyle {\begin{aligned}\operatorname {Var} (X)&=\operatorname {E} \left(X^{2}\right)-(\operatorname {E} (X))^{2}\\[5pt]&={\frac {1}{n}}\sum _{i=1}^{n}i^{2}-\left({\frac {1}{n}}\sum _{i=1}^{n}i\right)^{2}\\[5pt]&={\frac {(n+1)(2n+1)}{6}}-\left({\frac {n+1}{2}}\right)^{2}\\[4pt]&={\frac {n^{2}-1}{12}}.\end{aligned}}} +{\displaystyle \Pr \,(X=k)={\binom {n}{k}}p^{k}(1-p)^{n-k}} +{\displaystyle np} +{\displaystyle np(1-p)} +{\displaystyle \Pr \,(X=k)=(1-p)^{k-1}p} +{\displaystyle {\frac {1}{p}}} +{\displaystyle {\frac {(1-p)}{p^{2}}}} +{\displaystyle f\left(x\mid \mu ,\sigma ^{2}\right)={\frac {1}{\sqrt {2\pi \sigma ^{2}}}}e^{-{\frac {(x-\mu )^{2}}{2\sigma ^{2}}}}} +{\displaystyle \mu } +{\displaystyle \sigma ^{2}} +{\displaystyle f(x\mid a,b)={\begin{cases}{\frac {1}{b-a}}&{\text{for }}a\leq x\leq b,\\[3pt]0&{\text{for }}xb\end{cases}}} +{\displaystyle {\frac {a+b}{2}}} +{\displaystyle {\frac {(b-a)^{2}}{12}}} +{\displaystyle f(x\mid \lambda )=\lambda e^{-\lambda x}} +{\displaystyle {\frac {1}{\lambda }}} +{\displaystyle {\frac {1}{\lambda ^{2}}}} +{\displaystyle f(k\mid \lambda )={\frac {e^{-\lambda }\lambda ^{k}}{k!}}} +{\displaystyle \lambda } +{\displaystyle \lambda } +{\displaystyle \operatorname {Var} (X)\geq 0.} +{\displaystyle \operatorname {Var} (a)=0.} +{\displaystyle \operatorname {Var} (X)=0\iff \exists a:P(X=a)=1.} +{\displaystyle k} +{\displaystyle 1 Date: Tue, 12 Aug 2025 19:51:43 +0800 Subject: [PATCH 11/11] fix --- llm_web_kit/libs/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_web_kit/libs/version.py b/llm_web_kit/libs/version.py index 5cec2bf8..b50da94d 100644 --- a/llm_web_kit/libs/version.py +++ b/llm_web_kit/libs/version.py @@ -1 +1 @@ -__version__ = '3.2.1' \ No newline at end of file +__version__ = '3.2.1'