Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -124,4 +124,4 @@ jobs:
- name: Publish distribution to PyPI
run: |
pip install twine
twine upload --verbose --repository-url https://test.pypi.org/legacy/ dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
twine upload --verbose dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
14 changes: 7 additions & 7 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,19 @@ repos:
rev: 5.0.4
hooks:
- id: flake8
args: [ "--max-line-length=2200", "--ignore=E131,E125,W503,W504,E203,E231,E702,E128" ]
args: [ "--max-line-length=2200", "--ignore=E131,E125,W503,W504,E203,E231,E702,E128,E402" ]
exclude: '^tests/.*/assets/'
- repo: https://github.com/PyCQA/isort
rev: 5.11.5
hooks:
- id: isort
exclude: '^tests/.*/assets/'
- repo: https://github.com/pre-commit/mirrors-yapf
rev: v0.32.0
hooks:
- id: yapf
args: ["--style={based_on_style: google, column_limit: 200, indent_width: 4}"]
exclude: '^tests/.*/assets/'
# - repo: https://github.com/pre-commit/mirrors-yapf
# rev: v0.32.0
# hooks:
# - id: yapf
# args: ["--style={based_on_style: google, column_limit: 200, indent_width: 4}"]
# exclude: '^tests/.*/assets/'
# - repo: https://github.com/codespell-project/codespell
# rev: v2.2.1
# hooks:
Expand Down
2 changes: 1 addition & 1 deletion llm_web_kit/config/pipe_tpl/html-test.jsonc
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
"post_extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor"
"python_class": "llm_web_kit.extractor.html.post_extractor.ContentListStripSpacePostExtractor"
}
]
}
Expand Down
2 changes: 1 addition & 1 deletion llm_web_kit/config/pipe_tpl/html.jsonc
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"post_extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor"
"python_class": "llm_web_kit.extractor.html.post_extractor.ContentListStripSpacePostExtractor"
}
]
}
Expand Down
6 changes: 3 additions & 3 deletions llm_web_kit/config/pipe_tpl/noclip_html.jsonc
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@
},
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor"
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipFilterTablePreExtractor"
},
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor",
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipCleanTagsPreExtractor",
"class_init_kwargs": {}
}
],
Expand All @@ -28,7 +28,7 @@
"post_extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor"
"python_class": "llm_web_kit.extractor.html.post_extractor.ContentListStripSpacePostExtractor"
}
]
}
Expand Down
6 changes: 3 additions & 3 deletions llm_web_kit/config/pipe_tpl/noclip_html_test.jsonc
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@
},
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor"
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipFilterTablePreExtractor"
},
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor",
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipCleanTagsPreExtractor",
"class_init_kwargs": {}
}
],
Expand All @@ -35,7 +35,7 @@
"post_extractor": [
{
"enable": true,
"python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor"
"python_class": "llm_web_kit.extractor.html.post_extractor.ContentListStripSpacePostExtractor"
}
]
}
Expand Down
2 changes: 1 addition & 1 deletion llm_web_kit/extractor/html/post_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def _do_post_extract(self, data_json: DataJson) -> DataJson:
raise NotImplementedError('Subclass must implement abstract method')


class HTMLStripSpacePostExtractor(BaseFileFormatPostExtractor):
class ContentListStripSpacePostExtractor(BaseFileFormatPostExtractor):
"""对段落文本进行处理:
1. 连续的多个空格转换成1个
2. 连续的\t转换成1个
Expand Down
41 changes: 37 additions & 4 deletions llm_web_kit/extractor/html/pre_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,14 @@ def _do_pre_extract(self, data_json: DataJson) -> DataJson:

def __remove_format_table(self, data_json: DataJson):
"""remove 排版table."""
html_content = data_json['html']
html_content = self._get_html_content(data_json)
return self.__do_remove_layout_table(html_content)

def _get_html_content(self, data_json: DataJson):
return data_json['html']

def __do_remove_layout_table(self, html_content: str):
"""remove 排版table."""
html_str = html_to_element(html_content)
first_structure = html_str.xpath('/html/body/table') != []
second_structure = html_str.xpath('/html/body/center/table') != []
Expand Down Expand Up @@ -95,12 +102,12 @@ def __init__(self, config: dict):

@override
def _do_pre_extract(self, data_json: DataJson) -> DataJson:
data_json['html'] = self.__clean_invisible_elements(data_json)
html_content = data_json['html']
data_json['html'] = self._clean_invisible_elements(html_content, data_json)
return data_json

def __clean_invisible_elements(self, data_json: DataJson) -> str:
def _clean_invisible_elements(self, html_content: str, data_json: DataJson) -> str:
"""清理隐藏标签."""
html_content = data_json['html']
tree = html_to_element(html_content)
# 遍历所有配置的隐藏标签规则
for tag in INVISIBLE_TAGS:
Expand Down Expand Up @@ -184,3 +191,29 @@ def __clean_interactive_elements(self, data_json: DataJson) -> str:
if len(form.getchildren()) == 0 or not form.text_content().strip():
form.getparent().remove(form)
return element_to_html(tree)

# ##############################################################################
# 解决 main_html和html处理混乱的问题
# ##############################################################################


class HTMLFileFormatNoClipFilterTablePreExtractor(HTMLFileFormatFilterTablePreExtractor):
"""noclip管线对main_html预处理."""
def __init__(self, config: dict):
super().__init__(config)

@override
def _get_html_content(self, data_json: DataJson):
return data_json['main_html']


class HTMLFileFormatNoClipCleanTagsPreExtractor(HTMLFileFormatCleanTagsPreExtractor):
"""noclip管线对main_html预处理."""
def __init__(self, config: dict):
super().__init__(config)

@override
def _do_pre_extract(self, data_json: DataJson) -> DataJson:
html_content = data_json['main_html']
data_json['main_html'] = self._clean_invisible_elements(html_content, data_json)
return data_json
14 changes: 11 additions & 3 deletions llm_web_kit/extractor/html/recognizer/cc_math/common.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,20 @@
import logging
import os
import re
from pathlib import Path
from typing import List, Tuple

from lxml import etree
from lxml.html import HtmlElement

# 在导入前就设置严格的日志控制
logging.basicConfig(level=logging.WARNING, force=True)

# 设置py_asciimath的日志级别,完全禁用其日志输出
py_asciimath_logger = logging.getLogger('py_asciimath')
py_asciimath_logger.setLevel(logging.ERROR)
py_asciimath_logger.disabled = True

from py_asciimath.translator.translator import ASCIIMath2Tex

from llm_web_kit.extractor.html.recognizer.recognizer import CCTag
Expand All @@ -15,6 +25,7 @@
from llm_web_kit.libs.text_utils import normalize_ctl_text

asciimath2tex = ASCIIMath2Tex(log=False)

color_regex = re.compile(r'\\textcolor\[.*?\]\{.*?\}')


Expand Down Expand Up @@ -137,9 +148,6 @@ class MATHINSIGHT:
}


asciimath2tex = ASCIIMath2Tex(log=False)


def text_strip(text):
return text.strip() if text else text

Expand Down
4 changes: 2 additions & 2 deletions llm_web_kit/extractor/html/recognizer/cc_math/tag_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa
text = node.text
if text and text_strip(text):
if node.tag not in ['script', 'style']:
new_span = create_new_span([(CCMATH_INTERLINE,MathType.LATEX)], cm.wrap_math_md(text), node, math_render, o_html)
new_span = create_new_span([(CCMATH_INLINE,MathType.LATEX)], cm.wrap_math_md(text), node, math_render, o_html)
node.addnext(new_span)
else:
katex_pattern = re.compile(r'katex.render')
Expand All @@ -28,7 +28,7 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa
target_element = target_elements[0]
o_html = element_to_html(target_element)
target_element.text = None
new_span = create_new_span([(CCMATH_INTERLINE,MathType.LATEX)], cm.wrap_math_md(formula_content), target_element, math_render, o_html)
new_span = create_new_span([(CCMATH_INLINE,MathType.LATEX)], cm.wrap_math_md(formula_content), target_element, math_render, o_html)
target_element.insert(0, new_span)
elif node.get('type') and 'math/tex' in node.get('type'):
tag_math_type_list = cm.get_equation_type(o_html)
Expand Down
2 changes: 2 additions & 0 deletions llm_web_kit/extractor/html/recognizer/ccmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,11 +168,13 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe

# span.katex
if node.tag == 'script' or 'math' == node.get('class') or 'katex' == node.get('class'):
# print('匹配到script/math/katex标签: ', original_html)
tag_script.modify_tree(self.cm, math_render_type, original_html, node, parent)
# 只有有渲染器的网站才会走下面文本匹配逻辑
if math_render_type:
# 14. 只处理只有一层的p标签
if node.tag == 'p' and len(node.getchildren()) == 0:
# print('匹配到p标签: ', original_html)
tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)

# 修改:传入tree节点,mathjax方案作为process2,不参与上面process1节点的遍历
Expand Down
9 changes: 8 additions & 1 deletion llm_web_kit/extractor/html/recognizer/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,8 +194,15 @@ def __combine_text(self, text1:str, text2:str, lang='en') -> str:
txt = text1 + text2
return self.replace_entities(txt.strip(), entities_map)
else:
# 如果text2为空,直接返回text1
if not text2:
return self.replace_entities(text1.strip(), entities_map)
# 如果text1为空,直接返回text2
if not text1:
return self.replace_entities(text2.strip(), entities_map)

# 根据text1的最后一个字符和text2的第一个字符判断两个text之间的连接
if (text2[0] in string.punctuation) or (text2[0] in special_symbols) or (text2[0] in other_symbols) or (text1 and text1[-1] in other_symbols):
if (text2 and text2[0] in string.punctuation) or (text2 and text2[0] in special_symbols) or (text2 and text2[0] in other_symbols) or (text1 and text1[-1] in other_symbols):
words_sep = ''
else:
words_sep = ' '
Expand Down
2 changes: 2 additions & 0 deletions llm_web_kit/input/pre_data_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ class PreDataJsonKey:
TYPICAL_SIMPLIFIED_HTML = 'typical_simplified_html'
# 模型打标字典
LLM_RESPONSE = 'llm_response'
# 模型结果都为0
LLM_RESPONSE_EMPTY = 'llm_response_empty'
# 映射模版正文树结构的元素字典
HTML_ELEMENT_DICT = 'html_element_dict'
# 映射模版正文时的文本列表
Expand Down
2 changes: 1 addition & 1 deletion llm_web_kit/libs/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '3.2.0'
__version__ = '3.2.1'
34 changes: 15 additions & 19 deletions llm_web_kit/main_html_parser/parser/layout_batch_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,15 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson:
self.dynamic_classid_enable = pre_data.get(PreDataJsonKey.DYNAMIC_CLASSID_ENABLE, False)
self.more_noise_enable = pre_data.get(PreDataJsonKey.MORE_NOISE_ENABLE, False)
self.dynamic_classid_similarity_threshold = pre_data.get(PreDataJsonKey.DYNAMIC_CLASSID_SIM_THRESH, 0.85)
response_empty = pre_data.get(PreDataJsonKey.LLM_RESPONSE_EMPTY, False)
template_data_str = pre_data[PreDataJsonKey.HTML_ELEMENT_DICT]
template_data = dict()
# 检查第0层第一个元素是否为green,如果是则返回空的HTML
if response_empty:
pre_data[PreDataJsonKey.MAIN_HTML] = ''
pre_data[PreDataJsonKey.MAIN_HTML_BODY] = ''
pre_data[PreDataJsonKey.MAIN_HTML_SUCCESS] = False
return pre_data
if isinstance(template_data_str, str):
template_data_str = json.loads(template_data_str)
for layer, layer_dict in template_data_str.items():
Expand All @@ -57,19 +64,6 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson:
template_data = template_data_str
else:
raise ValueError(f'template_data 类型错误: {type(template_data_str)}')
# 检查第0层第一个元素是否为green,如果是则返回空的HTML
if 0 in template_data:
layer_0_elements = template_data[0]
if layer_0_elements:
# 获取第一个元素
first_element_info = next(iter(layer_0_elements.values()))
if isinstance(first_element_info, tuple) and len(first_element_info) > 0:
label = first_element_info[0] # 获取标签(red/green)
if label == 'green':
pre_data[PreDataJsonKey.MAIN_HTML] = ''
pre_data[PreDataJsonKey.MAIN_HTML_BODY] = ''
pre_data[PreDataJsonKey.MAIN_HTML_SUCCESS] = False
return pre_data

self.template_data = template_data
content, body = self.process(html_source, template_dict_html)
Expand Down Expand Up @@ -375,8 +369,6 @@ def __match_tag_class(self, layer_nodes, current_layer_key, parent_key, node_htm
def __match_tag(self, layer_nodes, current_layer_key, parent_key, node_html, template_doc, class_must=False,
id_exist=False):
current_norm_key = (self.normalize_key((current_layer_key[0], None, None)), parent_key)
current_norm_key_with_first_class = (
self.normalize_key((current_layer_key[0], current_layer_key[1].strip().split(' ')[0], None)), parent_key)
for ele_keyy, ele_value in layer_nodes.items():
# class id要存在
if class_must and not ele_keyy[1]:
Expand Down Expand Up @@ -405,10 +397,14 @@ def __match_tag(self, layer_nodes, current_layer_key, parent_key, node_html, tem
if template_sim >= self.dynamic_classid_similarity_threshold:
return ele_label, self.normalize_key(ele_keyy[0:3]), is_drop_tail
# first class方案
norm_ele_keyy_with_first_class = self.normalize_key((ele_keyy[0], ele_keyy[1].strip().split(' ')[0], None))
norm_ele_keyy_parent_with_first_class = (norm_ele_keyy_with_first_class, ele_parent_keyy)
if current_norm_key_with_first_class == norm_ele_keyy_parent_with_first_class:
return ele_label, self.normalize_key(ele_keyy[0:3]), is_drop_tail
if ele_keyy[1] is not None and current_layer_key[1] is not None:
current_norm_key_with_first_class = (
self.normalize_key((current_layer_key[0], current_layer_key[1].strip().split(' ')[0], None)),
parent_key)
norm_ele_keyy_with_first_class = self.normalize_key((ele_keyy[0], ele_keyy[1].strip().split(' ')[0], None))
norm_ele_keyy_parent_with_first_class = (norm_ele_keyy_with_first_class, ele_parent_keyy)
if current_norm_key_with_first_class == norm_ele_keyy_parent_with_first_class:
return ele_label, self.normalize_key(ele_keyy[0:3]), is_drop_tail

return None, None, None

Expand Down
1 change: 1 addition & 0 deletions llm_web_kit/main_html_parser/parser/tag_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson:
pre_data[PreDataJsonKey.TYPICAL_DICT_HTML] = template_dict_html
pre_data[PreDataJsonKey.SIMILARITY_LAYER] = 0
pre_data[PreDataJsonKey.TYPICAL_MAIN_HTML_SUCCESS] = False
pre_data[PreDataJsonKey.LLM_RESPONSE_EMPTY] = True
return pre_data

# 模版抽取正文html
Expand Down
Loading