Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
85dc9c0
feat:清理元素属性,保留图片的有效src(排除base64)、alt,以及所有元素的class和id"
LollipopsAndWine Jun 9, 2025
35909d6
Merge branch 'ccprocessor:dev' into dev
LollipopsAndWine Jun 9, 2025
6cffbb6
feat: 精简控制是否获取XPATH
LollipopsAndWine Jun 9, 2025
52b544b
Merge remote-tracking branch 'origin/dev' into dev
LollipopsAndWine Jun 9, 2025
c96bbf9
feat: 精简控制是否获取XPATH
LollipopsAndWine Jun 9, 2025
ceede71
Merge branch 'ccprocessor:dev' into dev
LollipopsAndWine Jun 9, 2025
460005a
Merge branch 'ccprocessor:dev' into dev
LollipopsAndWine Jun 10, 2025
c47b8c0
Merge branch 'ccprocessor:dev' into dev
LollipopsAndWine Jun 17, 2025
61fd634
feat: 自定义标签'marked-tail', 'marked-text'配置为行内标签
LollipopsAndWine Jun 17, 2025
a122c6a
Merge remote-tracking branch 'origin/dev' into dev
LollipopsAndWine Jun 17, 2025
59aff6d
Merge branch 'ccprocessor:dev' into dev
LollipopsAndWine Jun 19, 2025
a73c2b4
fix: 重命名自定义标签名称
LollipopsAndWine Jun 19, 2025
63b2467
Merge remote-tracking branch 'origin/dev' into dev
LollipopsAndWine Jun 19, 2025
87d9f82
fix: 重命名自定义标签名称
LollipopsAndWine Jun 19, 2025
ff910e9
fix: 去掉冗余代码
LollipopsAndWine Jun 19, 2025
9e8055e
Merge branch 'ccprocessor:dev' into dev
LollipopsAndWine Jun 30, 2025
049788b
Merge branch 'ccprocessor:dev' into dev
LollipopsAndWine Jul 9, 2025
d7e09d1
fix: 修复多语种拼接规则
LollipopsAndWine Jul 10, 2025
466e4aa
Merge remote-tracking branch 'origin/dev' into dev
LollipopsAndWine Jul 15, 2025
cd4f5cc
Merge branch 'ccprocessor:dev' into dev
LollipopsAndWine Jul 16, 2025
9113a0e
fix: 新增language参数
LollipopsAndWine Jul 16, 2025
0e6deee
fix: 修复无法处理xml
LollipopsAndWine Jul 16, 2025
e288989
Merge branch 'ccprocessor:dev' into dev
LollipopsAndWine Jul 22, 2025
f390206
feat: noclip管线新增预处理:删除表单交互式元素
LollipopsAndWine Jul 22, 2025
39870a7
Merge branch 'ccprocessor:dev' into dev
LollipopsAndWine Jul 24, 2025
d71155e
feat: noclip管线新增预处理配置
LollipopsAndWine Jul 24, 2025
d4b6305
Merge branch 'ccprocessor:dev' into dev
LollipopsAndWine Jul 31, 2025
2b0cdd4
fix: 修复title、list、table、text管线中换行不正确以及缺失内容
LollipopsAndWine Jul 31, 2025
38c9a0a
Merge branch 'ccprocessor:dev' into dev
LollipopsAndWine Jul 31, 2025
0fbac26
fix: 去掉冗余代码
LollipopsAndWine Jul 31, 2025
0bd9d28
fix: 增加测试用例
LollipopsAndWine Jul 31, 2025
aa9f984
Merge branch 'ccprocessor:dev' into dev
LollipopsAndWine Aug 8, 2025
12cccf6
refactor: 重构simple
LollipopsAndWine Aug 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llm_web_kit/extractor/html/recognizer/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def __combine_text(self, text1:str, text2:str, lang='en') -> str:
return self.replace_entities(txt.strip(), entities_map)
else:
# 根据text1的最后一个字符和text2的第一个字符判断两个text之间的连接
if (text2[0] in string.punctuation) or (text2[0] in special_symbols) or (text2[0] in other_symbols) or (text1 and text1[-1] in other_symbols):
if (text2 and text2[0] in string.punctuation) or (text2 and text2[0] in special_symbols) or (text2 and text2[0] in other_symbols) or (text1 and text1[-1] in other_symbols):
words_sep = ''
else:
words_sep = ' '
Expand Down
51 changes: 29 additions & 22 deletions llm_web_kit/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@

from llm_web_kit.config.cfg_reader import load_pipe_tpl
from llm_web_kit.extractor.extractor_chain import ExtractSimpleFactory
from llm_web_kit.extractor.html.extractor import (
HTMLPageLayoutType, MagicHTMLFIleFormatorExtractor,
NoClipHTMLFIleFormatorExtractor)
from llm_web_kit.input.datajson import DataJson


class PipeType:
HTML = 'html'
NOCLIP = 'noclip_html'


class ExtractorType:
HTML = 'html'
PDF = 'pdf'
Expand All @@ -19,24 +21,32 @@ class ExtractorType:

class ExtractorFactory:
"""factory class for extractor."""
html_extractor = None
magic_html_extractor = None
noclip_html_extractor = None
pdf_extractor = None
ebook_extractor = None

@staticmethod
def get_extractor(extractor_type: str):
def get_extractor(extractor_type: str, pipe_tpl_name: str):
if extractor_type == ExtractorType.HTML:
if ExtractorFactory.html_extractor is None:
extractor_cfg = load_pipe_tpl('html')
chain = ExtractSimpleFactory.create(extractor_cfg)
ExtractorFactory.html_extractor = chain
return ExtractorFactory.html_extractor
if pipe_tpl_name == PipeType.HTML:
if ExtractorFactory.magic_html_extractor is None:
extractor_cfg = load_pipe_tpl(pipe_tpl_name)
chain = ExtractSimpleFactory.create(extractor_cfg)
ExtractorFactory.magic_html_extractor = chain
return ExtractorFactory.magic_html_extractor
if pipe_tpl_name == PipeType.NOCLIP:
if ExtractorFactory.noclip_html_extractor is None:
extractor_cfg = load_pipe_tpl(pipe_tpl_name)
chain = ExtractSimpleFactory.create(extractor_cfg)
ExtractorFactory.noclip_html_extractor = chain
return ExtractorFactory.noclip_html_extractor
else:
raise ValueError(f'Invalid extractor type: {extractor_type}')


def __extract_main_html_by_no_clip_html(url:str, html_content: str, raw_html:str) -> DataJson:
extractor = NoClipHTMLFIleFormatorExtractor(load_pipe_tpl('noclip_html'))
extractor = ExtractorFactory.get_extractor(ExtractorType.HTML, PipeType.NOCLIP)
if raw_html == '':
raw_html = html_content
input_data_dict = {
Expand All @@ -54,14 +64,8 @@ def __extract_main_html_by_no_clip_html(url:str, html_content: str, raw_html:str
return result


def __extract_main_html_by_maigic_html(url:str, html_str: str, page_layout_type:str) -> DataJson:
magic_html_extractor = MagicHTMLFIleFormatorExtractor(load_pipe_tpl('html'))
main_html, method, title = magic_html_extractor._extract_main_html(html_str, url, page_layout_type)
return main_html, title


def __extract_html(url:str, html_content: str) -> DataJson:
extractor = ExtractorFactory.get_extractor(ExtractorType.HTML)
extractor = ExtractorFactory.get_extractor(ExtractorType.HTML, PipeType.HTML)
input_data_dict = {
'track_id': str(uuid.uuid4()),
'url': url,
Expand Down Expand Up @@ -94,7 +98,10 @@ def extract_html_to_mm_md(url:str, html_content: str, clip_html=True, raw_html='
return result.get_content_list().to_mm_md()


def extract_main_html_by_maigic_html(url:str, html_str: str, page_layout_type:str = HTMLPageLayoutType.LAYOUT_ARTICLE) -> str:
"""extract main html."""
result = __extract_main_html_by_maigic_html(url, html_str, page_layout_type)
return result[0], result[1]
def extract_main_html(url:str, html_content: str, clip_html=True, raw_html='') -> str:
if clip_html:
result = __extract_html(url, html_content)
else:
result = __extract_main_html_by_no_clip_html(url, html_content, raw_html)
main_html = result.get('main_html')
return main_html
12 changes: 8 additions & 4 deletions tests/llm_web_kit/simple/test_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import unittest

from llm_web_kit.simple import (extract_html_to_md, extract_html_to_mm_md,
extract_main_html_by_maigic_html)
extract_main_html)


class TestSimple(unittest.TestCase):
Expand Down Expand Up @@ -136,9 +136,13 @@ def test_extract_pure_html_to_mm_md(self):
mm_md = extract_html_to_mm_md(self.url, self.html_content, clip_html=True)
self.assertEqual(mm_md, '# Test Content\n\nThis is a test paragraph.\n\n![Test Image](e5db82b5bf63d49d80c5533616892d3386f43955369520986d67653c700fc53c)\n')

def test_extract_magic_html(self):
magic_html, title = extract_main_html_by_maigic_html(self.url, self.html_content)
self.assertEqual(magic_html, '<div><body><h1>Test Content</h1><p>This is a test paragraph.</p><img src="https://example.com/image.jpg" alt="Test Image"></body></div>')
def test_extract_magic_main_html(self):
magic_main_html = extract_main_html(self.url, self.html_content, clip_html=True)
self.assertEqual(magic_main_html, '<div><body><h1>Test Content</h1><p>This is a test paragraph.</p><img src="https://example.com/image.jpg" alt="Test Image"></body></div>')

def test_extract_noclip_main_html(self):
magic_main_html = extract_main_html(self.url, self.html_content, clip_html=False, raw_html=self.html_content)
self.assertEqual(magic_main_html, '<html><body><h1>Test Content</h1><p>This is a test paragraph.</p><img src="https://example.com/image.jpg" alt="Test Image"></body></html>')

def test_extract_real_html_to_md(self):
md = extract_html_to_md(self.url, self.real_html_content, clip_html=False)
Expand Down