From 6650241b33b151510d3ad7b4098489376b7ff321 Mon Sep 17 00:00:00 2001
From: yxq <94103889+snahualimi@users.noreply.github.com>
Date: Mon, 11 Aug 2025 11:12:04 +0800
Subject: [PATCH 01/12] Add files via upload

---
 .../parser/Parser_Dotsocr/fig_recognize.py    | 184 ++++++++++++++++++
 1 file changed, 184 insertions(+)
 create mode 100644 rag_factory/parser/Parser_Dotsocr/fig_recognize.py

diff --git a/rag_factory/parser/Parser_Dotsocr/fig_recognize.py b/rag_factory/parser/Parser_Dotsocr/fig_recognize.py
new file mode 100644
index 0000000..ee8fe20
--- /dev/null
+++ b/rag_factory/parser/Parser_Dotsocr/fig_recognize.py
@@ -0,0 +1,184 @@
+import os 
+import glob
+import json
+import re
+import fitz
+from PIL import Image
+from tqdm import tqdm
+from dashscope import MultiModalConversation
+import argparse
+from pathlib import Path
+
+def fig_understand(fig_path):
+    # prompt = '请给出图像中具体内容信息，并用json格式输出,仅输出json格式数据，其中，图片类型请从["chart","knowladge_map","other"]中选择'
+    prompt = '''
+你是一个图像内容理解专家，任务是读取图像内容并生成结构化 JSON 数据。请遵循以下规则：
+
+1. **仅输出 JSON 数据**，不要添加任何解释、前缀或后缀文字。
+2. JSON 格式中必须包含两个字段：
+   - "type": 图像类型，只能从 ["chart", "knowladge_map", "other"] 中选择。
+   - "content": 图像的具体结构化内容描述。
+3. 如果图像类型是：
+   - "chart": 请提取图表的标题、坐标轴标签、图例、系列等结构信息。
+   - "knowladge_map": 输出树状结构，所有节点使用 {"name": xxx, "children": [...]} 格式。
+   - "other": 尽可能准确描述图像的主要元素。
+
+以下是几个示例，请模仿格式输出。
+
+---
+
+### 示例1（chart）：
+输入图像：柱状图，标题为“年度销售统计”，X轴为月份，Y轴为销售额，图例为“产品A”和“产品B”。
+
+输出：
+```json
+{
+  "type": "chart",
+  "content": {
+  
+    "title": "年度销售统计",
+    "x_axis": "月份",
+    "y_axis": "销售额",
+    "legend": ["产品A", "产品B"],
+    "series": [
+      {"name": "产品A", "data": [100, 120, 130]},
+      {"name": "产品B", "data": [80, 90, 100]}
+    ]
+  }
+}
+示例2（knowladge_map）：
+输入图像：知识图谱，核心为“机器学习”，子节点有“监督学习”和“无监督学习”，监督学习下有“回归”和“分类”。
+
+输出：
+{
+  "type": "knowladge_map",
+  "content": {
+    "name": "机器学习",
+    "children": [
+      {
+        "name": "监督学习",
+        "children": [
+          {"name": "回归"},
+          {"name": "分类"}
+        ]
+      },
+      {
+        "name": "无监督学习"
+      }
+    ]
+  }
+}
+示例3（other）：
+输入图像：一张会议室内多个人开会的场景。
+
+输出：
+{
+  "type": "other",
+  "content": "一个会议室中有5个人正在围绕会议桌讨论，桌上有笔记本电脑和文件。"
+}
+
+请根据上面的示例输出格式，严格输出图像的内容识别结果，只返回符合格式的 JSON 数据。
+
+'''
+    messages=    [
+    {
+        "role": "user",
+        "content": [
+            {"image": f"file://{fig_path}"},
+            {"text": prompt}
+        ]
+    }
+]
+
+    response = MultiModalConversation.call(
+        api_key="sk-e696b88412204f6f8b747afe92c6e45a",
+        model="qwen-vl-plus",  
+        messages=messages,
+    )
+
+    # print(response)
+    return response["output"]["choices"][0]["message"].content[0]["text"].replace("```json",'').replace("```",'').strip()
+
+def save_fig(file_path, page_no, index, bbox, scale):
+
+    file_name, file_ext = os.path.splitext(os.path.basename(file_path))
+    file_name = file_name.replace('_layout', '')
+    base_dir = os.path.dirname(file_path)
+    pdf_file = os.path.join(base_dir, f"{file_name}_original.pdf")
+    doc =  fitz.open(pdf_file)
+    page = doc.load_page(page_no)
+
+    pdf_width = page.rect.width
+    pdf_height = page.rect.height
+
+    scale_x = scale[1] / pdf_width
+    scale_y = scale[0] / pdf_height
+    x1 = bbox[0] / scale_x
+    y1 = bbox[1] / scale_y
+    x2 = bbox[2] / scale_x
+    y2 = bbox[3] / scale_y
+    pdf_bbox = fitz.Rect(x1, y1, x2, y2) 
+    zoom = 300 / 72  # 输出300 DPI
+    matrix = fitz.Matrix(zoom, zoom)
+    img = page.get_pixmap(matrix=matrix, clip=pdf_bbox, alpha=False)
+
+    save_dir = os.path.join(base_dir,f"{file_name}/image")
+    if not os.path.exists(save_dir):
+        os.mkdir(save_dir)
+
+    
+    text = ''
+    save_path = os.path.join(save_dir, f'page_{page_no}_{index}.png')
+    if img is not None:
+        img.save(save_path)  
+        
+        text = fig_understand(save_path)
+
+    return save_path, text
+
+def process_one_file(json_file):
+    file_name = os.path.basename(json_file)
+    base_dir = os.path.dirname(json_file)
+    output_path = str(json_file).replace("layout", "img_content")
+    data = []
+    with open(json_file, 'r', encoding='utf-8') as f:
+        json_data = json.load(f)
+    print(f"Processing file: {file_name}")
+    for row in tqdm(json_data):
+        if row.get('category','') == 'Picture':
+            bbox = row['bbox']
+            page_no = row['page_no']
+            if (bbox[2]-bbox[0])*(bbox[3]-bbox[1]) < 52000:
+                row['text'] = ""
+            else:
+                fig_path, text = save_fig(json_file, page_no=page_no, index=row['index'], bbox=bbox, scale=row['scale'])
+                # print(text)
+                row['text'] = json.loads(text)
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(json_data, f, ensure_ascii=False, indent=4)
+    return json_data
+
+def main():
+    parser = argparse.ArgumentParser(description="Use vlm to get parsed figure content.")
+    parser.add_argument(
+        "--output", type=str, default="output",
+        help="Output parsed directory  (default: output)"
+    )
+    args = parser.parse_args()
+
+
+    if os.path.isdir(args.output):
+        for file in sorted(Path(args.output).glob('*_layout.json')):
+            data = process_one_file(file)
+    elif os.path.isfile(args.output):
+        data = process_one_file(args.output)
+    else:
+        print(f"'{args.output}' no exist")
+
+if __name__ == "__main__":
+    # files = sorted(glob.glob('/home/yangcehao/doc_analysis/Parser_Dotsocr/output'+'/*_layout.json'))
+    # for file in files:
+    #     data = process_one_file(file)
+    main()
+
+

From 84aa5980d570c14907733b6ff6cd912b2cd5be70 Mon Sep 17 00:00:00 2001
From: yxq <94103889+snahualimi@users.noreply.github.com>
Date: Mon, 11 Aug 2025 11:12:40 +0800
Subject: [PATCH 02/12] Add files via upload

---
 rag_factory/parser/Parser_Dotsocr/readme.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/rag_factory/parser/Parser_Dotsocr/readme.md b/rag_factory/parser/Parser_Dotsocr/readme.md
index 35f5294..9b86b6a 100644
--- a/rag_factory/parser/Parser_Dotsocr/readme.md
+++ b/rag_factory/parser/Parser_Dotsocr/readme.md
@@ -20,7 +20,7 @@ snapshot_download(repo_id="rednote-hilab/dots.ocr", local_dir="Parser_Dotsocr/we
 or
 
 from modelscope import snapshot_download
-snapshot_download(repo_id="rednote-hilab/dots.ocr", local_dir="Parser_Dotsocr/weights/DotsOCR")
+snapshot_download(repo_id="rednote-hilab/dots.ocr", local_dir=model_dir)
 ```
 
 ## 2. vLLM inference
@@ -38,3 +38,9 @@ python parser.py pdf_path.pdf
 ```
 
 If you want to parse document with transformers，add `--use_hf=True`
+
+## 4. figure understand
+
+Use vl model to understand content in parsed picture. Please obtain pdf layout parsed result first.
+
+python fig_recognize.py --output_path

From 2dddf94d9766953f3c5c19ed4c5d69fdb0de6c3d Mon Sep 17 00:00:00 2001
From: yxq <94103889+snahualimi@users.noreply.github.com>
Date: Mon, 11 Aug 2025 11:14:25 +0800
Subject: [PATCH 03/12] Add files via upload

---
 rag_factory/parser/Parser_Dotsocr/readme.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/rag_factory/parser/Parser_Dotsocr/readme.md b/rag_factory/parser/Parser_Dotsocr/readme.md
index 9b86b6a..b732edd 100644
--- a/rag_factory/parser/Parser_Dotsocr/readme.md
+++ b/rag_factory/parser/Parser_Dotsocr/readme.md
@@ -42,5 +42,6 @@ If you want to parse document with transformers，add `--use_hf=True`
 ## 4. figure understand
 
 Use vl model to understand content in parsed picture. Please obtain pdf layout parsed result first.
-
-python fig_recognize.py --output_path
+```
+python fig_recognize.py --output output
+```
\ No newline at end of file

From 8a494573789d5a6d9305ceea436e1e9528060f42 Mon Sep 17 00:00:00 2001
From: yxq <94103889+snahualimi@users.noreply.github.com>
Date: Mon, 11 Aug 2025 11:15:13 +0800
Subject: [PATCH 04/12] Update requirements.txt

---
 rag_factory/parser/Parser_Dotsocr/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rag_factory/parser/Parser_Dotsocr/requirements.txt b/rag_factory/parser/Parser_Dotsocr/requirements.txt
index 8159c46..65a6d2b 100644
--- a/rag_factory/parser/Parser_Dotsocr/requirements.txt
+++ b/rag_factory/parser/Parser_Dotsocr/requirements.txt
@@ -10,3 +10,4 @@ modelscope
 flash-attn==2.8.0.post2
 # for GLIBC 2.31, please use flash-attn==2.7.4.post1 instead of flash-attn==2.8.0.post2
 accelerate
+dashscope

From 24bf66b609ba8926d60b86d3bf73a42693908a8f Mon Sep 17 00:00:00 2001
From: yxq <94103889+snahualimi@users.noreply.github.com>
Date: Mon, 11 Aug 2025 11:17:12 +0800
Subject: [PATCH 05/12] Update readme.md

---
 rag_factory/parser/Parser_Dotsocr/readme.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/rag_factory/parser/Parser_Dotsocr/readme.md b/rag_factory/parser/Parser_Dotsocr/readme.md
index b732edd..d31f5ae 100644
--- a/rag_factory/parser/Parser_Dotsocr/readme.md
+++ b/rag_factory/parser/Parser_Dotsocr/readme.md
@@ -25,16 +25,16 @@ snapshot_download(repo_id="rednote-hilab/dots.ocr", local_dir=model_dir)
 
 ## 2. vLLM inference
 
-Using vLLM for faster paser speed  ( based on vllm==0.9.1 )
+Using vLLM for faster speed  ( based on vllm==0.9.1 )
 
 ```
-python vllm_launch.py --model_path dots_model_path
+python vllm_launch.py --model_path weights/DotsOCR
 ```
 
 ## 3. Document Parse
 
 ```
-python parser.py pdf_path.pdf 
+python parser.py  pdf_path.pdf (or pdfs_dir)
 ```
 
 If you want to parse document with transformers，add `--use_hf=True`
@@ -44,4 +44,4 @@ If you want to parse document with transformers，add `--use_hf=True`
 Use vl model to understand content in parsed picture. Please obtain pdf layout parsed result first.
 ```
 python fig_recognize.py --output output
-```
\ No newline at end of file
+```

From 4ea26f50ce934ddffb35e156224f9cb2553999b4 Mon Sep 17 00:00:00 2001
From: yxq <94103889+snahualimi@users.noreply.github.com>
Date: Mon, 11 Aug 2025 11:18:25 +0800
Subject: [PATCH 06/12] Update readme.md

---
 rag_factory/parser/Parser_Dotsocr/readme.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rag_factory/parser/Parser_Dotsocr/readme.md b/rag_factory/parser/Parser_Dotsocr/readme.md
index d31f5ae..212658c 100644
--- a/rag_factory/parser/Parser_Dotsocr/readme.md
+++ b/rag_factory/parser/Parser_Dotsocr/readme.md
@@ -31,7 +31,7 @@ Using vLLM for faster speed  ( based on vllm==0.9.1 )
 python vllm_launch.py --model_path weights/DotsOCR
 ```
 
-## 3. Document Parse
+## 3. Document parse
 
 ```
 python parser.py  pdf_path.pdf (or pdfs_dir)

From 9c2b2765d621774f659ab2d18367de43f3552fa7 Mon Sep 17 00:00:00 2001
From: yxq <94103889+snahualimi@users.noreply.github.com>
Date: Mon, 11 Aug 2025 11:18:39 +0800
Subject: [PATCH 07/12] Update readme.md

---
 rag_factory/parser/Parser_Dotsocr/readme.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rag_factory/parser/Parser_Dotsocr/readme.md b/rag_factory/parser/Parser_Dotsocr/readme.md
index 212658c..d83b416 100644
--- a/rag_factory/parser/Parser_Dotsocr/readme.md
+++ b/rag_factory/parser/Parser_Dotsocr/readme.md
@@ -39,7 +39,7 @@ python parser.py  pdf_path.pdf (or pdfs_dir)
 
 If you want to parse document with transformers，add `--use_hf=True`
 
-## 4. figure understand
+## 4. Figure understand
 
 Use vl model to understand content in parsed picture. Please obtain pdf layout parsed result first.
 ```

From a52d371a1feffe1967e01e045e6bde2d463999e4 Mon Sep 17 00:00:00 2001
From: yxq <94103889+snahualimi@users.noreply.github.com>
Date: Mon, 11 Aug 2025 11:29:33 +0800
Subject: [PATCH 08/12] Update fig_recognize.py

---
 rag_factory/parser/Parser_Dotsocr/fig_recognize.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/rag_factory/parser/Parser_Dotsocr/fig_recognize.py b/rag_factory/parser/Parser_Dotsocr/fig_recognize.py
index ee8fe20..9381d6c 100644
--- a/rag_factory/parser/Parser_Dotsocr/fig_recognize.py
+++ b/rag_factory/parser/Parser_Dotsocr/fig_recognize.py
@@ -9,6 +9,8 @@
 import argparse
 from pathlib import Path
 
+os.environ["DASHSCOPE_API_KEY"] = "your api key"
+
 def fig_understand(fig_path):
     # prompt = '请给出图像中具体内容信息，并用json格式输出,仅输出json格式数据，其中，图片类型请从["chart","knowladge_map","other"]中选择'
     prompt = '''
@@ -91,7 +93,7 @@ def fig_understand(fig_path):
 ]
 
     response = MultiModalConversation.call(
-        api_key="sk-e696b88412204f6f8b747afe92c6e45a",
+        api_key=os.environ.get('DASHSCOPE_API_KEY'),
         model="qwen-vl-plus",  
         messages=messages,
     )

From 25f74eb7d6406276fdacba19379220b897d7f01d Mon Sep 17 00:00:00 2001
From: yxq <94103889+snahualimi@users.noreply.github.com>
Date: Mon, 11 Aug 2025 11:30:17 +0800
Subject: [PATCH 09/12] Update fig_recognize.py

---
 rag_factory/parser/Parser_Dotsocr/fig_recognize.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/rag_factory/parser/Parser_Dotsocr/fig_recognize.py b/rag_factory/parser/Parser_Dotsocr/fig_recognize.py
index 9381d6c..175529e 100644
--- a/rag_factory/parser/Parser_Dotsocr/fig_recognize.py
+++ b/rag_factory/parser/Parser_Dotsocr/fig_recognize.py
@@ -9,7 +9,6 @@
 import argparse
 from pathlib import Path
 
-os.environ["DASHSCOPE_API_KEY"] = "your api key"
 
 def fig_understand(fig_path):
     # prompt = '请给出图像中具体内容信息，并用json格式输出,仅输出json格式数据，其中，图片类型请从["chart","knowladge_map","other"]中选择'
@@ -178,9 +177,7 @@ def main():
         print(f"'{args.output}' no exist")
 
 if __name__ == "__main__":
-    # files = sorted(glob.glob('/home/yangcehao/doc_analysis/Parser_Dotsocr/output'+'/*_layout.json'))
-    # for file in files:
-    #     data = process_one_file(file)
+    os.environ["DASHSCOPE_API_KEY"] = "your api key"
     main()
 
 

From f3d1ad2feba4a5ca3a4e44256e3bcd8211dcc25a Mon Sep 17 00:00:00 2001
From: yxq <94103889+snahualimi@users.noreply.github.com>
Date: Mon, 11 Aug 2025 15:43:30 +0800
Subject: [PATCH 10/12] Update vllm_launch.py

---
 rag_factory/parser/Parser_Dotsocr/vllm_launch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rag_factory/parser/Parser_Dotsocr/vllm_launch.py b/rag_factory/parser/Parser_Dotsocr/vllm_launch.py
index d79bc38..95d6fbc 100644
--- a/rag_factory/parser/Parser_Dotsocr/vllm_launch.py
+++ b/rag_factory/parser/Parser_Dotsocr/vllm_launch.py
@@ -4,7 +4,7 @@
 from pathlib import Path
 import argparse
 
-def launch_vllm_server(hf_model_path="/home/yangcehao/doc_analysis/dots.ocr/weights/DotsOCR", num_gpus="0", gpu_memory_utilization=0.95, port=8001):
+def launch_vllm_server(hf_model_path="weights/DotsOCR", num_gpus="0", gpu_memory_utilization=0.95, port=8001):
     # 1. 检查模型路径
     model_path = Path(hf_model_path).resolve()
     if not model_path.exists():

From a87c27d8ab5ede44fe6495ef0c3664ad3f490141 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=88=E6=98=8E=E7=A5=AF?= <chenmingzhen@idea.edu.cn>
Date: Mon, 11 Aug 2025 23:01:28 +0800
Subject: [PATCH 11/12] remove requirement.txt to root dir

---
 rag_factory/parser/Parser_Dotsocr/requirements.txt | 13 -------------
 1 file changed, 13 deletions(-)
 delete mode 100644 rag_factory/parser/Parser_Dotsocr/requirements.txt

diff --git a/rag_factory/parser/Parser_Dotsocr/requirements.txt b/rag_factory/parser/Parser_Dotsocr/requirements.txt
deleted file mode 100644
index 65a6d2b..0000000
--- a/rag_factory/parser/Parser_Dotsocr/requirements.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-# streamlit 
-gradio
-gradio_image_annotation
-PyMuPDF
-openai
-qwen_vl_utils
-transformers==4.51.3
-huggingface_hub
-modelscope
-flash-attn==2.8.0.post2
-# for GLIBC 2.31, please use flash-attn==2.7.4.post1 instead of flash-attn==2.8.0.post2
-accelerate
-dashscope

From 9027b5470bf47bc1584d58549b6e283b3f7a97c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=88=E6=98=8E=E7=A5=AF?= <chenmingzhen@idea.edu.cn>
Date: Mon, 11 Aug 2025 23:02:01 +0800
Subject: [PATCH 12/12] add dots.ocr requirement

---
 requirements.txt | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index ac808e5..823e164 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,4 +12,22 @@ neo4j
 aioboto3
 llama-index
 llama-index-core
-peewee
\ No newline at end of file
+peewee
+
+mineru[core]
+rank_bm25
+faiss_gpu
+
+
+
+# streamlit 
+PyMuPDF
+openai
+qwen_vl_utils
+transformers==4.51.3
+huggingface_hub
+modelscope
+flash-attn==2.8.0.post2
+# for GLIBC 2.31, please use flash-attn==2.7.4.post1 instead of flash-attn==2.8.0.post2
+accelerate
+dashscope