mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-04-02 05:58:34 +07:00
Compare commits
18 Commits
mineru-3.0
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ede8d95bf1 | ||
|
|
54b68d4bf1 | ||
|
|
1b478c24cf | ||
|
|
39b62cc76a | ||
|
|
13465ff43f | ||
|
|
d18b7df766 | ||
|
|
a97753c86f | ||
|
|
a3b65470cf | ||
|
|
bd7118a58d | ||
|
|
e3f8fb12ea | ||
|
|
4cfebebcb5 | ||
|
|
ee218993ca | ||
|
|
5f4d6a0cbb | ||
|
|
d2c5a29efd | ||
|
|
00e5a93ec3 | ||
|
|
69c39f9be2 | ||
|
|
887758e99d | ||
|
|
31f368ab85 |
Binary file not shown.
@@ -43,12 +43,12 @@ If you need to adjust parsing options through custom parameters, you can also ch
|
||||
>- API outputs are controlled by the server and written to `./output` by default
|
||||
>- Uploads currently support `PDF`, image, and `DOCX` files
|
||||
>
|
||||
>`POST /tasks` returns immediately with a `task_id`. `POST /file_parse` uses the same task manager internally, waits for the task to finish, and then returns the final result synchronously.
|
||||
>When a task is waiting in the queue, both the submission response and task-status response may include `queued_ahead` to indicate how many tasks are ahead of it.
|
||||
>Tasks are tracked only in-process for a single `mineru-api` instance. Task status is not preserved across service restarts, `--reload`, or multi-process deployments.
|
||||
>Completed or failed tasks are retained for 24 hours by default, then their task state and output directory are cleaned automatically. After cleanup, task status and result endpoints return `404`.
|
||||
>Use `MINERU_API_TASK_RETENTION_SECONDS` and `MINERU_API_TASK_CLEANUP_INTERVAL_SECONDS` to adjust retention and cleanup polling intervals.
|
||||
>Use `--enable-vlm-preload true` to warm up the local VLM model during service startup instead of waiting for the first VLM or hybrid request.
|
||||
>- `POST /tasks` returns immediately with a `task_id`. `POST /file_parse` uses the same task manager internally, waits for the task to finish, and then returns the final result synchronously.
|
||||
>- When a task is waiting in the queue, both the submission response and task-status response may include `queued_ahead` to indicate how many tasks are ahead of it.
|
||||
>- Tasks are tracked only in-process for a single `mineru-api` instance. Task status is not preserved across service restarts, `--reload`, or multi-process deployments.
|
||||
>- Completed or failed tasks are retained for 24 hours by default, then their task state and output directory are cleaned automatically. After cleanup, task status and result endpoints return `404`.
|
||||
>- Use `MINERU_API_TASK_RETENTION_SECONDS` and `MINERU_API_TASK_CLEANUP_INTERVAL_SECONDS` to adjust retention and cleanup polling intervals.
|
||||
>- Use `--enable-vlm-preload true` to warm up the local VLM model during service startup instead of waiting for the first VLM or hybrid request.
|
||||
>
|
||||
>Asynchronous task submission example:
|
||||
>```bash
|
||||
|
||||
@@ -43,12 +43,12 @@ mineru -p <input_path> -o <output_path>
|
||||
>- API 输出目录由服务端固定控制,默认写入 `./output`
|
||||
>- 上传文件当前支持 `PDF`、图片与 `DOCX`
|
||||
>
|
||||
>`POST /tasks` 会立即返回 `task_id`;`POST /file_parse` 会在内部提交到同一个任务管理器,等待任务完成后同步返回最终结果。
|
||||
>当任务处于排队状态时,任务提交结果和状态查询结果中可能会返回 `queued_ahead` 字段,用于表示前方排队任务数。
|
||||
>任务为单进程、进程内状态实现,服务重启、`--reload` 热重载或多进程部署后不保证仍可查询历史任务状态。
|
||||
>默认任务完成或失败后保留 24 小时,随后自动清理任务状态和输出目录;清理后访问任务状态或结果会返回 `404`。
|
||||
>可通过环境变量 `MINERU_API_TASK_RETENTION_SECONDS` 和 `MINERU_API_TASK_CLEANUP_INTERVAL_SECONDS` 调整保留时长与清理轮询间隔。
|
||||
>可通过 `--enable-vlm-preload true` 在服务启动阶段预热本地 VLM 模型,避免首次 VLM 或 hybrid 请求时再初始化。
|
||||
>- `POST /tasks` 会立即返回 `task_id`;`POST /file_parse` 会在内部提交到同一个任务管理器,等待任务完成后同步返回最终结果。
|
||||
>- 当任务处于排队状态时,任务提交结果和状态查询结果中可能会返回 `queued_ahead` 字段,用于表示前方排队任务数。
|
||||
>- 任务为单进程、进程内状态实现,服务重启、`--reload` 热重载或多进程部署后不保证仍可查询历史任务状态。
|
||||
>- 默认任务完成或失败后保留 24 小时,随后自动清理任务状态和输出目录;清理后访问任务状态或结果会返回 `404`。
|
||||
>- 可通过环境变量 `MINERU_API_TASK_RETENTION_SECONDS` 和 `MINERU_API_TASK_CLEANUP_INTERVAL_SECONDS` 调整保留时长与清理轮询间隔。
|
||||
>- 可通过 `--enable-vlm-preload true` 在服务启动阶段预热本地 VLM 模型,避免首次 VLM 或 hybrid 请求时再初始化。
|
||||
>
|
||||
>异步任务提交示例:
|
||||
>```bash
|
||||
|
||||
@@ -26,6 +26,7 @@ OFFICE_STYLE_RENDER_MODE_ENV = 'MINERU_OFFICE_STYLE_RENDER_MODE'
|
||||
OFFICE_STYLE_RENDER_MODE_HTML = 'html'
|
||||
OFFICE_STYLE_RENDER_MODE_MARKDOWN = 'markdown'
|
||||
OFFICE_MARKDOWN_WRAPPER_STYLES = {'bold', 'italic', 'strikethrough'}
|
||||
UNDERSCORE_THEMATIC_BREAK_RE = re.compile(r'^[ \t]{0,3}(?:_[ \t]*){3,}$')
|
||||
|
||||
|
||||
def _apply_markdown_style(content: str, style: list) -> str:
|
||||
@@ -159,6 +160,21 @@ def _build_media_path(img_buket_path: str, image_path: str) -> str:
|
||||
return f"{img_buket_path}/{image_path}"
|
||||
|
||||
|
||||
def _escape_underscore_thematic_break(content: str) -> str:
|
||||
"""Escape standalone underscore runs that Markdown would parse as a thematic break."""
|
||||
if not content:
|
||||
return content
|
||||
|
||||
if not UNDERSCORE_THEMATIC_BREAK_RE.fullmatch(content.strip()):
|
||||
return content
|
||||
|
||||
first_underscore = content.find('_')
|
||||
if first_underscore == -1:
|
||||
return content
|
||||
|
||||
return content[:first_underscore] + r'\_' + content[first_underscore + 1:]
|
||||
|
||||
|
||||
def get_title_level(para_block):
|
||||
title_level = para_block.get('level', 2)
|
||||
return title_level
|
||||
@@ -268,11 +284,12 @@ def _join_rendered_parts(parts: list[dict]) -> str:
|
||||
|
||||
|
||||
def _append_text_part(parts: list[dict], original_content: str, span_style: list):
|
||||
content_stripped = original_content.strip()
|
||||
escaped_content = _escape_underscore_thematic_break(original_content)
|
||||
content_stripped = escaped_content.strip()
|
||||
if content_stripped:
|
||||
styled = _apply_configured_style(content_stripped, span_style)
|
||||
leading = original_content[:len(original_content) - len(original_content.lstrip())]
|
||||
trailing = original_content[len(original_content.rstrip()):]
|
||||
leading = escaped_content[:len(escaped_content) - len(escaped_content.lstrip())]
|
||||
trailing = escaped_content[len(escaped_content.rstrip()):]
|
||||
parts.append(
|
||||
_make_rendered_part(
|
||||
ContentType.TEXT,
|
||||
@@ -683,7 +700,8 @@ def mk_blocks_to_markdown(para_blocks, make_mode, img_buket_path='', page_idx=No
|
||||
if para_text.strip() == '':
|
||||
continue
|
||||
else:
|
||||
page_markdown.append(para_text.strip())
|
||||
# page_markdown.append(para_text.strip())
|
||||
page_markdown.append(para_text.strip('\r\n'))
|
||||
|
||||
return page_markdown
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ import re
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import BinaryIO, Optional, Union, Any, Final
|
||||
from typing import BinaryIO, Optional, Union, Any, Final, Iterator
|
||||
|
||||
import pandas as pd
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
@@ -38,6 +38,15 @@ class DocxConverter:
|
||||
"a14": "http://schemas.microsoft.com/office/drawing/2010/main",
|
||||
"c": "http://schemas.openxmlformats.org/drawingml/2006/chart",
|
||||
}
|
||||
_PARAGRAPH_TRANSPARENT_INLINE_CONTAINERS: Final = {
|
||||
"bdo",
|
||||
"customXml",
|
||||
"dir",
|
||||
"fldSimple",
|
||||
"ins",
|
||||
"moveTo",
|
||||
"smartTag",
|
||||
}
|
||||
"""
|
||||
Word 文档中使用的 XML 命名空间映射。
|
||||
|
||||
@@ -1025,9 +1034,10 @@ class DocxConverter:
|
||||
is_section_end = True
|
||||
paragraph = Paragraph(element, self.docx_obj)
|
||||
paragraph_elements = self._get_paragraph_elements(paragraph)
|
||||
paragraph_text = self._get_paragraph_text(paragraph)
|
||||
paragraph_anchor = self._extract_paragraph_bookmark(element)
|
||||
text, equations = self._handle_equations_in_text(
|
||||
element=element, text=paragraph.text
|
||||
element=element, text=paragraph_text
|
||||
)
|
||||
|
||||
if text is None:
|
||||
@@ -1135,7 +1145,7 @@ class DocxConverter:
|
||||
self.cur_page.append(h_block)
|
||||
|
||||
elif len(equations) > 0:
|
||||
if (paragraph.text is None or len(paragraph.text.strip()) == 0) and len(
|
||||
if (paragraph_text is None or len(paragraph_text.strip()) == 0) and len(
|
||||
text
|
||||
) > 0:
|
||||
# 独立公式
|
||||
@@ -1304,14 +1314,17 @@ class DocxConverter:
|
||||
段落元素列表,每个元素包含文本、格式和超链接信息
|
||||
"""
|
||||
|
||||
inner_contents = list(self._iter_paragraph_inner_content(paragraph))
|
||||
paragraph_text = self._get_paragraph_text_from_contents(inner_contents)
|
||||
|
||||
# 目前保留空段落以保持向后兼容性:
|
||||
if paragraph.text.strip() == "":
|
||||
if paragraph_text.strip() == "":
|
||||
# 检查是否存在带可见样式(下划线或删除线)的空白文本 run。
|
||||
# 有可见样式的空白文本(如带下划线的空格)在视觉上是可见的,应予保留,
|
||||
# 因此跳过提前返回,交由后续完整 run 处理流程处理。
|
||||
has_visible_style_run = any(
|
||||
isinstance(c, Run) and c.text and self._has_visible_style(self._get_format_from_run(c))
|
||||
for c in paragraph.iter_inner_content()
|
||||
for c in inner_contents
|
||||
)
|
||||
if not has_visible_style_run:
|
||||
return [("", None, None)]
|
||||
@@ -1331,7 +1344,7 @@ class DocxConverter:
|
||||
_field_acc_format = None # 首个显示 run 的格式
|
||||
|
||||
# 遍历段落的 runs 并按格式分组
|
||||
for c in paragraph.iter_inner_content():
|
||||
for c in inner_contents:
|
||||
if isinstance(c, Hyperlink):
|
||||
# 若地址为 URL(含 ://),直接保留字符串,避免 Path 将 // 规范化为 /
|
||||
address = c.address
|
||||
@@ -1466,6 +1479,51 @@ class DocxConverter:
|
||||
|
||||
return paragraph_elements
|
||||
|
||||
def _iter_paragraph_inner_content(
|
||||
self,
|
||||
paragraph: Paragraph,
|
||||
container: Optional[BaseOxmlElement] = None,
|
||||
) -> Iterator[Union[Run, Hyperlink]]:
|
||||
"""Yield visible paragraph inline containers in document order.
|
||||
|
||||
python-docx only walks direct ``w:r`` and ``w:hyperlink`` children of ``w:p``.
|
||||
Inline ``w:sdt`` content controls are skipped entirely, which drops their text
|
||||
from both ``paragraph.text`` and ``paragraph.iter_inner_content()``. This walker
|
||||
treats ``w:sdt`` and a few transparent wrapper nodes as pass-through containers
|
||||
and reuses the existing Run/Hyperlink wrappers for the actual visible content.
|
||||
"""
|
||||
if container is None:
|
||||
container = paragraph._element
|
||||
|
||||
_W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
|
||||
for child in container:
|
||||
tag_name = etree.QName(child).localname
|
||||
|
||||
if tag_name == "r":
|
||||
yield Run(child, paragraph)
|
||||
elif tag_name == "hyperlink":
|
||||
yield Hyperlink(child, paragraph)
|
||||
elif tag_name == "sdt":
|
||||
sdt_content = child.find(f"{{{_W_NS}}}sdtContent")
|
||||
if sdt_content is not None:
|
||||
yield from self._iter_paragraph_inner_content(paragraph, sdt_content)
|
||||
elif tag_name in self._PARAGRAPH_TRANSPARENT_INLINE_CONTAINERS:
|
||||
yield from self._iter_paragraph_inner_content(paragraph, child)
|
||||
|
||||
@staticmethod
|
||||
def _get_paragraph_text_from_contents(
|
||||
inner_contents: list[Union[Run, Hyperlink]],
|
||||
) -> str:
|
||||
"""Rebuild paragraph plain text from visible inline containers."""
|
||||
return "".join(content.text or "" for content in inner_contents)
|
||||
|
||||
def _get_paragraph_text(self, paragraph: Paragraph) -> str:
|
||||
"""Return paragraph plain text, including inline ``w:sdt`` content."""
|
||||
return self._get_paragraph_text_from_contents(
|
||||
list(self._iter_paragraph_inner_content(paragraph))
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _resolve_style_chain_bool(
|
||||
cls,
|
||||
@@ -1922,7 +1980,7 @@ class DocxConverter:
|
||||
self.pre_ilevel = ilevel
|
||||
|
||||
# 情况 4: 同级列表项(相同缩进)
|
||||
elif self.pre_num_id == numid or self.pre_ilevel == ilevel:
|
||||
elif self.pre_num_id == numid and self.pre_ilevel == ilevel:
|
||||
# 获取栈顶的列表块
|
||||
list_block = self.list_block_stack[-1]
|
||||
|
||||
@@ -1933,6 +1991,14 @@ class DocxConverter:
|
||||
}
|
||||
list_block["content"].append(list_item)
|
||||
|
||||
else:
|
||||
logger.warning(
|
||||
"Unexpected DOCX list state in _add_list_item: "
|
||||
f"pre_num_id={self.pre_num_id}, numid={numid}, "
|
||||
f"pre_ilevel={self.pre_ilevel}, ilevel={ilevel}, "
|
||||
f"stack_depth={len(self.list_block_stack)}. "
|
||||
)
|
||||
|
||||
def _detect_heading_list_numids(self) -> set:
|
||||
"""
|
||||
预扫描文档,检测用作章节标题的列表numId。
|
||||
@@ -1962,7 +2028,7 @@ class DocxConverter:
|
||||
numid, ilevel = self._get_numId_and_ilvl(paragraph)
|
||||
if numid == 0:
|
||||
numid = None
|
||||
text = paragraph.text.strip() if paragraph.text else ""
|
||||
text = self._get_paragraph_text(paragraph).strip()
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
@@ -2439,8 +2505,9 @@ class DocxConverter:
|
||||
str: 处理后的文本内容(包含公式标记和超链接格式)
|
||||
"""
|
||||
paragraph_elements = self._get_paragraph_elements(paragraph)
|
||||
paragraph_text = self._get_paragraph_text(paragraph)
|
||||
text, equations = self._handle_equations_in_text(
|
||||
element=paragraph._element, text=paragraph.text
|
||||
element=paragraph._element, text=paragraph_text
|
||||
)
|
||||
|
||||
if text is None:
|
||||
@@ -2635,7 +2702,7 @@ class DocxConverter:
|
||||
for p, position in all_paragraphs:
|
||||
# 创建 Paragraph 对象以获取文本内容
|
||||
paragraph = Paragraph(p, self.docx_obj)
|
||||
text_content = paragraph.text
|
||||
text_content = self._get_paragraph_text(paragraph)
|
||||
|
||||
# 基于内容和位置创建唯一标识
|
||||
paragraph_id = (text_content, position)
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = "3.0.4"
|
||||
__version__ = "3.0.7"
|
||||
|
||||
Reference in New Issue
Block a user