Compare commits

...

18 Commits

Author SHA1 Message Date
myhloli
ede8d95bf1 Update version.py with new version 2026-04-01 13:20:54 +00:00
Xiaomeng Zhao
54b68d4bf1 Merge pull request #4718 from opendatalab/dev
3.0.7
2026-04-01 21:14:24 +08:00
Xiaomeng Zhao
1b478c24cf Merge pull request #4717 from myhloli/dev
fix: strip newline characters from paragraph text in office_middle_json_mkcontent
2026-04-01 21:13:47 +08:00
myhloli
39b62cc76a fix: strip newline characters from paragraph text in office_middle_json_mkcontent 2026-04-01 21:12:14 +08:00
Xiaomeng Zhao
13465ff43f Merge pull request #4716 from opendatalab/master
master->dev
2026-04-01 20:53:24 +08:00
myhloli
d18b7df766 Update version.py with new version 2026-04-01 12:52:13 +00:00
Xiaomeng Zhao
a97753c86f Merge pull request #4715 from myhloli/dev
fix: correct formatting of usage instructions in quick_usage.md
2026-04-01 20:51:04 +08:00
myhloli
a3b65470cf fix: correct formatting of usage instructions in quick_usage.md 2026-04-01 20:50:07 +08:00
Xiaomeng Zhao
bd7118a58d Merge pull request #4714 from opendatalab/dev
3.0.6
2026-04-01 20:47:20 +08:00
Xiaomeng Zhao
e3f8fb12ea Merge pull request #4713 from myhloli/dev
Dev
2026-04-01 20:46:40 +08:00
myhloli
4cfebebcb5 fix: add logging for unexpected DOCX list states in _add_list_item 2026-04-01 20:37:57 +08:00
myhloli
ee218993ca fix: correct logical condition for handling same-level list items in docx_converter 2026-04-01 20:32:49 +08:00
Xiaomeng Zhao
5f4d6a0cbb Merge pull request #4712 from myhloli/dev
Dev
2026-04-01 20:21:51 +08:00
myhloli
d2c5a29efd feat: add underscore thematic break escaping to Markdown processing 2026-04-01 20:16:40 +08:00
myhloli
00e5a93ec3 fix: correct paragraph text extraction by removing unnecessary stripping 2026-04-01 20:08:18 +08:00
myhloli
69c39f9be2 feat: enhance paragraph text extraction to include inline content controls 2026-04-01 16:29:08 +08:00
Xiaomeng Zhao
887758e99d Merge pull request #4707 from opendatalab/master
master->dev
2026-04-01 03:45:01 +08:00
myhloli
31f368ab85 Update version.py with new version 2026-03-31 19:34:02 +00:00
6 changed files with 112 additions and 27 deletions

Binary file not shown.

View File

@@ -43,12 +43,12 @@ If you need to adjust parsing options through custom parameters, you can also ch
>- API outputs are controlled by the server and written to `./output` by default
>- Uploads currently support `PDF`, image, and `DOCX` files
>
>`POST /tasks` returns immediately with a `task_id`. `POST /file_parse` uses the same task manager internally, waits for the task to finish, and then returns the final result synchronously.
>When a task is waiting in the queue, both the submission response and task-status response may include `queued_ahead` to indicate how many tasks are ahead of it.
>Tasks are tracked only in-process for a single `mineru-api` instance. Task status is not preserved across service restarts, `--reload`, or multi-process deployments.
>Completed or failed tasks are retained for 24 hours by default, then their task state and output directory are cleaned automatically. After cleanup, task status and result endpoints return `404`.
>Use `MINERU_API_TASK_RETENTION_SECONDS` and `MINERU_API_TASK_CLEANUP_INTERVAL_SECONDS` to adjust retention and cleanup polling intervals.
>Use `--enable-vlm-preload true` to warm up the local VLM model during service startup instead of waiting for the first VLM or hybrid request.
>- `POST /tasks` returns immediately with a `task_id`. `POST /file_parse` uses the same task manager internally, waits for the task to finish, and then returns the final result synchronously.
>- When a task is waiting in the queue, both the submission response and task-status response may include `queued_ahead` to indicate how many tasks are ahead of it.
>- Tasks are tracked only in-process for a single `mineru-api` instance. Task status is not preserved across service restarts, `--reload`, or multi-process deployments.
>- Completed or failed tasks are retained for 24 hours by default, then their task state and output directory are cleaned automatically. After cleanup, task status and result endpoints return `404`.
>- Use `MINERU_API_TASK_RETENTION_SECONDS` and `MINERU_API_TASK_CLEANUP_INTERVAL_SECONDS` to adjust retention and cleanup polling intervals.
>- Use `--enable-vlm-preload true` to warm up the local VLM model during service startup instead of waiting for the first VLM or hybrid request.
>
>Asynchronous task submission example:
>```bash

View File

@@ -43,12 +43,12 @@ mineru -p <input_path> -o <output_path>
>- API 输出目录由服务端固定控制,默认写入 `./output`
>- 上传文件当前支持 `PDF`、图片与 `DOCX`
>
>`POST /tasks` 会立即返回 `task_id``POST /file_parse` 会在内部提交到同一个任务管理器,等待任务完成后同步返回最终结果。
>当任务处于排队状态时,任务提交结果和状态查询结果中可能会返回 `queued_ahead` 字段,用于表示前方排队任务数。
>任务为单进程、进程内状态实现,服务重启、`--reload` 热重载或多进程部署后不保证仍可查询历史任务状态。
>默认任务完成或失败后保留 24 小时,随后自动清理任务状态和输出目录;清理后访问任务状态或结果会返回 `404`。
>可通过环境变量 `MINERU_API_TASK_RETENTION_SECONDS` 和 `MINERU_API_TASK_CLEANUP_INTERVAL_SECONDS` 调整保留时长与清理轮询间隔。
>可通过 `--enable-vlm-preload true` 在服务启动阶段预热本地 VLM 模型,避免首次 VLM 或 hybrid 请求时再初始化。
>- `POST /tasks` 会立即返回 `task_id``POST /file_parse` 会在内部提交到同一个任务管理器,等待任务完成后同步返回最终结果。
>- 当任务处于排队状态时,任务提交结果和状态查询结果中可能会返回 `queued_ahead` 字段,用于表示前方排队任务数。
>- 任务为单进程、进程内状态实现,服务重启、`--reload` 热重载或多进程部署后不保证仍可查询历史任务状态。
>- 默认任务完成或失败后保留 24 小时,随后自动清理任务状态和输出目录;清理后访问任务状态或结果会返回 `404`。
>- 可通过环境变量 `MINERU_API_TASK_RETENTION_SECONDS` 和 `MINERU_API_TASK_CLEANUP_INTERVAL_SECONDS` 调整保留时长与清理轮询间隔。
>- 可通过 `--enable-vlm-preload true` 在服务启动阶段预热本地 VLM 模型,避免首次 VLM 或 hybrid 请求时再初始化。
>
>异步任务提交示例:
>```bash

View File

@@ -26,6 +26,7 @@ OFFICE_STYLE_RENDER_MODE_ENV = 'MINERU_OFFICE_STYLE_RENDER_MODE'
OFFICE_STYLE_RENDER_MODE_HTML = 'html'
OFFICE_STYLE_RENDER_MODE_MARKDOWN = 'markdown'
OFFICE_MARKDOWN_WRAPPER_STYLES = {'bold', 'italic', 'strikethrough'}
UNDERSCORE_THEMATIC_BREAK_RE = re.compile(r'^[ \t]{0,3}(?:_[ \t]*){3,}$')
def _apply_markdown_style(content: str, style: list) -> str:
@@ -159,6 +160,21 @@ def _build_media_path(img_buket_path: str, image_path: str) -> str:
return f"{img_buket_path}/{image_path}"
def _escape_underscore_thematic_break(content: str) -> str:
"""Escape standalone underscore runs that Markdown would parse as a thematic break."""
if not content:
return content
if not UNDERSCORE_THEMATIC_BREAK_RE.fullmatch(content.strip()):
return content
first_underscore = content.find('_')
if first_underscore == -1:
return content
return content[:first_underscore] + r'\_' + content[first_underscore + 1:]
def get_title_level(para_block):
title_level = para_block.get('level', 2)
return title_level
@@ -268,11 +284,12 @@ def _join_rendered_parts(parts: list[dict]) -> str:
def _append_text_part(parts: list[dict], original_content: str, span_style: list):
content_stripped = original_content.strip()
escaped_content = _escape_underscore_thematic_break(original_content)
content_stripped = escaped_content.strip()
if content_stripped:
styled = _apply_configured_style(content_stripped, span_style)
leading = original_content[:len(original_content) - len(original_content.lstrip())]
trailing = original_content[len(original_content.rstrip()):]
leading = escaped_content[:len(escaped_content) - len(escaped_content.lstrip())]
trailing = escaped_content[len(escaped_content.rstrip()):]
parts.append(
_make_rendered_part(
ContentType.TEXT,
@@ -683,7 +700,8 @@ def mk_blocks_to_markdown(para_blocks, make_mode, img_buket_path='', page_idx=No
if para_text.strip() == '':
continue
else:
page_markdown.append(para_text.strip())
# page_markdown.append(para_text.strip())
page_markdown.append(para_text.strip('\r\n'))
return page_markdown

View File

@@ -2,7 +2,7 @@ import re
import zipfile
from io import BytesIO
from pathlib import Path
from typing import BinaryIO, Optional, Union, Any, Final
from typing import BinaryIO, Optional, Union, Any, Final, Iterator
import pandas as pd
from PIL import Image, ImageDraw, ImageFont
@@ -38,6 +38,15 @@ class DocxConverter:
"a14": "http://schemas.microsoft.com/office/drawing/2010/main",
"c": "http://schemas.openxmlformats.org/drawingml/2006/chart",
}
_PARAGRAPH_TRANSPARENT_INLINE_CONTAINERS: Final = {
"bdo",
"customXml",
"dir",
"fldSimple",
"ins",
"moveTo",
"smartTag",
}
"""
Word 文档中使用的 XML 命名空间映射。
@@ -1025,9 +1034,10 @@ class DocxConverter:
is_section_end = True
paragraph = Paragraph(element, self.docx_obj)
paragraph_elements = self._get_paragraph_elements(paragraph)
paragraph_text = self._get_paragraph_text(paragraph)
paragraph_anchor = self._extract_paragraph_bookmark(element)
text, equations = self._handle_equations_in_text(
element=element, text=paragraph.text
element=element, text=paragraph_text
)
if text is None:
@@ -1135,7 +1145,7 @@ class DocxConverter:
self.cur_page.append(h_block)
elif len(equations) > 0:
if (paragraph.text is None or len(paragraph.text.strip()) == 0) and len(
if (paragraph_text is None or len(paragraph_text.strip()) == 0) and len(
text
) > 0:
# 独立公式
@@ -1304,14 +1314,17 @@ class DocxConverter:
段落元素列表,每个元素包含文本、格式和超链接信息
"""
inner_contents = list(self._iter_paragraph_inner_content(paragraph))
paragraph_text = self._get_paragraph_text_from_contents(inner_contents)
# 目前保留空段落以保持向后兼容性:
if paragraph.text.strip() == "":
if paragraph_text.strip() == "":
# 检查是否存在带可见样式(下划线或删除线)的空白文本 run。
# 有可见样式的空白文本(如带下划线的空格)在视觉上是可见的,应予保留,
# 因此跳过提前返回,交由后续完整 run 处理流程处理。
has_visible_style_run = any(
isinstance(c, Run) and c.text and self._has_visible_style(self._get_format_from_run(c))
for c in paragraph.iter_inner_content()
for c in inner_contents
)
if not has_visible_style_run:
return [("", None, None)]
@@ -1331,7 +1344,7 @@ class DocxConverter:
_field_acc_format = None # 首个显示 run 的格式
# 遍历段落的 runs 并按格式分组
for c in paragraph.iter_inner_content():
for c in inner_contents:
if isinstance(c, Hyperlink):
# 若地址为 URL含 ://),直接保留字符串,避免 Path 将 // 规范化为 /
address = c.address
@@ -1466,6 +1479,51 @@ class DocxConverter:
return paragraph_elements
def _iter_paragraph_inner_content(
self,
paragraph: Paragraph,
container: Optional[BaseOxmlElement] = None,
) -> Iterator[Union[Run, Hyperlink]]:
"""Yield visible paragraph inline containers in document order.
python-docx only walks direct ``w:r`` and ``w:hyperlink`` children of ``w:p``.
Inline ``w:sdt`` content controls are skipped entirely, which drops their text
from both ``paragraph.text`` and ``paragraph.iter_inner_content()``. This walker
treats ``w:sdt`` and a few transparent wrapper nodes as pass-through containers
and reuses the existing Run/Hyperlink wrappers for the actual visible content.
"""
if container is None:
container = paragraph._element
_W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
for child in container:
tag_name = etree.QName(child).localname
if tag_name == "r":
yield Run(child, paragraph)
elif tag_name == "hyperlink":
yield Hyperlink(child, paragraph)
elif tag_name == "sdt":
sdt_content = child.find(f"{{{_W_NS}}}sdtContent")
if sdt_content is not None:
yield from self._iter_paragraph_inner_content(paragraph, sdt_content)
elif tag_name in self._PARAGRAPH_TRANSPARENT_INLINE_CONTAINERS:
yield from self._iter_paragraph_inner_content(paragraph, child)
@staticmethod
def _get_paragraph_text_from_contents(
inner_contents: list[Union[Run, Hyperlink]],
) -> str:
"""Rebuild paragraph plain text from visible inline containers."""
return "".join(content.text or "" for content in inner_contents)
def _get_paragraph_text(self, paragraph: Paragraph) -> str:
"""Return paragraph plain text, including inline ``w:sdt`` content."""
return self._get_paragraph_text_from_contents(
list(self._iter_paragraph_inner_content(paragraph))
)
@classmethod
def _resolve_style_chain_bool(
cls,
@@ -1922,7 +1980,7 @@ class DocxConverter:
self.pre_ilevel = ilevel
# 情况 4: 同级列表项(相同缩进)
elif self.pre_num_id == numid or self.pre_ilevel == ilevel:
elif self.pre_num_id == numid and self.pre_ilevel == ilevel:
# 获取栈顶的列表块
list_block = self.list_block_stack[-1]
@@ -1933,6 +1991,14 @@ class DocxConverter:
}
list_block["content"].append(list_item)
else:
logger.warning(
"Unexpected DOCX list state in _add_list_item: "
f"pre_num_id={self.pre_num_id}, numid={numid}, "
f"pre_ilevel={self.pre_ilevel}, ilevel={ilevel}, "
f"stack_depth={len(self.list_block_stack)}. "
)
def _detect_heading_list_numids(self) -> set:
"""
预扫描文档检测用作章节标题的列表numId。
@@ -1962,7 +2028,7 @@ class DocxConverter:
numid, ilevel = self._get_numId_and_ilvl(paragraph)
if numid == 0:
numid = None
text = paragraph.text.strip() if paragraph.text else ""
text = self._get_paragraph_text(paragraph).strip()
except Exception:
continue
@@ -2439,8 +2505,9 @@ class DocxConverter:
str: 处理后的文本内容(包含公式标记和超链接格式)
"""
paragraph_elements = self._get_paragraph_elements(paragraph)
paragraph_text = self._get_paragraph_text(paragraph)
text, equations = self._handle_equations_in_text(
element=paragraph._element, text=paragraph.text
element=paragraph._element, text=paragraph_text
)
if text is None:
@@ -2635,7 +2702,7 @@ class DocxConverter:
for p, position in all_paragraphs:
# 创建 Paragraph 对象以获取文本内容
paragraph = Paragraph(p, self.docx_obj)
text_content = paragraph.text
text_content = self._get_paragraph_text(paragraph)
# 基于内容和位置创建唯一标识
paragraph_id = (text_content, position)

View File

@@ -1 +1 @@
__version__ = "3.0.4"
__version__ = "3.0.7"