mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
feat: update API client to support maximum concurrent requests and improve logging
This commit is contained in:
@@ -47,6 +47,7 @@ TASK_RESULT_TIMEOUT_SECONDS = 600
|
||||
LOCAL_API_STARTUP_TIMEOUT_SECONDS = 30
|
||||
LOCAL_API_CLEANUP_RETRIES = 8
|
||||
LOCAL_API_CLEANUP_RETRY_INTERVAL_SECONDS = 0.25
|
||||
LOCAL_API_MAX_CONCURRENT_REQUESTS = 1
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@@ -95,12 +96,9 @@ class LocalAPIServer:
|
||||
self.temp_dir = tempfile.TemporaryDirectory(prefix="mineru-api-client-")
|
||||
self.temp_root = Path(self.temp_dir.name)
|
||||
self.output_root = self.temp_root / "output"
|
||||
self.log_path = self.temp_root / "mineru-api.log"
|
||||
self.base_url: str | None = None
|
||||
self.process: subprocess.Popen[bytes] | None = None
|
||||
self._log_handle = None
|
||||
self._atexit_registered = False
|
||||
self._cleanup_enabled = True
|
||||
|
||||
def start(self) -> str:
|
||||
if self.process is not None:
|
||||
@@ -110,9 +108,12 @@ class LocalAPIServer:
|
||||
self.base_url = f"http://127.0.0.1:{port}"
|
||||
env = os.environ.copy()
|
||||
env["MINERU_API_OUTPUT_ROOT"] = str(self.output_root)
|
||||
env["MINERU_API_MAX_CONCURRENT_REQUESTS"] = str(
|
||||
LOCAL_API_MAX_CONCURRENT_REQUESTS
|
||||
)
|
||||
env["MINERU_API_DISABLE_ACCESS_LOG"] = "1"
|
||||
self.output_root.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self._log_handle = open(self.log_path, "wb")
|
||||
self.process = subprocess.Popen(
|
||||
[
|
||||
sys.executable,
|
||||
@@ -123,8 +124,6 @@ class LocalAPIServer:
|
||||
"--port",
|
||||
str(port),
|
||||
],
|
||||
stdout=self._log_handle,
|
||||
stderr=subprocess.STDOUT,
|
||||
cwd=os.getcwd(),
|
||||
env=env,
|
||||
)
|
||||
@@ -134,8 +133,7 @@ class LocalAPIServer:
|
||||
self._atexit_registered = True
|
||||
return self.base_url
|
||||
|
||||
def stop(self, preserve_logs: bool = False) -> None:
|
||||
self._cleanup_enabled = not preserve_logs
|
||||
def stop(self) -> None:
|
||||
process = self.process
|
||||
self.process = None
|
||||
try:
|
||||
@@ -147,17 +145,13 @@ class LocalAPIServer:
|
||||
process.kill()
|
||||
process.wait(timeout=5)
|
||||
finally:
|
||||
if self._log_handle is not None:
|
||||
self._log_handle.close()
|
||||
self._log_handle = None
|
||||
if self._atexit_registered:
|
||||
try:
|
||||
atexit.unregister(self.stop)
|
||||
except Exception:
|
||||
pass
|
||||
self._atexit_registered = False
|
||||
if self._cleanup_enabled:
|
||||
self._cleanup_temp_dir()
|
||||
self._cleanup_temp_dir()
|
||||
|
||||
def _cleanup_temp_dir(self) -> None:
|
||||
last_error: Exception | None = None
|
||||
@@ -180,15 +174,6 @@ class LocalAPIServer:
|
||||
last_error,
|
||||
)
|
||||
|
||||
def read_log_tail(self, max_chars: int = 4000) -> str:
|
||||
if not self.log_path.exists():
|
||||
return ""
|
||||
content = self.log_path.read_text(encoding="utf-8", errors="ignore")
|
||||
if len(content) <= max_chars:
|
||||
return content
|
||||
return content[-max_chars:]
|
||||
|
||||
|
||||
def find_free_port() -> int:
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
||||
sock.bind(("127.0.0.1", 0))
|
||||
@@ -282,10 +267,8 @@ async def wait_for_local_api_ready(
|
||||
while asyncio.get_running_loop().time() < deadline:
|
||||
process = local_server.process
|
||||
if process is not None and process.poll() is not None:
|
||||
log_tail = local_server.read_log_tail()
|
||||
raise click.ClickException(
|
||||
f"Local mineru-api exited before becoming healthy. "
|
||||
f"Log file: {local_server.log_path}\n{log_tail}"
|
||||
"Local mineru-api exited before becoming healthy."
|
||||
)
|
||||
try:
|
||||
return await fetch_server_health(client, local_server.base_url)
|
||||
@@ -295,11 +278,10 @@ async def wait_for_local_api_ready(
|
||||
last_error = str(exc)
|
||||
await asyncio.sleep(TASK_STATUS_POLL_INTERVAL_SECONDS)
|
||||
|
||||
log_tail = local_server.read_log_tail()
|
||||
raise click.ClickException(
|
||||
f"Timed out waiting for local mineru-api to become healthy. "
|
||||
f"Log file: {local_server.log_path}\n{last_error or ''}\n{log_tail}"
|
||||
)
|
||||
message = "Timed out waiting for local mineru-api to become healthy."
|
||||
if last_error:
|
||||
message = f"{message} {last_error}"
|
||||
raise click.ClickException(message)
|
||||
|
||||
|
||||
def probe_pdf_effective_pages(
|
||||
@@ -780,8 +762,7 @@ async def run_orchestrated_cli(
|
||||
)
|
||||
finally:
|
||||
if local_server is not None:
|
||||
preserve_logs = sys.exc_info()[0] is not None
|
||||
local_server.stop(preserve_logs=preserve_logs)
|
||||
local_server.stop()
|
||||
|
||||
|
||||
@click.command()
|
||||
|
||||
@@ -173,7 +173,7 @@ def _process_output(
|
||||
json.dumps(model_output, ensure_ascii=False, indent=4),
|
||||
)
|
||||
|
||||
logger.info(f"local output dir is {local_md_dir}")
|
||||
logger.debug(f"local output dir is {local_md_dir}")
|
||||
|
||||
|
||||
def _process_pipeline(
|
||||
|
||||
@@ -64,7 +64,7 @@ DEFAULT_TASK_RETENTION_SECONDS = 24 * 60 * 60
|
||||
DEFAULT_TASK_CLEANUP_INTERVAL_SECONDS = 5 * 60
|
||||
DEFAULT_OUTPUT_ROOT = "./output"
|
||||
ALLOWED_PARSE_METHODS = {"auto", "txt", "ocr"}
|
||||
DEFAULT_MAX_CONCURRENT_REQUESTS = 3
|
||||
DEFAULT_MAX_CONCURRENT_REQUESTS = 1
|
||||
FILE_PARSE_TASK_ID_HEADER = "X-MinerU-Task-Id"
|
||||
FILE_PARSE_TASK_STATUS_HEADER = "X-MinerU-Task-Status"
|
||||
FILE_PARSE_TASK_STATUS_URL_HEADER = "X-MinerU-Task-Status-Url"
|
||||
@@ -76,6 +76,13 @@ _configured_max_concurrent_requests = 0
|
||||
_mps_parse_lock = threading.Lock()
|
||||
|
||||
|
||||
def env_flag_enabled(name: str, default: bool = False) -> bool:
|
||||
value = os.getenv(name)
|
||||
if value is None:
|
||||
return default
|
||||
return value.lower() in ("1", "true", "yes", "on")
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParseRequestOptions:
|
||||
files: list[UploadFile]
|
||||
@@ -170,11 +177,7 @@ async def lifespan(app: FastAPI):
|
||||
def create_app():
|
||||
# By default, the OpenAPI documentation endpoints (openapi_url, docs_url, redoc_url) are enabled.
|
||||
# To disable the FastAPI docs and schema endpoints, set the environment variable MINERU_API_ENABLE_FASTAPI_DOCS=0.
|
||||
enable_docs = str(os.getenv("MINERU_API_ENABLE_FASTAPI_DOCS", "1")).lower() in (
|
||||
"1",
|
||||
"true",
|
||||
"yes",
|
||||
)
|
||||
enable_docs = env_flag_enabled("MINERU_API_ENABLE_FASTAPI_DOCS", default=True)
|
||||
app = FastAPI(
|
||||
openapi_url="/openapi.json" if enable_docs else None,
|
||||
docs_url="/docs" if enable_docs else None,
|
||||
@@ -1338,26 +1341,21 @@ def main(ctx, host, port, reload, **kwargs):
|
||||
kwargs.update(arg_parse(ctx))
|
||||
|
||||
app.state.config = kwargs
|
||||
|
||||
try:
|
||||
mcr = int(
|
||||
kwargs.get(
|
||||
"mineru_api_max_concurrent_requests",
|
||||
DEFAULT_MAX_CONCURRENT_REQUESTS,
|
||||
)
|
||||
or DEFAULT_MAX_CONCURRENT_REQUESTS
|
||||
)
|
||||
except ValueError:
|
||||
mcr = DEFAULT_MAX_CONCURRENT_REQUESTS
|
||||
os.environ["MINERU_API_MAX_CONCURRENT_REQUESTS"] = str(mcr)
|
||||
access_log = not env_flag_enabled("MINERU_API_DISABLE_ACCESS_LOG")
|
||||
|
||||
print(f"Start MinerU FastAPI Service: http://{host}:{port}")
|
||||
print(f"API documentation: http://{host}:{port}/docs")
|
||||
|
||||
if reload:
|
||||
uvicorn.run("mineru.cli.fast_api:app", host=host, port=port, reload=True)
|
||||
uvicorn.run(
|
||||
"mineru.cli.fast_api:app",
|
||||
host=host,
|
||||
port=port,
|
||||
reload=True,
|
||||
access_log=access_log,
|
||||
)
|
||||
else:
|
||||
uvicorn.run(app, host=host, port=port, reload=False)
|
||||
uvicorn.run(app, host=host, port=port, reload=False, access_log=access_log)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user