feat: update API client to support maximum concurrent requests and improve logging

This commit is contained in:
myhloli
2026-03-26 00:34:33 +08:00
parent 053ae8eb24
commit fe257fe6a3
3 changed files with 32 additions and 53 deletions

View File

@@ -47,6 +47,7 @@ TASK_RESULT_TIMEOUT_SECONDS = 600
LOCAL_API_STARTUP_TIMEOUT_SECONDS = 30
LOCAL_API_CLEANUP_RETRIES = 8
LOCAL_API_CLEANUP_RETRY_INTERVAL_SECONDS = 0.25
LOCAL_API_MAX_CONCURRENT_REQUESTS = 1
@dataclass(frozen=True)
@@ -95,12 +96,9 @@ class LocalAPIServer:
self.temp_dir = tempfile.TemporaryDirectory(prefix="mineru-api-client-")
self.temp_root = Path(self.temp_dir.name)
self.output_root = self.temp_root / "output"
self.log_path = self.temp_root / "mineru-api.log"
self.base_url: str | None = None
self.process: subprocess.Popen[bytes] | None = None
self._log_handle = None
self._atexit_registered = False
self._cleanup_enabled = True
def start(self) -> str:
if self.process is not None:
@@ -110,9 +108,12 @@ class LocalAPIServer:
self.base_url = f"http://127.0.0.1:{port}"
env = os.environ.copy()
env["MINERU_API_OUTPUT_ROOT"] = str(self.output_root)
env["MINERU_API_MAX_CONCURRENT_REQUESTS"] = str(
LOCAL_API_MAX_CONCURRENT_REQUESTS
)
env["MINERU_API_DISABLE_ACCESS_LOG"] = "1"
self.output_root.mkdir(parents=True, exist_ok=True)
self._log_handle = open(self.log_path, "wb")
self.process = subprocess.Popen(
[
sys.executable,
@@ -123,8 +124,6 @@ class LocalAPIServer:
"--port",
str(port),
],
stdout=self._log_handle,
stderr=subprocess.STDOUT,
cwd=os.getcwd(),
env=env,
)
@@ -134,8 +133,7 @@ class LocalAPIServer:
self._atexit_registered = True
return self.base_url
def stop(self, preserve_logs: bool = False) -> None:
self._cleanup_enabled = not preserve_logs
def stop(self) -> None:
process = self.process
self.process = None
try:
@@ -147,17 +145,13 @@ class LocalAPIServer:
process.kill()
process.wait(timeout=5)
finally:
if self._log_handle is not None:
self._log_handle.close()
self._log_handle = None
if self._atexit_registered:
try:
atexit.unregister(self.stop)
except Exception:
pass
self._atexit_registered = False
if self._cleanup_enabled:
self._cleanup_temp_dir()
self._cleanup_temp_dir()
def _cleanup_temp_dir(self) -> None:
last_error: Exception | None = None
@@ -180,15 +174,6 @@ class LocalAPIServer:
last_error,
)
def read_log_tail(self, max_chars: int = 4000) -> str:
if not self.log_path.exists():
return ""
content = self.log_path.read_text(encoding="utf-8", errors="ignore")
if len(content) <= max_chars:
return content
return content[-max_chars:]
def find_free_port() -> int:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
sock.bind(("127.0.0.1", 0))
@@ -282,10 +267,8 @@ async def wait_for_local_api_ready(
while asyncio.get_running_loop().time() < deadline:
process = local_server.process
if process is not None and process.poll() is not None:
log_tail = local_server.read_log_tail()
raise click.ClickException(
f"Local mineru-api exited before becoming healthy. "
f"Log file: {local_server.log_path}\n{log_tail}"
"Local mineru-api exited before becoming healthy."
)
try:
return await fetch_server_health(client, local_server.base_url)
@@ -295,11 +278,10 @@ async def wait_for_local_api_ready(
last_error = str(exc)
await asyncio.sleep(TASK_STATUS_POLL_INTERVAL_SECONDS)
log_tail = local_server.read_log_tail()
raise click.ClickException(
f"Timed out waiting for local mineru-api to become healthy. "
f"Log file: {local_server.log_path}\n{last_error or ''}\n{log_tail}"
)
message = "Timed out waiting for local mineru-api to become healthy."
if last_error:
message = f"{message} {last_error}"
raise click.ClickException(message)
def probe_pdf_effective_pages(
@@ -780,8 +762,7 @@ async def run_orchestrated_cli(
)
finally:
if local_server is not None:
preserve_logs = sys.exc_info()[0] is not None
local_server.stop(preserve_logs=preserve_logs)
local_server.stop()
@click.command()

View File

@@ -173,7 +173,7 @@ def _process_output(
json.dumps(model_output, ensure_ascii=False, indent=4),
)
logger.info(f"local output dir is {local_md_dir}")
logger.debug(f"local output dir is {local_md_dir}")
def _process_pipeline(

View File

@@ -64,7 +64,7 @@ DEFAULT_TASK_RETENTION_SECONDS = 24 * 60 * 60
DEFAULT_TASK_CLEANUP_INTERVAL_SECONDS = 5 * 60
DEFAULT_OUTPUT_ROOT = "./output"
ALLOWED_PARSE_METHODS = {"auto", "txt", "ocr"}
DEFAULT_MAX_CONCURRENT_REQUESTS = 3
DEFAULT_MAX_CONCURRENT_REQUESTS = 1
FILE_PARSE_TASK_ID_HEADER = "X-MinerU-Task-Id"
FILE_PARSE_TASK_STATUS_HEADER = "X-MinerU-Task-Status"
FILE_PARSE_TASK_STATUS_URL_HEADER = "X-MinerU-Task-Status-Url"
@@ -76,6 +76,13 @@ _configured_max_concurrent_requests = 0
_mps_parse_lock = threading.Lock()
def env_flag_enabled(name: str, default: bool = False) -> bool:
value = os.getenv(name)
if value is None:
return default
return value.lower() in ("1", "true", "yes", "on")
@dataclass
class ParseRequestOptions:
files: list[UploadFile]
@@ -170,11 +177,7 @@ async def lifespan(app: FastAPI):
def create_app():
# By default, the OpenAPI documentation endpoints (openapi_url, docs_url, redoc_url) are enabled.
# To disable the FastAPI docs and schema endpoints, set the environment variable MINERU_API_ENABLE_FASTAPI_DOCS=0.
enable_docs = str(os.getenv("MINERU_API_ENABLE_FASTAPI_DOCS", "1")).lower() in (
"1",
"true",
"yes",
)
enable_docs = env_flag_enabled("MINERU_API_ENABLE_FASTAPI_DOCS", default=True)
app = FastAPI(
openapi_url="/openapi.json" if enable_docs else None,
docs_url="/docs" if enable_docs else None,
@@ -1338,26 +1341,21 @@ def main(ctx, host, port, reload, **kwargs):
kwargs.update(arg_parse(ctx))
app.state.config = kwargs
try:
mcr = int(
kwargs.get(
"mineru_api_max_concurrent_requests",
DEFAULT_MAX_CONCURRENT_REQUESTS,
)
or DEFAULT_MAX_CONCURRENT_REQUESTS
)
except ValueError:
mcr = DEFAULT_MAX_CONCURRENT_REQUESTS
os.environ["MINERU_API_MAX_CONCURRENT_REQUESTS"] = str(mcr)
access_log = not env_flag_enabled("MINERU_API_DISABLE_ACCESS_LOG")
print(f"Start MinerU FastAPI Service: http://{host}:{port}")
print(f"API documentation: http://{host}:{port}/docs")
if reload:
uvicorn.run("mineru.cli.fast_api:app", host=host, port=port, reload=True)
uvicorn.run(
"mineru.cli.fast_api:app",
host=host,
port=port,
reload=True,
access_log=access_log,
)
else:
uvicorn.run(app, host=host, port=port, reload=False)
uvicorn.run(app, host=host, port=port, reload=False, access_log=access_log)
if __name__ == "__main__":