mirror of
https://github.com/opendatalab/MinerU.git
synced 2026-03-27 11:08:32 +07:00
Compare commits
79 Commits
magic_pdf-
...
magic_pdf-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f06a32133c | ||
|
|
59b0b0c3da | ||
|
|
8a2736a53f | ||
|
|
0b35b73c64 | ||
|
|
9bd6294bfa | ||
|
|
8a52ada337 | ||
|
|
26c2378271 | ||
|
|
b6f051d88f | ||
|
|
85587b257b | ||
|
|
b560c18fb4 | ||
|
|
1cb79e7df0 | ||
|
|
8f2640826f | ||
|
|
21cfaf4c0d | ||
|
|
6f7aa890c2 | ||
|
|
f9310954bd | ||
|
|
64d67b5c58 | ||
|
|
f5dc261da5 | ||
|
|
1f468bed0a | ||
|
|
32fd7f958c | ||
|
|
86dc22caf3 | ||
|
|
07abba71c1 | ||
|
|
6396910992 | ||
|
|
61405b8af8 | ||
|
|
6d728ddad0 | ||
|
|
5b86ca82fe | ||
|
|
7512baaaa3 | ||
|
|
59d817b1bf | ||
|
|
070139a5bc | ||
|
|
2611e85349 | ||
|
|
43581df62b | ||
|
|
61a0c62c46 | ||
|
|
1b4af400cc | ||
|
|
2e32ecfeab | ||
|
|
6b401112e3 | ||
|
|
f31117deba | ||
|
|
ec1a6ef716 | ||
|
|
3c8b254574 | ||
|
|
5513e48ab8 | ||
|
|
f9f36c10cf | ||
|
|
514a571836 | ||
|
|
1d59509d9d | ||
|
|
94a7ba3d23 | ||
|
|
9cc53a5e57 | ||
|
|
da5091430b | ||
|
|
8a860e32e1 | ||
|
|
047a8df2ac | ||
|
|
83deab214e | ||
|
|
72b7894c70 | ||
|
|
012a8f1567 | ||
|
|
a5f8de9882 | ||
|
|
c38c784eaa | ||
|
|
68e83c124f | ||
|
|
17b09f7177 | ||
|
|
864e95355f | ||
|
|
0c279ffccd | ||
|
|
f9bd004048 | ||
|
|
f62d1aa781 | ||
|
|
388223f2e0 | ||
|
|
fcea39d36b | ||
|
|
00f3e329d9 | ||
|
|
caa1588a92 | ||
|
|
a0be4652e6 | ||
|
|
701f384994 | ||
|
|
2e487cac34 | ||
|
|
846dbecf45 | ||
|
|
a706743372 | ||
|
|
7242a4a76e | ||
|
|
6cbf7fabcf | ||
|
|
5b9fa871bd | ||
|
|
71a042d9fc | ||
|
|
bc339320ab | ||
|
|
779d2e8aaf | ||
|
|
05c328d0e3 | ||
|
|
719fb172c8 | ||
|
|
feb97f7f1b | ||
|
|
c9c14beab3 | ||
|
|
9fe81795bc | ||
|
|
85489f6c42 | ||
|
|
7a0d80760e |
17
.github/workflows/python-package.yml
vendored
17
.github/workflows/python-package.yml
vendored
@@ -20,21 +20,29 @@ jobs:
|
||||
python-version: ["3.10"]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
|
||||
|
||||
- name: Install wheel
|
||||
run: |
|
||||
python -m pip install wheel
|
||||
|
||||
- name: Build wheel
|
||||
run: |
|
||||
python setup.py bdist_wheel
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
@@ -61,4 +69,9 @@ jobs:
|
||||
with:
|
||||
files: './dist/*.whl'
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
|
||||
GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
|
||||
# - name: Publish to PyPI
|
||||
# uses: pypa/gh-action-pypi-publish@release/v1
|
||||
# with:
|
||||
# user: __token__
|
||||
# password: ${{ secrets.PYPI_TOKEN }}
|
||||
|
||||
661
LICENSE.md
Normal file
661
LICENSE.md
Normal file
@@ -0,0 +1,661 @@
|
||||
GNU AFFERO GENERAL PUBLIC LICENSE
|
||||
Version 3, 19 November 2007
|
||||
|
||||
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
Preamble
|
||||
|
||||
The GNU Affero General Public License is a free, copyleft license for
|
||||
software and other kinds of works, specifically designed to ensure
|
||||
cooperation with the community in the case of network server software.
|
||||
|
||||
The licenses for most software and other practical works are designed
|
||||
to take away your freedom to share and change the works. By contrast,
|
||||
our General Public Licenses are intended to guarantee your freedom to
|
||||
share and change all versions of a program--to make sure it remains free
|
||||
software for all its users.
|
||||
|
||||
When we speak of free software, we are referring to freedom, not
|
||||
price. Our General Public Licenses are designed to make sure that you
|
||||
have the freedom to distribute copies of free software (and charge for
|
||||
them if you wish), that you receive source code or can get it if you
|
||||
want it, that you can change the software or use pieces of it in new
|
||||
free programs, and that you know you can do these things.
|
||||
|
||||
Developers that use our General Public Licenses protect your rights
|
||||
with two steps: (1) assert copyright on the software, and (2) offer
|
||||
you this License which gives you legal permission to copy, distribute
|
||||
and/or modify the software.
|
||||
|
||||
A secondary benefit of defending all users' freedom is that
|
||||
improvements made in alternate versions of the program, if they
|
||||
receive widespread use, become available for other developers to
|
||||
incorporate. Many developers of free software are heartened and
|
||||
encouraged by the resulting cooperation. However, in the case of
|
||||
software used on network servers, this result may fail to come about.
|
||||
The GNU General Public License permits making a modified version and
|
||||
letting the public access it on a server without ever releasing its
|
||||
source code to the public.
|
||||
|
||||
The GNU Affero General Public License is designed specifically to
|
||||
ensure that, in such cases, the modified source code becomes available
|
||||
to the community. It requires the operator of a network server to
|
||||
provide the source code of the modified version running there to the
|
||||
users of that server. Therefore, public use of a modified version, on
|
||||
a publicly accessible server, gives the public access to the source
|
||||
code of the modified version.
|
||||
|
||||
An older license, called the Affero General Public License and
|
||||
published by Affero, was designed to accomplish similar goals. This is
|
||||
a different license, not a version of the Affero GPL, but Affero has
|
||||
released a new version of the Affero GPL which permits relicensing under
|
||||
this license.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow.
|
||||
|
||||
TERMS AND CONDITIONS
|
||||
|
||||
0. Definitions.
|
||||
|
||||
"This License" refers to version 3 of the GNU Affero General Public License.
|
||||
|
||||
"Copyright" also means copyright-like laws that apply to other kinds of
|
||||
works, such as semiconductor masks.
|
||||
|
||||
"The Program" refers to any copyrightable work licensed under this
|
||||
License. Each licensee is addressed as "you". "Licensees" and
|
||||
"recipients" may be individuals or organizations.
|
||||
|
||||
To "modify" a work means to copy from or adapt all or part of the work
|
||||
in a fashion requiring copyright permission, other than the making of an
|
||||
exact copy. The resulting work is called a "modified version" of the
|
||||
earlier work or a work "based on" the earlier work.
|
||||
|
||||
A "covered work" means either the unmodified Program or a work based
|
||||
on the Program.
|
||||
|
||||
To "propagate" a work means to do anything with it that, without
|
||||
permission, would make you directly or secondarily liable for
|
||||
infringement under applicable copyright law, except executing it on a
|
||||
computer or modifying a private copy. Propagation includes copying,
|
||||
distribution (with or without modification), making available to the
|
||||
public, and in some countries other activities as well.
|
||||
|
||||
To "convey" a work means any kind of propagation that enables other
|
||||
parties to make or receive copies. Mere interaction with a user through
|
||||
a computer network, with no transfer of a copy, is not conveying.
|
||||
|
||||
An interactive user interface displays "Appropriate Legal Notices"
|
||||
to the extent that it includes a convenient and prominently visible
|
||||
feature that (1) displays an appropriate copyright notice, and (2)
|
||||
tells the user that there is no warranty for the work (except to the
|
||||
extent that warranties are provided), that licensees may convey the
|
||||
work under this License, and how to view a copy of this License. If
|
||||
the interface presents a list of user commands or options, such as a
|
||||
menu, a prominent item in the list meets this criterion.
|
||||
|
||||
1. Source Code.
|
||||
|
||||
The "source code" for a work means the preferred form of the work
|
||||
for making modifications to it. "Object code" means any non-source
|
||||
form of a work.
|
||||
|
||||
A "Standard Interface" means an interface that either is an official
|
||||
standard defined by a recognized standards body, or, in the case of
|
||||
interfaces specified for a particular programming language, one that
|
||||
is widely used among developers working in that language.
|
||||
|
||||
The "System Libraries" of an executable work include anything, other
|
||||
than the work as a whole, that (a) is included in the normal form of
|
||||
packaging a Major Component, but which is not part of that Major
|
||||
Component, and (b) serves only to enable use of the work with that
|
||||
Major Component, or to implement a Standard Interface for which an
|
||||
implementation is available to the public in source code form. A
|
||||
"Major Component", in this context, means a major essential component
|
||||
(kernel, window system, and so on) of the specific operating system
|
||||
(if any) on which the executable work runs, or a compiler used to
|
||||
produce the work, or an object code interpreter used to run it.
|
||||
|
||||
The "Corresponding Source" for a work in object code form means all
|
||||
the source code needed to generate, install, and (for an executable
|
||||
work) run the object code and to modify the work, including scripts to
|
||||
control those activities. However, it does not include the work's
|
||||
System Libraries, or general-purpose tools or generally available free
|
||||
programs which are used unmodified in performing those activities but
|
||||
which are not part of the work. For example, Corresponding Source
|
||||
includes interface definition files associated with source files for
|
||||
the work, and the source code for shared libraries and dynamically
|
||||
linked subprograms that the work is specifically designed to require,
|
||||
such as by intimate data communication or control flow between those
|
||||
subprograms and other parts of the work.
|
||||
|
||||
The Corresponding Source need not include anything that users
|
||||
can regenerate automatically from other parts of the Corresponding
|
||||
Source.
|
||||
|
||||
The Corresponding Source for a work in source code form is that
|
||||
same work.
|
||||
|
||||
2. Basic Permissions.
|
||||
|
||||
All rights granted under this License are granted for the term of
|
||||
copyright on the Program, and are irrevocable provided the stated
|
||||
conditions are met. This License explicitly affirms your unlimited
|
||||
permission to run the unmodified Program. The output from running a
|
||||
covered work is covered by this License only if the output, given its
|
||||
content, constitutes a covered work. This License acknowledges your
|
||||
rights of fair use or other equivalent, as provided by copyright law.
|
||||
|
||||
You may make, run and propagate covered works that you do not
|
||||
convey, without conditions so long as your license otherwise remains
|
||||
in force. You may convey covered works to others for the sole purpose
|
||||
of having them make modifications exclusively for you, or provide you
|
||||
with facilities for running those works, provided that you comply with
|
||||
the terms of this License in conveying all material for which you do
|
||||
not control copyright. Those thus making or running the covered works
|
||||
for you must do so exclusively on your behalf, under your direction
|
||||
and control, on terms that prohibit them from making any copies of
|
||||
your copyrighted material outside their relationship with you.
|
||||
|
||||
Conveying under any other circumstances is permitted solely under
|
||||
the conditions stated below. Sublicensing is not allowed; section 10
|
||||
makes it unnecessary.
|
||||
|
||||
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
||||
|
||||
No covered work shall be deemed part of an effective technological
|
||||
measure under any applicable law fulfilling obligations under article
|
||||
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
||||
similar laws prohibiting or restricting circumvention of such
|
||||
measures.
|
||||
|
||||
When you convey a covered work, you waive any legal power to forbid
|
||||
circumvention of technological measures to the extent such circumvention
|
||||
is effected by exercising rights under this License with respect to
|
||||
the covered work, and you disclaim any intention to limit operation or
|
||||
modification of the work as a means of enforcing, against the work's
|
||||
users, your or third parties' legal rights to forbid circumvention of
|
||||
technological measures.
|
||||
|
||||
4. Conveying Verbatim Copies.
|
||||
|
||||
You may convey verbatim copies of the Program's source code as you
|
||||
receive it, in any medium, provided that you conspicuously and
|
||||
appropriately publish on each copy an appropriate copyright notice;
|
||||
keep intact all notices stating that this License and any
|
||||
non-permissive terms added in accord with section 7 apply to the code;
|
||||
keep intact all notices of the absence of any warranty; and give all
|
||||
recipients a copy of this License along with the Program.
|
||||
|
||||
You may charge any price or no price for each copy that you convey,
|
||||
and you may offer support or warranty protection for a fee.
|
||||
|
||||
5. Conveying Modified Source Versions.
|
||||
|
||||
You may convey a work based on the Program, or the modifications to
|
||||
produce it from the Program, in the form of source code under the
|
||||
terms of section 4, provided that you also meet all of these conditions:
|
||||
|
||||
a) The work must carry prominent notices stating that you modified
|
||||
it, and giving a relevant date.
|
||||
|
||||
b) The work must carry prominent notices stating that it is
|
||||
released under this License and any conditions added under section
|
||||
7. This requirement modifies the requirement in section 4 to
|
||||
"keep intact all notices".
|
||||
|
||||
c) You must license the entire work, as a whole, under this
|
||||
License to anyone who comes into possession of a copy. This
|
||||
License will therefore apply, along with any applicable section 7
|
||||
additional terms, to the whole of the work, and all its parts,
|
||||
regardless of how they are packaged. This License gives no
|
||||
permission to license the work in any other way, but it does not
|
||||
invalidate such permission if you have separately received it.
|
||||
|
||||
d) If the work has interactive user interfaces, each must display
|
||||
Appropriate Legal Notices; however, if the Program has interactive
|
||||
interfaces that do not display Appropriate Legal Notices, your
|
||||
work need not make them do so.
|
||||
|
||||
A compilation of a covered work with other separate and independent
|
||||
works, which are not by their nature extensions of the covered work,
|
||||
and which are not combined with it such as to form a larger program,
|
||||
in or on a volume of a storage or distribution medium, is called an
|
||||
"aggregate" if the compilation and its resulting copyright are not
|
||||
used to limit the access or legal rights of the compilation's users
|
||||
beyond what the individual works permit. Inclusion of a covered work
|
||||
in an aggregate does not cause this License to apply to the other
|
||||
parts of the aggregate.
|
||||
|
||||
6. Conveying Non-Source Forms.
|
||||
|
||||
You may convey a covered work in object code form under the terms
|
||||
of sections 4 and 5, provided that you also convey the
|
||||
machine-readable Corresponding Source under the terms of this License,
|
||||
in one of these ways:
|
||||
|
||||
a) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by the
|
||||
Corresponding Source fixed on a durable physical medium
|
||||
customarily used for software interchange.
|
||||
|
||||
b) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by a
|
||||
written offer, valid for at least three years and valid for as
|
||||
long as you offer spare parts or customer support for that product
|
||||
model, to give anyone who possesses the object code either (1) a
|
||||
copy of the Corresponding Source for all the software in the
|
||||
product that is covered by this License, on a durable physical
|
||||
medium customarily used for software interchange, for a price no
|
||||
more than your reasonable cost of physically performing this
|
||||
conveying of source, or (2) access to copy the
|
||||
Corresponding Source from a network server at no charge.
|
||||
|
||||
c) Convey individual copies of the object code with a copy of the
|
||||
written offer to provide the Corresponding Source. This
|
||||
alternative is allowed only occasionally and noncommercially, and
|
||||
only if you received the object code with such an offer, in accord
|
||||
with subsection 6b.
|
||||
|
||||
d) Convey the object code by offering access from a designated
|
||||
place (gratis or for a charge), and offer equivalent access to the
|
||||
Corresponding Source in the same way through the same place at no
|
||||
further charge. You need not require recipients to copy the
|
||||
Corresponding Source along with the object code. If the place to
|
||||
copy the object code is a network server, the Corresponding Source
|
||||
may be on a different server (operated by you or a third party)
|
||||
that supports equivalent copying facilities, provided you maintain
|
||||
clear directions next to the object code saying where to find the
|
||||
Corresponding Source. Regardless of what server hosts the
|
||||
Corresponding Source, you remain obligated to ensure that it is
|
||||
available for as long as needed to satisfy these requirements.
|
||||
|
||||
e) Convey the object code using peer-to-peer transmission, provided
|
||||
you inform other peers where the object code and Corresponding
|
||||
Source of the work are being offered to the general public at no
|
||||
charge under subsection 6d.
|
||||
|
||||
A separable portion of the object code, whose source code is excluded
|
||||
from the Corresponding Source as a System Library, need not be
|
||||
included in conveying the object code work.
|
||||
|
||||
A "User Product" is either (1) a "consumer product", which means any
|
||||
tangible personal property which is normally used for personal, family,
|
||||
or household purposes, or (2) anything designed or sold for incorporation
|
||||
into a dwelling. In determining whether a product is a consumer product,
|
||||
doubtful cases shall be resolved in favor of coverage. For a particular
|
||||
product received by a particular user, "normally used" refers to a
|
||||
typical or common use of that class of product, regardless of the status
|
||||
of the particular user or of the way in which the particular user
|
||||
actually uses, or expects or is expected to use, the product. A product
|
||||
is a consumer product regardless of whether the product has substantial
|
||||
commercial, industrial or non-consumer uses, unless such uses represent
|
||||
the only significant mode of use of the product.
|
||||
|
||||
"Installation Information" for a User Product means any methods,
|
||||
procedures, authorization keys, or other information required to install
|
||||
and execute modified versions of a covered work in that User Product from
|
||||
a modified version of its Corresponding Source. The information must
|
||||
suffice to ensure that the continued functioning of the modified object
|
||||
code is in no case prevented or interfered with solely because
|
||||
modification has been made.
|
||||
|
||||
If you convey an object code work under this section in, or with, or
|
||||
specifically for use in, a User Product, and the conveying occurs as
|
||||
part of a transaction in which the right of possession and use of the
|
||||
User Product is transferred to the recipient in perpetuity or for a
|
||||
fixed term (regardless of how the transaction is characterized), the
|
||||
Corresponding Source conveyed under this section must be accompanied
|
||||
by the Installation Information. But this requirement does not apply
|
||||
if neither you nor any third party retains the ability to install
|
||||
modified object code on the User Product (for example, the work has
|
||||
been installed in ROM).
|
||||
|
||||
The requirement to provide Installation Information does not include a
|
||||
requirement to continue to provide support service, warranty, or updates
|
||||
for a work that has been modified or installed by the recipient, or for
|
||||
the User Product in which it has been modified or installed. Access to a
|
||||
network may be denied when the modification itself materially and
|
||||
adversely affects the operation of the network or violates the rules and
|
||||
protocols for communication across the network.
|
||||
|
||||
Corresponding Source conveyed, and Installation Information provided,
|
||||
in accord with this section must be in a format that is publicly
|
||||
documented (and with an implementation available to the public in
|
||||
source code form), and must require no special password or key for
|
||||
unpacking, reading or copying.
|
||||
|
||||
7. Additional Terms.
|
||||
|
||||
"Additional permissions" are terms that supplement the terms of this
|
||||
License by making exceptions from one or more of its conditions.
|
||||
Additional permissions that are applicable to the entire Program shall
|
||||
be treated as though they were included in this License, to the extent
|
||||
that they are valid under applicable law. If additional permissions
|
||||
apply only to part of the Program, that part may be used separately
|
||||
under those permissions, but the entire Program remains governed by
|
||||
this License without regard to the additional permissions.
|
||||
|
||||
When you convey a copy of a covered work, you may at your option
|
||||
remove any additional permissions from that copy, or from any part of
|
||||
it. (Additional permissions may be written to require their own
|
||||
removal in certain cases when you modify the work.) You may place
|
||||
additional permissions on material, added by you to a covered work,
|
||||
for which you have or can give appropriate copyright permission.
|
||||
|
||||
Notwithstanding any other provision of this License, for material you
|
||||
add to a covered work, you may (if authorized by the copyright holders of
|
||||
that material) supplement the terms of this License with terms:
|
||||
|
||||
a) Disclaiming warranty or limiting liability differently from the
|
||||
terms of sections 15 and 16 of this License; or
|
||||
|
||||
b) Requiring preservation of specified reasonable legal notices or
|
||||
author attributions in that material or in the Appropriate Legal
|
||||
Notices displayed by works containing it; or
|
||||
|
||||
c) Prohibiting misrepresentation of the origin of that material, or
|
||||
requiring that modified versions of such material be marked in
|
||||
reasonable ways as different from the original version; or
|
||||
|
||||
d) Limiting the use for publicity purposes of names of licensors or
|
||||
authors of the material; or
|
||||
|
||||
e) Declining to grant rights under trademark law for use of some
|
||||
trade names, trademarks, or service marks; or
|
||||
|
||||
f) Requiring indemnification of licensors and authors of that
|
||||
material by anyone who conveys the material (or modified versions of
|
||||
it) with contractual assumptions of liability to the recipient, for
|
||||
any liability that these contractual assumptions directly impose on
|
||||
those licensors and authors.
|
||||
|
||||
All other non-permissive additional terms are considered "further
|
||||
restrictions" within the meaning of section 10. If the Program as you
|
||||
received it, or any part of it, contains a notice stating that it is
|
||||
governed by this License along with a term that is a further
|
||||
restriction, you may remove that term. If a license document contains
|
||||
a further restriction but permits relicensing or conveying under this
|
||||
License, you may add to a covered work material governed by the terms
|
||||
of that license document, provided that the further restriction does
|
||||
not survive such relicensing or conveying.
|
||||
|
||||
If you add terms to a covered work in accord with this section, you
|
||||
must place, in the relevant source files, a statement of the
|
||||
additional terms that apply to those files, or a notice indicating
|
||||
where to find the applicable terms.
|
||||
|
||||
Additional terms, permissive or non-permissive, may be stated in the
|
||||
form of a separately written license, or stated as exceptions;
|
||||
the above requirements apply either way.
|
||||
|
||||
8. Termination.
|
||||
|
||||
You may not propagate or modify a covered work except as expressly
|
||||
provided under this License. Any attempt otherwise to propagate or
|
||||
modify it is void, and will automatically terminate your rights under
|
||||
this License (including any patent licenses granted under the third
|
||||
paragraph of section 11).
|
||||
|
||||
However, if you cease all violation of this License, then your
|
||||
license from a particular copyright holder is reinstated (a)
|
||||
provisionally, unless and until the copyright holder explicitly and
|
||||
finally terminates your license, and (b) permanently, if the copyright
|
||||
holder fails to notify you of the violation by some reasonable means
|
||||
prior to 60 days after the cessation.
|
||||
|
||||
Moreover, your license from a particular copyright holder is
|
||||
reinstated permanently if the copyright holder notifies you of the
|
||||
violation by some reasonable means, this is the first time you have
|
||||
received notice of violation of this License (for any work) from that
|
||||
copyright holder, and you cure the violation prior to 30 days after
|
||||
your receipt of the notice.
|
||||
|
||||
Termination of your rights under this section does not terminate the
|
||||
licenses of parties who have received copies or rights from you under
|
||||
this License. If your rights have been terminated and not permanently
|
||||
reinstated, you do not qualify to receive new licenses for the same
|
||||
material under section 10.
|
||||
|
||||
9. Acceptance Not Required for Having Copies.
|
||||
|
||||
You are not required to accept this License in order to receive or
|
||||
run a copy of the Program. Ancillary propagation of a covered work
|
||||
occurring solely as a consequence of using peer-to-peer transmission
|
||||
to receive a copy likewise does not require acceptance. However,
|
||||
nothing other than this License grants you permission to propagate or
|
||||
modify any covered work. These actions infringe copyright if you do
|
||||
not accept this License. Therefore, by modifying or propagating a
|
||||
covered work, you indicate your acceptance of this License to do so.
|
||||
|
||||
10. Automatic Licensing of Downstream Recipients.
|
||||
|
||||
Each time you convey a covered work, the recipient automatically
|
||||
receives a license from the original licensors, to run, modify and
|
||||
propagate that work, subject to this License. You are not responsible
|
||||
for enforcing compliance by third parties with this License.
|
||||
|
||||
An "entity transaction" is a transaction transferring control of an
|
||||
organization, or substantially all assets of one, or subdividing an
|
||||
organization, or merging organizations. If propagation of a covered
|
||||
work results from an entity transaction, each party to that
|
||||
transaction who receives a copy of the work also receives whatever
|
||||
licenses to the work the party's predecessor in interest had or could
|
||||
give under the previous paragraph, plus a right to possession of the
|
||||
Corresponding Source of the work from the predecessor in interest, if
|
||||
the predecessor has it or can get it with reasonable efforts.
|
||||
|
||||
You may not impose any further restrictions on the exercise of the
|
||||
rights granted or affirmed under this License. For example, you may
|
||||
not impose a license fee, royalty, or other charge for exercise of
|
||||
rights granted under this License, and you may not initiate litigation
|
||||
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
||||
any patent claim is infringed by making, using, selling, offering for
|
||||
sale, or importing the Program or any portion of it.
|
||||
|
||||
11. Patents.
|
||||
|
||||
A "contributor" is a copyright holder who authorizes use under this
|
||||
License of the Program or a work on which the Program is based. The
|
||||
work thus licensed is called the contributor's "contributor version".
|
||||
|
||||
A contributor's "essential patent claims" are all patent claims
|
||||
owned or controlled by the contributor, whether already acquired or
|
||||
hereafter acquired, that would be infringed by some manner, permitted
|
||||
by this License, of making, using, or selling its contributor version,
|
||||
but do not include claims that would be infringed only as a
|
||||
consequence of further modification of the contributor version. For
|
||||
purposes of this definition, "control" includes the right to grant
|
||||
patent sublicenses in a manner consistent with the requirements of
|
||||
this License.
|
||||
|
||||
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
||||
patent license under the contributor's essential patent claims, to
|
||||
make, use, sell, offer for sale, import and otherwise run, modify and
|
||||
propagate the contents of its contributor version.
|
||||
|
||||
In the following three paragraphs, a "patent license" is any express
|
||||
agreement or commitment, however denominated, not to enforce a patent
|
||||
(such as an express permission to practice a patent or covenant not to
|
||||
sue for patent infringement). To "grant" such a patent license to a
|
||||
party means to make such an agreement or commitment not to enforce a
|
||||
patent against the party.
|
||||
|
||||
If you convey a covered work, knowingly relying on a patent license,
|
||||
and the Corresponding Source of the work is not available for anyone
|
||||
to copy, free of charge and under the terms of this License, through a
|
||||
publicly available network server or other readily accessible means,
|
||||
then you must either (1) cause the Corresponding Source to be so
|
||||
available, or (2) arrange to deprive yourself of the benefit of the
|
||||
patent license for this particular work, or (3) arrange, in a manner
|
||||
consistent with the requirements of this License, to extend the patent
|
||||
license to downstream recipients. "Knowingly relying" means you have
|
||||
actual knowledge that, but for the patent license, your conveying the
|
||||
covered work in a country, or your recipient's use of the covered work
|
||||
in a country, would infringe one or more identifiable patents in that
|
||||
country that you have reason to believe are valid.
|
||||
|
||||
If, pursuant to or in connection with a single transaction or
|
||||
arrangement, you convey, or propagate by procuring conveyance of, a
|
||||
covered work, and grant a patent license to some of the parties
|
||||
receiving the covered work authorizing them to use, propagate, modify
|
||||
or convey a specific copy of the covered work, then the patent license
|
||||
you grant is automatically extended to all recipients of the covered
|
||||
work and works based on it.
|
||||
|
||||
A patent license is "discriminatory" if it does not include within
|
||||
the scope of its coverage, prohibits the exercise of, or is
|
||||
conditioned on the non-exercise of one or more of the rights that are
|
||||
specifically granted under this License. You may not convey a covered
|
||||
work if you are a party to an arrangement with a third party that is
|
||||
in the business of distributing software, under which you make payment
|
||||
to the third party based on the extent of your activity of conveying
|
||||
the work, and under which the third party grants, to any of the
|
||||
parties who would receive the covered work from you, a discriminatory
|
||||
patent license (a) in connection with copies of the covered work
|
||||
conveyed by you (or copies made from those copies), or (b) primarily
|
||||
for and in connection with specific products or compilations that
|
||||
contain the covered work, unless you entered into that arrangement,
|
||||
or that patent license was granted, prior to 28 March 2007.
|
||||
|
||||
Nothing in this License shall be construed as excluding or limiting
|
||||
any implied license or other defenses to infringement that may
|
||||
otherwise be available to you under applicable patent law.
|
||||
|
||||
12. No Surrender of Others' Freedom.
|
||||
|
||||
If conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot convey a
|
||||
covered work so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you may
|
||||
not convey it at all. For example, if you agree to terms that obligate you
|
||||
to collect a royalty for further conveying from those to whom you convey
|
||||
the Program, the only way you could satisfy both those terms and this
|
||||
License would be to refrain entirely from conveying the Program.
|
||||
|
||||
13. Remote Network Interaction; Use with the GNU General Public License.
|
||||
|
||||
Notwithstanding any other provision of this License, if you modify the
|
||||
Program, your modified version must prominently offer all users
|
||||
interacting with it remotely through a computer network (if your version
|
||||
supports such interaction) an opportunity to receive the Corresponding
|
||||
Source of your version by providing access to the Corresponding Source
|
||||
from a network server at no charge, through some standard or customary
|
||||
means of facilitating copying of software. This Corresponding Source
|
||||
shall include the Corresponding Source for any work covered by version 3
|
||||
of the GNU General Public License that is incorporated pursuant to the
|
||||
following paragraph.
|
||||
|
||||
Notwithstanding any other provision of this License, you have
|
||||
permission to link or combine any covered work with a work licensed
|
||||
under version 3 of the GNU General Public License into a single
|
||||
combined work, and to convey the resulting work. The terms of this
|
||||
License will continue to apply to the part which is the covered work,
|
||||
but the work with which it is combined will remain governed by version
|
||||
3 of the GNU General Public License.
|
||||
|
||||
14. Revised Versions of this License.
|
||||
|
||||
The Free Software Foundation may publish revised and/or new versions of
|
||||
the GNU Affero General Public License from time to time. Such new versions
|
||||
will be similar in spirit to the present version, but may differ in detail to
|
||||
address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the
|
||||
Program specifies that a certain numbered version of the GNU Affero General
|
||||
Public License "or any later version" applies to it, you have the
|
||||
option of following the terms and conditions either of that numbered
|
||||
version or of any later version published by the Free Software
|
||||
Foundation. If the Program does not specify a version number of the
|
||||
GNU Affero General Public License, you may choose any version ever published
|
||||
by the Free Software Foundation.
|
||||
|
||||
If the Program specifies that a proxy can decide which future
|
||||
versions of the GNU Affero General Public License can be used, that proxy's
|
||||
public statement of acceptance of a version permanently authorizes you
|
||||
to choose that version for the Program.
|
||||
|
||||
Later license versions may give you additional or different
|
||||
permissions. However, no additional obligations are imposed on any
|
||||
author or copyright holder as a result of your choosing to follow a
|
||||
later version.
|
||||
|
||||
15. Disclaimer of Warranty.
|
||||
|
||||
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
||||
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
||||
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
||||
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
||||
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
||||
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
||||
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||
|
||||
16. Limitation of Liability.
|
||||
|
||||
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
||||
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
||||
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
||||
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
||||
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
||||
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
||||
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
||||
SUCH DAMAGES.
|
||||
|
||||
17. Interpretation of Sections 15 and 16.
|
||||
|
||||
If the disclaimer of warranty and limitation of liability provided
|
||||
above cannot be given local legal effect according to their terms,
|
||||
reviewing courts shall apply local law that most closely approximates
|
||||
an absolute waiver of all civil liability in connection with the
|
||||
Program, unless a warranty or assumption of liability accompanies a
|
||||
copy of the Program in return for a fee.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
How to Apply These Terms to Your New Programs
|
||||
|
||||
If you develop a new program, and you want it to be of the greatest
|
||||
possible use to the public, the best way to achieve this is to make it
|
||||
free software which everyone can redistribute and change under these terms.
|
||||
|
||||
To do so, attach the following notices to the program. It is safest
|
||||
to attach them to the start of each source file to most effectively
|
||||
state the exclusion of warranty; and each file should have at least
|
||||
the "copyright" line and a pointer to where the full notice is found.
|
||||
|
||||
<one line to give the program's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as published
|
||||
by the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
|
||||
If your software can interact with users remotely through a computer
|
||||
network, you should also make sure that it provides a way for users to
|
||||
get its source. For example, if your program is a web application, its
|
||||
interface could display a "Source" link that leads users to an archive
|
||||
of the code. There are many ways you could offer source, and different
|
||||
solutions will be better for different programs; see section 13 for the
|
||||
specific requirements.
|
||||
|
||||
You should also get your employer (if you work as a programmer) or school,
|
||||
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
||||
For more information on this, and how to apply and follow the GNU AGPL, see
|
||||
<https://www.gnu.org/licenses/>.
|
||||
44
README.md
44
README.md
@@ -1,19 +1,43 @@
|
||||
# pdf_toolbox
|
||||
pdf 解析基础函数
|
||||
|
||||
|
||||
## pdf是否是文字类型/扫描类型的区分
|
||||
# Magic-PDF
|
||||
|
||||
```shell
|
||||
cat s3_pdf_path.example.pdf | parallel --colsep ' ' -j 10 "python pdf_meta_scan.py --s3-pdf-path {2} --s3-profile {1} >> {/}.jsonl"
|
||||
便捷、准确的将PDF转换成Markdown文档
|
||||
|
||||
find dir/to/jsonl/ -type f -name "*.jsonl" | parallel -j 10 "python pdf_classfy_by_type.py --json_file {} >> {/}.jsonl"
|
||||
|
||||
### 上手指南
|
||||
|
||||
###### 开发前的配置要求
|
||||
|
||||
python 3.9+
|
||||
|
||||
###### **安装步骤**
|
||||
|
||||
1.Clone the repo
|
||||
|
||||
```sh
|
||||
git clone https://github.com/myhloli/Magic-PDF.git
|
||||
```
|
||||
|
||||
```shell
|
||||
# 如果单独运行脚本,合并到code-clean之后需要运行,参考如下:
|
||||
python -m pdf_meta_scan --s3-pdf-path "D:\pdf_files\内容排序测试_pdf\p3_图文混排 5.pdf" --s3-profile s2
|
||||
2.Install the requirements
|
||||
|
||||
```sh
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## pdf
|
||||
3.Run the main script
|
||||
|
||||
```sh
|
||||
use demo/demo_test.py
|
||||
```
|
||||
|
||||
### 版权说明
|
||||
|
||||
[LICENSE.md](https://github.com/myhloli/Magic-PDF/blob/master/LICENSE.md)
|
||||
|
||||
### 鸣谢
|
||||
|
||||
- [PyMuPDF](https://github.com/pymupdf/PyMuPDF)
|
||||
|
||||
|
||||
|
||||
|
||||
163
demo/demo_test.py
Normal file
163
demo/demo_test.py
Normal file
@@ -0,0 +1,163 @@
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
|
||||
from magic_pdf.pipeline import (
|
||||
meta_scan,
|
||||
classify_by_type,
|
||||
parse_pdf,
|
||||
pdf_intermediate_dict_to_markdown,
|
||||
save_tables_to_s3,
|
||||
)
|
||||
from magic_pdf.libs.commons import join_path, read_file, json_dump_path
|
||||
from app.common.s3 import get_s3_config
|
||||
from loguru import logger
|
||||
|
||||
|
||||
local_json_path = "Z:/format.json"
|
||||
local_jsonl_path = "Z:/format.jsonl"
|
||||
|
||||
|
||||
def get_json_from_local_or_s3(book_name=None):
|
||||
if book_name is None:
|
||||
with open(local_json_path, "r", encoding="utf-8") as json_file:
|
||||
json_line = json_file.read()
|
||||
json_object = json.loads(json_line)
|
||||
else:
|
||||
# error_log_path & json_dump_path
|
||||
# 可配置从上述两个地址获取源json
|
||||
json_path = join_path(json_dump_path, book_name + ".json")
|
||||
s3_config = get_s3_config(json_path)
|
||||
file_content = read_file(json_path, s3_config)
|
||||
json_str = file_content.decode("utf-8")
|
||||
logger.info(json_str)
|
||||
json_object = json.loads(json_str)
|
||||
return json_object
|
||||
|
||||
|
||||
def write_json_to_local(jso, book_name=None):
|
||||
if book_name is None:
|
||||
with open(local_json_path, "w", encoding="utf-8") as file:
|
||||
file.write(json.dumps(jso, ensure_ascii=False))
|
||||
else:
|
||||
pass
|
||||
|
||||
|
||||
def demo_parse_pdf(book_name=None, start_page_id=0, debug_mode=True):
|
||||
json_object = get_json_from_local_or_s3(book_name)
|
||||
|
||||
jso = parse_pdf(json_object, start_page_id=start_page_id, debug_mode=debug_mode)
|
||||
logger.info(f"pdf_parse_time: {jso['parse_time']}")
|
||||
|
||||
write_json_to_local(jso, book_name)
|
||||
|
||||
jso_md = pdf_intermediate_dict_to_markdown(jso, debug_mode=debug_mode)
|
||||
md_content = jso_md.get("content")
|
||||
if book_name is not None:
|
||||
save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
|
||||
markdown_save_path = join_path(save_tmp_path, "md", book_name + ".md")
|
||||
with open(markdown_save_path, "w", encoding="utf-8") as f:
|
||||
f.write(md_content)
|
||||
else:
|
||||
logger.info(md_content)
|
||||
|
||||
|
||||
def demo_save_tables(book_name=None, start_page_id=0, debug_mode=True):
|
||||
json_object = get_json_from_local_or_s3(book_name)
|
||||
|
||||
jso = parse_pdf(json_object, start_page_id=start_page_id, debug_mode=debug_mode)
|
||||
logger.info(f"pdf_parse_time: {jso['parse_time']}")
|
||||
|
||||
write_json_to_local(jso, book_name)
|
||||
|
||||
save_tables_to_s3(jso, debug_mode=debug_mode)
|
||||
|
||||
|
||||
def demo_classify_by_type(book_name=None, debug_mode=True):
|
||||
json_object = get_json_from_local_or_s3(book_name)
|
||||
|
||||
jso = classify_by_type(json_object, debug_mode=debug_mode)
|
||||
|
||||
logger.info(json.dumps(jso, ensure_ascii=False))
|
||||
logger.info(f"classify_time: {jso['classify_time']}")
|
||||
write_json_to_local(jso, book_name)
|
||||
|
||||
|
||||
def demo_meta_scan(book_name=None, debug_mode=True):
|
||||
json_object = get_json_from_local_or_s3(book_name)
|
||||
|
||||
# doc_layout_check=False
|
||||
jso = meta_scan(json_object, doc_layout_check=True)
|
||||
|
||||
logger.info(json.dumps(jso, ensure_ascii=False))
|
||||
logger.info(f"meta_scan_time: {jso['meta_scan_time']}")
|
||||
write_json_to_local(jso, book_name)
|
||||
|
||||
|
||||
def demo_meta_scan_from_jsonl():
|
||||
with open(local_jsonl_path, "r", encoding="utf-8") as jsonl_file:
|
||||
for line in jsonl_file:
|
||||
jso = json.loads(line)
|
||||
jso = meta_scan(jso)
|
||||
logger.info(f"pdf_path: {jso['content']['pdf_path']}")
|
||||
logger.info(f"read_file_time: {jso['read_file_time']}")
|
||||
logger.info(f"meta_scan_time: {jso['meta_scan_time']}")
|
||||
|
||||
|
||||
def demo_test5():
|
||||
with open(local_json_path, "r", encoding="utf-8") as json_file:
|
||||
json_line = json_file.read()
|
||||
jso = json.loads(json_line)
|
||||
img_list_len = len(jso["content"]["image_info_per_page"])
|
||||
logger.info(f"img_list_len: {img_list_len}")
|
||||
|
||||
|
||||
|
||||
def read_more_para_test_samples(type="scihub"):
|
||||
# 读取多段落测试样本
|
||||
curr_dir = Path(__file__).parent
|
||||
files_path = ""
|
||||
if type == "gift":
|
||||
relative_path = "../tests/assets/more_para_test_samples/gift_files.txt"
|
||||
files_path = os.path.join(curr_dir, relative_path)
|
||||
|
||||
if type == "scihub":
|
||||
relative_path = "../tests/assets/more_para_test_samples/scihub_files.txt"
|
||||
files_path = os.path.join(curr_dir, relative_path)
|
||||
|
||||
if type == "zlib":
|
||||
relative_path = "../tests/assets/more_para_test_samples/zlib_files.txt"
|
||||
files_path = os.path.join(curr_dir, relative_path)
|
||||
|
||||
# check if file exists
|
||||
if not os.path.exists(files_path):
|
||||
print("File not exist!")
|
||||
sys.exit(0)
|
||||
|
||||
with open(files_path, "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
# print("lines", lines)
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def batch_test_more_para(type="scihub"):
|
||||
# 批量测试多段落
|
||||
para_test_files = read_more_para_test_samples(type)
|
||||
for file in para_test_files:
|
||||
file = file.strip()
|
||||
print(file)
|
||||
demo_parse_pdf(book_name=file)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option("--book-name", help="s3上pdf文件的路径")
|
||||
def main(book_name: str):
|
||||
demo_parse_pdf(book_name, start_page_id=0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -2,7 +2,7 @@ import json
|
||||
import os
|
||||
from tqdm import tqdm
|
||||
|
||||
from magic_pdf.libs import join_path
|
||||
from magic_pdf.libs.commons import join_path
|
||||
|
||||
with open('/mnt/petrelfs/share_data/ouyanglinke/OCR/OCR_validation_dataset.json', 'r') as f:
|
||||
samples = json.load(f)
|
||||
|
||||
@@ -1,20 +0,0 @@
|
||||
from magic_pdf.libs import fitz # PyMuPDF
|
||||
|
||||
# PDF文件路径
|
||||
pdf_path = "D:\\project\\20231108code-clean\\code-clean\\tmp\\unittest\\download-pdfs\\scihub\\scihub_53700000\\libgen.scimag53724000-53724999.zip_10.1097\\00129191-200509000-00018.pdf"
|
||||
|
||||
doc = fitz.open(pdf_path) # Open the PDF
|
||||
# 你的数据
|
||||
data = [[[-2, 0, 603, 80, 24]], [[-3, 0, 602, 80, 24]]]
|
||||
|
||||
# 对每个页面进行处理
|
||||
for i, page in enumerate(doc):
|
||||
# 获取当前页面的数据
|
||||
page_data = data[i]
|
||||
for img in page_data:
|
||||
x0, y0, x1, y1, _ = img
|
||||
rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
|
||||
page.draw_rect(rect_coords, color=(1, 0, 0), fill=None, width=1.5, overlay=True) # Draw the rectangle
|
||||
|
||||
# Save the PDF
|
||||
doc.save("D:\\project\\20231108code-clean\\code-clean\\tmp\\unittest\\download-pdfs\\scihub\\scihub_53700000\\libgen.scimag53724000-53724999.zip_10.1097\\00129191-200509000-00018_new.pdf")
|
||||
68
demo/ocr_demo.py
Normal file
68
demo/ocr_demo.py
Normal file
@@ -0,0 +1,68 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
from loguru import logger
|
||||
from pathlib import Path
|
||||
|
||||
from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_nlp_markdown, ocr_mk_mm_markdown
|
||||
from magic_pdf.libs.commons import join_path
|
||||
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
|
||||
|
||||
|
||||
def save_markdown(markdown_text, input_filepath):
|
||||
# 获取输入文件的目录
|
||||
directory = os.path.dirname(input_filepath)
|
||||
# 获取输入文件的文件名(不带扩展名)
|
||||
base_name = os.path.basename(input_filepath)
|
||||
file_name_without_ext = os.path.splitext(base_name)[0]
|
||||
# 定义输出文件的路径
|
||||
output_filepath = os.path.join(directory, f"{file_name_without_ext}.md")
|
||||
|
||||
# 将Markdown文本写入.md文件
|
||||
with open(output_filepath, 'w', encoding='utf-8') as file:
|
||||
file.write(markdown_text)
|
||||
|
||||
|
||||
def read_json_file(file_path):
|
||||
with open(file_path, 'r') as f:
|
||||
data = json.load(f)
|
||||
return data
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.pdf"
|
||||
ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\s0043-1354(02)00581-x.json"
|
||||
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.pdf"
|
||||
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\双栏\j.1540-627x.2006.00176.x.json"
|
||||
# ocr_pdf_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_org.pdf"
|
||||
# ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_demo\ocr_1_fix.json"
|
||||
try:
|
||||
ocr_pdf_model_info = read_json_file(ocr_json_file_path)
|
||||
pth = Path(ocr_json_file_path)
|
||||
book_name = pth.name
|
||||
save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
|
||||
save_path = join_path(save_tmp_path, "md")
|
||||
save_path_with_bookname = os.path.join(save_path, book_name)
|
||||
text_content_save_path = f"{save_path_with_bookname}/book.md"
|
||||
pdf_info_dict = parse_pdf_by_ocr(
|
||||
ocr_pdf_path,
|
||||
None,
|
||||
ocr_pdf_model_info,
|
||||
save_path,
|
||||
book_name,
|
||||
debug_mode=True)
|
||||
|
||||
parent_dir = os.path.dirname(text_content_save_path)
|
||||
if not os.path.exists(parent_dir):
|
||||
os.makedirs(parent_dir)
|
||||
|
||||
# markdown_content = ocr_mk_nlp_markdown(pdf_info_dict)
|
||||
markdown_content = ocr_mk_mm_markdown(pdf_info_dict)
|
||||
|
||||
with open(text_content_save_path, "w", encoding="utf-8") as f:
|
||||
f.write(markdown_content)
|
||||
|
||||
# logger.info(markdown_content)
|
||||
# save_markdown(markdown_text, ocr_json_file_path)
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
@@ -5,7 +5,7 @@ from pathlib import Path
|
||||
import click
|
||||
from loguru import logger
|
||||
|
||||
from magic_pdf.libs import join_path
|
||||
from magic_pdf.libs.commons import join_path
|
||||
from magic_pdf.dict2md.mkcontent import mk_mm_markdown
|
||||
from magic_pdf.pipeline import parse_pdf_by_model
|
||||
|
||||
|
||||
63
magic_pdf/dict2md/ocr_mkcontent.py
Normal file
63
magic_pdf/dict2md/ocr_mkcontent.py
Normal file
@@ -0,0 +1,63 @@
|
||||
from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
|
||||
from magic_pdf.libs.ocr_content_type import ContentType
|
||||
|
||||
|
||||
def ocr_mk_nlp_markdown(pdf_info_dict: dict):
|
||||
markdown = []
|
||||
|
||||
for _, page_info in pdf_info_dict.items():
|
||||
blocks = page_info.get("preproc_blocks")
|
||||
if not blocks:
|
||||
continue
|
||||
for block in blocks:
|
||||
for line in block['lines']:
|
||||
line_text = ''
|
||||
for span in line['spans']:
|
||||
if not span.get('content'):
|
||||
continue
|
||||
content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号
|
||||
if span['type'] == ContentType.InlineEquation:
|
||||
content = f"${content}$"
|
||||
elif span['type'] == ContentType.InterlineEquation:
|
||||
content = f"$$\n{content}\n$$"
|
||||
line_text += content + ' '
|
||||
# 在行末添加两个空格以强制换行
|
||||
markdown.append(line_text.strip() + ' ')
|
||||
return '\n'.join(markdown)
|
||||
|
||||
|
||||
def ocr_mk_mm_markdown(pdf_info_dict: dict):
|
||||
|
||||
markdown = []
|
||||
|
||||
for _, page_info in pdf_info_dict.items():
|
||||
blocks = page_info.get("preproc_blocks")
|
||||
if not blocks:
|
||||
continue
|
||||
for block in blocks:
|
||||
for line in block['lines']:
|
||||
line_text = ''
|
||||
for span in line['spans']:
|
||||
if not span.get('content'):
|
||||
if not span.get('image_path'):
|
||||
continue
|
||||
else:
|
||||
content = f""
|
||||
else:
|
||||
content = ocr_escape_special_markdown_char(span['content']) # 转义特殊符号
|
||||
if span['type'] == ContentType.InlineEquation:
|
||||
content = f"${content}$"
|
||||
elif span['type'] == ContentType.InterlineEquation:
|
||||
content = f"$$\n{content}\n$$"
|
||||
line_text += content + ' '
|
||||
# 在行末添加两个空格以强制换行
|
||||
markdown.append(line_text.strip() + ' ')
|
||||
return '\n'.join(markdown)
|
||||
|
||||
def ocr_mk_mm_standard_format():
|
||||
'''
|
||||
content_list
|
||||
type string image/text/table/equation(行间的单独拿出来,行内的和text合并)
|
||||
|
||||
'''
|
||||
pass
|
||||
@@ -119,6 +119,20 @@ def _is_left_overlap(box1, box2,):
|
||||
return x0_1<=x0_2<=x1_1 and vertical_overlap_cond
|
||||
|
||||
|
||||
def __is_overlaps_y_exceeds_threshold(bbox1, bbox2, overlap_ratio_threshold=0.8):
|
||||
"""检查两个bbox在y轴上是否有重叠,并且该重叠区域的高度占两个bbox高度更低的那个超过80%"""
|
||||
_, y0_1, _, y1_1 = bbox1
|
||||
_, y0_2, _, y1_2 = bbox2
|
||||
|
||||
overlap = max(0, min(y1_1, y1_2) - max(y0_1, y0_2))
|
||||
height1, height2 = y1_1 - y0_1, y1_2 - y0_2
|
||||
max_height = max(height1, height2)
|
||||
min_height = min(height1, height2)
|
||||
|
||||
return (overlap / min_height) > overlap_ratio_threshold
|
||||
|
||||
|
||||
|
||||
def calculate_iou(bbox1, bbox2):
|
||||
# Determine the coordinates of the intersection rectangle
|
||||
x_left = max(bbox1[0], bbox2[0])
|
||||
@@ -163,7 +177,47 @@ def calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2):
|
||||
else:
|
||||
return intersection_area / min_box_area
|
||||
|
||||
|
||||
def calculate_overlap_area_in_bbox1_area_ratio(bbox1, bbox2):
|
||||
"""
|
||||
计算box1和box2的重叠面积占bbox1的比例
|
||||
"""
|
||||
# Determine the coordinates of the intersection rectangle
|
||||
x_left = max(bbox1[0], bbox2[0])
|
||||
y_top = max(bbox1[1], bbox2[1])
|
||||
x_right = min(bbox1[2], bbox2[2])
|
||||
y_bottom = min(bbox1[3], bbox2[3])
|
||||
|
||||
if x_right < x_left or y_bottom < y_top:
|
||||
return 0.0
|
||||
|
||||
# The area of overlap area
|
||||
intersection_area = (x_right - x_left) * (y_bottom - y_top)
|
||||
bbox1_area = (bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1])
|
||||
if bbox1_area == 0:
|
||||
return 0
|
||||
else:
|
||||
return intersection_area / bbox1_area
|
||||
|
||||
|
||||
def get_minbox_if_overlap_by_ratio(bbox1, bbox2, ratio):
|
||||
"""
|
||||
通过calculate_overlap_area_2_minbox_area_ratio计算两个bbox重叠的面积占最小面积的box的比例
|
||||
如果比例大于ratio,则返回小的那个bbox,
|
||||
否则返回None
|
||||
"""
|
||||
x1_min, y1_min, x1_max, y1_max = bbox1
|
||||
x2_min, y2_min, x2_max, y2_max = bbox2
|
||||
area1 = (x1_max - x1_min) * (y1_max - y1_min)
|
||||
area2 = (x2_max - x2_min) * (y2_max - y2_min)
|
||||
overlap_ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
|
||||
if overlap_ratio > ratio:
|
||||
if area1 <= area2:
|
||||
return bbox1
|
||||
else:
|
||||
return bbox2
|
||||
else:
|
||||
return None
|
||||
|
||||
def get_bbox_in_boundry(bboxes:list, boundry:tuple)-> list:
|
||||
x0, y0, x1, y1 = boundry
|
||||
new_boxes = [box for box in bboxes if box[0] >= x0 and box[1] >= y0 and box[2] <= x1 and box[3] <= y1]
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import datetime
|
||||
import json
|
||||
import os, re, configparser
|
||||
import time
|
||||
|
||||
@@ -115,6 +116,34 @@ def read_file(pdf_path: str, s3_profile):
|
||||
with open(pdf_path, "rb") as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
def get_docx_model_output(pdf_model_output, pdf_model_s3_profile, page_id):
|
||||
if isinstance(pdf_model_output, str):
|
||||
model_output_json_path = join_path(pdf_model_output, f"page_{page_id + 1}.json") # 模型输出的页面编号从1开始的
|
||||
if os.path.exists(model_output_json_path):
|
||||
json_from_docx = read_file(model_output_json_path, pdf_model_s3_profile)
|
||||
model_output_json = json.loads(json_from_docx)
|
||||
else:
|
||||
try:
|
||||
model_output_json_path = join_path(pdf_model_output, "model.json")
|
||||
with open(model_output_json_path, "r", encoding="utf-8") as f:
|
||||
model_output_json = json.load(f)
|
||||
model_output_json = model_output_json["doc_layout_result"][page_id]
|
||||
except:
|
||||
s3_model_output_json_path = join_path(pdf_model_output, f"page_{page_id + 1}.json")
|
||||
s3_model_output_json_path = join_path(pdf_model_output, f"{page_id}.json")
|
||||
#s3_model_output_json_path = join_path(pdf_model_output, f"page_{page_id }.json")
|
||||
# logger.warning(f"model_output_json_path: {model_output_json_path} not found. try to load from s3: {s3_model_output_json_path}")
|
||||
|
||||
s = read_file(s3_model_output_json_path, pdf_model_s3_profile)
|
||||
return json.loads(s)
|
||||
|
||||
elif isinstance(pdf_model_output, list):
|
||||
model_output_json = pdf_model_output[page_id]
|
||||
|
||||
return model_output_json
|
||||
|
||||
|
||||
def list_dir(dir_path:str, s3_profile:str):
|
||||
"""
|
||||
列出dir_path下的所有文件
|
||||
|
||||
9
magic_pdf/libs/coordinate_transform.py
Normal file
9
magic_pdf/libs/coordinate_transform.py
Normal file
@@ -0,0 +1,9 @@
|
||||
def get_scale_ratio(ocr_page_info, page):
|
||||
pix = page.get_pixmap(dpi=72)
|
||||
pymu_width = int(pix.w)
|
||||
pymu_height = int(pix.h)
|
||||
width_from_json = ocr_page_info['page_info']['width']
|
||||
height_from_json = ocr_page_info['page_info']['height']
|
||||
horizontal_scale_ratio = width_from_json / pymu_width
|
||||
vertical_scale_ratio = height_from_json / pymu_height
|
||||
return horizontal_scale_ratio, vertical_scale_ratio
|
||||
80
magic_pdf/libs/draw_bbox.py
Normal file
80
magic_pdf/libs/draw_bbox.py
Normal file
@@ -0,0 +1,80 @@
|
||||
from magic_pdf.libs.commons import fitz # PyMuPDF
|
||||
from magic_pdf.libs.ocr_content_type import ContentType
|
||||
|
||||
|
||||
def draw_bbox_without_number(i, bbox_list, page, rgb_config):
|
||||
new_rgb = []
|
||||
for item in rgb_config:
|
||||
item = float(item) / 255
|
||||
new_rgb.append(item)
|
||||
page_data = bbox_list[i]
|
||||
for bbox in page_data:
|
||||
x0, y0, x1, y1 = bbox
|
||||
rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
|
||||
page.draw_rect(rect_coords, color=new_rgb, fill=None, width=0.5, overlay=True) # Draw the rectangle
|
||||
|
||||
|
||||
def draw_bbox_with_number(i, bbox_list, page, rgb_config):
|
||||
new_rgb = []
|
||||
for item in rgb_config:
|
||||
item = float(item) / 255
|
||||
new_rgb.append(item)
|
||||
page_data = bbox_list[i]
|
||||
for j, bbox in enumerate(page_data):
|
||||
x0, y0, x1, y1 = bbox
|
||||
rect_coords = fitz.Rect(x0, y0, x1, y1) # Define the rectangle
|
||||
page.draw_rect(rect_coords, color=new_rgb, fill=None, width=0.5, overlay=True) # Draw the rectangle
|
||||
page.insert_text((x0, y0), str(j + 1), fontsize=10, color=new_rgb) # Insert the index at the top left corner of the rectangle
|
||||
|
||||
|
||||
def draw_layout_bbox(pdf_info_dict, input_path, out_path):
|
||||
layout_bbox_list = []
|
||||
dropped_bbox_list = []
|
||||
for page in pdf_info_dict.values():
|
||||
page_layout_list = []
|
||||
page_dropped_list = []
|
||||
for layout in page['layout_bboxes']:
|
||||
page_layout_list.append(layout['layout_bbox'])
|
||||
layout_bbox_list.append(page_layout_list)
|
||||
for drop_tag, dropped_bboxes in page['droped_bboxes'].items():
|
||||
for dropped_bbox in dropped_bboxes:
|
||||
page_dropped_list.append(dropped_bbox)
|
||||
dropped_bbox_list.append(page_dropped_list)
|
||||
|
||||
doc = fitz.open(input_path)
|
||||
for i, page in enumerate(doc):
|
||||
draw_bbox_with_number(i, layout_bbox_list, page, [255, 0, 0])
|
||||
draw_bbox_without_number(i, dropped_bbox_list, page, [0, 255, 0])
|
||||
# Save the PDF
|
||||
doc.save(f"{out_path}/layout.pdf")
|
||||
|
||||
def draw_text_bbox(pdf_info_dict, input_path, out_path):
|
||||
text_list = []
|
||||
inline_equation_list = []
|
||||
interline_equation_list = []
|
||||
for page in pdf_info_dict.values():
|
||||
page_text_list = []
|
||||
page_inline_equation_list = []
|
||||
page_interline_equation_list = []
|
||||
for block in page['preproc_blocks']:
|
||||
for line in block['lines']:
|
||||
for span in line['spans']:
|
||||
if span['type'] == ContentType.Text:
|
||||
page_text_list.append(span['bbox'])
|
||||
elif span['type'] == ContentType.InlineEquation:
|
||||
page_inline_equation_list.append(span['bbox'])
|
||||
elif span['type'] == ContentType.InterlineEquation:
|
||||
page_interline_equation_list.append(span['bbox'])
|
||||
text_list.append(page_text_list)
|
||||
inline_equation_list.append(page_inline_equation_list)
|
||||
interline_equation_list.append(page_interline_equation_list)
|
||||
|
||||
doc = fitz.open(input_path)
|
||||
for i, page in enumerate(doc):
|
||||
# 获取当前页面的数据
|
||||
draw_bbox_without_number(i, text_list, page, [255, 0, 0])
|
||||
draw_bbox_without_number(i, inline_equation_list, page, [0, 255, 0])
|
||||
draw_bbox_without_number(i, interline_equation_list, page, [0, 0, 255])
|
||||
|
||||
# Save the PDF
|
||||
doc.save(f"{out_path}/text.pdf")
|
||||
@@ -18,3 +18,14 @@ def escape_special_markdown_char(pymu_blocks):
|
||||
span['text'] = span['text'].replace(char, "\\" + char)
|
||||
|
||||
return pymu_blocks
|
||||
|
||||
|
||||
def ocr_escape_special_markdown_char(content):
|
||||
"""
|
||||
转义正文里对markdown语法有特殊意义的字符
|
||||
"""
|
||||
special_chars = ["*", "`", "~", "$"]
|
||||
for char in special_chars:
|
||||
content = content.replace(char, "\\" + char)
|
||||
|
||||
return content
|
||||
|
||||
7
magic_pdf/libs/ocr_content_type.py
Normal file
7
magic_pdf/libs/ocr_content_type.py
Normal file
@@ -0,0 +1,7 @@
|
||||
class ContentType:
|
||||
Image = "image"
|
||||
Table = "table"
|
||||
Text = "text"
|
||||
InlineEquation = "inline_equation"
|
||||
InterlineEquation = "interline_equation"
|
||||
|
||||
@@ -2,7 +2,7 @@ import time
|
||||
|
||||
# from anyio import Path
|
||||
|
||||
from magic_pdf.libs.commons import fitz, get_delta_time, get_img_s3_client
|
||||
from magic_pdf.libs.commons import fitz, get_delta_time, get_img_s3_client, get_docx_model_output
|
||||
import json
|
||||
import os
|
||||
import math
|
||||
@@ -59,6 +59,7 @@ from magic_pdf.post_proc.pdf_post_filter import pdf_post_filter
|
||||
from magic_pdf.pre_proc.remove_rotate_bbox import get_side_boundry, remove_rotate_side_textblock, remove_side_blank_block
|
||||
from magic_pdf.pre_proc.resolve_bbox_conflict import check_text_block_horizontal_overlap, resolve_bbox_overlap_conflict
|
||||
from magic_pdf.pre_proc.fix_table import fix_table_text_block, fix_tables, include_table_title
|
||||
from magic_pdf.pre_proc.solve_line_alien import solve_inline_too_large_interval
|
||||
|
||||
denseSingleLineBlockException_msg = DenseSingleLineBlockException().message
|
||||
titleDetectionException_msg = TitleDetectionException().message
|
||||
@@ -67,31 +68,6 @@ paraSplitException_msg = ParaSplitException().message
|
||||
paraMergeException_msg = ParaMergeException().message
|
||||
|
||||
|
||||
def get_docx_model_output(pdf_model_output, pdf_model_s3_profile, page_id):
|
||||
if isinstance(pdf_model_output, str):
|
||||
model_output_json_path = join_path(pdf_model_output, f"page_{page_id + 1}.json") # 模型输出的页面编号从1开始的
|
||||
if os.path.exists(model_output_json_path):
|
||||
json_from_docx = read_file(model_output_json_path, pdf_model_s3_profile)
|
||||
model_output_json = json.loads(json_from_docx)
|
||||
else:
|
||||
try:
|
||||
model_output_json_path = join_path(pdf_model_output, "model.json")
|
||||
with open(model_output_json_path, "r", encoding="utf-8") as f:
|
||||
model_output_json = json.load(f)
|
||||
model_output_json = model_output_json["doc_layout_result"][page_id]
|
||||
except:
|
||||
s3_model_output_json_path = join_path(pdf_model_output, f"page_{page_id + 1}.json")
|
||||
s3_model_output_json_path = join_path(pdf_model_output, f"{page_id}.json")
|
||||
#s3_model_output_json_path = join_path(pdf_model_output, f"page_{page_id }.json")
|
||||
# logger.warning(f"model_output_json_path: {model_output_json_path} not found. try to load from s3: {s3_model_output_json_path}")
|
||||
|
||||
s = read_file(s3_model_output_json_path, pdf_model_s3_profile)
|
||||
return json.loads(s)
|
||||
|
||||
elif isinstance(pdf_model_output, list):
|
||||
model_output_json = pdf_model_output[page_id]
|
||||
|
||||
return model_output_json
|
||||
|
||||
|
||||
def parse_pdf_by_model(
|
||||
@@ -446,6 +422,10 @@ def parse_pdf_by_model(
|
||||
==================================================================================================================================
|
||||
进入段落处理-2阶段
|
||||
"""
|
||||
|
||||
# 处理行内文字间距较大问题
|
||||
pdf_info_dict = solve_inline_too_large_interval(pdf_info_dict)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
para_process_pipeline = ParaProcessPipeline()
|
||||
|
||||
261
magic_pdf/pdf_parse_by_ocr.py
Normal file
261
magic_pdf/pdf_parse_by_ocr.py
Normal file
@@ -0,0 +1,261 @@
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_text_bbox
|
||||
from magic_pdf.libs.commons import (
|
||||
read_file,
|
||||
join_path,
|
||||
fitz,
|
||||
get_img_s3_client,
|
||||
get_delta_time,
|
||||
get_docx_model_output,
|
||||
)
|
||||
from magic_pdf.libs.coordinate_transform import get_scale_ratio
|
||||
from magic_pdf.libs.ocr_content_type import ContentType
|
||||
from magic_pdf.libs.safe_filename import sanitize_filename
|
||||
from magic_pdf.pre_proc.detect_footer_by_model import parse_footers
|
||||
from magic_pdf.pre_proc.detect_footnote import parse_footnotes_by_model
|
||||
from magic_pdf.pre_proc.detect_header import parse_headers
|
||||
from magic_pdf.pre_proc.detect_page_number import parse_pageNos
|
||||
from magic_pdf.pre_proc.ocr_cut_image import cut_image_and_table
|
||||
from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
|
||||
from magic_pdf.pre_proc.ocr_dict_merge import (
|
||||
merge_spans_to_line_by_layout, merge_lines_to_block,
|
||||
)
|
||||
from magic_pdf.pre_proc.ocr_span_list_modify import remove_spans_by_bboxes, remove_overlaps_min_spans, \
|
||||
adjust_bbox_for_standalone_block, modify_y_axis, modify_inline_equation, get_qa_need_list, \
|
||||
remove_spans_by_bboxes_dict
|
||||
from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox
|
||||
|
||||
|
||||
def construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
|
||||
images, tables, interline_equations, inline_equations,
|
||||
dropped_text_block, dropped_image_block, dropped_table_block,
|
||||
need_remove_spans_bboxes_dict):
|
||||
return_dict = {
|
||||
'preproc_blocks': blocks,
|
||||
'layout_bboxes': layout_bboxes,
|
||||
'page_idx': page_id,
|
||||
'page_size': [page_w, page_h],
|
||||
'_layout_tree': layout_tree,
|
||||
'images': images,
|
||||
'tables': tables,
|
||||
'interline_equations': interline_equations,
|
||||
'inline_equations': inline_equations,
|
||||
'droped_text_block': dropped_text_block,
|
||||
'droped_image_block': dropped_image_block,
|
||||
'droped_table_block': dropped_table_block,
|
||||
'droped_bboxes': need_remove_spans_bboxes_dict,
|
||||
}
|
||||
return return_dict
|
||||
|
||||
|
||||
def parse_pdf_by_ocr(
|
||||
pdf_path,
|
||||
s3_pdf_profile,
|
||||
pdf_model_output,
|
||||
save_path,
|
||||
book_name,
|
||||
pdf_model_profile=None,
|
||||
image_s3_config=None,
|
||||
start_page_id=0,
|
||||
end_page_id=None,
|
||||
debug_mode=False,
|
||||
):
|
||||
pdf_bytes = read_file(pdf_path, s3_pdf_profile)
|
||||
save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest")
|
||||
book_name = sanitize_filename(book_name)
|
||||
md_bookname_save_path = ""
|
||||
if debug_mode:
|
||||
save_path = join_path(save_tmp_path, "md")
|
||||
pdf_local_path = join_path(save_tmp_path, "download-pdfs", book_name)
|
||||
|
||||
if not os.path.exists(os.path.dirname(pdf_local_path)):
|
||||
# 如果目录不存在,创建它
|
||||
os.makedirs(os.path.dirname(pdf_local_path))
|
||||
|
||||
md_bookname_save_path = join_path(save_tmp_path, "md", book_name)
|
||||
if not os.path.exists(md_bookname_save_path):
|
||||
# 如果目录不存在,创建它
|
||||
os.makedirs(md_bookname_save_path)
|
||||
|
||||
with open(pdf_local_path + ".pdf", "wb") as pdf_file:
|
||||
pdf_file.write(pdf_bytes)
|
||||
|
||||
pdf_docs = fitz.open("pdf", pdf_bytes)
|
||||
# 初始化空的pdf_info_dict
|
||||
pdf_info_dict = {}
|
||||
img_s3_client = get_img_s3_client(save_path, image_s3_config)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
|
||||
end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1
|
||||
for page_id in range(start_page_id, end_page_id + 1):
|
||||
|
||||
# 获取当前页的page对象
|
||||
page = pdf_docs[page_id]
|
||||
# 获取当前页的宽高
|
||||
page_w = page.rect.width
|
||||
page_h = page.rect.height
|
||||
|
||||
if debug_mode:
|
||||
time_now = time.time()
|
||||
logger.info(
|
||||
f"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}"
|
||||
)
|
||||
start_time = time_now
|
||||
|
||||
# 获取当前页的模型数据
|
||||
ocr_page_info = get_docx_model_output(
|
||||
pdf_model_output, pdf_model_profile, page_id
|
||||
)
|
||||
|
||||
"""从json中获取每页的页码、页眉、页脚的bbox"""
|
||||
page_no_bboxes = parse_pageNos(page_id, page, ocr_page_info)
|
||||
header_bboxes = parse_headers(page_id, page, ocr_page_info)
|
||||
footer_bboxes = parse_footers(page_id, page, ocr_page_info)
|
||||
footnote_bboxes = parse_footnotes_by_model(
|
||||
page_id, page, ocr_page_info, md_bookname_save_path, debug_mode=debug_mode
|
||||
)
|
||||
|
||||
# 构建需要remove的bbox列表
|
||||
# need_remove_spans_bboxes = []
|
||||
# need_remove_spans_bboxes.extend(page_no_bboxes)
|
||||
# need_remove_spans_bboxes.extend(header_bboxes)
|
||||
# need_remove_spans_bboxes.extend(footer_bboxes)
|
||||
# need_remove_spans_bboxes.extend(footnote_bboxes)
|
||||
|
||||
# 构建需要remove的bbox字典
|
||||
need_remove_spans_bboxes_dict = {
|
||||
"page_no": page_no_bboxes,
|
||||
"header": header_bboxes,
|
||||
"footer": footer_bboxes,
|
||||
"footnote": footnote_bboxes,
|
||||
}
|
||||
|
||||
layout_dets = ocr_page_info["layout_dets"]
|
||||
spans = []
|
||||
|
||||
# 计算模型坐标和pymu坐标的缩放比例
|
||||
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(
|
||||
ocr_page_info, page
|
||||
)
|
||||
|
||||
for layout_det in layout_dets:
|
||||
category_id = layout_det["category_id"]
|
||||
allow_category_id_list = [1, 7, 13, 14, 15]
|
||||
if category_id in allow_category_id_list:
|
||||
x0, y0, _, _, x1, y1, _, _ = layout_det["poly"]
|
||||
bbox = [
|
||||
int(x0 / horizontal_scale_ratio),
|
||||
int(y0 / vertical_scale_ratio),
|
||||
int(x1 / horizontal_scale_ratio),
|
||||
int(y1 / vertical_scale_ratio),
|
||||
]
|
||||
# 删除高度或者宽度为0的spans
|
||||
if bbox[2] - bbox[0] == 0 or bbox[3] - bbox[1] == 0:
|
||||
continue
|
||||
"""要删除的"""
|
||||
# 3: 'header', # 页眉
|
||||
# 4: 'page number', # 页码
|
||||
# 5: 'footnote', # 脚注
|
||||
# 6: 'footer', # 页脚
|
||||
"""当成span拼接的"""
|
||||
# 1: 'image', # 图片
|
||||
# 7: 'table', # 表格
|
||||
# 13: 'inline_equation', # 行内公式
|
||||
# 14: 'interline_equation', # 行间公式
|
||||
# 15: 'text', # ocr识别文本
|
||||
"""layout信息"""
|
||||
# 11: 'full column', # 单栏
|
||||
# 12: 'sub column', # 多栏
|
||||
span = {
|
||||
"bbox": bbox,
|
||||
}
|
||||
if category_id == 1:
|
||||
span["type"] = ContentType.Image
|
||||
|
||||
elif category_id == 7:
|
||||
span["type"] = ContentType.Table
|
||||
|
||||
elif category_id == 13:
|
||||
span["content"] = layout_det["latex"]
|
||||
span["type"] = ContentType.InlineEquation
|
||||
elif category_id == 14:
|
||||
span["content"] = layout_det["latex"]
|
||||
span["type"] = ContentType.InterlineEquation
|
||||
elif category_id == 15:
|
||||
span["content"] = layout_det["text"]
|
||||
span["type"] = ContentType.Text
|
||||
# print(span)
|
||||
spans.append(span)
|
||||
else:
|
||||
continue
|
||||
|
||||
|
||||
|
||||
|
||||
# 删除重叠spans中较小的那些
|
||||
spans = remove_overlaps_min_spans(spans)
|
||||
|
||||
# 删除remove_span_block_bboxes中的bbox
|
||||
# spans = remove_spans_by_bboxes(spans, need_remove_spans_bboxes)
|
||||
# 按qa要求,增加drop相关数据
|
||||
spans, dropped_text_block, dropped_image_block, dropped_table_block = remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict)
|
||||
|
||||
# 对image和table截图
|
||||
spans = cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client)
|
||||
|
||||
# 行内公式调整, 高度调整至与同行文字高度一致(优先左侧, 其次右侧)
|
||||
displayed_list = []
|
||||
text_inline_lines = []
|
||||
modify_y_axis(spans, displayed_list, text_inline_lines)
|
||||
# 模型识别错误的行间公式, type类型转换成行内公式
|
||||
spans = modify_inline_equation(spans, displayed_list, text_inline_lines)
|
||||
|
||||
# bbox去除粘连
|
||||
spans = remove_overlap_between_bbox(spans)
|
||||
|
||||
# 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
|
||||
spans = adjust_bbox_for_standalone_block(spans)
|
||||
|
||||
|
||||
# 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
|
||||
layout_bboxes, layout_tree = layout_detect(ocr_page_info['subfield_dets'], page, ocr_page_info)
|
||||
|
||||
# 将spans合并成line(在layout内,从上到下,从左到右)
|
||||
lines = merge_spans_to_line_by_layout(spans, layout_bboxes)
|
||||
|
||||
# 将lines合并成block
|
||||
blocks = merge_lines_to_block(lines)
|
||||
|
||||
# 根据block合并段落
|
||||
|
||||
|
||||
# 获取QA需要外置的list
|
||||
images, tables, interline_equations, inline_equations = get_qa_need_list(blocks)
|
||||
|
||||
# 构造pdf_info_dict
|
||||
page_info = construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
|
||||
images, tables, interline_equations, inline_equations,
|
||||
dropped_text_block, dropped_image_block, dropped_table_block,
|
||||
need_remove_spans_bboxes_dict)
|
||||
pdf_info_dict[f"page_{page_id}"] = page_info
|
||||
|
||||
# 在测试时,保存调试信息
|
||||
if debug_mode:
|
||||
params_file_save_path = join_path(
|
||||
save_tmp_path, "md", book_name, "preproc_out.json"
|
||||
)
|
||||
with open(params_file_save_path, "w", encoding="utf-8") as f:
|
||||
json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4)
|
||||
|
||||
# drow_bbox
|
||||
draw_layout_bbox(pdf_info_dict, pdf_path, md_bookname_save_path)
|
||||
draw_text_bbox(pdf_info_dict, pdf_path, md_bookname_save_path)
|
||||
|
||||
return pdf_info_dict
|
||||
@@ -3,6 +3,7 @@ import sys
|
||||
import time
|
||||
from urllib.parse import quote
|
||||
|
||||
from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_nlp_markdown, ocr_mk_mm_markdown
|
||||
from magic_pdf.libs.commons import read_file, join_path, parse_bucket_key, formatted_time
|
||||
from magic_pdf.libs.drop_reason import DropReason
|
||||
from magic_pdf.libs.json_compressor import JsonCompressor
|
||||
@@ -13,6 +14,7 @@ from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
|
||||
from loguru import logger
|
||||
|
||||
from app.common.s3 import get_s3_config, get_s3_client
|
||||
from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr
|
||||
|
||||
|
||||
def exception_handler(jso: dict, e):
|
||||
@@ -23,6 +25,27 @@ def exception_handler(jso: dict, e):
|
||||
return jso
|
||||
|
||||
|
||||
def get_data_type(jso: dict):
|
||||
data_type = jso.get('data_type')
|
||||
if data_type is None:
|
||||
data_type = jso.get('file_type')
|
||||
return data_type
|
||||
|
||||
|
||||
def get_bookid(jso: dict):
|
||||
book_id = jso.get('bookid')
|
||||
if book_id is None:
|
||||
book_id = jso.get('original_file_id')
|
||||
return book_id
|
||||
|
||||
|
||||
def get_data_source(jso: dict):
|
||||
data_source = jso.get('data_source')
|
||||
if data_source is None:
|
||||
data_source = jso.get('file_source')
|
||||
return data_source
|
||||
|
||||
|
||||
def meta_scan(jso: dict, doc_layout_check=True) -> dict:
|
||||
s3_pdf_path = jso.get('file_location')
|
||||
s3_config = get_s3_config(s3_pdf_path)
|
||||
@@ -32,7 +55,7 @@ def meta_scan(jso: dict, doc_layout_check=True) -> dict:
|
||||
jso['drop_reason'] = DropReason.MISS_DOC_LAYOUT_RESULT
|
||||
return jso
|
||||
try:
|
||||
data_source = jso.get('data_source')
|
||||
data_source = get_data_source(jso)
|
||||
file_id = jso.get('file_id')
|
||||
book_name = data_source + "/" + file_id
|
||||
|
||||
@@ -78,7 +101,7 @@ def classify_by_type(jso: dict, debug_mode=False) -> dict:
|
||||
# 开始正式逻辑
|
||||
try:
|
||||
pdf_meta = jso.get('pdf_meta')
|
||||
data_source = jso.get('data_source')
|
||||
data_source = get_data_source(jso)
|
||||
file_id = jso.get('file_id')
|
||||
book_name = data_source + "/" + file_id
|
||||
total_page = pdf_meta["total_page"]
|
||||
@@ -140,11 +163,11 @@ def save_tables_to_s3(jso: dict, debug_mode=False) -> dict:
|
||||
pass
|
||||
else:# 如果debug没开,则检测是否有needdrop字段
|
||||
if jso.get('need_drop', False):
|
||||
logger.info(f"book_name is:{jso['data_source']}/{jso['file_id']} need drop", file=sys.stderr)
|
||||
logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']} need drop", file=sys.stderr)
|
||||
jso["dropped"] = True
|
||||
return jso
|
||||
try:
|
||||
data_source = jso.get('data_source')
|
||||
data_source = get_data_source(jso)
|
||||
file_id = jso.get('file_id')
|
||||
book_name = data_source + "/" + file_id
|
||||
title = jso.get('title')
|
||||
@@ -195,7 +218,7 @@ def save_tables_to_s3(jso: dict, debug_mode=False) -> dict:
|
||||
|
||||
def drop_needdrop_pdf(jso: dict) -> dict:
|
||||
if jso.get('need_drop', False):
|
||||
logger.info(f"book_name is:{jso['data_source']}/{jso['file_id']} need drop", file=sys.stderr)
|
||||
logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']} need drop", file=sys.stderr)
|
||||
jso["dropped"] = True
|
||||
return jso
|
||||
|
||||
@@ -206,7 +229,7 @@ def pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
|
||||
pass
|
||||
else:# 如果debug没开,则检测是否有needdrop字段
|
||||
if jso.get('need_drop', False):
|
||||
book_name = join_path(jso['data_source'], jso['file_id'])
|
||||
book_name = join_path(get_data_source(jso), jso['file_id'])
|
||||
logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
|
||||
jso["dropped"] = True
|
||||
return jso
|
||||
@@ -216,7 +239,7 @@ def pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
|
||||
pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
|
||||
markdown_content = mk_nlp_markdown(pdf_intermediate_dict)
|
||||
jso["content"] = markdown_content
|
||||
logger.info(f"book_name is:{jso['data_source']}/{jso['file_id']},markdown content length is {len(markdown_content)}", file=sys.stderr)
|
||||
logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}", file=sys.stderr)
|
||||
# 把无用的信息清空
|
||||
jso["doc_layout_result"] = ""
|
||||
jso["pdf_intermediate_dict"] = ""
|
||||
@@ -237,7 +260,7 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
|
||||
s3_pdf_path = jso.get('file_location')
|
||||
s3_config = get_s3_config(s3_pdf_path)
|
||||
model_output_json_list = jso.get('doc_layout_result')
|
||||
data_source = jso.get('data_source')
|
||||
data_source = get_data_source(jso)
|
||||
file_id = jso.get('file_id')
|
||||
book_name = data_source + "/" + file_id
|
||||
|
||||
@@ -290,5 +313,79 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
|
||||
return jso
|
||||
|
||||
|
||||
def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict:
|
||||
# 检测debug开关
|
||||
if debug_mode:
|
||||
pass
|
||||
else: # 如果debug没开,则检测是否有needdrop字段
|
||||
if jso.get('need_drop', False):
|
||||
return jso
|
||||
|
||||
s3_pdf_path = jso.get('file_location')
|
||||
s3_config = get_s3_config(s3_pdf_path)
|
||||
model_output_json_list = jso.get('doc_layout_result')
|
||||
data_source = get_data_source(jso)
|
||||
file_id = jso.get('file_id')
|
||||
book_name = data_source + "/" + file_id
|
||||
try:
|
||||
save_path = "s3://mllm-raw-media/pdf2md_img/"
|
||||
image_s3_config = get_s3_config(save_path)
|
||||
start_time = time.time() # 记录开始时间
|
||||
# 先打印一下book_name和解析开始的时间
|
||||
logger.info(f"book_name is:{book_name},start_time is:{formatted_time(start_time)}", file=sys.stderr)
|
||||
pdf_info_dict = parse_pdf_by_ocr(
|
||||
s3_pdf_path,
|
||||
s3_config,
|
||||
model_output_json_list,
|
||||
save_path,
|
||||
book_name,
|
||||
pdf_model_profile=None,
|
||||
image_s3_config=image_s3_config,
|
||||
start_page_id=start_page_id,
|
||||
debug_mode=debug_mode
|
||||
)
|
||||
if pdf_info_dict.get('need_drop', False): # 如果返回的字典里有need_drop,则提取drop_reason并跳过本次解析
|
||||
jso['need_drop'] = True
|
||||
jso['drop_reason'] = pdf_info_dict["drop_reason"]
|
||||
else: # 正常返回,将 pdf_info_dict 压缩并存储
|
||||
pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict)
|
||||
jso['pdf_intermediate_dict'] = pdf_info_dict
|
||||
end_time = time.time() # 记录完成时间
|
||||
parse_time = int(end_time - start_time) # 计算执行时间
|
||||
# 解析完成后打印一下book_name和耗时
|
||||
logger.info(f"book_name is:{book_name},end_time is:{formatted_time(end_time)},cost_time is:{parse_time}",
|
||||
file=sys.stderr)
|
||||
jso['parse_time'] = parse_time
|
||||
except Exception as e:
|
||||
jso = exception_handler(jso, e)
|
||||
return jso
|
||||
|
||||
|
||||
def ocr_pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict:
|
||||
|
||||
if debug_mode:
|
||||
pass
|
||||
else: # 如果debug没开,则检测是否有needdrop字段
|
||||
if jso.get('need_drop', False):
|
||||
book_name = join_path(get_data_source(jso), jso['file_id'])
|
||||
logger.info(f"book_name is:{book_name} need drop", file=sys.stderr)
|
||||
jso["dropped"] = True
|
||||
return jso
|
||||
try:
|
||||
pdf_intermediate_dict = jso['pdf_intermediate_dict']
|
||||
# 将 pdf_intermediate_dict 解压
|
||||
pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict)
|
||||
markdown_content = ocr_mk_mm_markdown(pdf_intermediate_dict)
|
||||
jso["content"] = markdown_content
|
||||
logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}", file=sys.stderr)
|
||||
# 把无用的信息清空
|
||||
jso["doc_layout_result"] = ""
|
||||
jso["pdf_intermediate_dict"] = ""
|
||||
jso["pdf_meta"] = ""
|
||||
except Exception as e:
|
||||
jso = exception_handler(jso, e)
|
||||
return jso
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
pass
|
||||
|
||||
@@ -75,7 +75,8 @@ def merge_footnote_blocks(page_info, main_text_font):
|
||||
is_below(block['bbox'], footnote_bbox) and
|
||||
sum([size >= main_text_size,
|
||||
len(block['lines']) >= 5,
|
||||
block_font == main_text_font]) >= 2]
|
||||
block_font == main_text_font])
|
||||
>= 2]
|
||||
# 如果main_text_bboxes_below不为空,说明footnote下面有正文block,这个footnote不成立,跳过
|
||||
if len(main_text_bboxes_below) > 0:
|
||||
continue
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from magic_pdf.libs.boxbase import _is_in # 正则
|
||||
from magic_pdf.libs.boxbase import _is_in, calculate_overlap_area_2_minbox_area_ratio # 正则
|
||||
from magic_pdf.libs.commons import fitz # pyMuPDF库
|
||||
|
||||
|
||||
@@ -18,7 +18,16 @@ def __solve_contain_bboxs(all_bbox_list: list):
|
||||
dump_list.append(all_bbox_list[i])
|
||||
elif _is_in(bbox2, bbox1):
|
||||
dump_list.append(all_bbox_list[j])
|
||||
|
||||
else:
|
||||
ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
|
||||
if ratio > 0.7:
|
||||
s1 = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
|
||||
s2 = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
|
||||
if s2 > s1:
|
||||
dump_list.append(all_bbox_list[i])
|
||||
else:
|
||||
dump_list.append(all_bbox_list[i])
|
||||
|
||||
# 遍历需要删除的列表中的每个元素
|
||||
for item in dump_list:
|
||||
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from magic_pdf.libs.commons import fitz # pyMuPDF库
|
||||
from magic_pdf.libs.coordinate_transform import get_scale_ratio
|
||||
|
||||
|
||||
def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
|
||||
@@ -8,23 +9,12 @@ def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
|
||||
:param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
|
||||
:param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
|
||||
"""
|
||||
DPI = 72 # use this resolution
|
||||
pix = page.get_pixmap(dpi=DPI)
|
||||
pageL = 0
|
||||
pageR = int(pix.w)
|
||||
pageU = 0
|
||||
pageD = int(pix.h)
|
||||
|
||||
|
||||
#--------- 通过json_from_DocXchain来获取 footer ---------#
|
||||
footer_bbox_from_DocXChain = []
|
||||
|
||||
|
||||
xf_json = json_from_DocXchain_obj
|
||||
width_from_json = xf_json['page_info']['width']
|
||||
height_from_json = xf_json['page_info']['height']
|
||||
LR_scaleRatio = width_from_json / (pageR - pageL)
|
||||
UD_scaleRatio = height_from_json / (pageD - pageU)
|
||||
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
|
||||
|
||||
# {0: 'title', # 标题
|
||||
# 1: 'figure', # 图片
|
||||
@@ -42,10 +32,10 @@ def parse_footers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
|
||||
# 13: 'embedding', # 嵌入公式
|
||||
# 14: 'isolated'} # 单行公式
|
||||
for xf in xf_json['layout_dets']:
|
||||
L = xf['poly'][0] / LR_scaleRatio
|
||||
U = xf['poly'][1] / UD_scaleRatio
|
||||
R = xf['poly'][2] / LR_scaleRatio
|
||||
D = xf['poly'][5] / UD_scaleRatio
|
||||
L = xf['poly'][0] / horizontal_scale_ratio
|
||||
U = xf['poly'][1] / vertical_scale_ratio
|
||||
R = xf['poly'][2] / horizontal_scale_ratio
|
||||
D = xf['poly'][5] / vertical_scale_ratio
|
||||
# L += pageL # 有的页面,artBox偏移了。不在(0,0)
|
||||
# R += pageL
|
||||
# U += pageU
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
from collections import Counter
|
||||
from magic_pdf.libs.commons import fitz # pyMuPDF库
|
||||
from magic_pdf.libs.coordinate_transform import get_scale_ratio
|
||||
|
||||
|
||||
def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict, md_bookname_save_path, debug_mode=False):
|
||||
@@ -9,22 +10,12 @@ def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_
|
||||
:param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
|
||||
:param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
|
||||
"""
|
||||
DPI = 72 # use this resolution
|
||||
pix = page.get_pixmap(dpi=DPI)
|
||||
pageL = 0
|
||||
pageR = int(pix.w)
|
||||
pageU = 0
|
||||
pageD = int(pix.h)
|
||||
|
||||
|
||||
#--------- 通过json_from_DocXchain来获取 footnote ---------#
|
||||
footnote_bbox_from_DocXChain = []
|
||||
|
||||
xf_json = json_from_DocXchain_obj
|
||||
width_from_json = xf_json['page_info']['width']
|
||||
height_from_json = xf_json['page_info']['height']
|
||||
LR_scaleRatio = width_from_json / (pageR - pageL)
|
||||
UD_scaleRatio = height_from_json / (pageD - pageU)
|
||||
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
|
||||
|
||||
# {0: 'title', # 标题
|
||||
# 1: 'figure', # 图片
|
||||
@@ -42,10 +33,10 @@ def parse_footnotes_by_model(page_ID: int, page: fitz.Page, json_from_DocXchain_
|
||||
# 13: 'embedding', # 嵌入公式
|
||||
# 14: 'isolated'} # 单行公式
|
||||
for xf in xf_json['layout_dets']:
|
||||
L = xf['poly'][0] / LR_scaleRatio
|
||||
U = xf['poly'][1] / UD_scaleRatio
|
||||
R = xf['poly'][2] / LR_scaleRatio
|
||||
D = xf['poly'][5] / UD_scaleRatio
|
||||
L = xf['poly'][0] / horizontal_scale_ratio
|
||||
U = xf['poly'][1] / vertical_scale_ratio
|
||||
R = xf['poly'][2] / horizontal_scale_ratio
|
||||
D = xf['poly'][5] / vertical_scale_ratio
|
||||
# L += pageL # 有的页面,artBox偏移了。不在(0,0)
|
||||
# R += pageL
|
||||
# U += pageU
|
||||
@@ -104,7 +95,8 @@ def parse_footnotes_by_rule(remain_text_blocks, page_height, page_id, main_text_
|
||||
list: 符合规则的脚注文本块的边界框列表。
|
||||
|
||||
"""
|
||||
if page_id > 20:
|
||||
# if page_id > 20:
|
||||
if page_id > 2: # 为保证精确度,先只筛选前3页
|
||||
return []
|
||||
else:
|
||||
# 存储每一行的文本块大小的列表
|
||||
@@ -128,7 +120,7 @@ def parse_footnotes_by_rule(remain_text_blocks, page_height, page_id, main_text_
|
||||
block_line_sizes.append(line_size)
|
||||
span_font = [(span['font'], len(span['text'])) for span in line['spans'] if 'font' in span and len(span['text']) > 0]
|
||||
if span_font:
|
||||
# # todo main_text_font应该用基于字数最多的字体而不是span级别的统计
|
||||
# main_text_font应该用基于字数最多的字体而不是span级别的统计
|
||||
# font_names.append(font_name for font_name in span_font)
|
||||
# block_fonts.append(font_name for font_name in span_font)
|
||||
for font, count in span_font:
|
||||
@@ -158,9 +150,17 @@ def parse_footnotes_by_rule(remain_text_blocks, page_height, page_id, main_text_
|
||||
# and len(block['lines']) < 5]
|
||||
footnote_bboxes = [block['bbox'] for block, block_size, block_font in block_sizes if
|
||||
block['bbox'][1] > page_height * 0.6 and
|
||||
sum([block_size < main_text_size,
|
||||
len(block['lines']) < 5,
|
||||
block_font != main_text_font]) >= 2]
|
||||
# 较为严格的规则
|
||||
block_size < main_text_size and
|
||||
(len(block['lines']) < 5 or
|
||||
block_font != main_text_font)]
|
||||
|
||||
# 较为宽松的规则
|
||||
# sum([block_size < main_text_size,
|
||||
# len(block['lines']) < 5,
|
||||
# block_font != main_text_font])
|
||||
# >= 2]
|
||||
|
||||
|
||||
return footnote_bboxes
|
||||
else:
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from magic_pdf.libs.commons import fitz # pyMuPDF库
|
||||
from magic_pdf.libs.coordinate_transform import get_scale_ratio
|
||||
|
||||
|
||||
def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
|
||||
@@ -8,22 +9,12 @@ def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
|
||||
:param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
|
||||
:param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
|
||||
"""
|
||||
DPI = 72 # use this resolution
|
||||
pix = page.get_pixmap(dpi=DPI)
|
||||
pageL = 0
|
||||
pageR = int(pix.w)
|
||||
pageU = 0
|
||||
pageD = int(pix.h)
|
||||
|
||||
|
||||
#--------- 通过json_from_DocXchain来获取 header ---------#
|
||||
header_bbox_from_DocXChain = []
|
||||
|
||||
xf_json = json_from_DocXchain_obj
|
||||
width_from_json = xf_json['page_info']['width']
|
||||
height_from_json = xf_json['page_info']['height']
|
||||
LR_scaleRatio = width_from_json / (pageR - pageL)
|
||||
UD_scaleRatio = height_from_json / (pageD - pageU)
|
||||
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
|
||||
|
||||
# {0: 'title', # 标题
|
||||
# 1: 'figure', # 图片
|
||||
@@ -41,10 +32,10 @@ def parse_headers(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
|
||||
# 13: 'embedding', # 嵌入公式
|
||||
# 14: 'isolated'} # 单行公式
|
||||
for xf in xf_json['layout_dets']:
|
||||
L = xf['poly'][0] / LR_scaleRatio
|
||||
U = xf['poly'][1] / UD_scaleRatio
|
||||
R = xf['poly'][2] / LR_scaleRatio
|
||||
D = xf['poly'][5] / UD_scaleRatio
|
||||
L = xf['poly'][0] / horizontal_scale_ratio
|
||||
U = xf['poly'][1] / vertical_scale_ratio
|
||||
R = xf['poly'][2] / horizontal_scale_ratio
|
||||
D = xf['poly'][5] / vertical_scale_ratio
|
||||
# L += pageL # 有的页面,artBox偏移了。不在(0,0)
|
||||
# R += pageL
|
||||
# U += pageU
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from magic_pdf.libs.commons import fitz # pyMuPDF库
|
||||
from magic_pdf.libs.coordinate_transform import get_scale_ratio
|
||||
|
||||
|
||||
def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
|
||||
@@ -8,22 +9,12 @@ def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
|
||||
:param res_dir_path: str类型,是每一个pdf文档,在当前.py文件的目录下生成一个与pdf文档同名的文件夹,res_dir_path就是文件夹的dir
|
||||
:param json_from_DocXchain_obj: dict类型,把pdf文档送入DocXChain模型中后,提取bbox,结果保存到pdf文档同名文件夹下的 page_ID.json文件中了。json_from_DocXchain_obj就是打开后的dict
|
||||
"""
|
||||
DPI = 72 # use this resolution
|
||||
pix = page.get_pixmap(dpi=DPI)
|
||||
pageL = 0
|
||||
pageR = int(pix.w)
|
||||
pageU = 0
|
||||
pageD = int(pix.h)
|
||||
|
||||
|
||||
#--------- 通过json_from_DocXchain来获取 pageNo ---------#
|
||||
pageNo_bbox_from_DocXChain = []
|
||||
|
||||
xf_json = json_from_DocXchain_obj
|
||||
width_from_json = xf_json['page_info']['width']
|
||||
height_from_json = xf_json['page_info']['height']
|
||||
LR_scaleRatio = width_from_json / (pageR - pageL)
|
||||
UD_scaleRatio = height_from_json / (pageD - pageU)
|
||||
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(xf_json, page)
|
||||
|
||||
# {0: 'title', # 标题
|
||||
# 1: 'figure', # 图片
|
||||
@@ -41,10 +32,10 @@ def parse_pageNos(page_ID: int, page: fitz.Page, json_from_DocXchain_obj: dict):
|
||||
# 13: 'embedding', # 嵌入公式
|
||||
# 14: 'isolated'} # 单行公式
|
||||
for xf in xf_json['layout_dets']:
|
||||
L = xf['poly'][0] / LR_scaleRatio
|
||||
U = xf['poly'][1] / UD_scaleRatio
|
||||
R = xf['poly'][2] / LR_scaleRatio
|
||||
D = xf['poly'][5] / UD_scaleRatio
|
||||
L = xf['poly'][0] / horizontal_scale_ratio
|
||||
U = xf['poly'][1] / vertical_scale_ratio
|
||||
R = xf['poly'][2] / horizontal_scale_ratio
|
||||
D = xf['poly'][5] / vertical_scale_ratio
|
||||
# L += pageL # 有的页面,artBox偏移了。不在(0,0)
|
||||
# R += pageL
|
||||
# U += pageU
|
||||
|
||||
20
magic_pdf/pre_proc/ocr_cut_image.py
Normal file
20
magic_pdf/pre_proc/ocr_cut_image.py
Normal file
@@ -0,0 +1,20 @@
|
||||
from magic_pdf.libs.commons import join_path
|
||||
from magic_pdf.libs.ocr_content_type import ContentType
|
||||
from magic_pdf.libs.pdf_image_tools import cut_image
|
||||
|
||||
|
||||
def cut_image_and_table(spans, page, page_id, book_name, save_path, img_s3_client):
|
||||
def s3_return_path(type):
|
||||
return join_path(book_name, type)
|
||||
|
||||
def img_save_path(type):
|
||||
return join_path(save_path, s3_return_path(type))
|
||||
|
||||
for span in spans:
|
||||
span_type = span['type']
|
||||
if span_type == ContentType.Image:
|
||||
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('images'), s3_return_path=s3_return_path('images'), img_s3_client=img_s3_client)
|
||||
elif span_type == ContentType.Table:
|
||||
span['image_path'] = cut_image(span['bbox'], page_id, page, img_save_path('tables'), s3_return_path=s3_return_path('tables'), img_s3_client=img_s3_client)
|
||||
|
||||
return spans
|
||||
133
magic_pdf/pre_proc/ocr_detect_layout.py
Normal file
133
magic_pdf/pre_proc/ocr_detect_layout.py
Normal file
@@ -0,0 +1,133 @@
|
||||
import fitz
|
||||
|
||||
from magic_pdf.layout.layout_sort import get_bboxes_layout
|
||||
from magic_pdf.libs.boxbase import _is_part_overlap, _is_in
|
||||
from magic_pdf.libs.coordinate_transform import get_scale_ratio
|
||||
|
||||
|
||||
def get_center_point(bbox):
|
||||
"""
|
||||
根据边界框坐标信息,计算出该边界框的中心点坐标。
|
||||
Args:
|
||||
bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
|
||||
Returns:
|
||||
list: 中心点坐标信息,包含两个元素,分别为x坐标和y坐标。
|
||||
"""
|
||||
return [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
|
||||
|
||||
|
||||
def get_area(bbox):
|
||||
"""
|
||||
根据边界框坐标信息,计算出该边界框的面积。
|
||||
Args:
|
||||
bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
|
||||
Returns:
|
||||
float: 该边界框的面积。
|
||||
"""
|
||||
return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
|
||||
|
||||
|
||||
def adjust_layouts(layout_bboxes, page_boundry, page_id):
|
||||
# 遍历所有布局框
|
||||
for i in range(len(layout_bboxes)):
|
||||
# 遍历当前布局框之后的布局框
|
||||
for j in range(i + 1, len(layout_bboxes)):
|
||||
# 判断两个布局框是否重叠
|
||||
if _is_part_overlap(layout_bboxes[i], layout_bboxes[j]):
|
||||
# 计算每个布局框的中心点坐标和面积
|
||||
area_i = get_area(layout_bboxes[i])
|
||||
area_j = get_area(layout_bboxes[j])
|
||||
|
||||
# 较大布局框和较小布局框的赋值
|
||||
if area_i > area_j:
|
||||
larger_layout, smaller_layout = layout_bboxes[i], layout_bboxes[j]
|
||||
else:
|
||||
larger_layout, smaller_layout = layout_bboxes[j], layout_bboxes[i]
|
||||
|
||||
center_large = get_center_point(larger_layout)
|
||||
center_small = get_center_point(smaller_layout)
|
||||
# 计算横向和纵向的距离差
|
||||
distance_x = center_large[0] - center_small[0]
|
||||
distance_y = center_large[1] - center_small[1]
|
||||
|
||||
# 根据距离差判断重叠方向并修正边界
|
||||
if abs(distance_x) > abs(distance_y): # 左右重叠
|
||||
if distance_x > 0 and larger_layout[0] < smaller_layout[2]:
|
||||
larger_layout[0] = smaller_layout[2]+1
|
||||
if distance_x < 0 and larger_layout[2] > smaller_layout[0]:
|
||||
larger_layout[2] = smaller_layout[0]-1
|
||||
else: # 上下重叠
|
||||
if distance_y > 0 and larger_layout[1] < smaller_layout[3]:
|
||||
larger_layout[1] = smaller_layout[3]+1
|
||||
if distance_y < 0 and larger_layout[3] > smaller_layout[1]:
|
||||
larger_layout[3] = smaller_layout[1]-1
|
||||
# 排序调整布局边界框列表
|
||||
new_bboxes = []
|
||||
for layout_bbox in layout_bboxes:
|
||||
new_bboxes.append([layout_bbox[0], layout_bbox[1], layout_bbox[2], layout_bbox[3], None, None, None, None, None, None, None, None, None])
|
||||
|
||||
layout_bboxes, layout_tree = get_bboxes_layout(new_bboxes, page_boundry, page_id)
|
||||
|
||||
# 返回排序调整后的布局边界框列表
|
||||
return layout_bboxes, layout_tree
|
||||
|
||||
|
||||
def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
|
||||
"""
|
||||
对输入的布局信息进行解析,提取出每个子布局的边界框,并对所有子布局进行排序调整。
|
||||
|
||||
Args:
|
||||
layout_info (list): 包含子布局信息的列表,每个子布局信息为字典类型,包含'poly'字段,表示子布局的边界框坐标信息。
|
||||
|
||||
Returns:
|
||||
list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。
|
||||
|
||||
"""
|
||||
page_id = ocr_page_info['page_info']['page_no']-1
|
||||
horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(ocr_page_info, page)
|
||||
# 初始化布局边界框列表
|
||||
layout_bboxes = []
|
||||
# 遍历每个子布局
|
||||
for sub_layout in layout_info:
|
||||
# 提取子布局的边界框坐标信息
|
||||
x0, y0, _, _, x1, y1, _, _ = sub_layout['poly']
|
||||
bbox = [int(x0 / horizontal_scale_ratio), int(y0 / vertical_scale_ratio),
|
||||
int(x1 / horizontal_scale_ratio), int(y1 / vertical_scale_ratio)]
|
||||
|
||||
# 将子布局的边界框添加到列表中
|
||||
layout_bboxes.append(bbox)
|
||||
|
||||
# 初始化新的布局边界框列表
|
||||
new_layout_bboxes = []
|
||||
# 遍历每个布局边界框
|
||||
for i in range(len(layout_bboxes)):
|
||||
# 初始化标记变量,用于判断当前边界框是否需要保留
|
||||
keep = True
|
||||
# 获取当前边界框的坐标信息
|
||||
box_i = layout_bboxes[i]
|
||||
|
||||
# 遍历其他边界框
|
||||
for j in range(len(layout_bboxes)):
|
||||
# 排除当前边界框自身
|
||||
if i != j:
|
||||
# 获取其他边界框的坐标信息
|
||||
box_j = layout_bboxes[j]
|
||||
# 检测box_i是否被box_j包含
|
||||
if _is_in(box_i, box_j):
|
||||
# 如果当前边界框被其他边界框包含,则标记为不需要保留
|
||||
keep = False
|
||||
# 跳出内层循环
|
||||
break
|
||||
|
||||
# 如果当前边界框需要保留,则添加到新的布局边界框列表中
|
||||
if keep:
|
||||
new_layout_bboxes.append(layout_bboxes[i])
|
||||
|
||||
# 对新的布局边界框列表进行排序调整
|
||||
page_width = page.rect.width
|
||||
page_height = page.rect.height
|
||||
page_boundry = [0, 0, page_width, page_height]
|
||||
layout_bboxes, layout_tree = adjust_layouts(new_layout_bboxes, page_boundry, page_id)
|
||||
|
||||
# 返回排序调整后的布局边界框列表
|
||||
return layout_bboxes, layout_tree
|
||||
100
magic_pdf/pre_proc/ocr_dict_merge.py
Normal file
100
magic_pdf/pre_proc/ocr_dict_merge.py
Normal file
@@ -0,0 +1,100 @@
|
||||
from loguru import logger
|
||||
|
||||
from magic_pdf.libs.boxbase import __is_overlaps_y_exceeds_threshold, get_minbox_if_overlap_by_ratio, \
|
||||
calculate_overlap_area_in_bbox1_area_ratio
|
||||
from magic_pdf.libs.ocr_content_type import ContentType
|
||||
|
||||
|
||||
# 将每一个line中的span从左到右排序
|
||||
def line_sort_spans_by_left_to_right(lines):
|
||||
line_objects = []
|
||||
for line in lines:
|
||||
# 按照x0坐标排序
|
||||
line.sort(key=lambda span: span['bbox'][0])
|
||||
line_bbox = [
|
||||
min(span['bbox'][0] for span in line), # x0
|
||||
min(span['bbox'][1] for span in line), # y0
|
||||
max(span['bbox'][2] for span in line), # x1
|
||||
max(span['bbox'][3] for span in line), # y1
|
||||
]
|
||||
line_objects.append({
|
||||
"bbox": line_bbox,
|
||||
"spans": line,
|
||||
})
|
||||
return line_objects
|
||||
|
||||
def merge_spans_to_line(spans):
|
||||
# 按照y0坐标排序
|
||||
spans.sort(key=lambda span: span['bbox'][1])
|
||||
|
||||
lines = []
|
||||
current_line = [spans[0]]
|
||||
for span in spans[1:]:
|
||||
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
|
||||
# image和table类型,同上
|
||||
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
|
||||
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
|
||||
# 则开始新行
|
||||
lines.append(current_line)
|
||||
current_line = [span]
|
||||
continue
|
||||
|
||||
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
|
||||
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
|
||||
current_line.append(span)
|
||||
else:
|
||||
# 否则,开始新行
|
||||
lines.append(current_line)
|
||||
current_line = [span]
|
||||
|
||||
# 添加最后一行
|
||||
if current_line:
|
||||
lines.append(current_line)
|
||||
|
||||
return lines
|
||||
|
||||
def merge_spans_to_line_by_layout(spans, layout_bboxes):
|
||||
lines = []
|
||||
new_spans = []
|
||||
for item in layout_bboxes:
|
||||
layout_bbox = item['layout_bbox']
|
||||
# 遍历spans,将每个span放入对应的layout中
|
||||
layout_sapns = []
|
||||
for span in spans:
|
||||
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], layout_bbox) > 0.65:
|
||||
layout_sapns.append(span)
|
||||
# 如果layout_sapns不为空,则放入new_spans中
|
||||
if len(layout_sapns) > 0:
|
||||
new_spans.append(layout_sapns)
|
||||
# 从spans删除已经放入layout_sapns中的span
|
||||
for layout_sapn in layout_sapns:
|
||||
spans.remove(layout_sapn)
|
||||
|
||||
if len(new_spans) > 0:
|
||||
for layout_sapns in new_spans:
|
||||
layout_lines = merge_spans_to_line(layout_sapns)
|
||||
lines.extend(layout_lines)
|
||||
|
||||
#对line中的span进行排序
|
||||
lines = line_sort_spans_by_left_to_right(lines)
|
||||
|
||||
return lines
|
||||
|
||||
|
||||
def merge_lines_to_block(lines):
|
||||
# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
|
||||
blocks = []
|
||||
for line in lines:
|
||||
blocks.append(
|
||||
{
|
||||
"bbox": line["bbox"],
|
||||
"lines": [line],
|
||||
}
|
||||
)
|
||||
return blocks
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
207
magic_pdf/pre_proc/ocr_span_list_modify.py
Normal file
207
magic_pdf/pre_proc/ocr_span_list_modify.py
Normal file
@@ -0,0 +1,207 @@
|
||||
from loguru import logger
|
||||
|
||||
from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
|
||||
__is_overlaps_y_exceeds_threshold
|
||||
from magic_pdf.libs.ocr_content_type import ContentType
|
||||
|
||||
|
||||
def remove_overlaps_min_spans(spans):
|
||||
# 删除重叠spans中较小的那些
|
||||
for span1 in spans.copy():
|
||||
for span2 in spans.copy():
|
||||
if span1 != span2:
|
||||
overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
|
||||
if overlap_box is not None:
|
||||
bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
|
||||
if bbox_to_remove is not None:
|
||||
spans.remove(bbox_to_remove)
|
||||
return spans
|
||||
|
||||
|
||||
def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
|
||||
# 遍历spans, 判断是否在removed_span_block_bboxes中
|
||||
# 如果是, 则删除该span 否则, 保留该span
|
||||
need_remove_spans = []
|
||||
for span in spans:
|
||||
for removed_bbox in need_remove_spans_bboxes:
|
||||
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
|
||||
need_remove_spans.append(span)
|
||||
break
|
||||
|
||||
for span in need_remove_spans:
|
||||
spans.remove(span)
|
||||
|
||||
return spans
|
||||
|
||||
|
||||
def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
|
||||
dropped_text_block = []
|
||||
dropped_image_block = []
|
||||
dropped_table_block = []
|
||||
for drop_tag, removed_bboxes in need_remove_spans_bboxes_dict.items():
|
||||
# logger.info(f"remove spans by bbox dict, drop_tag: {drop_tag}, removed_bboxes: {removed_bboxes}")
|
||||
need_remove_spans = []
|
||||
for span in spans:
|
||||
for removed_bbox in removed_bboxes:
|
||||
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
|
||||
need_remove_spans.append(span)
|
||||
break
|
||||
|
||||
for span in need_remove_spans:
|
||||
spans.remove(span)
|
||||
span['tag'] = drop_tag
|
||||
if span['type'] in [ContentType.Text, ContentType.InlineEquation, ContentType.InterlineEquation]:
|
||||
dropped_text_block.append(span)
|
||||
elif span['type'] == ContentType.Image:
|
||||
dropped_image_block.append(span)
|
||||
elif span['type'] == ContentType.Table:
|
||||
dropped_table_block.append(span)
|
||||
|
||||
return spans, dropped_text_block, dropped_image_block, dropped_table_block
|
||||
|
||||
|
||||
def adjust_bbox_for_standalone_block(spans):
|
||||
# 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
|
||||
for sb_span in spans:
|
||||
if sb_span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
|
||||
for text_span in spans:
|
||||
if text_span['type'] in [ContentType.Text, ContentType.InlineEquation]:
|
||||
# 判断span2的纵向高度是否被span所覆盖
|
||||
if sb_span['bbox'][1] < text_span['bbox'][1] and sb_span['bbox'][3] > text_span['bbox'][3]:
|
||||
# 判断span2是否在span左边
|
||||
if text_span['bbox'][0] < sb_span['bbox'][0]:
|
||||
# 调整span的y0和span2的y0一致
|
||||
sb_span['bbox'][1] = text_span['bbox'][1]
|
||||
return spans
|
||||
|
||||
|
||||
def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
|
||||
# displayed_list = []
|
||||
|
||||
spans.sort(key=lambda span: span['bbox'][1])
|
||||
|
||||
lines = []
|
||||
current_line = [spans[0]]
|
||||
if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
|
||||
displayed_list.append(spans[0])
|
||||
|
||||
line_first_y0 = spans[0]["bbox"][1]
|
||||
line_first_y = spans[0]["bbox"][3]
|
||||
# 用于给行间公式搜索
|
||||
# text_inline_lines = []
|
||||
for span in spans[1:]:
|
||||
# if span.get("content","") == "78.":
|
||||
# print("debug")
|
||||
# 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
|
||||
# image和table类型,同上
|
||||
if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
|
||||
s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in current_line):
|
||||
# 传入
|
||||
if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
|
||||
displayed_list.append(span)
|
||||
# 则开始新行
|
||||
lines.append(current_line)
|
||||
if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
|
||||
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
|
||||
current_line = [span]
|
||||
line_first_y0 = span["bbox"][1]
|
||||
line_first_y = span["bbox"][3]
|
||||
continue
|
||||
|
||||
# 如果当前的span与当前行的最后一个span在y轴上重叠,则添加到当前行
|
||||
if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
|
||||
if span["type"] == "text":
|
||||
line_first_y0 = span["bbox"][1]
|
||||
line_first_y = span["bbox"][3]
|
||||
current_line.append(span)
|
||||
|
||||
else:
|
||||
# 否则,开始新行
|
||||
lines.append(current_line)
|
||||
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
|
||||
current_line = [span]
|
||||
line_first_y0 = span["bbox"][1]
|
||||
line_first_y = span["bbox"][3]
|
||||
|
||||
# 添加最后一行
|
||||
if current_line:
|
||||
lines.append(current_line)
|
||||
if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
|
||||
text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
|
||||
for line in text_inline_lines:
|
||||
# 按照x0坐标排序
|
||||
current_line = line[0]
|
||||
current_line.sort(key=lambda span: span['bbox'][0])
|
||||
|
||||
# 调整每一个文字行内bbox统一
|
||||
for line in text_inline_lines:
|
||||
current_line, (line_first_y0, line_first_y) = line
|
||||
for span in current_line:
|
||||
span["bbox"][1] = line_first_y0
|
||||
span["bbox"][3] = line_first_y
|
||||
|
||||
# return spans, displayed_list, text_inline_lines
|
||||
|
||||
|
||||
def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines: list):
|
||||
# 错误行间公式转行内公式
|
||||
j = 0
|
||||
for i in range(len(displayed_list)):
|
||||
# if i == 8:
|
||||
# print("debug")
|
||||
span = displayed_list[i]
|
||||
span_y0, span_y = span["bbox"][1], span["bbox"][3]
|
||||
|
||||
while j < len(text_inline_lines):
|
||||
text_line = text_inline_lines[j]
|
||||
y0, y1 = text_line[1]
|
||||
if (
|
||||
span_y0 < y0 and span_y > y0 or span_y0 < y1 and span_y > y1 or span_y0 < y0 and span_y > y1) and __is_overlaps_y_exceeds_threshold(
|
||||
span['bbox'], (0, y0, 0, y1)):
|
||||
|
||||
# 调整公式类型
|
||||
if span["type"] == ContentType.InterlineEquation:
|
||||
# 最后一行是行间公式
|
||||
if j + 1 >= len(text_inline_lines):
|
||||
span["type"] = ContentType.InlineEquation
|
||||
span["bbox"][1] = y0
|
||||
span["bbox"][3] = y1
|
||||
else:
|
||||
# 行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
|
||||
y0_next, y1_next = text_inline_lines[j + 1][1]
|
||||
if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3 * (
|
||||
y1 - y0) > span_y - span_y0:
|
||||
span["type"] = ContentType.InlineEquation
|
||||
span["bbox"][1] = y0
|
||||
span["bbox"][3] = y1
|
||||
break
|
||||
elif span_y < y0 or span_y0 < y0 and span_y > y0 and not __is_overlaps_y_exceeds_threshold(span['bbox'],
|
||||
(0, y0, 0, y1)):
|
||||
break
|
||||
else:
|
||||
j += 1
|
||||
|
||||
return spans
|
||||
|
||||
|
||||
def get_qa_need_list(blocks):
|
||||
# 创建 images, tables, interline_equations, inline_equations 的副本
|
||||
images = []
|
||||
tables = []
|
||||
interline_equations = []
|
||||
inline_equations = []
|
||||
|
||||
for block in blocks:
|
||||
for line in block["lines"]:
|
||||
for span in line["spans"]:
|
||||
if span["type"] == ContentType.Image:
|
||||
images.append(span)
|
||||
elif span["type"] == ContentType.Table:
|
||||
tables.append(span)
|
||||
elif span["type"] == ContentType.InlineEquation:
|
||||
inline_equations.append(span)
|
||||
elif span["type"] == ContentType.InterlineEquation:
|
||||
interline_equations.append(span)
|
||||
else:
|
||||
continue
|
||||
return images, tables, interline_equations, inline_equations
|
||||
43
magic_pdf/pre_proc/remove_bbox_overlap.py
Normal file
43
magic_pdf/pre_proc/remove_bbox_overlap.py
Normal file
@@ -0,0 +1,43 @@
|
||||
from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_in
|
||||
|
||||
|
||||
def _remove_overlap_between_bbox(spans):
|
||||
res = []
|
||||
for v in spans:
|
||||
for i in range(len(res)):
|
||||
if _is_in(res[i]["bbox"], v["bbox"]):
|
||||
continue
|
||||
if _is_in_or_part_overlap(res[i]["bbox"], v["bbox"]):
|
||||
ix0, iy0, ix1, iy1 = res[i]["bbox"]
|
||||
x0, y0, x1, y1 = v["bbox"]
|
||||
|
||||
diff_x = min(x1, ix1) - max(x0, ix0)
|
||||
diff_y = min(y1, iy1) - max(y0, iy0)
|
||||
|
||||
if diff_y > diff_x:
|
||||
if x1 >= ix1:
|
||||
mid = (x0 + ix1) // 2
|
||||
ix1 = min(mid, ix1)
|
||||
x0 = max(mid + 1, x0)
|
||||
else:
|
||||
mid = (ix0 + x1) // 2
|
||||
ix0 = max(mid + 1, ix0)
|
||||
x1 = min(mid, x1)
|
||||
else:
|
||||
if y1 >= iy1:
|
||||
mid = (y0 + iy1) // 2
|
||||
y0 = max(mid + 1, y0)
|
||||
iy1 = min(iy1, mid)
|
||||
else:
|
||||
mid = (iy0 + y1) // 2
|
||||
y1 = min(y1, mid)
|
||||
iy0 = max(mid + 1, iy0)
|
||||
res[i]["bbox"] = [ix0, iy0, ix1, iy1]
|
||||
v["bbox"] = [x0, y0, x1, y1]
|
||||
|
||||
res.append(v)
|
||||
return res
|
||||
|
||||
|
||||
def remove_overlap_between_bbox(spans):
|
||||
return _remove_overlap_between_bbox(spans)
|
||||
29
magic_pdf/pre_proc/solve_line_alien.py
Normal file
29
magic_pdf/pre_proc/solve_line_alien.py
Normal file
@@ -0,0 +1,29 @@
|
||||
def solve_inline_too_large_interval(pdf_info_dict: dict) -> dict: # text_block -> json中的preproc_block
|
||||
"""解决行内文本间距过大问题"""
|
||||
for i in range(len(pdf_info_dict)):
|
||||
|
||||
text_blocks = pdf_info_dict[f'page_{i}']['preproc_blocks']
|
||||
|
||||
for block in text_blocks:
|
||||
|
||||
x_pre_1, y_pre_1, x_pre_2, y_pre_2 = 0, 0, 0, 0
|
||||
|
||||
for line in block['lines']:
|
||||
|
||||
x_cur_1, y_cur_1, x_cur_2, y_cur_2 = line['bbox']
|
||||
# line_box = [x1, y1, x2, y2]
|
||||
if int(y_cur_1) == int(y_pre_1) and int(y_cur_2) == int(y_pre_2):
|
||||
# if len(line['spans']) == 1:
|
||||
line['spans'][0]['text'] = ' ' + line['spans'][0]['text']
|
||||
|
||||
x_pre_1, y_pre_1, x_pre_2, y_pre_2 = line['bbox']
|
||||
|
||||
return pdf_info_dict
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
19
others/README.md
Normal file
19
others/README.md
Normal file
@@ -0,0 +1,19 @@
|
||||
# pdf_toolbox
|
||||
pdf 解析基础函数
|
||||
|
||||
|
||||
## pdf是否是文字类型/扫描类型的区分
|
||||
|
||||
```shell
|
||||
cat s3_pdf_path.example.pdf | parallel --colsep ' ' -j 10 "python pdf_meta_scan.py --s3-pdf-path {2} --s3-profile {1} >> {/}.jsonl"
|
||||
|
||||
find dir/to/jsonl/ -type f -name "*.jsonl" | parallel -j 10 "python pdf_classfy_by_type.py --json_file {} >> {/}.jsonl"
|
||||
|
||||
```
|
||||
|
||||
```shell
|
||||
# 如果单独运行脚本,合并到code-clean之后需要运行,参考如下:
|
||||
python -m pdf_meta_scan --s3-pdf-path "D:\pdf_files\内容排序测试_pdf\p3_图文混排 5.pdf" --s3-profile s2
|
||||
```
|
||||
|
||||
## pdf
|
||||
@@ -1,16 +1,15 @@
|
||||
boto3==1.34.52
|
||||
botocore==1.34.52
|
||||
Brotli==1.1.0
|
||||
click==8.1.7
|
||||
Distance==0.1.3
|
||||
PyMuPDF==1.23.25
|
||||
loguru==0.7.2
|
||||
matplotlib==3.8.3
|
||||
numpy==1.26.4
|
||||
pandas==2.2.1
|
||||
pycld2==0.41
|
||||
regex==2023.12.25
|
||||
spacy==3.7.4
|
||||
termcolor==2.4.0
|
||||
boto3>=1.28.43
|
||||
Brotli>=1.1.0
|
||||
click>=8.1.7
|
||||
Distance>=0.1.3
|
||||
PyMuPDF>=1.23.26
|
||||
loguru>=0.6.0
|
||||
matplotlib>=3.8.3
|
||||
numpy>=1.21.6
|
||||
pandas>=1.3.5
|
||||
pycld2>=0.41
|
||||
regex>=2023.12.25
|
||||
spacy>=3.7.4
|
||||
termcolor>=2.4.0
|
||||
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
|
||||
zh_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-3.7.0/zh_core_web_sm-3.7.0-py3-none-any.whl
|
||||
18
setup.py
18
setup.py
@@ -1,5 +1,5 @@
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
import subprocess
|
||||
def parse_requirements(filename):
|
||||
with open(filename) as f:
|
||||
lines = f.read().splitlines()
|
||||
@@ -15,12 +15,26 @@ def parse_requirements(filename):
|
||||
|
||||
return requires
|
||||
|
||||
def get_version():
|
||||
command = ["git", "describe", "--tags"]
|
||||
try:
|
||||
version = subprocess.check_output(command).decode().strip()
|
||||
version_parts = version.split("-")
|
||||
if len(version_parts) > 1 and version_parts[0].startswith("magic_pdf"):
|
||||
return version_parts[1]
|
||||
else:
|
||||
raise ValueError(f"Invalid version tag {version}. Expected format is magic_pdf-<version>-released.")
|
||||
except Exception as e:
|
||||
print(e)
|
||||
return "0.0.0"
|
||||
|
||||
|
||||
requires = parse_requirements('requirements.txt')
|
||||
|
||||
setup(
|
||||
name="magic_pdf", # 项目名
|
||||
version="0.1.0", # 版本号
|
||||
# version="0.1.3", # 版本号
|
||||
version=get_version(), # 自动从tag中获取版本号
|
||||
packages=find_packages(), # 包含所有的包
|
||||
install_requires=requires, # 项目依赖的第三方库
|
||||
python_requires=">=3.9", # 项目依赖的 Python 版本
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
from magic_pdf.libs import fitz
|
||||
from magic_pdf.libs.commons import fitz
|
||||
|
||||
from app.common.s3 import get_s3_config, get_s3_client
|
||||
from magic_pdf.libs import join_path, json_dump_path, read_file, parse_bucket_key
|
||||
from magic_pdf.libs.commons import join_path, json_dump_path, read_file, parse_bucket_key
|
||||
from loguru import logger
|
||||
|
||||
test_pdf_dir_path = "s3://llm-pdf-text/unittest/pdf/"
|
||||
|
||||
@@ -2,10 +2,10 @@ import os
|
||||
|
||||
import pytest
|
||||
|
||||
from magic_pdf.filter import classify_by_area, classify_by_text_len, classify_by_avg_words, \
|
||||
from magic_pdf.filter.pdf_classify_by_type import classify_by_area, classify_by_text_len, classify_by_avg_words, \
|
||||
classify_by_img_num, classify_by_text_layout, classify_by_img_narrow_strips
|
||||
from magic_pdf.filter.pdf_meta_scan import get_pdf_page_size_pts, get_pdf_textlen_per_page, get_imgs_per_page
|
||||
from test.test_commons import get_docs_from_test_pdf, get_test_json_data
|
||||
from tests.test_commons import get_docs_from_test_pdf, get_test_json_data
|
||||
|
||||
# 获取当前目录
|
||||
current_directory = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
@@ -2,7 +2,7 @@ import os
|
||||
|
||||
import pytest
|
||||
from magic_pdf.filter.pdf_meta_scan import get_pdf_page_size_pts, get_image_info, get_pdf_text_layout_per_page, get_language
|
||||
from test.test_commons import get_docs_from_test_pdf, get_test_json_data
|
||||
from tests.test_commons import get_docs_from_test_pdf, get_test_json_data
|
||||
|
||||
# 获取当前目录
|
||||
current_directory = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
@@ -11,21 +11,21 @@ Execute the following command to run the tests under directory code-clean:
|
||||
|
||||
"""
|
||||
|
||||
from test.test_para.test_pdf2text_recogPara_Common import (
|
||||
from tests.test_para.test_pdf2text_recogPara_Common import (
|
||||
TestIsBboxOverlap,
|
||||
TestIsInBbox,
|
||||
TestIsBboxOverlap,
|
||||
TestIsLineLeftAlignedFromNeighbors,
|
||||
TestIsLineRightAlignedFromNeighbors,
|
||||
)
|
||||
from test.test_para.test_pdf2text_recogPara_EquationsProcessor import TestCalcOverlapPct
|
||||
from test.test_para.test_pdf2text_recogPara_BlockInnerParasProcessor import TestIsConsistentLines
|
||||
from test.test_para.test_pdf2text_recogPara_BlockContinuationProcessor import (
|
||||
from tests.test_para.test_pdf2text_recogPara_EquationsProcessor import TestCalcOverlapPct
|
||||
from tests.test_para.test_pdf2text_recogPara_BlockInnerParasProcessor import TestIsConsistentLines
|
||||
from tests.test_para.test_pdf2text_recogPara_BlockContinuationProcessor import (
|
||||
TestIsAlphabetChar,
|
||||
TestIsChineseChar,
|
||||
TestIsOtherLetterChar,
|
||||
)
|
||||
from test.test_para.test_pdf2text_recogPara_TitleProcessor import TestTitleProcessor
|
||||
from tests.test_para.test_pdf2text_recogPara_TitleProcessor import TestTitleProcessor
|
||||
|
||||
|
||||
# Test suite
|
||||
|
||||
Reference in New Issue
Block a user