Fix: PDF parsing doesn't support partially numbered lists (#1525)
* Fix: PDF parsing doesn't support partially numbered lists * Refactor: Move import of PARTIAL_NUMBERING_PATTERN to the top of the test file * Refactor: Improve assertion formatting in partial numbering tests
This commit is contained in:
@@ -1,4 +1,4 @@
|
|||||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||||
#
|
#
|
||||||
# SPDX-License-Identifier: MIT
|
# SPDX-License-Identifier: MIT
|
||||||
__version__ = "0.1.4"
|
__version__ = "0.1.5b1"
|
||||||
|
|||||||
@@ -1,11 +1,62 @@
|
|||||||
import sys
|
import sys
|
||||||
import io
|
import io
|
||||||
|
import re
|
||||||
from typing import BinaryIO, Any
|
from typing import BinaryIO, Any
|
||||||
|
|
||||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||||
from .._stream_info import StreamInfo
|
from .._stream_info import StreamInfo
|
||||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||||
|
|
||||||
|
# Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10")
|
||||||
|
PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$")
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_partial_numbering_lines(text: str) -> str:
|
||||||
|
"""
|
||||||
|
Post-process extracted text to merge MasterFormat-style partial numbering
|
||||||
|
with the following text line.
|
||||||
|
|
||||||
|
MasterFormat documents use partial numbering like:
|
||||||
|
.1 The intent of this Request for Proposal...
|
||||||
|
.2 Available information relative to...
|
||||||
|
|
||||||
|
Some PDF extractors split these into separate lines:
|
||||||
|
.1
|
||||||
|
The intent of this Request for Proposal...
|
||||||
|
|
||||||
|
This function merges them back together.
|
||||||
|
"""
|
||||||
|
lines = text.split("\n")
|
||||||
|
result_lines: list[str] = []
|
||||||
|
i = 0
|
||||||
|
|
||||||
|
while i < len(lines):
|
||||||
|
line = lines[i]
|
||||||
|
stripped = line.strip()
|
||||||
|
|
||||||
|
# Check if this line is ONLY a partial numbering
|
||||||
|
if PARTIAL_NUMBERING_PATTERN.match(stripped):
|
||||||
|
# Look for the next non-empty line to merge with
|
||||||
|
j = i + 1
|
||||||
|
while j < len(lines) and not lines[j].strip():
|
||||||
|
j += 1
|
||||||
|
|
||||||
|
if j < len(lines):
|
||||||
|
# Merge the partial numbering with the next line
|
||||||
|
next_line = lines[j].strip()
|
||||||
|
result_lines.append(f"{stripped} {next_line}")
|
||||||
|
i = j + 1 # Skip past the merged line
|
||||||
|
else:
|
||||||
|
# No next line to merge with, keep as is
|
||||||
|
result_lines.append(line)
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
result_lines.append(line)
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
return "\n".join(result_lines)
|
||||||
|
|
||||||
|
|
||||||
# Load dependencies
|
# Load dependencies
|
||||||
_dependency_exc_info = None
|
_dependency_exc_info = None
|
||||||
try:
|
try:
|
||||||
@@ -117,6 +168,14 @@ def _extract_form_content_from_words(page: Any) -> str | None:
|
|||||||
# Determine row type
|
# Determine row type
|
||||||
is_paragraph = line_width > page_width * 0.55 and len(combined_text) > 60
|
is_paragraph = line_width > page_width * 0.55 and len(combined_text) > 60
|
||||||
|
|
||||||
|
# Check for MasterFormat-style partial numbering (e.g., ".1", ".2")
|
||||||
|
# These should be treated as list items, not table rows
|
||||||
|
has_partial_numbering = False
|
||||||
|
if row_words:
|
||||||
|
first_word = row_words[0]["text"].strip()
|
||||||
|
if PARTIAL_NUMBERING_PATTERN.match(first_word):
|
||||||
|
has_partial_numbering = True
|
||||||
|
|
||||||
row_info.append(
|
row_info.append(
|
||||||
{
|
{
|
||||||
"y_key": y_key,
|
"y_key": y_key,
|
||||||
@@ -125,6 +184,7 @@ def _extract_form_content_from_words(page: Any) -> str | None:
|
|||||||
"x_groups": x_groups,
|
"x_groups": x_groups,
|
||||||
"is_paragraph": is_paragraph,
|
"is_paragraph": is_paragraph,
|
||||||
"num_columns": len(x_groups),
|
"num_columns": len(x_groups),
|
||||||
|
"has_partial_numbering": has_partial_numbering,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -156,6 +216,11 @@ def _extract_form_content_from_words(page: Any) -> str | None:
|
|||||||
info["is_table_row"] = False
|
info["is_table_row"] = False
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Rows with partial numbering (e.g., ".1", ".2") are list items, not table rows
|
||||||
|
if info["has_partial_numbering"]:
|
||||||
|
info["is_table_row"] = False
|
||||||
|
continue
|
||||||
|
|
||||||
# Count how many global columns this row's words align with
|
# Count how many global columns this row's words align with
|
||||||
aligned_columns: set[int] = set()
|
aligned_columns: set[int] = set()
|
||||||
for word in info["words"]:
|
for word in info["words"]:
|
||||||
@@ -469,4 +534,7 @@ class PdfConverter(DocumentConverter):
|
|||||||
pdf_bytes.seek(0)
|
pdf_bytes.seek(0)
|
||||||
markdown = pdfminer.high_level.extract_text(pdf_bytes)
|
markdown = pdfminer.high_level.extract_text(pdf_bytes)
|
||||||
|
|
||||||
|
# Post-process to merge MasterFormat-style partial numbering with following text
|
||||||
|
markdown = _merge_partial_numbering_lines(markdown)
|
||||||
|
|
||||||
return DocumentConverterResult(markdown=markdown)
|
return DocumentConverterResult(markdown=markdown)
|
||||||
|
|||||||
@@ -0,0 +1,74 @@
|
|||||||
|
%PDF-1.3
|
||||||
|
%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
|
||||||
|
1 0 obj
|
||||||
|
<<
|
||||||
|
/F1 2 0 R /F2 3 0 R
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
2 0 obj
|
||||||
|
<<
|
||||||
|
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
3 0 obj
|
||||||
|
<<
|
||||||
|
/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
4 0 obj
|
||||||
|
<<
|
||||||
|
/Contents 8 0 R /MediaBox [ 0 0 612 792 ] /Parent 7 0 R /Resources <<
|
||||||
|
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
||||||
|
>> /Rotate 0 /Trans <<
|
||||||
|
|
||||||
|
>>
|
||||||
|
/Type /Page
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
5 0 obj
|
||||||
|
<<
|
||||||
|
/PageMode /UseNone /Pages 7 0 R /Type /Catalog
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
6 0 obj
|
||||||
|
<<
|
||||||
|
/Author (anonymous) /CreationDate (D:20260108192537+01'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20260108192537+01'00') /Producer (ReportLab PDF Library - www.reportlab.com)
|
||||||
|
/Subject (unspecified) /Title (untitled) /Trapped /False
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
7 0 obj
|
||||||
|
<<
|
||||||
|
/Count 1 /Kids [ 4 0 R ] /Type /Pages
|
||||||
|
>>
|
||||||
|
endobj
|
||||||
|
8 0 obj
|
||||||
|
<<
|
||||||
|
/Filter [ /ASCII85Decode /FlateDecode ] /Length 670
|
||||||
|
>>
|
||||||
|
stream
|
||||||
|
Gat$td;IYl'Rf-pcJpsZ/27V[H_WEoW#\5sVS2I3Jt]?;R+`$Ms*f.>6<=3APUNhTmQL<9F,pFup'KGk=TR,7^>/u!#kAE+l;?UQ8Fg(+-O>;^54HWJ*kXdl'VdsI]Y^$-G(GWPR)iGMeWbg3)F'+jfWpCb"rU?d?8?q_r!E2N'0sM)J>=XD.jgunBuga\Wi4MX$WV/b)1F@bC8Nj8(0*)"ZK06BSqlu1$[^37A;/aK=mfgqg$&i),2OH&%^\"B1%B\dd_V>$5OtPri4rcEe3LoBUeL6QAPnpQr+R-t0f]ZSYc?BTAKQ?A&+J#J*N*=6;'?@Cp*>auj0",hDS3bH4[hVs3O="&bk&U@>+8c1&c2iDg6R*%q%iEZq'-!FNSB8#C*'po69R8$S(:.=-$N6'!_[1/jV<$@V3Z_"gd!g!MJMT)mTUN4cWjUQQj]HT_m]0*R=YgTmcl@k>*b/SBce9?.m,bEi#?PI:=r_6G.auM&FtP,>O7T%Z<$f#=g6(2+d@;8?"$8cdI38ZZ>hq5b2_pQY:M\.Kod,pl)ZX7a7Gc'Mf_'SB1X3*L[-51a8`h4)KjJQjLfm/3TIeQY?2+?^.r^HNafjHp<5,1M=W'N>8sb=dB#FC5M`7L91"BC@CfEckPe`M5O:#!Fj$K]s(Gs8rW$>H7gK~>endstream
|
||||||
|
endobj
|
||||||
|
xref
|
||||||
|
0 9
|
||||||
|
0000000000 65535 f
|
||||||
|
0000000073 00000 n
|
||||||
|
0000000114 00000 n
|
||||||
|
0000000221 00000 n
|
||||||
|
0000000333 00000 n
|
||||||
|
0000000526 00000 n
|
||||||
|
0000000594 00000 n
|
||||||
|
0000000890 00000 n
|
||||||
|
0000000949 00000 n
|
||||||
|
trailer
|
||||||
|
<<
|
||||||
|
/ID
|
||||||
|
[<5467fcd5093f18002be6af3fb13ce6c3><5467fcd5093f18002be6af3fb13ce6c3>]
|
||||||
|
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
|
||||||
|
|
||||||
|
/Info 6 0 R
|
||||||
|
/Root 5 0 R
|
||||||
|
/Size 9
|
||||||
|
>>
|
||||||
|
startxref
|
||||||
|
1709
|
||||||
|
%%EOF
|
||||||
@@ -0,0 +1,171 @@
|
|||||||
|
#!/usr/bin/env python3 -m pytest
|
||||||
|
"""Tests for MasterFormat-style partial numbering in PDF conversion."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from markitdown import MarkItDown
|
||||||
|
from markitdown.converters._pdf_converter import PARTIAL_NUMBERING_PATTERN
|
||||||
|
|
||||||
|
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
|
||||||
|
|
||||||
|
|
||||||
|
class TestMasterFormatPartialNumbering:
|
||||||
|
"""Test handling of MasterFormat-style partial numbering (.1, .2, etc.)."""
|
||||||
|
|
||||||
|
def test_partial_numbering_pattern_regex(self):
|
||||||
|
"""Test that the partial numbering regex pattern correctly matches."""
|
||||||
|
|
||||||
|
# Should match partial numbering patterns
|
||||||
|
assert PARTIAL_NUMBERING_PATTERN.match(".1") is not None
|
||||||
|
assert PARTIAL_NUMBERING_PATTERN.match(".2") is not None
|
||||||
|
assert PARTIAL_NUMBERING_PATTERN.match(".10") is not None
|
||||||
|
assert PARTIAL_NUMBERING_PATTERN.match(".99") is not None
|
||||||
|
|
||||||
|
# Should NOT match other patterns
|
||||||
|
assert PARTIAL_NUMBERING_PATTERN.match("1.") is None
|
||||||
|
assert PARTIAL_NUMBERING_PATTERN.match("1.2") is None
|
||||||
|
assert PARTIAL_NUMBERING_PATTERN.match(".1.2") is None
|
||||||
|
assert PARTIAL_NUMBERING_PATTERN.match("text") is None
|
||||||
|
assert PARTIAL_NUMBERING_PATTERN.match(".a") is None
|
||||||
|
assert PARTIAL_NUMBERING_PATTERN.match("") is None
|
||||||
|
|
||||||
|
def test_masterformat_partial_numbering_not_split(self):
|
||||||
|
"""Test that MasterFormat partial numbering stays with associated text.
|
||||||
|
|
||||||
|
MasterFormat documents use partial numbering like:
|
||||||
|
.1 The intent of this Request for Proposal...
|
||||||
|
.2 Available information relative to...
|
||||||
|
|
||||||
|
These should NOT be split into separate table columns, but kept
|
||||||
|
as coherent text lines with the number followed by its description.
|
||||||
|
"""
|
||||||
|
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
|
||||||
|
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
result = markitdown.convert(pdf_path)
|
||||||
|
text_content = result.text_content
|
||||||
|
|
||||||
|
# Partial numberings should NOT appear isolated on their own lines
|
||||||
|
# If they're isolated, it means the parser incorrectly split them from their text
|
||||||
|
lines = text_content.split("\n")
|
||||||
|
isolated_numberings = []
|
||||||
|
for line in lines:
|
||||||
|
stripped = line.strip()
|
||||||
|
# Check if line contains ONLY a partial numbering (with possible whitespace/pipes)
|
||||||
|
cleaned = stripped.replace("|", "").strip()
|
||||||
|
if cleaned in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".10"]:
|
||||||
|
isolated_numberings.append(stripped)
|
||||||
|
|
||||||
|
assert len(isolated_numberings) == 0, (
|
||||||
|
f"Partial numberings should not be isolated from their text. "
|
||||||
|
f"Found isolated: {isolated_numberings}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Verify that partial numberings appear WITH following text on the same line
|
||||||
|
# Look for patterns like ".1 The intent" or ".1 Some text"
|
||||||
|
partial_with_text = re.findall(r"\.\d+\s+\w+", text_content)
|
||||||
|
assert (
|
||||||
|
len(partial_with_text) > 0
|
||||||
|
), "Expected to find partial numberings followed by text on the same line"
|
||||||
|
|
||||||
|
def test_masterformat_content_preserved(self):
|
||||||
|
"""Test that MasterFormat document content is fully preserved."""
|
||||||
|
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
|
||||||
|
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
result = markitdown.convert(pdf_path)
|
||||||
|
text_content = result.text_content
|
||||||
|
|
||||||
|
# Verify key content from the MasterFormat document is preserved
|
||||||
|
expected_content = [
|
||||||
|
"RFP for Construction Management Services",
|
||||||
|
"Section 00 00 43",
|
||||||
|
"Instructions to Respondents",
|
||||||
|
"Ken Sargent House",
|
||||||
|
"INTENT",
|
||||||
|
"Request for Proposal",
|
||||||
|
"KEN SARGENT HOUSE",
|
||||||
|
"GRANDE PRAIRIE, ALBERTA",
|
||||||
|
"Section 00 00 45",
|
||||||
|
]
|
||||||
|
|
||||||
|
for content in expected_content:
|
||||||
|
assert (
|
||||||
|
content in text_content
|
||||||
|
), f"Expected content '{content}' not found in extracted text"
|
||||||
|
|
||||||
|
# Verify partial numbering is followed by text on the same line
|
||||||
|
# .1 should be followed by "The intent" on the same line
|
||||||
|
assert re.search(
|
||||||
|
r"\.1\s+The intent", text_content
|
||||||
|
), "Partial numbering .1 should be followed by 'The intent' text"
|
||||||
|
|
||||||
|
# .2 should be followed by "Available information" on the same line
|
||||||
|
assert re.search(
|
||||||
|
r"\.2\s+Available information", text_content
|
||||||
|
), "Partial numbering .2 should be followed by 'Available information' text"
|
||||||
|
|
||||||
|
# Ensure text content is not empty and has reasonable length
|
||||||
|
assert (
|
||||||
|
len(text_content.strip()) > 100
|
||||||
|
), "MasterFormat document should have substantial text content"
|
||||||
|
|
||||||
|
def test_merge_partial_numbering_with_empty_lines_between(self):
|
||||||
|
"""Test that partial numberings merge correctly even with empty lines between.
|
||||||
|
|
||||||
|
When PDF extractors produce output like:
|
||||||
|
.1
|
||||||
|
|
||||||
|
The intent of this Request...
|
||||||
|
|
||||||
|
The merge logic should still combine them properly.
|
||||||
|
"""
|
||||||
|
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
|
||||||
|
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
result = markitdown.convert(pdf_path)
|
||||||
|
text_content = result.text_content
|
||||||
|
|
||||||
|
# The merged result should have .1 and .2 followed by text
|
||||||
|
# Check that we don't have patterns like ".1\n\nThe intent" (unmerged)
|
||||||
|
lines = text_content.split("\n")
|
||||||
|
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
stripped = line.strip()
|
||||||
|
# If we find an isolated partial numbering, the merge failed
|
||||||
|
if stripped in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8"]:
|
||||||
|
# Check if next non-empty line exists and wasn't merged
|
||||||
|
for j in range(i + 1, min(i + 3, len(lines))):
|
||||||
|
if lines[j].strip():
|
||||||
|
pytest.fail(
|
||||||
|
f"Partial numbering '{stripped}' on line {i} was not "
|
||||||
|
f"merged with following text '{lines[j].strip()[:30]}...'"
|
||||||
|
)
|
||||||
|
break
|
||||||
|
|
||||||
|
def test_multiple_partial_numberings_all_merged(self):
|
||||||
|
"""Test that all partial numberings in a document are properly merged."""
|
||||||
|
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
|
||||||
|
|
||||||
|
markitdown = MarkItDown()
|
||||||
|
result = markitdown.convert(pdf_path)
|
||||||
|
text_content = result.text_content
|
||||||
|
|
||||||
|
# Count occurrences of merged partial numberings (number followed by text)
|
||||||
|
merged_count = len(re.findall(r"\.\d+\s+[A-Za-z]", text_content))
|
||||||
|
|
||||||
|
# Count isolated partial numberings (number alone on a line)
|
||||||
|
isolated_count = 0
|
||||||
|
for line in text_content.split("\n"):
|
||||||
|
stripped = line.strip()
|
||||||
|
if re.match(r"^\.\d+$", stripped):
|
||||||
|
isolated_count += 1
|
||||||
|
|
||||||
|
assert (
|
||||||
|
merged_count >= 2
|
||||||
|
), f"Expected at least 2 merged partial numberings, found {merged_count}"
|
||||||
|
assert (
|
||||||
|
isolated_count == 0
|
||||||
|
), f"Found {isolated_count} isolated partial numberings that weren't merged"
|
||||||
Reference in New Issue
Block a user