Fix: PDF parsing doesn't support partially numbered lists (#1525)
* Fix: PDF parsing doesn't support partially numbered lists * Refactor: Move import of PARTIAL_NUMBERING_PATTERN to the top of the test file * Refactor: Improve assertion formatting in partial numbering tests
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
__version__ = "0.1.4"
|
||||
__version__ = "0.1.5b1"
|
||||
|
||||
@@ -1,11 +1,62 @@
|
||||
import sys
|
||||
import io
|
||||
import re
|
||||
from typing import BinaryIO, Any
|
||||
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10")
|
||||
PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$")
|
||||
|
||||
|
||||
def _merge_partial_numbering_lines(text: str) -> str:
|
||||
"""
|
||||
Post-process extracted text to merge MasterFormat-style partial numbering
|
||||
with the following text line.
|
||||
|
||||
MasterFormat documents use partial numbering like:
|
||||
.1 The intent of this Request for Proposal...
|
||||
.2 Available information relative to...
|
||||
|
||||
Some PDF extractors split these into separate lines:
|
||||
.1
|
||||
The intent of this Request for Proposal...
|
||||
|
||||
This function merges them back together.
|
||||
"""
|
||||
lines = text.split("\n")
|
||||
result_lines: list[str] = []
|
||||
i = 0
|
||||
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
stripped = line.strip()
|
||||
|
||||
# Check if this line is ONLY a partial numbering
|
||||
if PARTIAL_NUMBERING_PATTERN.match(stripped):
|
||||
# Look for the next non-empty line to merge with
|
||||
j = i + 1
|
||||
while j < len(lines) and not lines[j].strip():
|
||||
j += 1
|
||||
|
||||
if j < len(lines):
|
||||
# Merge the partial numbering with the next line
|
||||
next_line = lines[j].strip()
|
||||
result_lines.append(f"{stripped} {next_line}")
|
||||
i = j + 1 # Skip past the merged line
|
||||
else:
|
||||
# No next line to merge with, keep as is
|
||||
result_lines.append(line)
|
||||
i += 1
|
||||
else:
|
||||
result_lines.append(line)
|
||||
i += 1
|
||||
|
||||
return "\n".join(result_lines)
|
||||
|
||||
|
||||
# Load dependencies
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
@@ -117,6 +168,14 @@ def _extract_form_content_from_words(page: Any) -> str | None:
|
||||
# Determine row type
|
||||
is_paragraph = line_width > page_width * 0.55 and len(combined_text) > 60
|
||||
|
||||
# Check for MasterFormat-style partial numbering (e.g., ".1", ".2")
|
||||
# These should be treated as list items, not table rows
|
||||
has_partial_numbering = False
|
||||
if row_words:
|
||||
first_word = row_words[0]["text"].strip()
|
||||
if PARTIAL_NUMBERING_PATTERN.match(first_word):
|
||||
has_partial_numbering = True
|
||||
|
||||
row_info.append(
|
||||
{
|
||||
"y_key": y_key,
|
||||
@@ -125,6 +184,7 @@ def _extract_form_content_from_words(page: Any) -> str | None:
|
||||
"x_groups": x_groups,
|
||||
"is_paragraph": is_paragraph,
|
||||
"num_columns": len(x_groups),
|
||||
"has_partial_numbering": has_partial_numbering,
|
||||
}
|
||||
)
|
||||
|
||||
@@ -156,6 +216,11 @@ def _extract_form_content_from_words(page: Any) -> str | None:
|
||||
info["is_table_row"] = False
|
||||
continue
|
||||
|
||||
# Rows with partial numbering (e.g., ".1", ".2") are list items, not table rows
|
||||
if info["has_partial_numbering"]:
|
||||
info["is_table_row"] = False
|
||||
continue
|
||||
|
||||
# Count how many global columns this row's words align with
|
||||
aligned_columns: set[int] = set()
|
||||
for word in info["words"]:
|
||||
@@ -469,4 +534,7 @@ class PdfConverter(DocumentConverter):
|
||||
pdf_bytes.seek(0)
|
||||
markdown = pdfminer.high_level.extract_text(pdf_bytes)
|
||||
|
||||
# Post-process to merge MasterFormat-style partial numbering with following text
|
||||
markdown = _merge_partial_numbering_lines(markdown)
|
||||
|
||||
return DocumentConverterResult(markdown=markdown)
|
||||
|
||||
@@ -0,0 +1,74 @@
|
||||
%PDF-1.3
|
||||
%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
|
||||
1 0 obj
|
||||
<<
|
||||
/F1 2 0 R /F2 3 0 R
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/Contents 8 0 R /MediaBox [ 0 0 612 792 ] /Parent 7 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/PageMode /UseNone /Pages 7 0 R /Type /Catalog
|
||||
>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<<
|
||||
/Author (anonymous) /CreationDate (D:20260108192537+01'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20260108192537+01'00') /Producer (ReportLab PDF Library - www.reportlab.com)
|
||||
/Subject (unspecified) /Title (untitled) /Trapped /False
|
||||
>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<<
|
||||
/Count 1 /Kids [ 4 0 R ] /Type /Pages
|
||||
>>
|
||||
endobj
|
||||
8 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 670
|
||||
>>
|
||||
stream
|
||||
Gat$td;IYl'Rf-pcJpsZ/27V[H_WEoW#\5sVS2I3Jt]?;R+`$Ms*f.>6<=3APUNhTmQL<9F,pFup'KGk=TR,7^>/u!#kAE+l;?UQ8Fg(+-O>;^54HWJ*kXdl'VdsI]Y^$-G(GWPR)iGMeWbg3)F'+jfWpCb"rU?d?8?q_r!E2N'0sM)J>=XD.jgunBuga\Wi4MX$WV/b)1F@bC8Nj8(0*)"ZK06BSqlu1$[^37A;/aK=mfgqg$&i),2OH&%^\"B1%B\dd_V>$5OtPri4rcEe3LoBUeL6QAPnpQr+R-t0f]ZSYc?BTAKQ?A&+J#J*N*=6;'?@Cp*>auj0",hDS3bH4[hVs3O="&bk&U@>+8c1&c2iDg6R*%q%iEZq'-!FNSB8#C*'po69R8$S(:.=-$N6'!_[1/jV<$@V3Z_"gd!g!MJMT)mTUN4cWjUQQj]HT_m]0*R=YgTmcl@k>*b/SBce9?.m,bEi#?PI:=r_6G.auM&FtP,>O7T%Z<$f#=g6(2+d@;8?"$8cdI38ZZ>hq5b2_pQY:M\.Kod,pl)ZX7a7Gc'Mf_'SB1X3*L[-51a8`h4)KjJQjLfm/3TIeQY?2+?^.r^HNafjHp<5,1M=W'N>8sb=dB#FC5M`7L91"BC@CfEckPe`M5O:#!Fj$K]s(Gs8rW$>H7gK~>endstream
|
||||
endobj
|
||||
xref
|
||||
0 9
|
||||
0000000000 65535 f
|
||||
0000000073 00000 n
|
||||
0000000114 00000 n
|
||||
0000000221 00000 n
|
||||
0000000333 00000 n
|
||||
0000000526 00000 n
|
||||
0000000594 00000 n
|
||||
0000000890 00000 n
|
||||
0000000949 00000 n
|
||||
trailer
|
||||
<<
|
||||
/ID
|
||||
[<5467fcd5093f18002be6af3fb13ce6c3><5467fcd5093f18002be6af3fb13ce6c3>]
|
||||
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
|
||||
|
||||
/Info 6 0 R
|
||||
/Root 5 0 R
|
||||
/Size 9
|
||||
>>
|
||||
startxref
|
||||
1709
|
||||
%%EOF
|
||||
@@ -0,0 +1,171 @@
|
||||
#!/usr/bin/env python3 -m pytest
|
||||
"""Tests for MasterFormat-style partial numbering in PDF conversion."""
|
||||
|
||||
import os
|
||||
import re
|
||||
import pytest
|
||||
|
||||
from markitdown import MarkItDown
|
||||
from markitdown.converters._pdf_converter import PARTIAL_NUMBERING_PATTERN
|
||||
|
||||
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
|
||||
|
||||
|
||||
class TestMasterFormatPartialNumbering:
|
||||
"""Test handling of MasterFormat-style partial numbering (.1, .2, etc.)."""
|
||||
|
||||
def test_partial_numbering_pattern_regex(self):
|
||||
"""Test that the partial numbering regex pattern correctly matches."""
|
||||
|
||||
# Should match partial numbering patterns
|
||||
assert PARTIAL_NUMBERING_PATTERN.match(".1") is not None
|
||||
assert PARTIAL_NUMBERING_PATTERN.match(".2") is not None
|
||||
assert PARTIAL_NUMBERING_PATTERN.match(".10") is not None
|
||||
assert PARTIAL_NUMBERING_PATTERN.match(".99") is not None
|
||||
|
||||
# Should NOT match other patterns
|
||||
assert PARTIAL_NUMBERING_PATTERN.match("1.") is None
|
||||
assert PARTIAL_NUMBERING_PATTERN.match("1.2") is None
|
||||
assert PARTIAL_NUMBERING_PATTERN.match(".1.2") is None
|
||||
assert PARTIAL_NUMBERING_PATTERN.match("text") is None
|
||||
assert PARTIAL_NUMBERING_PATTERN.match(".a") is None
|
||||
assert PARTIAL_NUMBERING_PATTERN.match("") is None
|
||||
|
||||
def test_masterformat_partial_numbering_not_split(self):
|
||||
"""Test that MasterFormat partial numbering stays with associated text.
|
||||
|
||||
MasterFormat documents use partial numbering like:
|
||||
.1 The intent of this Request for Proposal...
|
||||
.2 Available information relative to...
|
||||
|
||||
These should NOT be split into separate table columns, but kept
|
||||
as coherent text lines with the number followed by its description.
|
||||
"""
|
||||
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
|
||||
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert(pdf_path)
|
||||
text_content = result.text_content
|
||||
|
||||
# Partial numberings should NOT appear isolated on their own lines
|
||||
# If they're isolated, it means the parser incorrectly split them from their text
|
||||
lines = text_content.split("\n")
|
||||
isolated_numberings = []
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
# Check if line contains ONLY a partial numbering (with possible whitespace/pipes)
|
||||
cleaned = stripped.replace("|", "").strip()
|
||||
if cleaned in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".10"]:
|
||||
isolated_numberings.append(stripped)
|
||||
|
||||
assert len(isolated_numberings) == 0, (
|
||||
f"Partial numberings should not be isolated from their text. "
|
||||
f"Found isolated: {isolated_numberings}"
|
||||
)
|
||||
|
||||
# Verify that partial numberings appear WITH following text on the same line
|
||||
# Look for patterns like ".1 The intent" or ".1 Some text"
|
||||
partial_with_text = re.findall(r"\.\d+\s+\w+", text_content)
|
||||
assert (
|
||||
len(partial_with_text) > 0
|
||||
), "Expected to find partial numberings followed by text on the same line"
|
||||
|
||||
def test_masterformat_content_preserved(self):
|
||||
"""Test that MasterFormat document content is fully preserved."""
|
||||
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
|
||||
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert(pdf_path)
|
||||
text_content = result.text_content
|
||||
|
||||
# Verify key content from the MasterFormat document is preserved
|
||||
expected_content = [
|
||||
"RFP for Construction Management Services",
|
||||
"Section 00 00 43",
|
||||
"Instructions to Respondents",
|
||||
"Ken Sargent House",
|
||||
"INTENT",
|
||||
"Request for Proposal",
|
||||
"KEN SARGENT HOUSE",
|
||||
"GRANDE PRAIRIE, ALBERTA",
|
||||
"Section 00 00 45",
|
||||
]
|
||||
|
||||
for content in expected_content:
|
||||
assert (
|
||||
content in text_content
|
||||
), f"Expected content '{content}' not found in extracted text"
|
||||
|
||||
# Verify partial numbering is followed by text on the same line
|
||||
# .1 should be followed by "The intent" on the same line
|
||||
assert re.search(
|
||||
r"\.1\s+The intent", text_content
|
||||
), "Partial numbering .1 should be followed by 'The intent' text"
|
||||
|
||||
# .2 should be followed by "Available information" on the same line
|
||||
assert re.search(
|
||||
r"\.2\s+Available information", text_content
|
||||
), "Partial numbering .2 should be followed by 'Available information' text"
|
||||
|
||||
# Ensure text content is not empty and has reasonable length
|
||||
assert (
|
||||
len(text_content.strip()) > 100
|
||||
), "MasterFormat document should have substantial text content"
|
||||
|
||||
def test_merge_partial_numbering_with_empty_lines_between(self):
|
||||
"""Test that partial numberings merge correctly even with empty lines between.
|
||||
|
||||
When PDF extractors produce output like:
|
||||
.1
|
||||
|
||||
The intent of this Request...
|
||||
|
||||
The merge logic should still combine them properly.
|
||||
"""
|
||||
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
|
||||
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert(pdf_path)
|
||||
text_content = result.text_content
|
||||
|
||||
# The merged result should have .1 and .2 followed by text
|
||||
# Check that we don't have patterns like ".1\n\nThe intent" (unmerged)
|
||||
lines = text_content.split("\n")
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
stripped = line.strip()
|
||||
# If we find an isolated partial numbering, the merge failed
|
||||
if stripped in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8"]:
|
||||
# Check if next non-empty line exists and wasn't merged
|
||||
for j in range(i + 1, min(i + 3, len(lines))):
|
||||
if lines[j].strip():
|
||||
pytest.fail(
|
||||
f"Partial numbering '{stripped}' on line {i} was not "
|
||||
f"merged with following text '{lines[j].strip()[:30]}...'"
|
||||
)
|
||||
break
|
||||
|
||||
def test_multiple_partial_numberings_all_merged(self):
|
||||
"""Test that all partial numberings in a document are properly merged."""
|
||||
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
|
||||
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert(pdf_path)
|
||||
text_content = result.text_content
|
||||
|
||||
# Count occurrences of merged partial numberings (number followed by text)
|
||||
merged_count = len(re.findall(r"\.\d+\s+[A-Za-z]", text_content))
|
||||
|
||||
# Count isolated partial numberings (number alone on a line)
|
||||
isolated_count = 0
|
||||
for line in text_content.split("\n"):
|
||||
stripped = line.strip()
|
||||
if re.match(r"^\.\d+$", stripped):
|
||||
isolated_count += 1
|
||||
|
||||
assert (
|
||||
merged_count >= 2
|
||||
), f"Expected at least 2 merged partial numberings, found {merged_count}"
|
||||
assert (
|
||||
isolated_count == 0
|
||||
), f"Found {isolated_count} isolated partial numberings that weren't merged"
|
||||
Reference in New Issue
Block a user