Fix: PDF parsing doesn't support partially numbered lists (#1525)

* Fix: PDF parsing doesn't support partially numbered lists

* Refactor: Move import of PARTIAL_NUMBERING_PATTERN to the top of the test file

* Refactor: Improve assertion formatting in partial numbering tests
This commit is contained in:
lesyk
2026-01-09 00:15:22 +01:00
committed by GitHub
parent 251dddcf0c
commit 7fdaefb724
4 changed files with 314 additions and 1 deletions
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
__version__ = "0.1.4"
__version__ = "0.1.5b1"
@@ -1,11 +1,62 @@
import sys
import io
import re
from typing import BinaryIO, Any
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10")
PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$")
def _merge_partial_numbering_lines(text: str) -> str:
"""
Post-process extracted text to merge MasterFormat-style partial numbering
with the following text line.
MasterFormat documents use partial numbering like:
.1 The intent of this Request for Proposal...
.2 Available information relative to...
Some PDF extractors split these into separate lines:
.1
The intent of this Request for Proposal...
This function merges them back together.
"""
lines = text.split("\n")
result_lines: list[str] = []
i = 0
while i < len(lines):
line = lines[i]
stripped = line.strip()
# Check if this line is ONLY a partial numbering
if PARTIAL_NUMBERING_PATTERN.match(stripped):
# Look for the next non-empty line to merge with
j = i + 1
while j < len(lines) and not lines[j].strip():
j += 1
if j < len(lines):
# Merge the partial numbering with the next line
next_line = lines[j].strip()
result_lines.append(f"{stripped} {next_line}")
i = j + 1 # Skip past the merged line
else:
# No next line to merge with, keep as is
result_lines.append(line)
i += 1
else:
result_lines.append(line)
i += 1
return "\n".join(result_lines)
# Load dependencies
_dependency_exc_info = None
try:
@@ -117,6 +168,14 @@ def _extract_form_content_from_words(page: Any) -> str | None:
# Determine row type
is_paragraph = line_width > page_width * 0.55 and len(combined_text) > 60
# Check for MasterFormat-style partial numbering (e.g., ".1", ".2")
# These should be treated as list items, not table rows
has_partial_numbering = False
if row_words:
first_word = row_words[0]["text"].strip()
if PARTIAL_NUMBERING_PATTERN.match(first_word):
has_partial_numbering = True
row_info.append(
{
"y_key": y_key,
@@ -125,6 +184,7 @@ def _extract_form_content_from_words(page: Any) -> str | None:
"x_groups": x_groups,
"is_paragraph": is_paragraph,
"num_columns": len(x_groups),
"has_partial_numbering": has_partial_numbering,
}
)
@@ -156,6 +216,11 @@ def _extract_form_content_from_words(page: Any) -> str | None:
info["is_table_row"] = False
continue
# Rows with partial numbering (e.g., ".1", ".2") are list items, not table rows
if info["has_partial_numbering"]:
info["is_table_row"] = False
continue
# Count how many global columns this row's words align with
aligned_columns: set[int] = set()
for word in info["words"]:
@@ -469,4 +534,7 @@ class PdfConverter(DocumentConverter):
pdf_bytes.seek(0)
markdown = pdfminer.high_level.extract_text(pdf_bytes)
# Post-process to merge MasterFormat-style partial numbering with following text
markdown = _merge_partial_numbering_lines(markdown)
return DocumentConverterResult(markdown=markdown)
@@ -0,0 +1,74 @@
%PDF-1.3
%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
1 0 obj
<<
/F1 2 0 R /F2 3 0 R
>>
endobj
2 0 obj
<<
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
>>
endobj
3 0 obj
<<
/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
>>
endobj
4 0 obj
<<
/Contents 8 0 R /MediaBox [ 0 0 612 792 ] /Parent 7 0 R /Resources <<
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
>> /Rotate 0 /Trans <<
>>
/Type /Page
>>
endobj
5 0 obj
<<
/PageMode /UseNone /Pages 7 0 R /Type /Catalog
>>
endobj
6 0 obj
<<
/Author (anonymous) /CreationDate (D:20260108192537+01'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20260108192537+01'00') /Producer (ReportLab PDF Library - www.reportlab.com)
/Subject (unspecified) /Title (untitled) /Trapped /False
>>
endobj
7 0 obj
<<
/Count 1 /Kids [ 4 0 R ] /Type /Pages
>>
endobj
8 0 obj
<<
/Filter [ /ASCII85Decode /FlateDecode ] /Length 670
>>
stream
Gat$td;IYl'Rf-pcJpsZ/27V[H_WEoW#\5sVS2I3Jt]?;R+`$Ms*f.>6<=3APUNhTmQL<9F,pFup'KGk=TR,7^>/u!#kAE+l;?UQ8Fg(+-O>;^54HWJ*kXdl'VdsI]Y^$-G(GWPR)iGMeWbg3)F'+jfWpCb"rU?d?8?q_r!E2N'0sM)J>=XD.jgunBuga\Wi4MX$WV/b)1F@bC8Nj8(0*)"ZK06BSqlu1$[^37A;/aK=mfgqg$&i),2OH&%^\"B1%B\dd_V>$5OtPri4rcEe3LoBUeL6QAPnpQr+R-t0f]ZSYc?BTAKQ?A&+J#J*N*=6;'?@Cp*>auj0",hDS3bH4[hVs3O="&bk&U@>+8c1&c2iDg6R*%q%iEZq'-!FNSB8#C*'po69R8$S(:.=-$N6'!_[1/jV<$@V3Z_"gd!g!MJMT)mTUN4cWjUQQj]HT_m]0*R=YgTmcl@k>*b/SBce9?.m,bEi#?PI:=r_6G.auM&FtP,>O7T%Z<$f#=g6(2+d@;8?"$8cdI38ZZ>hq5b2_pQY:M\.Kod,pl)ZX7a7Gc'Mf_'SB1X3*L[-51a8`h4)KjJQjLfm/3TIeQY?2+?^.r^HNafjHp<5,1M=W'N>8sb=dB#FC5M`7L91"BC@CfEckPe`M5O:#!Fj$K]s(Gs8rW$>H7gK~>endstream
endobj
xref
0 9
0000000000 65535 f
0000000073 00000 n
0000000114 00000 n
0000000221 00000 n
0000000333 00000 n
0000000526 00000 n
0000000594 00000 n
0000000890 00000 n
0000000949 00000 n
trailer
<<
/ID
[<5467fcd5093f18002be6af3fb13ce6c3><5467fcd5093f18002be6af3fb13ce6c3>]
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
/Info 6 0 R
/Root 5 0 R
/Size 9
>>
startxref
1709
%%EOF
@@ -0,0 +1,171 @@
#!/usr/bin/env python3 -m pytest
"""Tests for MasterFormat-style partial numbering in PDF conversion."""
import os
import re
import pytest
from markitdown import MarkItDown
from markitdown.converters._pdf_converter import PARTIAL_NUMBERING_PATTERN
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
class TestMasterFormatPartialNumbering:
"""Test handling of MasterFormat-style partial numbering (.1, .2, etc.)."""
def test_partial_numbering_pattern_regex(self):
"""Test that the partial numbering regex pattern correctly matches."""
# Should match partial numbering patterns
assert PARTIAL_NUMBERING_PATTERN.match(".1") is not None
assert PARTIAL_NUMBERING_PATTERN.match(".2") is not None
assert PARTIAL_NUMBERING_PATTERN.match(".10") is not None
assert PARTIAL_NUMBERING_PATTERN.match(".99") is not None
# Should NOT match other patterns
assert PARTIAL_NUMBERING_PATTERN.match("1.") is None
assert PARTIAL_NUMBERING_PATTERN.match("1.2") is None
assert PARTIAL_NUMBERING_PATTERN.match(".1.2") is None
assert PARTIAL_NUMBERING_PATTERN.match("text") is None
assert PARTIAL_NUMBERING_PATTERN.match(".a") is None
assert PARTIAL_NUMBERING_PATTERN.match("") is None
def test_masterformat_partial_numbering_not_split(self):
"""Test that MasterFormat partial numbering stays with associated text.
MasterFormat documents use partial numbering like:
.1 The intent of this Request for Proposal...
.2 Available information relative to...
These should NOT be split into separate table columns, but kept
as coherent text lines with the number followed by its description.
"""
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
markitdown = MarkItDown()
result = markitdown.convert(pdf_path)
text_content = result.text_content
# Partial numberings should NOT appear isolated on their own lines
# If they're isolated, it means the parser incorrectly split them from their text
lines = text_content.split("\n")
isolated_numberings = []
for line in lines:
stripped = line.strip()
# Check if line contains ONLY a partial numbering (with possible whitespace/pipes)
cleaned = stripped.replace("|", "").strip()
if cleaned in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".10"]:
isolated_numberings.append(stripped)
assert len(isolated_numberings) == 0, (
f"Partial numberings should not be isolated from their text. "
f"Found isolated: {isolated_numberings}"
)
# Verify that partial numberings appear WITH following text on the same line
# Look for patterns like ".1 The intent" or ".1 Some text"
partial_with_text = re.findall(r"\.\d+\s+\w+", text_content)
assert (
len(partial_with_text) > 0
), "Expected to find partial numberings followed by text on the same line"
def test_masterformat_content_preserved(self):
"""Test that MasterFormat document content is fully preserved."""
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
markitdown = MarkItDown()
result = markitdown.convert(pdf_path)
text_content = result.text_content
# Verify key content from the MasterFormat document is preserved
expected_content = [
"RFP for Construction Management Services",
"Section 00 00 43",
"Instructions to Respondents",
"Ken Sargent House",
"INTENT",
"Request for Proposal",
"KEN SARGENT HOUSE",
"GRANDE PRAIRIE, ALBERTA",
"Section 00 00 45",
]
for content in expected_content:
assert (
content in text_content
), f"Expected content '{content}' not found in extracted text"
# Verify partial numbering is followed by text on the same line
# .1 should be followed by "The intent" on the same line
assert re.search(
r"\.1\s+The intent", text_content
), "Partial numbering .1 should be followed by 'The intent' text"
# .2 should be followed by "Available information" on the same line
assert re.search(
r"\.2\s+Available information", text_content
), "Partial numbering .2 should be followed by 'Available information' text"
# Ensure text content is not empty and has reasonable length
assert (
len(text_content.strip()) > 100
), "MasterFormat document should have substantial text content"
def test_merge_partial_numbering_with_empty_lines_between(self):
"""Test that partial numberings merge correctly even with empty lines between.
When PDF extractors produce output like:
.1
The intent of this Request...
The merge logic should still combine them properly.
"""
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
markitdown = MarkItDown()
result = markitdown.convert(pdf_path)
text_content = result.text_content
# The merged result should have .1 and .2 followed by text
# Check that we don't have patterns like ".1\n\nThe intent" (unmerged)
lines = text_content.split("\n")
for i, line in enumerate(lines):
stripped = line.strip()
# If we find an isolated partial numbering, the merge failed
if stripped in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8"]:
# Check if next non-empty line exists and wasn't merged
for j in range(i + 1, min(i + 3, len(lines))):
if lines[j].strip():
pytest.fail(
f"Partial numbering '{stripped}' on line {i} was not "
f"merged with following text '{lines[j].strip()[:30]}...'"
)
break
def test_multiple_partial_numberings_all_merged(self):
"""Test that all partial numberings in a document are properly merged."""
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
markitdown = MarkItDown()
result = markitdown.convert(pdf_path)
text_content = result.text_content
# Count occurrences of merged partial numberings (number followed by text)
merged_count = len(re.findall(r"\.\d+\s+[A-Za-z]", text_content))
# Count isolated partial numberings (number alone on a line)
isolated_count = 0
for line in text_content.split("\n"):
stripped = line.strip()
if re.match(r"^\.\d+$", stripped):
isolated_count += 1
assert (
merged_count >= 2
), f"Expected at least 2 merged partial numberings, found {merged_count}"
assert (
isolated_count == 0
), f"Found {isolated_count} isolated partial numberings that weren't merged"