Fix: PDF parsing doesn't support partially numbered lists (#1525)

* Fix: PDF parsing doesn't support partially numbered lists * Refactor: Move import of PARTIAL_NUMBERING_PATTERN to the top of the test file * Refactor: Improve assertion formatting in partial numbering tests
2026-01-09 00:15:22 +01:00
parent 251dddcf0c
commit 7fdaefb724
4 changed files with 314 additions and 1 deletions
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.1.4"
+__version__ = "0.1.5b1"
@@ -1,11 +1,62 @@
 import sys
 import io
 import re
 from typing import BinaryIO, Any
 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
 # Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10")
 PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$")
 def _merge_partial_numbering_lines(text: str) -> str:
    """
    Post-process extracted text to merge MasterFormat-style partial numbering
    with the following text line.
    MasterFormat documents use partial numbering like:
        .1  The intent of this Request for Proposal...
        .2  Available information relative to...
    Some PDF extractors split these into separate lines:
        .1
        The intent of this Request for Proposal...
    This function merges them back together.
    """
    lines = text.split("\n")
    result_lines: list[str] = []
    i = 0
    while i < len(lines):
        line = lines[i]
        stripped = line.strip()
        # Check if this line is ONLY a partial numbering
        if PARTIAL_NUMBERING_PATTERN.match(stripped):
            # Look for the next non-empty line to merge with
            j = i + 1
            while j < len(lines) and not lines[j].strip():
                j += 1
            if j < len(lines):
                # Merge the partial numbering with the next line
                next_line = lines[j].strip()
                result_lines.append(f"{stripped} {next_line}")
                i = j + 1  # Skip past the merged line
            else:
                # No next line to merge with, keep as is
                result_lines.append(line)
                i += 1
        else:
            result_lines.append(line)
            i += 1
    return "\n".join(result_lines)
 # Load dependencies
 _dependency_exc_info = None
 try:
@@ -117,6 +168,14 @@ def _extract_form_content_from_words(page: Any) -> str | None:
        # Determine row type
        is_paragraph = line_width > page_width * 0.55 and len(combined_text) > 60
        # Check for MasterFormat-style partial numbering (e.g., ".1", ".2")
        # These should be treated as list items, not table rows
        has_partial_numbering = False
        if row_words:
            first_word = row_words[0]["text"].strip()
            if PARTIAL_NUMBERING_PATTERN.match(first_word):
                has_partial_numbering = True
        row_info.append(
            {
                "y_key": y_key,
@@ -125,6 +184,7 @@ def _extract_form_content_from_words(page: Any) -> str | None:
                "x_groups": x_groups,
                "is_paragraph": is_paragraph,
                "num_columns": len(x_groups),
                "has_partial_numbering": has_partial_numbering,
            }
        )
@@ -156,6 +216,11 @@ def _extract_form_content_from_words(page: Any) -> str | None:
            info["is_table_row"] = False
            continue
        # Rows with partial numbering (e.g., ".1", ".2") are list items, not table rows
        if info["has_partial_numbering"]:
            info["is_table_row"] = False
            continue
        # Count how many global columns this row's words align with
        aligned_columns: set[int] = set()
        for word in info["words"]:
@@ -469,4 +534,7 @@ class PdfConverter(DocumentConverter):
            pdf_bytes.seek(0)
            markdown = pdfminer.high_level.extract_text(pdf_bytes)
        # Post-process to merge MasterFormat-style partial numbering with following text
        markdown = _merge_partial_numbering_lines(markdown)
        return DocumentConverterResult(markdown=markdown)
@@ -0,0 +1,74 @@
 %PDF-1.3
 %“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
 1 0 obj
 <<
 /F1 2 0 R /F2 3 0 R
 >>
 endobj
 2 0 obj
 <<
 /BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
 >>
 endobj
 3 0 obj
 <<
 /BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
 >>
 endobj
 4 0 obj
 <<
 /Contents 8 0 R /MediaBox [ 0 0 612 792 ] /Parent 7 0 R /Resources <<
 /Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
 >> /Rotate 0 /Trans <<
 >> 
  /Type /Page
 >>
 endobj
 5 0 obj
 <<
 /PageMode /UseNone /Pages 7 0 R /Type /Catalog
 >>
 endobj
 6 0 obj
 <<
 /Author (anonymous) /CreationDate (D:20260108192537+01'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20260108192537+01'00') /Producer (ReportLab PDF Library - www.reportlab.com) 
  /Subject (unspecified) /Title (untitled) /Trapped /False
 >>
 endobj
 7 0 obj
 <<
 /Count 1 /Kids [ 4 0 R ] /Type /Pages
 >>
 endobj
 8 0 obj
 <<
 /Filter [ /ASCII85Decode /FlateDecode ] /Length 670
 >>
 stream
 Gat$td;IYl'Rf-pcJpsZ/27V[H_WEoW#\5sVS2I3Jt]?;R+`$Ms*f.>6<=3APUNhTmQL<9F,pFup'KGk=TR,7^>/u!#kAE+l;?UQ8Fg(+-O>;^54HWJ*kXdl'VdsI]Y^$-G(GWPR)iGMeWbg3)F'+jfWpCb"rU?d?8?q_r!E2N'0sM)J>=XD.jgunBuga\Wi4MX$WV/b)1F@bC8Nj8(0*)"ZK06BSqlu1$[^37A;/aK=mfgqg$&i),2OH&%^\"B1%B\dd_V>$5OtPri4rcEe3LoBUeL6QAPnpQr+R-t0f]ZSYc?BTAKQ?A&+J#J*N*=6;'?@Cp*>auj0",hDS3bH4[hVs3O="&bk&U@>+8c1&c2iDg6R*%q%iEZq'-!FNSB8#C*'po69R8$S(:.=-$N6'!_[1/jV<$@V3Z_"gd!g!MJMT)mTUN4cWjUQQj]HT_m]0*R=YgTmcl@k>*b/SBce9?.m,bEi#?PI:=r_6G.auM&FtP,>O7T%Z<$f#=g6(2+d@;8?"$8cdI38ZZ>hq5b2_pQY:M\.Kod,pl)ZX7a7Gc'Mf_'SB1X3*L[-51a8`h4)KjJQjLfm/3TIeQY?2+?^.r^HNafjHp<5,1M=W'N>8sb=dB#FC5M`7L91"BC@CfEckPe`M5O:#!Fj$K]s(Gs8rW$>H7gK~>endstream
 endobj
 xref
 0 9
 0000000000 65535 f 
 0000000073 00000 n 
 0000000114 00000 n 
 0000000221 00000 n 
 0000000333 00000 n 
 0000000526 00000 n 
 0000000594 00000 n 
 0000000890 00000 n 
 0000000949 00000 n 
 trailer
 <<
 /ID 
 [<5467fcd5093f18002be6af3fb13ce6c3><5467fcd5093f18002be6af3fb13ce6c3>]
 % ReportLab generated PDF document -- digest (http://www.reportlab.com)
 /Info 6 0 R
 /Root 5 0 R
 /Size 9
 >>
 startxref
 1709
 %%EOF
@@ -0,0 +1,171 @@
 #!/usr/bin/env python3 -m pytest
 """Tests for MasterFormat-style partial numbering in PDF conversion."""
 import os
 import re
 import pytest
 from markitdown import MarkItDown
 from markitdown.converters._pdf_converter import PARTIAL_NUMBERING_PATTERN
 TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
 class TestMasterFormatPartialNumbering:
    """Test handling of MasterFormat-style partial numbering (.1, .2, etc.)."""
    def test_partial_numbering_pattern_regex(self):
        """Test that the partial numbering regex pattern correctly matches."""
        # Should match partial numbering patterns
        assert PARTIAL_NUMBERING_PATTERN.match(".1") is not None
        assert PARTIAL_NUMBERING_PATTERN.match(".2") is not None
        assert PARTIAL_NUMBERING_PATTERN.match(".10") is not None
        assert PARTIAL_NUMBERING_PATTERN.match(".99") is not None
        # Should NOT match other patterns
        assert PARTIAL_NUMBERING_PATTERN.match("1.") is None
        assert PARTIAL_NUMBERING_PATTERN.match("1.2") is None
        assert PARTIAL_NUMBERING_PATTERN.match(".1.2") is None
        assert PARTIAL_NUMBERING_PATTERN.match("text") is None
        assert PARTIAL_NUMBERING_PATTERN.match(".a") is None
        assert PARTIAL_NUMBERING_PATTERN.match("") is None
    def test_masterformat_partial_numbering_not_split(self):
        """Test that MasterFormat partial numbering stays with associated text.
        MasterFormat documents use partial numbering like:
            .1  The intent of this Request for Proposal...
            .2  Available information relative to...
        These should NOT be split into separate table columns, but kept
        as coherent text lines with the number followed by its description.
        """
        pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
        markitdown = MarkItDown()
        result = markitdown.convert(pdf_path)
        text_content = result.text_content
        # Partial numberings should NOT appear isolated on their own lines
        # If they're isolated, it means the parser incorrectly split them from their text
        lines = text_content.split("\n")
        isolated_numberings = []
        for line in lines:
            stripped = line.strip()
            # Check if line contains ONLY a partial numbering (with possible whitespace/pipes)
            cleaned = stripped.replace("|", "").strip()
            if cleaned in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".10"]:
                isolated_numberings.append(stripped)
        assert len(isolated_numberings) == 0, (
            f"Partial numberings should not be isolated from their text. "
            f"Found isolated: {isolated_numberings}"
        )
        # Verify that partial numberings appear WITH following text on the same line
        # Look for patterns like ".1 The intent" or ".1  Some text"
        partial_with_text = re.findall(r"\.\d+\s+\w+", text_content)
        assert (
            len(partial_with_text) > 0
        ), "Expected to find partial numberings followed by text on the same line"
    def test_masterformat_content_preserved(self):
        """Test that MasterFormat document content is fully preserved."""
        pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
        markitdown = MarkItDown()
        result = markitdown.convert(pdf_path)
        text_content = result.text_content
        # Verify key content from the MasterFormat document is preserved
        expected_content = [
            "RFP for Construction Management Services",
            "Section 00 00 43",
            "Instructions to Respondents",
            "Ken Sargent House",
            "INTENT",
            "Request for Proposal",
            "KEN SARGENT HOUSE",
            "GRANDE PRAIRIE, ALBERTA",
            "Section 00 00 45",
        ]
        for content in expected_content:
            assert (
                content in text_content
            ), f"Expected content '{content}' not found in extracted text"
        # Verify partial numbering is followed by text on the same line
        # .1 should be followed by "The intent" on the same line
        assert re.search(
            r"\.1\s+The intent", text_content
        ), "Partial numbering .1 should be followed by 'The intent' text"
        # .2 should be followed by "Available information" on the same line
        assert re.search(
            r"\.2\s+Available information", text_content
        ), "Partial numbering .2 should be followed by 'Available information' text"
        # Ensure text content is not empty and has reasonable length
        assert (
            len(text_content.strip()) > 100
        ), "MasterFormat document should have substantial text content"
    def test_merge_partial_numbering_with_empty_lines_between(self):
        """Test that partial numberings merge correctly even with empty lines between.
        When PDF extractors produce output like:
            .1
            The intent of this Request...
        The merge logic should still combine them properly.
        """
        pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
        markitdown = MarkItDown()
        result = markitdown.convert(pdf_path)
        text_content = result.text_content
        # The merged result should have .1 and .2 followed by text
        # Check that we don't have patterns like ".1\n\nThe intent" (unmerged)
        lines = text_content.split("\n")
        for i, line in enumerate(lines):
            stripped = line.strip()
            # If we find an isolated partial numbering, the merge failed
            if stripped in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8"]:
                # Check if next non-empty line exists and wasn't merged
                for j in range(i + 1, min(i + 3, len(lines))):
                    if lines[j].strip():
                        pytest.fail(
                            f"Partial numbering '{stripped}' on line {i} was not "
                            f"merged with following text '{lines[j].strip()[:30]}...'"
                        )
                        break
    def test_multiple_partial_numberings_all_merged(self):
        """Test that all partial numberings in a document are properly merged."""
        pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
        markitdown = MarkItDown()
        result = markitdown.convert(pdf_path)
        text_content = result.text_content
        # Count occurrences of merged partial numberings (number followed by text)
        merged_count = len(re.findall(r"\.\d+\s+[A-Za-z]", text_content))
        # Count isolated partial numberings (number alone on a line)
        isolated_count = 0
        for line in text_content.split("\n"):
            stripped = line.strip()
            if re.match(r"^\.\d+$", stripped):
                isolated_count += 1
        assert (
            merged_count >= 2
        ), f"Expected at least 2 merged partial numberings, found {merged_count}"
        assert (
            isolated_count == 0
        ), f"Found {isolated_count} isolated partial numberings that weren't merged"