Fix: PDF parsing doesn't support partially numbered lists (#1525)

* Fix: PDF parsing doesn't support partially numbered lists * Refactor: Move import of PARTIAL_NUMBERING_PATTERN to the top of the test file * Refactor: Improve assertion formatting in partial numbering tests
2026-01-09 00:15:22 +01:00
parent 251dddcf0c
commit 7fdaefb724
4 changed files with 314 additions and 1 deletions
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.1.4"
+__version__ = "0.1.5b1"
@@ -1,11 +1,62 @@
 import sys
 import io
+import re
 from typing import BinaryIO, Any

 from .._base_converter import DocumentConverter, DocumentConverterResult
 from .._stream_info import StreamInfo
 from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE

+# Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10")
+PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$")
+
+
+def _merge_partial_numbering_lines(text: str) -> str:
+    """
+    Post-process extracted text to merge MasterFormat-style partial numbering
+    with the following text line.
+
+    MasterFormat documents use partial numbering like:
+        .1  The intent of this Request for Proposal...
+        .2  Available information relative to...
+
+    Some PDF extractors split these into separate lines:
+        .1
+        The intent of this Request for Proposal...
+
+    This function merges them back together.
+    """
+    lines = text.split("\n")
+    result_lines: list[str] = []
+    i = 0
+
+    while i < len(lines):
+        line = lines[i]
+        stripped = line.strip()
+
+        # Check if this line is ONLY a partial numbering
+        if PARTIAL_NUMBERING_PATTERN.match(stripped):
+            # Look for the next non-empty line to merge with
+            j = i + 1
+            while j < len(lines) and not lines[j].strip():
+                j += 1
+
+            if j < len(lines):
+                # Merge the partial numbering with the next line
+                next_line = lines[j].strip()
+                result_lines.append(f"{stripped} {next_line}")
+                i = j + 1  # Skip past the merged line
+            else:
+                # No next line to merge with, keep as is
+                result_lines.append(line)
+                i += 1
+        else:
+            result_lines.append(line)
+            i += 1
+
+    return "\n".join(result_lines)
+
+
 # Load dependencies
 _dependency_exc_info = None
 try:
@@ -117,6 +168,14 @@ def _extract_form_content_from_words(page: Any) -> str | None:
        # Determine row type
        is_paragraph = line_width > page_width * 0.55 and len(combined_text) > 60

+        # Check for MasterFormat-style partial numbering (e.g., ".1", ".2")
+        # These should be treated as list items, not table rows
+        has_partial_numbering = False
+        if row_words:
+            first_word = row_words[0]["text"].strip()
+            if PARTIAL_NUMBERING_PATTERN.match(first_word):
+                has_partial_numbering = True
+
        row_info.append(
            {
                "y_key": y_key,
@@ -125,6 +184,7 @@ def _extract_form_content_from_words(page: Any) -> str | None:
                "x_groups": x_groups,
                "is_paragraph": is_paragraph,
                "num_columns": len(x_groups),
+                "has_partial_numbering": has_partial_numbering,
            }
        )

@@ -156,6 +216,11 @@ def _extract_form_content_from_words(page: Any) -> str | None:
            info["is_table_row"] = False
            continue

+        # Rows with partial numbering (e.g., ".1", ".2") are list items, not table rows
+        if info["has_partial_numbering"]:
+            info["is_table_row"] = False
+            continue
+
        # Count how many global columns this row's words align with
        aligned_columns: set[int] = set()
        for word in info["words"]:
@@ -469,4 +534,7 @@ class PdfConverter(DocumentConverter):
            pdf_bytes.seek(0)
            markdown = pdfminer.high_level.extract_text(pdf_bytes)

+        # Post-process to merge MasterFormat-style partial numbering with following text
+        markdown = _merge_partial_numbering_lines(markdown)
+
        return DocumentConverterResult(markdown=markdown)
@@ -0,0 +1,74 @@
+%PDF-1.3
+%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
+1 0 obj
+<<
+/F1 2 0 R /F2 3 0 R
+>>
+endobj
+2 0 obj
+<<
+/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
+>>
+endobj
+3 0 obj
+<<
+/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
+>>
+endobj
+4 0 obj
+<<
+/Contents 8 0 R /MediaBox [ 0 0 612 792 ] /Parent 7 0 R /Resources <<
+/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
+>> /Rotate 0 /Trans <<
+
+>> 
+  /Type /Page
+>>
+endobj
+5 0 obj
+<<
+/PageMode /UseNone /Pages 7 0 R /Type /Catalog
+>>
+endobj
+6 0 obj
+<<
+/Author (anonymous) /CreationDate (D:20260108192537+01'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20260108192537+01'00') /Producer (ReportLab PDF Library - www.reportlab.com) 
+  /Subject (unspecified) /Title (untitled) /Trapped /False
+>>
+endobj
+7 0 obj
+<<
+/Count 1 /Kids [ 4 0 R ] /Type /Pages
+>>
+endobj
+8 0 obj
+<<
+/Filter [ /ASCII85Decode /FlateDecode ] /Length 670
+>>
+stream
+Gat$td;IYl'Rf-pcJpsZ/27V[H_WEoW#\5sVS2I3Jt]?;R+`$Ms*f.>6<=3APUNhTmQL<9F,pFup'KGk=TR,7^>/u!#kAE+l;?UQ8Fg(+-O>;^54HWJ*kXdl'VdsI]Y^$-G(GWPR)iGMeWbg3)F'+jfWpCb"rU?d?8?q_r!E2N'0sM)J>=XD.jgunBuga\Wi4MX$WV/b)1F@bC8Nj8(0*)"ZK06BSqlu1$[^37A;/aK=mfgqg$&i),2OH&%^\"B1%B\dd_V>$5OtPri4rcEe3LoBUeL6QAPnpQr+R-t0f]ZSYc?BTAKQ?A&+J#J*N*=6;'?@Cp*>auj0",hDS3bH4[hVs3O="&bk&U@>+8c1&c2iDg6R*%q%iEZq'-!FNSB8#C*'po69R8$S(:.=-$N6'!_[1/jV<$@V3Z_"gd!g!MJMT)mTUN4cWjUQQj]HT_m]0*R=YgTmcl@k>*b/SBce9?.m,bEi#?PI:=r_6G.auM&FtP,>O7T%Z<$f#=g6(2+d@;8?"$8cdI38ZZ>hq5b2_pQY:M\.Kod,pl)ZX7a7Gc'Mf_'SB1X3*L[-51a8`h4)KjJQjLfm/3TIeQY?2+?^.r^HNafjHp<5,1M=W'N>8sb=dB#FC5M`7L91"BC@CfEckPe`M5O:#!Fj$K]s(Gs8rW$>H7gK~>endstream
+endobj
+xref
+0 9
+0000000000 65535 f 
+0000000073 00000 n 
+0000000114 00000 n 
+0000000221 00000 n 
+0000000333 00000 n 
+0000000526 00000 n 
+0000000594 00000 n 
+0000000890 00000 n 
+0000000949 00000 n 
+trailer
+<<
+/ID 
+[<5467fcd5093f18002be6af3fb13ce6c3><5467fcd5093f18002be6af3fb13ce6c3>]
+% ReportLab generated PDF document -- digest (http://www.reportlab.com)
+
+/Info 6 0 R
+/Root 5 0 R
+/Size 9
+>>
+startxref
+1709
+%%EOF
@@ -0,0 +1,171 @@
+#!/usr/bin/env python3 -m pytest
+"""Tests for MasterFormat-style partial numbering in PDF conversion."""
+
+import os
+import re
+import pytest
+
+from markitdown import MarkItDown
+from markitdown.converters._pdf_converter import PARTIAL_NUMBERING_PATTERN
+
+TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
+
+
+class TestMasterFormatPartialNumbering:
+    """Test handling of MasterFormat-style partial numbering (.1, .2, etc.)."""
+
+    def test_partial_numbering_pattern_regex(self):
+        """Test that the partial numbering regex pattern correctly matches."""
+
+        # Should match partial numbering patterns
+        assert PARTIAL_NUMBERING_PATTERN.match(".1") is not None
+        assert PARTIAL_NUMBERING_PATTERN.match(".2") is not None
+        assert PARTIAL_NUMBERING_PATTERN.match(".10") is not None
+        assert PARTIAL_NUMBERING_PATTERN.match(".99") is not None
+
+        # Should NOT match other patterns
+        assert PARTIAL_NUMBERING_PATTERN.match("1.") is None
+        assert PARTIAL_NUMBERING_PATTERN.match("1.2") is None
+        assert PARTIAL_NUMBERING_PATTERN.match(".1.2") is None
+        assert PARTIAL_NUMBERING_PATTERN.match("text") is None
+        assert PARTIAL_NUMBERING_PATTERN.match(".a") is None
+        assert PARTIAL_NUMBERING_PATTERN.match("") is None
+
+    def test_masterformat_partial_numbering_not_split(self):
+        """Test that MasterFormat partial numbering stays with associated text.
+
+        MasterFormat documents use partial numbering like:
+            .1  The intent of this Request for Proposal...
+            .2  Available information relative to...
+
+        These should NOT be split into separate table columns, but kept
+        as coherent text lines with the number followed by its description.
+        """
+        pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
+
+        markitdown = MarkItDown()
+        result = markitdown.convert(pdf_path)
+        text_content = result.text_content
+
+        # Partial numberings should NOT appear isolated on their own lines
+        # If they're isolated, it means the parser incorrectly split them from their text
+        lines = text_content.split("\n")
+        isolated_numberings = []
+        for line in lines:
+            stripped = line.strip()
+            # Check if line contains ONLY a partial numbering (with possible whitespace/pipes)
+            cleaned = stripped.replace("|", "").strip()
+            if cleaned in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".10"]:
+                isolated_numberings.append(stripped)
+
+        assert len(isolated_numberings) == 0, (
+            f"Partial numberings should not be isolated from their text. "
+            f"Found isolated: {isolated_numberings}"
+        )
+
+        # Verify that partial numberings appear WITH following text on the same line
+        # Look for patterns like ".1 The intent" or ".1  Some text"
+        partial_with_text = re.findall(r"\.\d+\s+\w+", text_content)
+        assert (
+            len(partial_with_text) > 0
+        ), "Expected to find partial numberings followed by text on the same line"
+
+    def test_masterformat_content_preserved(self):
+        """Test that MasterFormat document content is fully preserved."""
+        pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
+
+        markitdown = MarkItDown()
+        result = markitdown.convert(pdf_path)
+        text_content = result.text_content
+
+        # Verify key content from the MasterFormat document is preserved
+        expected_content = [
+            "RFP for Construction Management Services",
+            "Section 00 00 43",
+            "Instructions to Respondents",
+            "Ken Sargent House",
+            "INTENT",
+            "Request for Proposal",
+            "KEN SARGENT HOUSE",
+            "GRANDE PRAIRIE, ALBERTA",
+            "Section 00 00 45",
+        ]
+
+        for content in expected_content:
+            assert (
+                content in text_content
+            ), f"Expected content '{content}' not found in extracted text"
+
+        # Verify partial numbering is followed by text on the same line
+        # .1 should be followed by "The intent" on the same line
+        assert re.search(
+            r"\.1\s+The intent", text_content
+        ), "Partial numbering .1 should be followed by 'The intent' text"
+
+        # .2 should be followed by "Available information" on the same line
+        assert re.search(
+            r"\.2\s+Available information", text_content
+        ), "Partial numbering .2 should be followed by 'Available information' text"
+
+        # Ensure text content is not empty and has reasonable length
+        assert (
+            len(text_content.strip()) > 100
+        ), "MasterFormat document should have substantial text content"
+
+    def test_merge_partial_numbering_with_empty_lines_between(self):
+        """Test that partial numberings merge correctly even with empty lines between.
+
+        When PDF extractors produce output like:
+            .1
+
+            The intent of this Request...
+
+        The merge logic should still combine them properly.
+        """
+        pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
+
+        markitdown = MarkItDown()
+        result = markitdown.convert(pdf_path)
+        text_content = result.text_content
+
+        # The merged result should have .1 and .2 followed by text
+        # Check that we don't have patterns like ".1\n\nThe intent" (unmerged)
+        lines = text_content.split("\n")
+
+        for i, line in enumerate(lines):
+            stripped = line.strip()
+            # If we find an isolated partial numbering, the merge failed
+            if stripped in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8"]:
+                # Check if next non-empty line exists and wasn't merged
+                for j in range(i + 1, min(i + 3, len(lines))):
+                    if lines[j].strip():
+                        pytest.fail(
+                            f"Partial numbering '{stripped}' on line {i} was not "
+                            f"merged with following text '{lines[j].strip()[:30]}...'"
+                        )
+                        break
+
+    def test_multiple_partial_numberings_all_merged(self):
+        """Test that all partial numberings in a document are properly merged."""
+        pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
+
+        markitdown = MarkItDown()
+        result = markitdown.convert(pdf_path)
+        text_content = result.text_content
+
+        # Count occurrences of merged partial numberings (number followed by text)
+        merged_count = len(re.findall(r"\.\d+\s+[A-Za-z]", text_content))
+
+        # Count isolated partial numberings (number alone on a line)
+        isolated_count = 0
+        for line in text_content.split("\n"):
+            stripped = line.strip()
+            if re.match(r"^\.\d+$", stripped):
+                isolated_count += 1
+
+        assert (
+            merged_count >= 2
+        ), f"Expected at least 2 merged partial numberings, found {merged_count}"
+        assert (
+            isolated_count == 0
+        ), f"Found {isolated_count} isolated partial numberings that weren't merged"