Files
markitdown/packages/markitdown/tests/test_pdf_masterformat.py
T
lesyk 7fdaefb724 Fix: PDF parsing doesn't support partially numbered lists (#1525)
* Fix: PDF parsing doesn't support partially numbered lists

* Refactor: Move import of PARTIAL_NUMBERING_PATTERN to the top of the test file

* Refactor: Improve assertion formatting in partial numbering tests
2026-01-08 15:15:22 -08:00

172 lines
7.1 KiB
Python

#!/usr/bin/env python3 -m pytest
"""Tests for MasterFormat-style partial numbering in PDF conversion."""
import os
import re
import pytest
from markitdown import MarkItDown
from markitdown.converters._pdf_converter import PARTIAL_NUMBERING_PATTERN
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
class TestMasterFormatPartialNumbering:
"""Test handling of MasterFormat-style partial numbering (.1, .2, etc.)."""
def test_partial_numbering_pattern_regex(self):
"""Test that the partial numbering regex pattern correctly matches."""
# Should match partial numbering patterns
assert PARTIAL_NUMBERING_PATTERN.match(".1") is not None
assert PARTIAL_NUMBERING_PATTERN.match(".2") is not None
assert PARTIAL_NUMBERING_PATTERN.match(".10") is not None
assert PARTIAL_NUMBERING_PATTERN.match(".99") is not None
# Should NOT match other patterns
assert PARTIAL_NUMBERING_PATTERN.match("1.") is None
assert PARTIAL_NUMBERING_PATTERN.match("1.2") is None
assert PARTIAL_NUMBERING_PATTERN.match(".1.2") is None
assert PARTIAL_NUMBERING_PATTERN.match("text") is None
assert PARTIAL_NUMBERING_PATTERN.match(".a") is None
assert PARTIAL_NUMBERING_PATTERN.match("") is None
def test_masterformat_partial_numbering_not_split(self):
"""Test that MasterFormat partial numbering stays with associated text.
MasterFormat documents use partial numbering like:
.1 The intent of this Request for Proposal...
.2 Available information relative to...
These should NOT be split into separate table columns, but kept
as coherent text lines with the number followed by its description.
"""
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
markitdown = MarkItDown()
result = markitdown.convert(pdf_path)
text_content = result.text_content
# Partial numberings should NOT appear isolated on their own lines
# If they're isolated, it means the parser incorrectly split them from their text
lines = text_content.split("\n")
isolated_numberings = []
for line in lines:
stripped = line.strip()
# Check if line contains ONLY a partial numbering (with possible whitespace/pipes)
cleaned = stripped.replace("|", "").strip()
if cleaned in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".10"]:
isolated_numberings.append(stripped)
assert len(isolated_numberings) == 0, (
f"Partial numberings should not be isolated from their text. "
f"Found isolated: {isolated_numberings}"
)
# Verify that partial numberings appear WITH following text on the same line
# Look for patterns like ".1 The intent" or ".1 Some text"
partial_with_text = re.findall(r"\.\d+\s+\w+", text_content)
assert (
len(partial_with_text) > 0
), "Expected to find partial numberings followed by text on the same line"
def test_masterformat_content_preserved(self):
"""Test that MasterFormat document content is fully preserved."""
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
markitdown = MarkItDown()
result = markitdown.convert(pdf_path)
text_content = result.text_content
# Verify key content from the MasterFormat document is preserved
expected_content = [
"RFP for Construction Management Services",
"Section 00 00 43",
"Instructions to Respondents",
"Ken Sargent House",
"INTENT",
"Request for Proposal",
"KEN SARGENT HOUSE",
"GRANDE PRAIRIE, ALBERTA",
"Section 00 00 45",
]
for content in expected_content:
assert (
content in text_content
), f"Expected content '{content}' not found in extracted text"
# Verify partial numbering is followed by text on the same line
# .1 should be followed by "The intent" on the same line
assert re.search(
r"\.1\s+The intent", text_content
), "Partial numbering .1 should be followed by 'The intent' text"
# .2 should be followed by "Available information" on the same line
assert re.search(
r"\.2\s+Available information", text_content
), "Partial numbering .2 should be followed by 'Available information' text"
# Ensure text content is not empty and has reasonable length
assert (
len(text_content.strip()) > 100
), "MasterFormat document should have substantial text content"
def test_merge_partial_numbering_with_empty_lines_between(self):
"""Test that partial numberings merge correctly even with empty lines between.
When PDF extractors produce output like:
.1
The intent of this Request...
The merge logic should still combine them properly.
"""
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
markitdown = MarkItDown()
result = markitdown.convert(pdf_path)
text_content = result.text_content
# The merged result should have .1 and .2 followed by text
# Check that we don't have patterns like ".1\n\nThe intent" (unmerged)
lines = text_content.split("\n")
for i, line in enumerate(lines):
stripped = line.strip()
# If we find an isolated partial numbering, the merge failed
if stripped in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8"]:
# Check if next non-empty line exists and wasn't merged
for j in range(i + 1, min(i + 3, len(lines))):
if lines[j].strip():
pytest.fail(
f"Partial numbering '{stripped}' on line {i} was not "
f"merged with following text '{lines[j].strip()[:30]}...'"
)
break
def test_multiple_partial_numberings_all_merged(self):
"""Test that all partial numberings in a document are properly merged."""
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
markitdown = MarkItDown()
result = markitdown.convert(pdf_path)
text_content = result.text_content
# Count occurrences of merged partial numberings (number followed by text)
merged_count = len(re.findall(r"\.\d+\s+[A-Za-z]", text_content))
# Count isolated partial numberings (number alone on a line)
isolated_count = 0
for line in text_content.split("\n"):
stripped = line.strip()
if re.match(r"^\.\d+$", stripped):
isolated_count += 1
assert (
merged_count >= 2
), f"Expected at least 2 merged partial numberings, found {merged_count}"
assert (
isolated_count == 0
), f"Found {isolated_count} isolated partial numberings that weren't merged"