Fix O(n) memory growth in PDF conversion by calling page.close() afte… (#1612)
* Fix O(n) memory growth in PDF conversion by calling page.close() after each page * Refactor PDF memory optimization tests for improved readability and consistency * Add memory benchmarking tests for PDF conversion with page.close() fix * Remove unnecessary blank lines in PDF memory optimization tests for cleaner code * Bump version to 0.1.6b2 in __about__.py * Update PDF conversion tests to include mimetype in StreamInfo
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
__version__ = "0.1.6b1"
|
||||
__version__ = "0.1.6b2"
|
||||
|
||||
@@ -536,39 +536,41 @@ class PdfConverter(DocumentConverter):
|
||||
|
||||
assert isinstance(file_stream, io.IOBase)
|
||||
|
||||
markdown_chunks: list[str] = []
|
||||
|
||||
# Read file stream into BytesIO for compatibility with pdfplumber
|
||||
pdf_bytes = io.BytesIO(file_stream.read())
|
||||
|
||||
try:
|
||||
# Track how many pages are form-style vs plain text
|
||||
form_pages = 0
|
||||
plain_pages = 0
|
||||
# Single pass: check every page for form-style content.
|
||||
# Pages with tables/forms get rich extraction; plain-text
|
||||
# pages are collected separately. page.close() is called
|
||||
# after each page to free pdfplumber's cached objects and
|
||||
# keep memory usage constant regardless of page count.
|
||||
markdown_chunks: list[str] = []
|
||||
form_page_count = 0
|
||||
plain_page_indices: list[int] = []
|
||||
|
||||
with pdfplumber.open(pdf_bytes) as pdf:
|
||||
for page in pdf.pages:
|
||||
# Try form-style word position extraction
|
||||
for page_idx, page in enumerate(pdf.pages):
|
||||
page_content = _extract_form_content_from_words(page)
|
||||
|
||||
# If extraction returns None, this page is not form-style
|
||||
if page_content is None:
|
||||
plain_pages += 1
|
||||
# Extract text using pdfplumber's basic extraction for this page
|
||||
if page_content is not None:
|
||||
form_page_count += 1
|
||||
if page_content.strip():
|
||||
markdown_chunks.append(page_content)
|
||||
else:
|
||||
plain_page_indices.append(page_idx)
|
||||
text = page.extract_text()
|
||||
if text and text.strip():
|
||||
markdown_chunks.append(text.strip())
|
||||
else:
|
||||
form_pages += 1
|
||||
if page_content.strip():
|
||||
markdown_chunks.append(page_content)
|
||||
|
||||
# If most pages are plain text, use pdfminer for better text handling
|
||||
if plain_pages > form_pages and plain_pages > 0:
|
||||
page.close() # Free cached page data immediately
|
||||
|
||||
# If no pages had form-style content, use pdfminer for
|
||||
# the whole document (better text spacing for prose).
|
||||
if form_page_count == 0:
|
||||
pdf_bytes.seek(0)
|
||||
markdown = pdfminer.high_level.extract_text(pdf_bytes)
|
||||
else:
|
||||
# Build markdown from chunks
|
||||
markdown = "\n\n".join(markdown_chunks).strip()
|
||||
|
||||
except Exception:
|
||||
|
||||
@@ -0,0 +1,364 @@
|
||||
#!/usr/bin/env python3 -m pytest
|
||||
"""Tests for PDF converter memory optimization.
|
||||
|
||||
Verifies that:
|
||||
- page.close() is called after processing each page (frees cached data)
|
||||
- Plain-text PDFs fall back to pdfminer when no form pages are found
|
||||
- Mixed PDFs use form extraction only on form-style pages
|
||||
- Memory stays constant regardless of page count
|
||||
"""
|
||||
|
||||
import gc
|
||||
import io
|
||||
import os
|
||||
import tracemalloc
|
||||
|
||||
import pytest
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
from markitdown import MarkItDown
|
||||
|
||||
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
|
||||
|
||||
|
||||
def _has_fpdf2() -> bool:
|
||||
try:
|
||||
import fpdf # noqa: F401
|
||||
|
||||
return True
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
|
||||
def _make_form_page():
|
||||
"""Create a mock page with 3-column table-like word positions."""
|
||||
page = MagicMock()
|
||||
page.width = 612
|
||||
page.close = MagicMock()
|
||||
page.extract_words.return_value = [
|
||||
{"text": "Name", "x0": 50, "x1": 100, "top": 10, "bottom": 20},
|
||||
{"text": "Value", "x0": 250, "x1": 300, "top": 10, "bottom": 20},
|
||||
{"text": "Unit", "x0": 450, "x1": 500, "top": 10, "bottom": 20},
|
||||
{"text": "Alpha", "x0": 50, "x1": 100, "top": 30, "bottom": 40},
|
||||
{"text": "100", "x0": 250, "x1": 280, "top": 30, "bottom": 40},
|
||||
{"text": "kg", "x0": 450, "x1": 470, "top": 30, "bottom": 40},
|
||||
{"text": "Beta", "x0": 50, "x1": 100, "top": 50, "bottom": 60},
|
||||
{"text": "200", "x0": 250, "x1": 280, "top": 50, "bottom": 60},
|
||||
{"text": "lb", "x0": 450, "x1": 470, "top": 50, "bottom": 60},
|
||||
]
|
||||
return page
|
||||
|
||||
|
||||
def _make_plain_page():
|
||||
"""Create a mock page with single-line paragraph (no table structure)."""
|
||||
page = MagicMock()
|
||||
page.width = 612
|
||||
page.close = MagicMock()
|
||||
page.extract_words.return_value = [
|
||||
{
|
||||
"text": "This is a long paragraph of plain text.",
|
||||
"x0": 50,
|
||||
"x1": 550,
|
||||
"top": 10,
|
||||
"bottom": 20,
|
||||
},
|
||||
]
|
||||
page.extract_text.return_value = "This is a long paragraph of plain text."
|
||||
return page
|
||||
|
||||
|
||||
def _mock_pdfplumber_open(pages):
|
||||
"""Return a mock pdfplumber.open that yields the given pages."""
|
||||
|
||||
def mock_open(stream):
|
||||
mock_pdf = MagicMock()
|
||||
mock_pdf.pages = pages
|
||||
mock_pdf.__enter__ = MagicMock(return_value=mock_pdf)
|
||||
mock_pdf.__exit__ = MagicMock(return_value=False)
|
||||
return mock_pdf
|
||||
|
||||
return mock_open
|
||||
|
||||
|
||||
class TestPdfMemoryOptimization:
|
||||
"""Test that PDF conversion cleans up per-page caches to limit memory."""
|
||||
|
||||
def test_page_close_called_on_every_page(self):
|
||||
"""Verify page.close() is called on every page during conversion.
|
||||
|
||||
This ensures cached word/layout data is freed after each page,
|
||||
preventing O(n) memory growth with page count.
|
||||
"""
|
||||
num_pages = 20
|
||||
pages = [_make_form_page() for _ in range(num_pages)]
|
||||
|
||||
with patch(
|
||||
"markitdown.converters._pdf_converter.pdfplumber"
|
||||
) as mock_pdfplumber:
|
||||
mock_pdfplumber.open.side_effect = _mock_pdfplumber_open(pages)
|
||||
|
||||
md = MarkItDown()
|
||||
buf = io.BytesIO(b"fake pdf content")
|
||||
from markitdown import StreamInfo
|
||||
|
||||
md.convert_stream(
|
||||
buf,
|
||||
stream_info=StreamInfo(extension=".pdf", mimetype="application/pdf"),
|
||||
)
|
||||
|
||||
# page.close() must be called on ALL pages
|
||||
for i, page in enumerate(pages):
|
||||
assert page.close.called, (
|
||||
f"page.close() was NOT called on page {i} — "
|
||||
"this would cause memory to accumulate"
|
||||
)
|
||||
|
||||
def test_plain_text_pdf_falls_back_to_pdfminer(self):
|
||||
"""Verify all-plain-text PDFs fall back to pdfminer.
|
||||
|
||||
When no page has form-style content, the converter should discard
|
||||
pdfplumber results and use pdfminer for the whole document (better
|
||||
text spacing for prose).
|
||||
"""
|
||||
num_pages = 50
|
||||
pages = [_make_plain_page() for _ in range(num_pages)]
|
||||
|
||||
with patch(
|
||||
"markitdown.converters._pdf_converter.pdfplumber"
|
||||
) as mock_pdfplumber, patch(
|
||||
"markitdown.converters._pdf_converter.pdfminer"
|
||||
) as mock_pdfminer:
|
||||
mock_pdfplumber.open.side_effect = _mock_pdfplumber_open(pages)
|
||||
mock_pdfminer.high_level.extract_text.return_value = "Plain text content"
|
||||
|
||||
md = MarkItDown()
|
||||
buf = io.BytesIO(b"fake pdf content")
|
||||
from markitdown import StreamInfo
|
||||
|
||||
result = md.convert_stream(
|
||||
buf,
|
||||
stream_info=StreamInfo(extension=".pdf", mimetype="application/pdf"),
|
||||
)
|
||||
|
||||
# pdfminer should be used for the final text extraction
|
||||
assert mock_pdfminer.high_level.extract_text.called, (
|
||||
"pdfminer.high_level.extract_text was not called — "
|
||||
"plain-text PDFs should fall back to pdfminer"
|
||||
)
|
||||
assert result.text_content is not None
|
||||
|
||||
def test_plain_text_pdf_still_closes_all_pages(self):
|
||||
"""Even for plain-text PDFs, page.close() must be called on every page."""
|
||||
num_pages = 30
|
||||
pages = [_make_plain_page() for _ in range(num_pages)]
|
||||
|
||||
with patch(
|
||||
"markitdown.converters._pdf_converter.pdfplumber"
|
||||
) as mock_pdfplumber, patch(
|
||||
"markitdown.converters._pdf_converter.pdfminer"
|
||||
) as mock_pdfminer:
|
||||
mock_pdfplumber.open.side_effect = _mock_pdfplumber_open(pages)
|
||||
mock_pdfminer.high_level.extract_text.return_value = "text"
|
||||
|
||||
md = MarkItDown()
|
||||
buf = io.BytesIO(b"fake pdf content")
|
||||
from markitdown import StreamInfo
|
||||
|
||||
md.convert_stream(
|
||||
buf,
|
||||
stream_info=StreamInfo(extension=".pdf", mimetype="application/pdf"),
|
||||
)
|
||||
|
||||
for i, page in enumerate(pages):
|
||||
assert (
|
||||
page.close.called
|
||||
), f"page.close() was NOT called on plain-text page {i}"
|
||||
|
||||
def test_mixed_pdf_uses_form_extraction_per_page(self):
|
||||
"""In a mixed PDF, form pages get table extraction while plain pages don't.
|
||||
|
||||
Ensures we don't miss form-style pages and don't waste work
|
||||
running form extraction on plain-text pages.
|
||||
"""
|
||||
# Pages 0,2,4 are form-style; pages 1,3 are plain text
|
||||
pages = [
|
||||
_make_form_page(), # 0 - form
|
||||
_make_plain_page(), # 1 - plain
|
||||
_make_form_page(), # 2 - form
|
||||
_make_plain_page(), # 3 - plain
|
||||
_make_form_page(), # 4 - form
|
||||
]
|
||||
|
||||
with patch(
|
||||
"markitdown.converters._pdf_converter.pdfplumber"
|
||||
) as mock_pdfplumber:
|
||||
mock_pdfplumber.open.side_effect = _mock_pdfplumber_open(pages)
|
||||
|
||||
md = MarkItDown()
|
||||
buf = io.BytesIO(b"fake pdf content")
|
||||
from markitdown import StreamInfo
|
||||
|
||||
result = md.convert_stream(
|
||||
buf,
|
||||
stream_info=StreamInfo(extension=".pdf", mimetype="application/pdf"),
|
||||
)
|
||||
|
||||
# All pages should have close() called
|
||||
for i, page in enumerate(pages):
|
||||
assert page.close.called, f"page.close() not called on page {i}"
|
||||
|
||||
# Form pages (0,2,4) should have extract_words called
|
||||
for i in [0, 2, 4]:
|
||||
assert pages[
|
||||
i
|
||||
].extract_words.called, f"extract_words not called on form page {i}"
|
||||
|
||||
# Result should contain table content from form pages
|
||||
assert result.text_content is not None
|
||||
assert (
|
||||
"|" in result.text_content
|
||||
), "Expected markdown table pipes in output from form-style pages"
|
||||
|
||||
def test_only_one_pdfplumber_open_call(self):
|
||||
"""Verify pdfplumber.open is called exactly once (single pass)."""
|
||||
pages = [_make_form_page() for _ in range(10)]
|
||||
|
||||
with patch(
|
||||
"markitdown.converters._pdf_converter.pdfplumber"
|
||||
) as mock_pdfplumber:
|
||||
mock_pdfplumber.open.side_effect = _mock_pdfplumber_open(pages)
|
||||
|
||||
md = MarkItDown()
|
||||
buf = io.BytesIO(b"fake pdf content")
|
||||
from markitdown import StreamInfo
|
||||
|
||||
md.convert_stream(
|
||||
buf,
|
||||
stream_info=StreamInfo(extension=".pdf", mimetype="application/pdf"),
|
||||
)
|
||||
|
||||
assert mock_pdfplumber.open.call_count == 1, (
|
||||
f"Expected 1 pdfplumber.open call (single pass), "
|
||||
f"got {mock_pdfplumber.open.call_count}"
|
||||
)
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not os.path.exists(os.path.join(TEST_FILES_DIR, "test.pdf")),
|
||||
reason="test.pdf not available",
|
||||
)
|
||||
def test_real_pdf_page_cleanup(self):
|
||||
"""Integration test: verify page.close() is called with a real PDF."""
|
||||
import pdfplumber
|
||||
|
||||
close_call_count = 0
|
||||
original_close = pdfplumber.page.Page.close
|
||||
|
||||
def tracking_close(self):
|
||||
nonlocal close_call_count
|
||||
close_call_count += 1
|
||||
original_close(self)
|
||||
|
||||
with patch.object(pdfplumber.page.Page, "close", tracking_close):
|
||||
md = MarkItDown()
|
||||
pdf_path = os.path.join(TEST_FILES_DIR, "test.pdf")
|
||||
md.convert(pdf_path)
|
||||
|
||||
assert (
|
||||
close_call_count > 0
|
||||
), "page.close() was never called during PDF conversion"
|
||||
|
||||
|
||||
def _generate_table_pdf(num_pages: int) -> bytes:
|
||||
"""Generate a PDF with table-like content on every page."""
|
||||
from fpdf import FPDF
|
||||
|
||||
pdf = FPDF()
|
||||
pdf.set_auto_page_break(auto=False)
|
||||
for page_num in range(num_pages):
|
||||
pdf.add_page()
|
||||
pdf.set_font("Helvetica", size=10)
|
||||
pdf.set_xy(10, 10)
|
||||
pdf.cell(60, 8, "Parameter", border=1)
|
||||
pdf.cell(60, 8, "Value", border=1)
|
||||
pdf.cell(60, 8, "Unit", border=1)
|
||||
pdf.ln()
|
||||
for row in range(20):
|
||||
y = 18 + row * 8
|
||||
if y > 270:
|
||||
break
|
||||
pdf.set_xy(10, y)
|
||||
pdf.cell(60, 8, f"Param_{page_num}_{row}", border=1)
|
||||
pdf.cell(60, 8, f"{(page_num * 100 + row) * 1.23:.2f}", border=1)
|
||||
pdf.cell(60, 8, "kg/m2", border=1)
|
||||
return pdf.output()
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not _has_fpdf2(),
|
||||
reason="fpdf2 not installed",
|
||||
)
|
||||
class TestPdfMemoryBenchmark:
|
||||
"""Benchmark: verify memory stays constant with page.close() fix."""
|
||||
|
||||
def test_memory_does_not_grow_linearly(self):
|
||||
"""Peak memory for 200 pages should be far less than without the fix.
|
||||
|
||||
Without page.close(), 200 pages uses ~225 MiB (linear growth).
|
||||
With the fix, peak memory should stay under 30 MiB.
|
||||
"""
|
||||
from markitdown import StreamInfo
|
||||
|
||||
num_pages = 200
|
||||
pdf_bytes = _generate_table_pdf(num_pages)
|
||||
|
||||
gc.collect()
|
||||
tracemalloc.start()
|
||||
|
||||
md = MarkItDown()
|
||||
buf = io.BytesIO(pdf_bytes)
|
||||
md.convert_stream(buf, stream_info=StreamInfo(extension=".pdf"))
|
||||
|
||||
_, peak = tracemalloc.get_traced_memory()
|
||||
tracemalloc.stop()
|
||||
|
||||
peak_mib = peak / 1024 / 1024
|
||||
# Without the fix this would be ~225 MiB. With the fix it should
|
||||
# be well under 30 MiB. Use a generous threshold to avoid flaky
|
||||
# failures on different machines.
|
||||
assert peak_mib < 30, (
|
||||
f"Peak memory {peak_mib:.1f} MiB for {num_pages} pages is too high. "
|
||||
f"Expected < 30 MiB with page.close() fix."
|
||||
)
|
||||
|
||||
def test_memory_constant_across_page_counts(self):
|
||||
"""Peak memory should not scale linearly with page count.
|
||||
|
||||
Converts 50-page and 200-page PDFs and asserts the peak memory
|
||||
ratio is much less than the 4x page count ratio.
|
||||
"""
|
||||
from markitdown import StreamInfo
|
||||
|
||||
results = {}
|
||||
for num_pages in [50, 200]:
|
||||
pdf_bytes = _generate_table_pdf(num_pages)
|
||||
|
||||
gc.collect()
|
||||
tracemalloc.start()
|
||||
|
||||
md = MarkItDown()
|
||||
buf = io.BytesIO(pdf_bytes)
|
||||
md.convert_stream(buf, stream_info=StreamInfo(extension=".pdf"))
|
||||
|
||||
_, peak = tracemalloc.get_traced_memory()
|
||||
tracemalloc.stop()
|
||||
results[num_pages] = peak
|
||||
|
||||
ratio = results[200] / results[50]
|
||||
# With O(n) memory growth the ratio would be ~4x.
|
||||
# With the fix it should be close to 1x (well under 2x).
|
||||
assert ratio < 2.0, (
|
||||
f"Memory ratio 200p/50p = {ratio:.2f}x — "
|
||||
f"expected < 2.0x (constant memory). "
|
||||
f"50p={results[50] / 1024 / 1024:.1f} MiB, "
|
||||
f"200p={results[200] / 1024 / 1024:.1f} MiB"
|
||||
)
|
||||
Reference in New Issue
Block a user