From a6c8ac46a684bac4b4a2377d67ff615264eb8f27 Mon Sep 17 00:00:00 2001 From: lesyk Date: Mon, 16 Mar 2026 17:35:24 +0000 Subject: [PATCH] =?UTF-8?q?Fix=20O(n)=20memory=20growth=20in=20PDF=20conve?= =?UTF-8?q?rsion=20by=20calling=20page.close()=20afte=E2=80=A6=20(#1612)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix O(n) memory growth in PDF conversion by calling page.close() after each page * Refactor PDF memory optimization tests for improved readability and consistency * Add memory benchmarking tests for PDF conversion with page.close() fix * Remove unnecessary blank lines in PDF memory optimization tests for cleaner code * Bump version to 0.1.6b2 in __about__.py * Update PDF conversion tests to include mimetype in StreamInfo --- .../markitdown/src/markitdown/__about__.py | 2 +- .../markitdown/converters/_pdf_converter.py | 38 +- packages/markitdown/tests/test_pdf_memory.py | 364 ++++++++++++++++++ 3 files changed, 385 insertions(+), 19 deletions(-) create mode 100644 packages/markitdown/tests/test_pdf_memory.py diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py index ff02806..538a2c1 100644 --- a/packages/markitdown/src/markitdown/__about__.py +++ b/packages/markitdown/src/markitdown/__about__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT -__version__ = "0.1.6b1" +__version__ = "0.1.6b2" diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index 9c61003..ffbcbd9 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -536,39 +536,41 @@ class PdfConverter(DocumentConverter): assert isinstance(file_stream, io.IOBase) - markdown_chunks: list[str] = [] - # Read file stream into BytesIO for compatibility with pdfplumber pdf_bytes = io.BytesIO(file_stream.read()) try: - # Track how many pages are form-style vs plain text - form_pages = 0 - plain_pages = 0 + # Single pass: check every page for form-style content. + # Pages with tables/forms get rich extraction; plain-text + # pages are collected separately. page.close() is called + # after each page to free pdfplumber's cached objects and + # keep memory usage constant regardless of page count. + markdown_chunks: list[str] = [] + form_page_count = 0 + plain_page_indices: list[int] = [] with pdfplumber.open(pdf_bytes) as pdf: - for page in pdf.pages: - # Try form-style word position extraction + for page_idx, page in enumerate(pdf.pages): page_content = _extract_form_content_from_words(page) - # If extraction returns None, this page is not form-style - if page_content is None: - plain_pages += 1 - # Extract text using pdfplumber's basic extraction for this page + if page_content is not None: + form_page_count += 1 + if page_content.strip(): + markdown_chunks.append(page_content) + else: + plain_page_indices.append(page_idx) text = page.extract_text() if text and text.strip(): markdown_chunks.append(text.strip()) - else: - form_pages += 1 - if page_content.strip(): - markdown_chunks.append(page_content) - # If most pages are plain text, use pdfminer for better text handling - if plain_pages > form_pages and plain_pages > 0: + page.close() # Free cached page data immediately + + # If no pages had form-style content, use pdfminer for + # the whole document (better text spacing for prose). + if form_page_count == 0: pdf_bytes.seek(0) markdown = pdfminer.high_level.extract_text(pdf_bytes) else: - # Build markdown from chunks markdown = "\n\n".join(markdown_chunks).strip() except Exception: diff --git a/packages/markitdown/tests/test_pdf_memory.py b/packages/markitdown/tests/test_pdf_memory.py new file mode 100644 index 0000000..1731dd6 --- /dev/null +++ b/packages/markitdown/tests/test_pdf_memory.py @@ -0,0 +1,364 @@ +#!/usr/bin/env python3 -m pytest +"""Tests for PDF converter memory optimization. + +Verifies that: +- page.close() is called after processing each page (frees cached data) +- Plain-text PDFs fall back to pdfminer when no form pages are found +- Mixed PDFs use form extraction only on form-style pages +- Memory stays constant regardless of page count +""" + +import gc +import io +import os +import tracemalloc + +import pytest +from unittest.mock import patch, MagicMock + +from markitdown import MarkItDown + +TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") + + +def _has_fpdf2() -> bool: + try: + import fpdf # noqa: F401 + + return True + except ImportError: + return False + + +def _make_form_page(): + """Create a mock page with 3-column table-like word positions.""" + page = MagicMock() + page.width = 612 + page.close = MagicMock() + page.extract_words.return_value = [ + {"text": "Name", "x0": 50, "x1": 100, "top": 10, "bottom": 20}, + {"text": "Value", "x0": 250, "x1": 300, "top": 10, "bottom": 20}, + {"text": "Unit", "x0": 450, "x1": 500, "top": 10, "bottom": 20}, + {"text": "Alpha", "x0": 50, "x1": 100, "top": 30, "bottom": 40}, + {"text": "100", "x0": 250, "x1": 280, "top": 30, "bottom": 40}, + {"text": "kg", "x0": 450, "x1": 470, "top": 30, "bottom": 40}, + {"text": "Beta", "x0": 50, "x1": 100, "top": 50, "bottom": 60}, + {"text": "200", "x0": 250, "x1": 280, "top": 50, "bottom": 60}, + {"text": "lb", "x0": 450, "x1": 470, "top": 50, "bottom": 60}, + ] + return page + + +def _make_plain_page(): + """Create a mock page with single-line paragraph (no table structure).""" + page = MagicMock() + page.width = 612 + page.close = MagicMock() + page.extract_words.return_value = [ + { + "text": "This is a long paragraph of plain text.", + "x0": 50, + "x1": 550, + "top": 10, + "bottom": 20, + }, + ] + page.extract_text.return_value = "This is a long paragraph of plain text." + return page + + +def _mock_pdfplumber_open(pages): + """Return a mock pdfplumber.open that yields the given pages.""" + + def mock_open(stream): + mock_pdf = MagicMock() + mock_pdf.pages = pages + mock_pdf.__enter__ = MagicMock(return_value=mock_pdf) + mock_pdf.__exit__ = MagicMock(return_value=False) + return mock_pdf + + return mock_open + + +class TestPdfMemoryOptimization: + """Test that PDF conversion cleans up per-page caches to limit memory.""" + + def test_page_close_called_on_every_page(self): + """Verify page.close() is called on every page during conversion. + + This ensures cached word/layout data is freed after each page, + preventing O(n) memory growth with page count. + """ + num_pages = 20 + pages = [_make_form_page() for _ in range(num_pages)] + + with patch( + "markitdown.converters._pdf_converter.pdfplumber" + ) as mock_pdfplumber: + mock_pdfplumber.open.side_effect = _mock_pdfplumber_open(pages) + + md = MarkItDown() + buf = io.BytesIO(b"fake pdf content") + from markitdown import StreamInfo + + md.convert_stream( + buf, + stream_info=StreamInfo(extension=".pdf", mimetype="application/pdf"), + ) + + # page.close() must be called on ALL pages + for i, page in enumerate(pages): + assert page.close.called, ( + f"page.close() was NOT called on page {i} — " + "this would cause memory to accumulate" + ) + + def test_plain_text_pdf_falls_back_to_pdfminer(self): + """Verify all-plain-text PDFs fall back to pdfminer. + + When no page has form-style content, the converter should discard + pdfplumber results and use pdfminer for the whole document (better + text spacing for prose). + """ + num_pages = 50 + pages = [_make_plain_page() for _ in range(num_pages)] + + with patch( + "markitdown.converters._pdf_converter.pdfplumber" + ) as mock_pdfplumber, patch( + "markitdown.converters._pdf_converter.pdfminer" + ) as mock_pdfminer: + mock_pdfplumber.open.side_effect = _mock_pdfplumber_open(pages) + mock_pdfminer.high_level.extract_text.return_value = "Plain text content" + + md = MarkItDown() + buf = io.BytesIO(b"fake pdf content") + from markitdown import StreamInfo + + result = md.convert_stream( + buf, + stream_info=StreamInfo(extension=".pdf", mimetype="application/pdf"), + ) + + # pdfminer should be used for the final text extraction + assert mock_pdfminer.high_level.extract_text.called, ( + "pdfminer.high_level.extract_text was not called — " + "plain-text PDFs should fall back to pdfminer" + ) + assert result.text_content is not None + + def test_plain_text_pdf_still_closes_all_pages(self): + """Even for plain-text PDFs, page.close() must be called on every page.""" + num_pages = 30 + pages = [_make_plain_page() for _ in range(num_pages)] + + with patch( + "markitdown.converters._pdf_converter.pdfplumber" + ) as mock_pdfplumber, patch( + "markitdown.converters._pdf_converter.pdfminer" + ) as mock_pdfminer: + mock_pdfplumber.open.side_effect = _mock_pdfplumber_open(pages) + mock_pdfminer.high_level.extract_text.return_value = "text" + + md = MarkItDown() + buf = io.BytesIO(b"fake pdf content") + from markitdown import StreamInfo + + md.convert_stream( + buf, + stream_info=StreamInfo(extension=".pdf", mimetype="application/pdf"), + ) + + for i, page in enumerate(pages): + assert ( + page.close.called + ), f"page.close() was NOT called on plain-text page {i}" + + def test_mixed_pdf_uses_form_extraction_per_page(self): + """In a mixed PDF, form pages get table extraction while plain pages don't. + + Ensures we don't miss form-style pages and don't waste work + running form extraction on plain-text pages. + """ + # Pages 0,2,4 are form-style; pages 1,3 are plain text + pages = [ + _make_form_page(), # 0 - form + _make_plain_page(), # 1 - plain + _make_form_page(), # 2 - form + _make_plain_page(), # 3 - plain + _make_form_page(), # 4 - form + ] + + with patch( + "markitdown.converters._pdf_converter.pdfplumber" + ) as mock_pdfplumber: + mock_pdfplumber.open.side_effect = _mock_pdfplumber_open(pages) + + md = MarkItDown() + buf = io.BytesIO(b"fake pdf content") + from markitdown import StreamInfo + + result = md.convert_stream( + buf, + stream_info=StreamInfo(extension=".pdf", mimetype="application/pdf"), + ) + + # All pages should have close() called + for i, page in enumerate(pages): + assert page.close.called, f"page.close() not called on page {i}" + + # Form pages (0,2,4) should have extract_words called + for i in [0, 2, 4]: + assert pages[ + i + ].extract_words.called, f"extract_words not called on form page {i}" + + # Result should contain table content from form pages + assert result.text_content is not None + assert ( + "|" in result.text_content + ), "Expected markdown table pipes in output from form-style pages" + + def test_only_one_pdfplumber_open_call(self): + """Verify pdfplumber.open is called exactly once (single pass).""" + pages = [_make_form_page() for _ in range(10)] + + with patch( + "markitdown.converters._pdf_converter.pdfplumber" + ) as mock_pdfplumber: + mock_pdfplumber.open.side_effect = _mock_pdfplumber_open(pages) + + md = MarkItDown() + buf = io.BytesIO(b"fake pdf content") + from markitdown import StreamInfo + + md.convert_stream( + buf, + stream_info=StreamInfo(extension=".pdf", mimetype="application/pdf"), + ) + + assert mock_pdfplumber.open.call_count == 1, ( + f"Expected 1 pdfplumber.open call (single pass), " + f"got {mock_pdfplumber.open.call_count}" + ) + + @pytest.mark.skipif( + not os.path.exists(os.path.join(TEST_FILES_DIR, "test.pdf")), + reason="test.pdf not available", + ) + def test_real_pdf_page_cleanup(self): + """Integration test: verify page.close() is called with a real PDF.""" + import pdfplumber + + close_call_count = 0 + original_close = pdfplumber.page.Page.close + + def tracking_close(self): + nonlocal close_call_count + close_call_count += 1 + original_close(self) + + with patch.object(pdfplumber.page.Page, "close", tracking_close): + md = MarkItDown() + pdf_path = os.path.join(TEST_FILES_DIR, "test.pdf") + md.convert(pdf_path) + + assert ( + close_call_count > 0 + ), "page.close() was never called during PDF conversion" + + +def _generate_table_pdf(num_pages: int) -> bytes: + """Generate a PDF with table-like content on every page.""" + from fpdf import FPDF + + pdf = FPDF() + pdf.set_auto_page_break(auto=False) + for page_num in range(num_pages): + pdf.add_page() + pdf.set_font("Helvetica", size=10) + pdf.set_xy(10, 10) + pdf.cell(60, 8, "Parameter", border=1) + pdf.cell(60, 8, "Value", border=1) + pdf.cell(60, 8, "Unit", border=1) + pdf.ln() + for row in range(20): + y = 18 + row * 8 + if y > 270: + break + pdf.set_xy(10, y) + pdf.cell(60, 8, f"Param_{page_num}_{row}", border=1) + pdf.cell(60, 8, f"{(page_num * 100 + row) * 1.23:.2f}", border=1) + pdf.cell(60, 8, "kg/m2", border=1) + return pdf.output() + + +@pytest.mark.skipif( + not _has_fpdf2(), + reason="fpdf2 not installed", +) +class TestPdfMemoryBenchmark: + """Benchmark: verify memory stays constant with page.close() fix.""" + + def test_memory_does_not_grow_linearly(self): + """Peak memory for 200 pages should be far less than without the fix. + + Without page.close(), 200 pages uses ~225 MiB (linear growth). + With the fix, peak memory should stay under 30 MiB. + """ + from markitdown import StreamInfo + + num_pages = 200 + pdf_bytes = _generate_table_pdf(num_pages) + + gc.collect() + tracemalloc.start() + + md = MarkItDown() + buf = io.BytesIO(pdf_bytes) + md.convert_stream(buf, stream_info=StreamInfo(extension=".pdf")) + + _, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + + peak_mib = peak / 1024 / 1024 + # Without the fix this would be ~225 MiB. With the fix it should + # be well under 30 MiB. Use a generous threshold to avoid flaky + # failures on different machines. + assert peak_mib < 30, ( + f"Peak memory {peak_mib:.1f} MiB for {num_pages} pages is too high. " + f"Expected < 30 MiB with page.close() fix." + ) + + def test_memory_constant_across_page_counts(self): + """Peak memory should not scale linearly with page count. + + Converts 50-page and 200-page PDFs and asserts the peak memory + ratio is much less than the 4x page count ratio. + """ + from markitdown import StreamInfo + + results = {} + for num_pages in [50, 200]: + pdf_bytes = _generate_table_pdf(num_pages) + + gc.collect() + tracemalloc.start() + + md = MarkItDown() + buf = io.BytesIO(pdf_bytes) + md.convert_stream(buf, stream_info=StreamInfo(extension=".pdf")) + + _, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + results[num_pages] = peak + + ratio = results[200] / results[50] + # With O(n) memory growth the ratio would be ~4x. + # With the fix it should be close to 1x (well under 2x). + assert ratio < 2.0, ( + f"Memory ratio 200p/50p = {ratio:.2f}x — " + f"expected < 2.0x (constant memory). " + f"50p={results[50] / 1024 / 1024:.1f} MiB, " + f"200p={results[200] / 1024 / 1024:.1f} MiB" + )