c6308dc822
* Add OCR test data and implement tests for various document formats - Created HTML file with multiple images for testing OCR extraction. - Added several PDF files with different layouts and image placements to validate OCR functionality. - Introduced PPTX files with complex layouts and images at various positions for comprehensive testing. - Included XLSX files with multiple images and complex layouts to ensure accurate OCR extraction. - Implemented a new test suite in `test_ocr.py` to validate OCR functionality across all document types, ensuring context preservation and accuracy. * Enhance OCR functionality and validation in document converters - Refactor image extraction and processing in PDF, PPTX, and XLSX converters for improved readability and consistency. - Implement detailed validation for OCR text positioning relative to surrounding text in test cases. - Introduce comprehensive tests for expected OCR results across various document types, ensuring no base64 images are present. - Improve error handling and logging for better debugging during OCR extraction. * Add support for scanned PDFs with full-page OCR fallback and implement tests * Bump version to 0.1.6b1 in __about__.py * Refactor OCR services to support LLM Vision, update README and tests accordingly * Add OCR-enabled converters and ensure consistent OCR format across document types * Refactor converters to improve import organization and enhance OCR functionality across DOCX, PDF, PPTX, and XLSX converters * Refactor exception imports for consistency across converters and tests * Fix OCR tests to match MockOCRService output and fix cross-platform file URI handling * Bump version to 0.1.6b1 in __about__.py * Skip DOCX/XLSX/PPTX OCR tests when optional dependencies are missing * Add comprehensive OCR test suite for various document formats - Introduced multiple test documents for PDF, DOCX, XLSX, and PPTX formats, covering scenarios with images at the start, middle, and end. - Implemented tests for complex layouts, multi-page documents, and documents with multiple images. - Created a new test script `test_ocr.py` to validate OCR functionality, ensuring context preservation and accurate text extraction. - Added expected OCR results for validation against ground truth. - Included tests for scanned documents to verify OCR fallback mechanisms. * Remove obsolete HTML test files and refactor test cases for file URIs and OCR format consistency - Deleted `html_image_start.html` and `html_multiple_images.html` as they are no longer needed. - Updated `test_file_uris` in `test_module_misc.py` to simplify assertions by removing unnecessary `url2pathname` usage. - Removed `test_ocr_format_consistency.py` as it is no longer relevant to the current testing framework. * Refactor OCR processing in PdfConverterWithOCR and enhance unit tests for multipage PDFs * Revert * Revert * Update REDMEs * Refactor import statements for consistency and improve formatting in converter and test files
250 lines
7.7 KiB
Python
250 lines
7.7 KiB
Python
"""
|
|
Unit tests for XlsxConverterWithOCR.
|
|
|
|
For each XLSX test file: convert with a mock OCR service then compare the
|
|
full output string against the expected snapshot.
|
|
|
|
OCR block format used by the converter:
|
|
*[Image OCR]
|
|
MOCK_OCR_TEXT_12345
|
|
[End OCR]*
|
|
|
|
Images are grouped at the end of each sheet under:
|
|
### Images in this sheet:
|
|
"""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import pytest
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
|
|
|
from markitdown_ocr._ocr_service import OCRResult # noqa: E402
|
|
from markitdown_ocr._xlsx_converter_with_ocr import ( # noqa: E402
|
|
XlsxConverterWithOCR,
|
|
)
|
|
from markitdown import StreamInfo # noqa: E402
|
|
|
|
TEST_DATA_DIR = Path(__file__).parent / "ocr_test_data"
|
|
|
|
_MOCK_TEXT = "MOCK_OCR_TEXT_12345"
|
|
_OCR_BLOCK = f"*[Image OCR]\n{_MOCK_TEXT}\n[End OCR]*"
|
|
_IMG_SECTION = "### Images in this sheet:"
|
|
|
|
|
|
class MockOCRService:
|
|
def extract_text(
|
|
self, # noqa: ANN101
|
|
image_stream: Any,
|
|
**kwargs: Any,
|
|
) -> OCRResult:
|
|
return OCRResult(text=_MOCK_TEXT, backend_used="mock")
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def svc() -> MockOCRService:
|
|
return MockOCRService()
|
|
|
|
|
|
def _convert(filename: str, ocr_service: MockOCRService) -> str:
|
|
path = TEST_DATA_DIR / filename
|
|
if not path.exists():
|
|
pytest.skip(f"Test file not found: {path}")
|
|
converter = XlsxConverterWithOCR()
|
|
with open(path, "rb") as f:
|
|
return converter.convert(
|
|
f, StreamInfo(extension=".xlsx"), ocr_service=ocr_service
|
|
).text_content
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# xlsx_image_start.xlsx
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_xlsx_image_start(svc: MockOCRService) -> None:
|
|
expected = (
|
|
"## Sales Q1\n\n"
|
|
"| Product | Sales |\n"
|
|
"| --- | --- |\n"
|
|
"| Widget A | 100 |\n"
|
|
"| Widget B | 150 |\n\n"
|
|
"### Images in this sheet:\n\n"
|
|
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
|
"## Forecast Q2\n\n"
|
|
"| Projected Sales | Unnamed: 1 |\n"
|
|
"| --- | --- |\n"
|
|
"| Widget A | 120 |\n"
|
|
"| Widget B | 180 |\n\n"
|
|
"### Images in this sheet:\n\n"
|
|
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
|
)
|
|
assert _convert("xlsx_image_start.xlsx", svc) == expected
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# xlsx_image_middle.xlsx
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_xlsx_image_middle(svc: MockOCRService) -> None:
|
|
expected = (
|
|
"## Revenue\n\n"
|
|
"| Q1 Report | Unnamed: 1 |\n"
|
|
"| --- | --- |\n"
|
|
"| NaN | NaN |\n"
|
|
"| Revenue | $50,000 |\n"
|
|
"| NaN | NaN |\n"
|
|
"| NaN | NaN |\n"
|
|
"| NaN | NaN |\n"
|
|
"| NaN | NaN |\n"
|
|
"| Profit Margin | 40% |\n\n"
|
|
"### Images in this sheet:\n\n"
|
|
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
|
"## Expenses\n\n"
|
|
"| Expense Breakdown | Unnamed: 1 |\n"
|
|
"| --- | --- |\n"
|
|
"| NaN | NaN |\n"
|
|
"| Expenses | $30,000 |\n"
|
|
"| NaN | NaN |\n"
|
|
"| NaN | NaN |\n"
|
|
"| NaN | NaN |\n"
|
|
"| NaN | NaN |\n"
|
|
"| Savings | $5,000 |\n\n"
|
|
"### Images in this sheet:\n\n"
|
|
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
|
)
|
|
assert _convert("xlsx_image_middle.xlsx", svc) == expected
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# xlsx_image_end.xlsx
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_xlsx_image_end(svc: MockOCRService) -> None:
|
|
expected = (
|
|
"## Sheet\n\n"
|
|
"| Financial Summary | Unnamed: 1 |\n"
|
|
"| --- | --- |\n"
|
|
"| Total Revenue | $500,000 |\n"
|
|
"| Total Expenses | $300,000 |\n"
|
|
"| Net Profit | $200,000 |\n"
|
|
"| NaN | NaN |\n"
|
|
"| NaN | NaN |\n"
|
|
"| NaN | NaN |\n"
|
|
"| NaN | NaN |\n"
|
|
"| NaN | NaN |\n"
|
|
"| Signature: | NaN |\n\n"
|
|
"### Images in this sheet:\n\n"
|
|
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
|
"## Budget\n\n"
|
|
"| Budget Allocation | Unnamed: 1 |\n"
|
|
"| --- | --- |\n"
|
|
"| Marketing | $100,000 |\n"
|
|
"| R&D | $150,000 |\n"
|
|
"| Operations | $50,000 |\n"
|
|
"| NaN | NaN |\n"
|
|
"| NaN | NaN |\n"
|
|
"| NaN | NaN |\n"
|
|
"| NaN | NaN |\n"
|
|
"| NaN | NaN |\n"
|
|
"| Approved: | NaN |\n\n"
|
|
"### Images in this sheet:\n\n"
|
|
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
|
)
|
|
assert _convert("xlsx_image_end.xlsx", svc) == expected
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# xlsx_multiple_images.xlsx
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_xlsx_multiple_images(svc: MockOCRService) -> None:
|
|
expected = (
|
|
"## Overview\n\n"
|
|
"| Dashboard |\n"
|
|
"| --- |\n"
|
|
"| Status: Active |\n"
|
|
"| NaN |\n"
|
|
"| NaN |\n"
|
|
"| NaN |\n"
|
|
"| NaN |\n"
|
|
"| Performance Summary |\n\n"
|
|
"### Images in this sheet:\n\n"
|
|
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
|
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
|
"## Details\n\n"
|
|
"| Detailed Metrics |\n"
|
|
"| --- |\n"
|
|
"| System Health |\n\n"
|
|
"### Images in this sheet:\n\n"
|
|
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
|
"## Summary\n\n"
|
|
"| Quarter Summary |\n"
|
|
"| --- |\n"
|
|
"| Overall Performance |\n\n"
|
|
"### Images in this sheet:\n\n"
|
|
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
|
)
|
|
assert _convert("xlsx_multiple_images.xlsx", svc) == expected
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# xlsx_complex_layout.xlsx
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_xlsx_complex_layout(svc: MockOCRService) -> None:
|
|
expected = (
|
|
"## Complex Report\n\n"
|
|
"| Annual Report 2024 | Unnamed: 1 |\n"
|
|
"| --- | --- |\n"
|
|
"| NaN | NaN |\n"
|
|
"| Month | Sales |\n"
|
|
"| Jan | 1000 |\n"
|
|
"| Feb | 1200 |\n"
|
|
"| NaN | NaN |\n"
|
|
"| Total | 2200 |\n\n"
|
|
"### Images in this sheet:\n\n"
|
|
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
|
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
|
"## Customers\n\n"
|
|
"| Customer Metrics | Unnamed: 1 |\n"
|
|
"| --- | --- |\n"
|
|
"| NaN | NaN |\n"
|
|
"| New Customers | 250 |\n"
|
|
"| Retention Rate | 92% |\n\n"
|
|
"### Images in this sheet:\n\n"
|
|
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
|
"## Regions\n\n"
|
|
"| Regional Breakdown | Unnamed: 1 |\n"
|
|
"| --- | --- |\n"
|
|
"| NaN | NaN |\n"
|
|
"| Region | Revenue |\n"
|
|
"| North | $800K |\n"
|
|
"| South | $600K |\n\n"
|
|
"### Images in this sheet:\n\n"
|
|
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
|
)
|
|
assert _convert("xlsx_complex_layout.xlsx", svc) == expected
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# No OCR service — no OCR tags emitted
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def test_xlsx_no_ocr_service_no_tags() -> None:
|
|
path = TEST_DATA_DIR / "xlsx_image_middle.xlsx"
|
|
if not path.exists():
|
|
pytest.skip(f"Test file not found: {path}")
|
|
converter = XlsxConverterWithOCR()
|
|
with open(path, "rb") as f:
|
|
md = converter.convert(f, StreamInfo(extension=".xlsx")).text_content
|
|
assert "*[Image OCR]" not in md
|
|
assert "[End OCR]*" not in md
|