[MS] Add OCR layer service for embedded images and PDF scans (#1541)

* Add OCR test data and implement tests for various document formats

- Created HTML file with multiple images for testing OCR extraction.
- Added several PDF files with different layouts and image placements to validate OCR functionality.
- Introduced PPTX files with complex layouts and images at various positions for comprehensive testing.
- Included XLSX files with multiple images and complex layouts to ensure accurate OCR extraction.
- Implemented a new test suite in `test_ocr.py` to validate OCR functionality across all document types, ensuring context preservation and accuracy.

* Enhance OCR functionality and validation in document converters

- Refactor image extraction and processing in PDF, PPTX, and XLSX converters for improved readability and consistency.
- Implement detailed validation for OCR text positioning relative to surrounding text in test cases.
- Introduce comprehensive tests for expected OCR results across various document types, ensuring no base64 images are present.
- Improve error handling and logging for better debugging during OCR extraction.

* Add support for scanned PDFs with full-page OCR fallback and implement tests

* Bump version to 0.1.6b1 in __about__.py

* Refactor OCR services to support LLM Vision, update README and tests accordingly

* Add OCR-enabled converters and ensure consistent OCR format across document types

* Refactor converters to improve import organization and enhance OCR functionality across DOCX, PDF, PPTX, and XLSX converters

* Refactor exception imports for consistency across converters and tests

* Fix OCR tests to match MockOCRService output and fix cross-platform file URI handling

* Bump version to 0.1.6b1 in __about__.py

* Skip DOCX/XLSX/PPTX OCR tests when optional dependencies are missing

* Add comprehensive OCR test suite for various document formats

- Introduced multiple test documents for PDF, DOCX, XLSX, and PPTX formats, covering scenarios with images at the start, middle, and end.
- Implemented tests for complex layouts, multi-page documents, and documents with multiple images.
- Created a new test script `test_ocr.py` to validate OCR functionality, ensuring context preservation and accurate text extraction.
- Added expected OCR results for validation against ground truth.
- Included tests for scanned documents to verify OCR fallback mechanisms.

* Remove obsolete HTML test files and refactor test cases for file URIs and OCR format consistency

- Deleted `html_image_start.html` and `html_multiple_images.html` as they are no longer needed.
- Updated `test_file_uris` in `test_module_misc.py` to simplify assertions by removing unnecessary `url2pathname` usage.
- Removed `test_ocr_format_consistency.py` as it is no longer relevant to the current testing framework.

* Refactor OCR processing in PdfConverterWithOCR and enhance unit tests for multipage PDFs

* Revert

* Revert

* Update REDMEs

* Refactor import statements for consistency and improve formatting in converter and test files
This commit is contained in:
lesyk
2026-03-10 16:17:17 +00:00
committed by GitHub
parent 4a5340f93b
commit c6308dc822
45 changed files with 5382 additions and 2 deletions
@@ -0,0 +1,4 @@
# SPDX-FileCopyrightText: 2025-present Contributors
# SPDX-License-Identifier: MIT
__version__ = "0.1.0"
@@ -0,0 +1,31 @@
# SPDX-FileCopyrightText: 2025-present Contributors
# SPDX-License-Identifier: MIT
"""
markitdown-ocr: OCR plugin for MarkItDown
Adds LLM Vision-based text extraction from images embedded in PDF, DOCX, PPTX, and XLSX files.
"""
from ._plugin import __plugin_interface_version__, register_converters
from .__about__ import __version__
from ._ocr_service import (
OCRResult,
LLMVisionOCRService,
)
from ._pdf_converter_with_ocr import PdfConverterWithOCR
from ._docx_converter_with_ocr import DocxConverterWithOCR
from ._pptx_converter_with_ocr import PptxConverterWithOCR
from ._xlsx_converter_with_ocr import XlsxConverterWithOCR
__all__ = [
"__version__",
"__plugin_interface_version__",
"register_converters",
"OCRResult",
"LLMVisionOCRService",
"PdfConverterWithOCR",
"DocxConverterWithOCR",
"PptxConverterWithOCR",
"XlsxConverterWithOCR",
]
@@ -0,0 +1,189 @@
"""
Enhanced DOCX Converter with OCR support for embedded images.
Extracts images from Word documents and performs OCR while maintaining context.
"""
import io
import re
import sys
from typing import Any, BinaryIO, Optional
from markitdown.converters import HtmlConverter
from markitdown.converter_utils.docx.pre_process import pre_process_docx
from markitdown import DocumentConverterResult, StreamInfo
from markitdown._exceptions import (
MissingDependencyException,
MISSING_DEPENDENCY_MESSAGE,
)
from ._ocr_service import LLMVisionOCRService
# Try loading dependencies
_dependency_exc_info = None
try:
import mammoth
from docx import Document
except ImportError:
_dependency_exc_info = sys.exc_info()
# Placeholder injected into HTML so that mammoth never sees the OCR markers.
# Must be a single token with no special markdown characters.
_PLACEHOLDER = "MARKITDOWNOCRBLOCK{}"
class DocxConverterWithOCR(HtmlConverter):
"""
Enhanced DOCX Converter with OCR support for embedded images.
Maintains document flow while extracting text from images inline.
"""
def __init__(self, ocr_service: Optional[LLMVisionOCRService] = None):
super().__init__()
self._html_converter = HtmlConverter()
self.ocr_service = ocr_service
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension == ".docx":
return True
if mimetype.startswith(
"application/vnd.openxmlformats-officedocument.wordprocessingml"
):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> DocumentConverterResult:
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".docx",
feature="docx",
)
) from _dependency_exc_info[1].with_traceback(
_dependency_exc_info[2]
) # type: ignore[union-attr]
# Get OCR service if available (from kwargs or instance)
ocr_service: Optional[LLMVisionOCRService] = (
kwargs.get("ocr_service") or self.ocr_service
)
if ocr_service:
# 1. Extract and OCR images — returns raw text per image
file_stream.seek(0)
image_ocr_map = self._extract_and_ocr_images(file_stream, ocr_service)
# 2. Convert DOCX → HTML via mammoth
file_stream.seek(0)
pre_process_stream = pre_process_docx(file_stream)
html_result = mammoth.convert_to_html(
pre_process_stream, style_map=kwargs.get("style_map")
).value
# 3. Replace <img> tags with plain placeholder tokens so that
# mammoth's HTML→markdown step never escapes our OCR markers.
html_with_placeholders, ocr_texts = self._inject_placeholders(
html_result, image_ocr_map
)
# 4. Convert HTML → markdown
md_result = self._html_converter.convert_string(
html_with_placeholders, **kwargs
)
md = md_result.markdown
# 5. Swap placeholders for the actual OCR blocks (post-conversion
# so * and _ are never escaped by the markdown converter).
for i, raw_text in enumerate(ocr_texts):
placeholder = _PLACEHOLDER.format(i)
ocr_block = f"*[Image OCR]\n{raw_text}\n[End OCR]*"
md = md.replace(placeholder, ocr_block)
return DocumentConverterResult(markdown=md)
else:
# Standard conversion without OCR
style_map = kwargs.get("style_map", None)
pre_process_stream = pre_process_docx(file_stream)
return self._html_converter.convert_string(
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
**kwargs,
)
def _extract_and_ocr_images(
self, file_stream: BinaryIO, ocr_service: LLMVisionOCRService
) -> dict[str, str]:
"""
Extract images from DOCX and OCR them.
Returns:
Dict mapping image relationship IDs to raw OCR text (no markers).
"""
ocr_map = {}
try:
file_stream.seek(0)
doc = Document(file_stream)
for rel in doc.part.rels.values():
if "image" in rel.target_ref.lower():
try:
image_bytes = rel.target_part.blob
image_stream = io.BytesIO(image_bytes)
ocr_result = ocr_service.extract_text(image_stream)
if ocr_result.text.strip():
# Store raw text only — markers added later
ocr_map[rel.rId] = ocr_result.text.strip()
except Exception:
continue
except Exception:
pass
return ocr_map
def _inject_placeholders(
self, html: str, ocr_map: dict[str, str]
) -> tuple[str, list[str]]:
"""
Replace <img> tags with numbered placeholder tokens.
Returns:
(html_with_placeholders, ordered list of raw OCR texts)
"""
if not ocr_map:
return html, []
ocr_texts = list(ocr_map.values())
used: list[int] = []
def replace_img(match: re.Match) -> str: # type: ignore[type-arg]
for i in range(len(ocr_texts)):
if i not in used:
used.append(i)
return f"<p>{_PLACEHOLDER.format(i)}</p>"
return "" # remove image if all OCR texts already used
result = re.sub(r"<img[^>]*>", replace_img, html)
# Any OCR texts that had no matching <img> tag go at the end
for i in range(len(ocr_texts)):
if i not in used:
result += f"<p>{_PLACEHOLDER.format(i)}</p>"
return result, ocr_texts
@@ -0,0 +1,110 @@
"""
OCR Service Layer for MarkItDown
Provides LLM Vision-based image text extraction.
"""
import base64
from typing import Any, BinaryIO
from dataclasses import dataclass
from markitdown import StreamInfo
@dataclass
class OCRResult:
"""Result from OCR extraction."""
text: str
confidence: float | None = None
backend_used: str | None = None
error: str | None = None
class LLMVisionOCRService:
"""OCR service using LLM vision models (OpenAI-compatible)."""
def __init__(
self,
client: Any,
model: str,
default_prompt: str | None = None,
) -> None:
"""
Initialize LLM Vision OCR service.
Args:
client: OpenAI-compatible client
model: Model name (e.g., 'gpt-4o', 'gemini-2.0-flash')
default_prompt: Default prompt for OCR extraction
"""
self.client = client
self.model = model
self.default_prompt = default_prompt or (
"Extract all text from this image. "
"Return ONLY the extracted text, maintaining the original "
"layout and order. Do not add any commentary or description."
)
def extract_text(
self,
image_stream: BinaryIO,
prompt: str | None = None,
stream_info: StreamInfo | None = None,
**kwargs: Any,
) -> OCRResult:
"""Extract text using LLM vision."""
if self.client is None:
return OCRResult(
text="",
backend_used="llm_vision",
error="LLM client not configured",
)
try:
image_stream.seek(0)
content_type: str | None = None
if stream_info:
content_type = stream_info.mimetype
if not content_type:
try:
from PIL import Image
image_stream.seek(0)
img = Image.open(image_stream)
fmt = img.format.lower() if img.format else "png"
content_type = f"image/{fmt}"
except Exception:
content_type = "image/png"
image_stream.seek(0)
base64_image = base64.b64encode(image_stream.read()).decode("utf-8")
data_uri = f"data:{content_type};base64,{base64_image}"
actual_prompt = prompt or self.default_prompt
response = self.client.chat.completions.create(
model=self.model,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": actual_prompt},
{
"type": "image_url",
"image_url": {"url": data_uri},
},
],
}
],
)
text = response.choices[0].message.content
return OCRResult(
text=text.strip() if text else "",
backend_used="llm_vision",
)
except Exception as e:
return OCRResult(text="", backend_used="llm_vision", error=str(e))
finally:
image_stream.seek(0)
@@ -0,0 +1,422 @@
"""
Enhanced PDF Converter with OCR support for embedded images.
Extracts images from PDFs and performs OCR while maintaining document context.
"""
import io
import sys
from typing import Any, BinaryIO, Optional
from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo
from markitdown._exceptions import (
MissingDependencyException,
MISSING_DEPENDENCY_MESSAGE,
)
from ._ocr_service import LLMVisionOCRService
# Import dependencies
_dependency_exc_info = None
try:
import pdfminer
import pdfminer.high_level
import pdfplumber
from PIL import Image
except ImportError:
_dependency_exc_info = sys.exc_info()
def _extract_images_from_page(page: Any) -> list[dict]:
"""
Extract images from a PDF page by rendering page regions.
Returns:
List of dicts with 'stream', 'bbox', 'name', 'y_pos' keys
"""
images_info = []
try:
# Try multiple methods to detect images
images = []
# Method 1: Use page.images (standard approach)
if hasattr(page, "images") and page.images:
images = page.images
# Method 2: If no images found, try underlying PDF objects
if not images and hasattr(page, "objects") and "image" in page.objects:
images = page.objects.get("image", [])
# Method 3: Try filtering all objects for image types
if not images and hasattr(page, "objects"):
all_objs = page.objects
for obj_type in all_objs.keys():
if "image" in obj_type.lower() or "xobject" in obj_type.lower():
potential_imgs = all_objs.get(obj_type, [])
if potential_imgs:
images = potential_imgs
break
for i, img_dict in enumerate(images):
try:
# Try to get the actual image stream from the PDF
img_stream = None
y_pos = 0
# Method A: If img_dict has 'stream' key, use it directly
if "stream" in img_dict and hasattr(img_dict["stream"], "get_data"):
try:
img_bytes = img_dict["stream"].get_data()
# Try to open as PIL Image to validate/decode
pil_img = Image.open(io.BytesIO(img_bytes))
# Convert to RGB if needed (handle CMYK, etc.)
if pil_img.mode not in ("RGB", "L"):
pil_img = pil_img.convert("RGB")
# Save to stream as PNG
img_stream = io.BytesIO()
pil_img.save(img_stream, format="PNG")
img_stream.seek(0)
y_pos = img_dict.get("top", 0)
except Exception:
pass
# Method B: Fallback to rendering page region
if img_stream is None:
x0 = img_dict.get("x0", 0)
y0 = img_dict.get("top", 0)
x1 = img_dict.get("x1", 0)
y1 = img_dict.get("bottom", 0)
y_pos = y0
# Check if dimensions are valid
if x1 <= x0 or y1 <= y0:
continue
# Use pdfplumber's within_bbox to crop, then render
# This preserves coordinate system correctly
bbox = (x0, y0, x1, y1)
cropped_page = page.within_bbox(bbox)
# Render at 150 DPI (balance between quality and size)
page_img = cropped_page.to_image(resolution=150)
# Save to stream
img_stream = io.BytesIO()
page_img.original.save(img_stream, format="PNG")
img_stream.seek(0)
if img_stream:
images_info.append(
{
"stream": img_stream,
"name": f"page_{page.page_number}_img_{i}",
"y_pos": y_pos,
}
)
except Exception:
continue
except Exception:
pass
return images_info
class PdfConverterWithOCR(DocumentConverter):
"""
Enhanced PDF Converter with OCR support for embedded images.
Maintains document structure while extracting text from images inline.
"""
def __init__(self, ocr_service: Optional[LLMVisionOCRService] = None):
super().__init__()
self.ocr_service = ocr_service
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension == ".pdf":
return True
if mimetype.startswith("application/pdf") or mimetype.startswith(
"application/x-pdf"
):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> DocumentConverterResult:
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".pdf",
feature="pdf",
)
) from _dependency_exc_info[1].with_traceback(
_dependency_exc_info[2]
) # type: ignore[union-attr]
# Get OCR service if available (from kwargs or instance)
ocr_service: LLMVisionOCRService | None = (
kwargs.get("ocr_service") or self.ocr_service
)
# Read PDF into BytesIO
file_stream.seek(0)
pdf_bytes = io.BytesIO(file_stream.read())
markdown_content = []
try:
with pdfplumber.open(pdf_bytes) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
markdown_content.append(f"\n## Page {page_num}\n")
# If OCR is enabled, interleave text and images by position
if ocr_service:
images_on_page = self._extract_page_images(pdf_bytes, page_num)
if images_on_page:
# Extract text lines with Y positions
chars = page.chars
if chars:
# Group chars into lines based on Y position
lines_with_y = []
current_line = []
current_y = None
for char in sorted(
chars, key=lambda c: (c["top"], c["x0"])
):
y = char["top"]
if current_y is None:
current_y = y
elif abs(y - current_y) > 2: # New line threshold
if current_line:
text = "".join(
[c["text"] for c in current_line]
)
lines_with_y.append(
{"y": current_y, "text": text.strip()}
)
current_line = []
current_y = y
current_line.append(char)
# Add last line
if current_line:
text = "".join([c["text"] for c in current_line])
lines_with_y.append(
{"y": current_y, "text": text.strip()}
)
else:
# Fallback: use simple text extraction
text_content = page.extract_text() or ""
lines_with_y = [
{"y": i * 10, "text": line}
for i, line in enumerate(text_content.split("\n"))
]
# OCR all images
image_data = []
for img_info in images_on_page:
ocr_result = ocr_service.extract_text(
img_info["stream"]
)
if ocr_result.text.strip():
image_data.append(
{
"y_pos": img_info["y_pos"],
"name": img_info["name"],
"ocr_text": ocr_result.text,
"backend": ocr_result.backend_used,
"type": "image",
}
)
# Add text items
content_items = [
{
"y_pos": item["y"],
"text": item["text"],
"type": "text",
}
for item in lines_with_y
if item["text"]
]
content_items.extend(image_data)
# Sort all items by Y position (top to bottom)
content_items.sort(key=lambda x: x["y_pos"])
# Build markdown by interleaving text and images
for item in content_items:
if item["type"] == "text":
markdown_content.append(item["text"])
else: # image
ocr_text = item["ocr_text"]
img_marker = (
f"\n\n*[Image OCR]\n{ocr_text}\n[End OCR]*\n"
)
markdown_content.append(img_marker)
else:
# No images detected - just extract regular text
text_content = page.extract_text() or ""
if text_content.strip():
markdown_content.append(text_content.strip())
else:
# No OCR, just extract text
text_content = page.extract_text() or ""
if text_content.strip():
markdown_content.append(text_content.strip())
# Build final markdown
markdown = "\n\n".join(markdown_content).strip()
# Fallback to pdfminer if empty
if not markdown:
pdf_bytes.seek(0)
markdown = pdfminer.high_level.extract_text(pdf_bytes)
except Exception:
# Fallback to pdfminer
try:
pdf_bytes.seek(0)
markdown = pdfminer.high_level.extract_text(pdf_bytes)
except Exception:
markdown = ""
# Final fallback: If still empty/whitespace and OCR is available,
# treat as scanned PDF and OCR full pages
if ocr_service and (not markdown or not markdown.strip()):
pdf_bytes.seek(0)
markdown = self._ocr_full_pages(pdf_bytes, ocr_service)
return DocumentConverterResult(markdown=markdown)
def _extract_page_images(self, pdf_bytes: io.BytesIO, page_num: int) -> list[dict]:
"""
Extract images from a PDF page using pdfplumber.
Args:
pdf_bytes: PDF file as BytesIO
page_num: Page number (1-indexed)
Returns:
List of image info dicts with 'stream', 'bbox', 'name', 'y_pos'
"""
images = []
try:
pdf_bytes.seek(0)
with pdfplumber.open(pdf_bytes) as pdf:
if page_num <= len(pdf.pages):
page = pdf.pages[page_num - 1] # 0-indexed
images = _extract_images_from_page(page)
except Exception:
pass
# Sort by vertical position (top to bottom)
images.sort(key=lambda x: x["y_pos"])
return images
def _ocr_full_pages(
self, pdf_bytes: io.BytesIO, ocr_service: LLMVisionOCRService
) -> str:
"""
Fallback for scanned PDFs: Convert entire pages to images and OCR them.
Used when text extraction returns empty/whitespace results.
Args:
pdf_bytes: PDF file as BytesIO
ocr_service: OCR service to use
Returns:
Markdown text extracted from OCR of full pages
"""
markdown_parts = []
try:
pdf_bytes.seek(0)
with pdfplumber.open(pdf_bytes) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
try:
markdown_parts.append(f"\n## Page {page_num}\n")
# Render page to image
page_img = page.to_image(resolution=300)
img_stream = io.BytesIO()
page_img.original.save(img_stream, format="PNG")
img_stream.seek(0)
# Run OCR
ocr_result = ocr_service.extract_text(img_stream)
if ocr_result.text.strip():
text = ocr_result.text.strip()
markdown_parts.append(f"*[Image OCR]\n{text}\n[End OCR]*")
else:
markdown_parts.append(
"*[No text could be extracted from this page]*"
)
except Exception as e:
markdown_parts.append(
f"*[Error processing page {page_num}: {str(e)}]*"
)
continue
except Exception:
# pdfplumber failed (e.g. malformed EOF) — try PyMuPDF for rendering
markdown_parts = []
try:
import fitz # PyMuPDF
pdf_bytes.seek(0)
doc = fitz.open(stream=pdf_bytes.read(), filetype="pdf")
for page_num in range(1, doc.page_count + 1):
try:
markdown_parts.append(f"\n## Page {page_num}\n")
page = doc[page_num - 1]
mat = fitz.Matrix(300 / 72, 300 / 72) # 300 DPI
pix = page.get_pixmap(matrix=mat)
img_stream = io.BytesIO(pix.tobytes("png"))
img_stream.seek(0)
ocr_result = ocr_service.extract_text(img_stream)
if ocr_result.text.strip():
text = ocr_result.text.strip()
markdown_parts.append(f"*[Image OCR]\n{text}\n[End OCR]*")
else:
markdown_parts.append(
"*[No text could be extracted from this page]*"
)
except Exception as e:
markdown_parts.append(
f"*[Error processing page {page_num}: {str(e)}]*"
)
continue
doc.close()
except Exception:
return "*[Error: Could not process scanned PDF]*"
return "\n\n".join(markdown_parts).strip()
@@ -0,0 +1,68 @@
"""
Plugin registration for markitdown-ocr.
Registers OCR-enhanced converters with priority-based replacement strategy.
"""
from typing import Any
from markitdown import MarkItDown
from ._ocr_service import LLMVisionOCRService
from ._pdf_converter_with_ocr import PdfConverterWithOCR
from ._docx_converter_with_ocr import DocxConverterWithOCR
from ._pptx_converter_with_ocr import PptxConverterWithOCR
from ._xlsx_converter_with_ocr import XlsxConverterWithOCR
__plugin_interface_version__ = 1
def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None:
"""
Register OCR-enhanced converters with MarkItDown.
This plugin provides OCR support for PDF, DOCX, PPTX, and XLSX files.
The converters are registered with priority -1.0 to run BEFORE built-in
converters (which have priority 0.0), effectively replacing them when
the plugin is enabled.
Args:
markitdown: MarkItDown instance to register converters with
**kwargs: Additional keyword arguments that may include:
- llm_client: OpenAI-compatible client for LLM-based OCR (required for OCR to work)
- llm_model: Model name (e.g., 'gpt-4o')
- llm_prompt: Custom prompt for text extraction
"""
# Create OCR service — reads the same llm_client/llm_model kwargs
# that MarkItDown itself already accepts for image descriptions
llm_client = kwargs.get("llm_client")
llm_model = kwargs.get("llm_model")
llm_prompt = kwargs.get("llm_prompt")
ocr_service: LLMVisionOCRService | None = None
if llm_client and llm_model:
ocr_service = LLMVisionOCRService(
client=llm_client,
model=llm_model,
default_prompt=llm_prompt,
)
# Register converters with priority -1.0 (before built-ins at 0.0)
# This effectively "replaces" the built-in converters when plugin is installed
# Pass the OCR service to each converter's constructor
PRIORITY_OCR_ENHANCED = -1.0
markitdown.register_converter(
PdfConverterWithOCR(ocr_service=ocr_service), priority=PRIORITY_OCR_ENHANCED
)
markitdown.register_converter(
DocxConverterWithOCR(ocr_service=ocr_service), priority=PRIORITY_OCR_ENHANCED
)
markitdown.register_converter(
PptxConverterWithOCR(ocr_service=ocr_service), priority=PRIORITY_OCR_ENHANCED
)
markitdown.register_converter(
XlsxConverterWithOCR(ocr_service=ocr_service), priority=PRIORITY_OCR_ENHANCED
)
@@ -0,0 +1,249 @@
"""
Enhanced PPTX Converter with improved OCR support.
Already has LLM-based image description, this enhances it with traditional OCR fallback.
"""
import io
import sys
from typing import Any, BinaryIO, Optional
from typing import BinaryIO, Any, Optional
from markitdown.converters import HtmlConverter
from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo
from markitdown._exceptions import (
MissingDependencyException,
MISSING_DEPENDENCY_MESSAGE,
)
from ._ocr_service import LLMVisionOCRService
_dependency_exc_info = None
try:
import pptx
except ImportError:
_dependency_exc_info = sys.exc_info()
class PptxConverterWithOCR(DocumentConverter):
"""Enhanced PPTX Converter with OCR fallback."""
def __init__(self, ocr_service: Optional[LLMVisionOCRService] = None):
super().__init__()
self._html_converter = HtmlConverter()
self.ocr_service = ocr_service
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension == ".pptx":
return True
if mimetype.startswith(
"application/vnd.openxmlformats-officedocument.presentationml"
):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> DocumentConverterResult:
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".pptx",
feature="pptx",
)
) from _dependency_exc_info[1].with_traceback(
_dependency_exc_info[2]
) # type: ignore[union-attr]
# Get OCR service (from kwargs or instance)
ocr_service: Optional[LLMVisionOCRService] = (
kwargs.get("ocr_service") or self.ocr_service
)
llm_client = kwargs.get("llm_client")
presentation = pptx.Presentation(file_stream)
md_content = ""
slide_num = 0
for slide in presentation.slides:
slide_num += 1
md_content += f"\\n\\n<!-- Slide number: {slide_num} -->\\n"
title = slide.shapes.title
def get_shape_content(shape, **kwargs):
nonlocal md_content
# Pictures
if self._is_picture(shape):
# Get image data
image_stream = io.BytesIO(shape.image.blob)
# Try LLM description first if available
llm_description = ""
if llm_client and kwargs.get("llm_model"):
try:
from ._llm_caption import llm_caption
image_filename = shape.image.filename
image_extension = None
if image_filename:
import os
image_extension = os.path.splitext(image_filename)[1]
image_stream_info = StreamInfo(
mimetype=shape.image.content_type,
extension=image_extension,
filename=image_filename,
)
llm_description = llm_caption(
image_stream,
image_stream_info,
client=llm_client,
model=kwargs.get("llm_model"),
prompt=kwargs.get("llm_prompt"),
)
except Exception:
pass
# Try OCR if LLM failed or not available
ocr_text = ""
if not llm_description and ocr_service:
try:
image_stream.seek(0)
ocr_result = ocr_service.extract_text(image_stream)
if ocr_result.text.strip():
ocr_text = ocr_result.text.strip()
except Exception:
pass
# Format extracted content using unified OCR block format
content = (llm_description or ocr_text or "").strip()
if content:
md_content += f"\n*[Image OCR]\n{content}\n[End OCR]*\n"
# Tables
if self._is_table(shape):
md_content += self._convert_table_to_markdown(shape.table, **kwargs)
# Charts
if shape.has_chart:
md_content += self._convert_chart_to_markdown(shape.chart)
# Text areas
elif shape.has_text_frame:
if shape == title:
md_content += "# " + shape.text.lstrip() + "\\n"
else:
md_content += shape.text + "\\n"
# Group Shapes
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
sorted_shapes = sorted(
shape.shapes,
key=lambda x: (
float("-inf") if not x.top else x.top,
float("-inf") if not x.left else x.left,
),
)
for subshape in sorted_shapes:
get_shape_content(subshape, **kwargs)
sorted_shapes = sorted(
slide.shapes,
key=lambda x: (
float("-inf") if not x.top else x.top,
float("-inf") if not x.left else x.left,
),
)
for shape in sorted_shapes:
get_shape_content(shape, **kwargs)
md_content = md_content.strip()
if slide.has_notes_slide:
md_content += "\\n\\n### Notes:\\n"
notes_frame = slide.notes_slide.notes_text_frame
if notes_frame is not None:
md_content += notes_frame.text
md_content = md_content.strip()
return DocumentConverterResult(markdown=md_content.strip())
def _is_picture(self, shape):
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
return True
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
if hasattr(shape, "image"):
return True
return False
def _is_table(self, shape):
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
return True
return False
def _convert_table_to_markdown(self, table, **kwargs):
import html
html_table = "<html><body><table>"
first_row = True
for row in table.rows:
html_table += "<tr>"
for cell in row.cells:
if first_row:
html_table += "<th>" + html.escape(cell.text) + "</th>"
else:
html_table += "<td>" + html.escape(cell.text) + "</td>"
html_table += "</tr>"
first_row = False
html_table += "</table></body></html>"
return (
self._html_converter.convert_string(html_table, **kwargs).markdown.strip()
+ "\\n"
)
def _convert_chart_to_markdown(self, chart):
try:
md = "\\n\\n### Chart"
if chart.has_title:
md += f": {chart.chart_title.text_frame.text}"
md += "\\n\\n"
data = []
category_names = [c.label for c in chart.plots[0].categories]
series_names = [s.name for s in chart.series]
data.append(["Category"] + series_names)
for idx, category in enumerate(category_names):
row = [category]
for series in chart.series:
row.append(series.values[idx])
data.append(row)
markdown_table = []
for row in data:
markdown_table.append("| " + " | ".join(map(str, row)) + " |")
header = markdown_table[0]
separator = "|" + "|".join(["---"] * len(data[0])) + "|"
return md + "\\n".join([header, separator] + markdown_table[1:])
except ValueError as e:
if "unsupported plot type" in str(e):
return "\\n\\n[unsupported chart]\\n\\n"
except Exception:
return "\\n\\n[unsupported chart]\\n\\n"
@@ -0,0 +1,225 @@
"""
Enhanced XLSX Converter with OCR support for embedded images.
Extracts images from Excel spreadsheets and performs OCR while maintaining cell context.
"""
import io
import sys
from typing import Any, BinaryIO, Optional
from markitdown.converters import HtmlConverter
from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo
from markitdown._exceptions import (
MissingDependencyException,
MISSING_DEPENDENCY_MESSAGE,
)
from ._ocr_service import LLMVisionOCRService
# Try loading dependencies
_xlsx_dependency_exc_info = None
try:
import pandas as pd
from openpyxl import load_workbook
except ImportError:
_xlsx_dependency_exc_info = sys.exc_info()
class XlsxConverterWithOCR(DocumentConverter):
"""
Enhanced XLSX Converter with OCR support for embedded images.
Extracts images with their cell positions and performs OCR.
"""
def __init__(self, ocr_service: Optional[LLMVisionOCRService] = None):
super().__init__()
self._html_converter = HtmlConverter()
self.ocr_service = ocr_service
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
if extension == ".xlsx":
return True
if mimetype.startswith(
"application/vnd.openxmlformats-officedocument.spreadsheetml"
):
return True
return False
def convert(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any,
) -> DocumentConverterResult:
if _xlsx_dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
converter=type(self).__name__,
extension=".xlsx",
feature="xlsx",
)
) from _xlsx_dependency_exc_info[1].with_traceback(
_xlsx_dependency_exc_info[2]
) # type: ignore[union-attr]
# Get OCR service if available (from kwargs or instance)
ocr_service: Optional[LLMVisionOCRService] = (
kwargs.get("ocr_service") or self.ocr_service
)
if ocr_service:
# Remove ocr_service from kwargs to avoid duplicate argument error
kwargs_without_ocr = {k: v for k, v in kwargs.items() if k != "ocr_service"}
return self._convert_with_ocr(
file_stream, ocr_service, **kwargs_without_ocr
)
else:
return self._convert_standard(file_stream, **kwargs)
def _convert_standard(
self, file_stream: BinaryIO, **kwargs: Any
) -> DocumentConverterResult:
"""Standard conversion without OCR."""
file_stream.seek(0)
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
md_content = ""
for sheet_name in sheets:
md_content += f"## {sheet_name}\n"
html_content = sheets[sheet_name].to_html(index=False)
md_content += (
self._html_converter.convert_string(
html_content, **kwargs
).markdown.strip()
+ "\n\n"
)
return DocumentConverterResult(markdown=md_content.strip())
def _convert_with_ocr(
self, file_stream: BinaryIO, ocr_service: LLMVisionOCRService, **kwargs: Any
) -> DocumentConverterResult:
"""Convert XLSX with image OCR."""
file_stream.seek(0)
wb = load_workbook(file_stream)
md_content = ""
for sheet_name in wb.sheetnames:
sheet = wb[sheet_name]
md_content += f"## {sheet_name}\n\n"
# Convert sheet data to markdown table
file_stream.seek(0)
try:
df = pd.read_excel(
file_stream, sheet_name=sheet_name, engine="openpyxl"
)
html_content = df.to_html(index=False)
md_content += (
self._html_converter.convert_string(
html_content, **kwargs
).markdown.strip()
+ "\n\n"
)
except Exception:
# If pandas fails, just skip the table
pass
# Extract and OCR images in this sheet
images_with_ocr = self._extract_and_ocr_sheet_images(sheet, ocr_service)
if images_with_ocr:
md_content += "### Images in this sheet:\n\n"
for img_info in images_with_ocr:
ocr_text = img_info["ocr_text"]
md_content += f"*[Image OCR]\n{ocr_text}\n[End OCR]*\n\n"
return DocumentConverterResult(markdown=md_content.strip())
def _extract_and_ocr_sheet_images(
self, sheet: Any, ocr_service: LLMVisionOCRService
) -> list[dict]:
"""
Extract and OCR images from an Excel sheet.
Args:
sheet: openpyxl worksheet
ocr_service: OCR service
Returns:
List of dicts with 'cell_ref' and 'ocr_text'
"""
results = []
try:
# Check if sheet has images
if hasattr(sheet, "_images"):
for img in sheet._images:
try:
# Get image data
if hasattr(img, "_data"):
image_data = img._data()
elif hasattr(img, "image"):
# Some versions store it differently
image_data = img.image
else:
continue
# Create image stream
image_stream = io.BytesIO(image_data)
# Get cell reference
cell_ref = "unknown"
if hasattr(img, "anchor"):
anchor = img.anchor
if hasattr(anchor, "_from"):
from_cell = anchor._from
if hasattr(from_cell, "col") and hasattr(
from_cell, "row"
):
# Convert column number to letter
col_letter = self._column_number_to_letter(
from_cell.col
)
cell_ref = f"{col_letter}{from_cell.row + 1}"
# Perform OCR
ocr_result = ocr_service.extract_text(image_stream)
if ocr_result.text.strip():
results.append(
{
"cell_ref": cell_ref,
"ocr_text": ocr_result.text.strip(),
"backend": ocr_result.backend_used,
}
)
except Exception:
continue
except Exception:
pass
return results
@staticmethod
def _column_number_to_letter(n: int) -> str:
"""Convert column number to Excel column letter (0-indexed)."""
result = ""
n = n + 1 # Make 1-indexed
while n > 0:
n -= 1
result = chr(65 + (n % 26)) + result
n //= 26
return result