Compare commits
8 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 7b040a4445 | |||
| 63cbbd9de6 | |||
| a6c8ac46a6 | |||
| c6308dc822 | |||
| 4a5340f93b | |||
| 6b0fd15e60 | |||
| 2b6ec9f315 | |||
| c83de14a9c |
@@ -1,2 +1,3 @@
|
||||
*
|
||||
!packages/
|
||||
!app.py
|
||||
|
||||
@@ -1,2 +1,5 @@
|
||||
packages/markitdown/tests/test_files/** linguist-vendored
|
||||
packages/markitdown-sample-plugin/tests/test_files/** linguist-vendored
|
||||
|
||||
# Treat PDF files as binary to prevent line ending conversion
|
||||
*.pdf binary
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
# markitdown
|
||||
|
||||
이 파일은 Claude Code가 어느 경로에서 실행되든 자동으로 로드합니다.
|
||||
|
||||
## 프로젝트 개요
|
||||
- md 파일로 변환 간소화
|
||||
|
||||
## 저장소
|
||||
- Git 서버: Gitea (자체 NAS 운영)
|
||||
- Gitea URL: https://gitea.gru.farm/
|
||||
- 계정: airkjw
|
||||
- 저장소: markitdown
|
||||
- Remote: https://gitea.gru.farm/airkjw/markitdown
|
||||
- 토큰: b1a93cfe7024411e34b3cb9ff04bb0c3abc35bc6
|
||||
@@ -0,0 +1,34 @@
|
||||
FROM python:3.13-slim-bullseye
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV EXIFTOOL_PATH=/usr/local/bin/exiftool
|
||||
ENV FFMPEG_PATH=/usr/bin/ffmpeg
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
ffmpeg \
|
||||
curl \
|
||||
perl \
|
||||
make \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& curl -fsSL https://exiftool.org/Image-ExifTool-13.55.tar.gz -o /tmp/exiftool.tar.gz \
|
||||
&& tar -xzf /tmp/exiftool.tar.gz -C /tmp \
|
||||
&& cd /tmp/Image-ExifTool-13.55 \
|
||||
&& perl Makefile.PL && make install \
|
||||
&& rm -rf /tmp/exiftool.tar.gz /tmp/Image-ExifTool-13.55
|
||||
|
||||
WORKDIR /app
|
||||
COPY packages/ /app/packages/
|
||||
COPY app.py /app/app.py
|
||||
|
||||
RUN pip --no-cache-dir install \
|
||||
/app/packages/markitdown[all] \
|
||||
streamlit
|
||||
|
||||
EXPOSE 8501
|
||||
|
||||
HEALTHCHECK CMD curl -f http://localhost:8501/_stcore/health || exit 1
|
||||
|
||||
ENTRYPOINT ["streamlit", "run", "app.py", \
|
||||
"--server.port=8501", \
|
||||
"--server.address=0.0.0.0", \
|
||||
"--server.headless=true"]
|
||||
@@ -9,7 +9,7 @@
|
||||
|
||||
> [!IMPORTANT]
|
||||
> Breaking changes between 0.0.1 to 0.1.0:
|
||||
> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install 'markitdown[all]'` to have backward-compatible behavior.
|
||||
> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install 'markitdown[all]'` to have backward-compatible behavior.
|
||||
> * convert\_stream() now requires a binary file-like object (e.g., a file opened in binary mode, or an io.BytesIO object). This is a breaking change from the previous version, where it previously also accepted text file-like objects, like io.StringIO.
|
||||
> * The DocumentConverter class interface has changed to read from file-like streams rather than file paths. *No temporary files are created anymore*. If you are the maintainer of a plugin, or custom DocumentConverter, you likely need to update your code. Otherwise, if only using the MarkItDown class or CLI (as in these examples), you should not need to change anything.
|
||||
|
||||
@@ -132,6 +132,38 @@ markitdown --use-plugins path-to-file.pdf
|
||||
|
||||
To find available plugins, search GitHub for the hashtag `#markitdown-plugin`. To develop a plugin, see `packages/markitdown-sample-plugin`.
|
||||
|
||||
#### markitdown-ocr Plugin
|
||||
|
||||
The `markitdown-ocr` plugin adds OCR support to PDF, DOCX, PPTX, and XLSX converters, extracting text from embedded images using LLM Vision — the same `llm_client` / `llm_model` pattern that MarkItDown already uses for image descriptions. No new ML libraries or binary dependencies required.
|
||||
|
||||
**Installation:**
|
||||
|
||||
```bash
|
||||
pip install markitdown-ocr
|
||||
pip install openai # or any OpenAI-compatible client
|
||||
```
|
||||
|
||||
**Usage:**
|
||||
|
||||
Pass the same `llm_client` and `llm_model` you would use for image descriptions:
|
||||
|
||||
```python
|
||||
from markitdown import MarkItDown
|
||||
from openai import OpenAI
|
||||
|
||||
md = MarkItDown(
|
||||
enable_plugins=True,
|
||||
llm_client=OpenAI(),
|
||||
llm_model="gpt-4o",
|
||||
)
|
||||
result = md.convert("document_with_images.pdf")
|
||||
print(result.text_content)
|
||||
```
|
||||
|
||||
If no `llm_client` is provided the plugin still loads, but OCR is silently skipped and the standard built-in converter is used instead.
|
||||
|
||||
See [`packages/markitdown-ocr/README.md`](packages/markitdown-ocr/README.md) for detailed documentation.
|
||||
|
||||
### Azure Document Intelligence
|
||||
|
||||
To use Microsoft Document Intelligence for conversion:
|
||||
|
||||
@@ -0,0 +1,117 @@
|
||||
import io
|
||||
import tempfile
|
||||
import os
|
||||
import streamlit as st
|
||||
from markitdown import MarkItDown
|
||||
|
||||
st.set_page_config(
|
||||
page_title="MarkItDown",
|
||||
page_icon="📄",
|
||||
layout="wide",
|
||||
)
|
||||
|
||||
st.title("📄 MarkItDown")
|
||||
st.caption("파일을 Markdown으로 변환합니다")
|
||||
|
||||
SUPPORTED_EXTENSIONS = [
|
||||
"pdf", "docx", "pptx", "xlsx", "xls",
|
||||
"jpg", "jpeg", "png",
|
||||
"mp3", "wav",
|
||||
"html", "htm",
|
||||
"csv", "json", "xml",
|
||||
"ipynb", "epub", "zip", "msg",
|
||||
]
|
||||
|
||||
# Sidebar
|
||||
with st.sidebar:
|
||||
st.header("설정")
|
||||
show_preview = st.toggle("Markdown 렌더링 미리보기", value=True)
|
||||
st.divider()
|
||||
st.markdown("**지원 포맷**")
|
||||
st.markdown(
|
||||
"PDF · DOCX · PPTX · XLSX · XLS\n\n"
|
||||
"JPG · PNG · MP3 · WAV\n\n"
|
||||
"HTML · CSV · JSON · XML\n\n"
|
||||
"IPYNB · EPUB · ZIP · MSG"
|
||||
)
|
||||
|
||||
# URL 변환
|
||||
url_tab, file_tab = st.tabs(["URL 변환", "파일 업로드"])
|
||||
|
||||
md = MarkItDown()
|
||||
|
||||
with url_tab:
|
||||
url = st.text_input("URL 입력", placeholder="https://example.com 또는 YouTube URL")
|
||||
if st.button("변환", key="url_btn", disabled=not url):
|
||||
with st.spinner("변환 중..."):
|
||||
try:
|
||||
result = md.convert(url)
|
||||
st.session_state["url_result"] = result.text_content
|
||||
st.session_state["url_filename"] = "output.md"
|
||||
except Exception as e:
|
||||
st.error(f"변환 실패: {e}")
|
||||
|
||||
if "url_result" in st.session_state:
|
||||
_content = st.session_state["url_result"]
|
||||
col1, col2 = st.columns([1, 1]) if show_preview else (st.container(), None)
|
||||
|
||||
with col1:
|
||||
st.subheader("Markdown 원문")
|
||||
st.code(_content, language="markdown")
|
||||
|
||||
if show_preview and col2:
|
||||
with col2:
|
||||
st.subheader("미리보기")
|
||||
st.markdown(_content)
|
||||
|
||||
st.download_button(
|
||||
"⬇️ .md 파일 다운로드",
|
||||
data=_content,
|
||||
file_name=st.session_state["url_filename"],
|
||||
mime="text/markdown",
|
||||
)
|
||||
|
||||
with file_tab:
|
||||
uploaded = st.file_uploader(
|
||||
"파일을 끌어다 놓거나 클릭해서 선택하세요",
|
||||
type=SUPPORTED_EXTENSIONS,
|
||||
)
|
||||
|
||||
if uploaded is not None:
|
||||
if st.button("변환", key="file_btn"):
|
||||
with st.spinner("변환 중..."):
|
||||
try:
|
||||
suffix = os.path.splitext(uploaded.name)[1]
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
||||
tmp.write(uploaded.getvalue())
|
||||
tmp_path = tmp.name
|
||||
|
||||
result = md.convert(tmp_path)
|
||||
os.unlink(tmp_path)
|
||||
|
||||
st.session_state["file_result"] = result.text_content
|
||||
st.session_state["file_filename"] = os.path.splitext(uploaded.name)[0] + ".md"
|
||||
except Exception as e:
|
||||
st.error(f"변환 실패: {e}")
|
||||
|
||||
if "file_result" in st.session_state:
|
||||
_content = st.session_state["file_result"]
|
||||
|
||||
if show_preview:
|
||||
col1, col2 = st.columns([1, 1])
|
||||
with col1:
|
||||
st.subheader("Markdown 원문")
|
||||
st.code(_content, language="markdown")
|
||||
with col2:
|
||||
st.subheader("미리보기")
|
||||
st.markdown(_content)
|
||||
else:
|
||||
st.subheader("Markdown 원문")
|
||||
st.code(_content, language="markdown")
|
||||
|
||||
st.download_button(
|
||||
"⬇️ .md 파일 다운로드",
|
||||
data=_content,
|
||||
file_name=st.session_state["file_filename"],
|
||||
mime="text/markdown",
|
||||
)
|
||||
@@ -1,5 +1,9 @@
|
||||
# MarkItDown-MCP
|
||||
|
||||
> [!IMPORTANT]
|
||||
> The MarkItDown-MCP package is meant for **local use**, with local trusted agents. In particular, when running the MCP server with Streamable HTTP or SSE, it binds to `localhost` by default, and is not exposed to other machines on the network or Internet. In this configuration, it is meant to be a direct alternative to the STDIO transport, which may be more convenient in some cases. DO NOT bind the server to other interfaces unless you understand the [security implications](#security-considerations) of doing so.
|
||||
|
||||
|
||||
[](https://pypi.org/project/markitdown-mcp/)
|
||||

|
||||
[](https://github.com/microsoft/autogen)
|
||||
@@ -18,14 +22,14 @@ pip install markitdown-mcp
|
||||
|
||||
## Usage
|
||||
|
||||
To run the MCP server, using STDIO (default) use the following command:
|
||||
To run the MCP server, using STDIO (default), use the following command:
|
||||
|
||||
|
||||
```bash
|
||||
markitdown-mcp
|
||||
```
|
||||
|
||||
To run the MCP server, using Streamable HTTP and SSE use the following command:
|
||||
To run the MCP server, using Streamable HTTP and SSE, use the following command:
|
||||
|
||||
```bash
|
||||
markitdown-mcp --http --host 127.0.0.1 --port 3001
|
||||
@@ -96,7 +100,7 @@ If you want to mount a directory, adjust it accordingly:
|
||||
|
||||
## Debugging
|
||||
|
||||
To debug the MCP server you can use the `mcpinspector` tool.
|
||||
To debug the MCP server you can use the `MCP Inspector` tool.
|
||||
|
||||
```bash
|
||||
npx @modelcontextprotocol/inspector
|
||||
@@ -127,7 +131,7 @@ Finally:
|
||||
|
||||
## Security Considerations
|
||||
|
||||
The server does not support authentication, and runs with the privileges of the user running it. For this reason, when running in SSE or Streamable HTTP mode, it is recommended to run the server bound to `localhost` (default).
|
||||
The server does not support authentication, and runs with the privileges of the user running it. For this reason, when running in SSE or Streamable HTTP mode, the server binds by default to `localhost`. Even still, it is important to recognize that the server can be accessed by any process or users on the same local machine, and that the `convert_to_markdown` tool can be used to read any file that the server's user has access to, or any data from the network. If you require additional security, consider running the server in a sandboxed environment, such as a virtual machine or container, and ensure that the user permissions are properly configured to limit access to sensitive files and network segments. Above all, DO NOT bind the server to other interfaces (non-localhost) unless you understand the security implications of doing so.
|
||||
|
||||
## Trademarks
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
__version__ = "0.0.1a4"
|
||||
__version__ = "0.0.1a5"
|
||||
|
||||
@@ -113,10 +113,23 @@ def main():
|
||||
sys.exit(1)
|
||||
|
||||
if use_http:
|
||||
host = args.host if args.host else "127.0.0.1"
|
||||
if args.host and args.host not in ("127.0.0.1", "localhost"):
|
||||
print(
|
||||
"\n"
|
||||
"WARNING: The server is being bound to a non-localhost interface "
|
||||
f"({host}).\n"
|
||||
"This exposes the server to other machines on the network or Internet.\n"
|
||||
"The server has NO authentication and runs with your user's privileges.\n"
|
||||
"Any process or user that can reach this interface can read files and\n"
|
||||
"fetch network resources accessible to this user.\n"
|
||||
"Only proceed if you understand the security implications.\n",
|
||||
file=sys.stderr,
|
||||
)
|
||||
starlette_app = create_starlette_app(mcp_server, debug=True)
|
||||
uvicorn.run(
|
||||
starlette_app,
|
||||
host=args.host if args.host else "127.0.0.1",
|
||||
host=host,
|
||||
port=args.port if args.port else 3001,
|
||||
)
|
||||
else:
|
||||
|
||||
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) Microsoft Corporation.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE
|
||||
@@ -0,0 +1,200 @@
|
||||
# MarkItDown OCR Plugin
|
||||
|
||||
LLM Vision plugin for MarkItDown that extracts text from images embedded in PDF, DOCX, PPTX, and XLSX files.
|
||||
|
||||
Uses the same `llm_client` / `llm_model` pattern that MarkItDown already supports for image descriptions — no new ML libraries or binary dependencies required.
|
||||
|
||||
## Features
|
||||
|
||||
- **Enhanced PDF Converter**: Extracts text from images within PDFs, with full-page OCR fallback for scanned documents
|
||||
- **Enhanced DOCX Converter**: OCR for images in Word documents
|
||||
- **Enhanced PPTX Converter**: OCR for images in PowerPoint presentations
|
||||
- **Enhanced XLSX Converter**: OCR for images in Excel spreadsheets
|
||||
- **Context Preservation**: Maintains document structure and flow when inserting extracted text
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
pip install markitdown-ocr
|
||||
```
|
||||
|
||||
The plugin uses whatever OpenAI-compatible client you already have. Install one if you don't have it yet:
|
||||
|
||||
```bash
|
||||
pip install openai
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Command Line
|
||||
|
||||
```bash
|
||||
markitdown document.pdf --use-plugins --llm-client openai --llm-model gpt-4o
|
||||
```
|
||||
|
||||
### Python API
|
||||
|
||||
Pass `llm_client` and `llm_model` to `MarkItDown()` exactly as you would for image descriptions:
|
||||
|
||||
```python
|
||||
from markitdown import MarkItDown
|
||||
from openai import OpenAI
|
||||
|
||||
md = MarkItDown(
|
||||
enable_plugins=True,
|
||||
llm_client=OpenAI(),
|
||||
llm_model="gpt-4o",
|
||||
)
|
||||
|
||||
result = md.convert("document_with_images.pdf")
|
||||
print(result.text_content)
|
||||
```
|
||||
|
||||
If no `llm_client` is provided the plugin still loads, but OCR is silently skipped — falling back to the standard built-in converter.
|
||||
|
||||
### Custom Prompt
|
||||
|
||||
Override the default extraction prompt for specialized documents:
|
||||
|
||||
```python
|
||||
md = MarkItDown(
|
||||
enable_plugins=True,
|
||||
llm_client=OpenAI(),
|
||||
llm_model="gpt-4o",
|
||||
llm_prompt="Extract all text from this image, preserving table structure.",
|
||||
)
|
||||
```
|
||||
|
||||
### Any OpenAI-Compatible Client
|
||||
|
||||
Works with any client that follows the OpenAI API:
|
||||
|
||||
```python
|
||||
from openai import AzureOpenAI
|
||||
|
||||
md = MarkItDown(
|
||||
enable_plugins=True,
|
||||
llm_client=AzureOpenAI(
|
||||
api_key="...",
|
||||
azure_endpoint="https://your-resource.openai.azure.com/",
|
||||
api_version="2024-02-01",
|
||||
),
|
||||
llm_model="gpt-4o",
|
||||
)
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
When `MarkItDown(enable_plugins=True, llm_client=..., llm_model=...)` is called:
|
||||
|
||||
1. MarkItDown discovers the plugin via the `markitdown.plugin` entry point group
|
||||
2. It calls `register_converters()`, forwarding all kwargs including `llm_client` and `llm_model`
|
||||
3. The plugin creates an `LLMVisionOCRService` from those kwargs
|
||||
4. Four OCR-enhanced converters are registered at **priority -1.0** — before the built-in converters at priority 0.0
|
||||
|
||||
When a file is converted:
|
||||
|
||||
1. The OCR converter accepts the file
|
||||
2. It extracts embedded images from the document
|
||||
3. Each image is sent to the LLM with an extraction prompt
|
||||
4. The returned text is inserted inline, preserving document structure
|
||||
5. If the LLM call fails, conversion continues without that image's text
|
||||
|
||||
## Supported File Formats
|
||||
|
||||
### PDF
|
||||
|
||||
- Embedded images are extracted by position (via `page.images` / page XObjects) and OCR'd inline, interleaved with the surrounding text in vertical reading order.
|
||||
- **Scanned PDFs** (pages with no extractable text) are detected automatically: each page is rendered at 300 DPI and sent to the LLM as a full-page image.
|
||||
- **Malformed PDFs** that pdfplumber/pdfminer cannot open (e.g. truncated EOF) are retried with PyMuPDF page rendering, so content is still recovered.
|
||||
|
||||
### DOCX
|
||||
|
||||
- Images are extracted via document part relationships (`doc.part.rels`).
|
||||
- OCR is run before the DOCX→HTML→Markdown pipeline executes: placeholder tokens are injected into the HTML so that the markdown converter does not escape the OCR markers, and the final placeholders are replaced with the formatted `*[Image OCR]...[End OCR]*` blocks after conversion.
|
||||
- Document flow (headings, paragraphs, tables) is fully preserved around the OCR blocks.
|
||||
|
||||
### PPTX
|
||||
|
||||
- Picture shapes, placeholder shapes with images, and images inside groups are all supported.
|
||||
- Shapes are processed in top-to-left reading order per slide.
|
||||
- If an `llm_client` is configured, the LLM is asked for a description first; OCR is used as the fallback when no description is returned.
|
||||
|
||||
### XLSX
|
||||
|
||||
- Images embedded in worksheets (`sheet._images`) are extracted per sheet.
|
||||
- Cell position is calculated from the image anchor coordinates (column/row → Excel letter notation).
|
||||
- Images are listed under a `### Images in this sheet:` section after the sheet's data table — they are not interleaved into the table rows.
|
||||
|
||||
### Output format
|
||||
|
||||
Every extracted OCR block is wrapped as:
|
||||
|
||||
```text
|
||||
*[Image OCR]
|
||||
<extracted text>
|
||||
[End OCR]*
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### OCR text missing from output
|
||||
|
||||
The most likely cause is a missing `llm_client` or `llm_model`. Verify:
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
from markitdown import MarkItDown
|
||||
|
||||
md = MarkItDown(
|
||||
enable_plugins=True,
|
||||
llm_client=OpenAI(), # required
|
||||
llm_model="gpt-4o", # required
|
||||
)
|
||||
```
|
||||
|
||||
### Plugin not loading
|
||||
|
||||
Confirm the plugin is installed and discovered:
|
||||
|
||||
```bash
|
||||
markitdown --list-plugins # should show: ocr
|
||||
```
|
||||
|
||||
### API errors
|
||||
|
||||
The plugin propagates LLM API errors as warnings and continues conversion. Check your API key, quota, and that the chosen model supports vision inputs.
|
||||
|
||||
## Development
|
||||
|
||||
### Running Tests
|
||||
|
||||
```bash
|
||||
cd packages/markitdown-ocr
|
||||
pytest tests/ -v
|
||||
```
|
||||
|
||||
### Building from Source
|
||||
|
||||
```bash
|
||||
git clone https://github.com/microsoft/markitdown.git
|
||||
cd markitdown/packages/markitdown-ocr
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions are welcome! See the [MarkItDown repository](https://github.com/microsoft/markitdown) for guidelines.
|
||||
|
||||
## License
|
||||
|
||||
MIT — see [LICENSE](LICENSE).
|
||||
|
||||
## Changelog
|
||||
|
||||
### 0.1.0 (Initial Release)
|
||||
|
||||
- LLM Vision OCR for PDF, DOCX, PPTX, XLSX
|
||||
- Full-page OCR fallback for scanned PDFs
|
||||
- Context-aware inline text insertion
|
||||
- Priority-based converter replacement (no code changes required)
|
||||
@@ -0,0 +1,57 @@
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "markitdown-ocr"
|
||||
dynamic = ["version"]
|
||||
description = 'OCR plugin for MarkItDown - Extracts text from images in PDF, DOCX, PPTX, and XLSX via LLM Vision'
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
license = "MIT"
|
||||
keywords = ["markitdown", "ocr", "pdf", "docx", "xlsx", "pptx", "llm", "vision"]
|
||||
authors = [
|
||||
{ name = "Contributors", email = "noreply@github.com" },
|
||||
]
|
||||
classifiers = [
|
||||
"Development Status :: 4 - Beta",
|
||||
"Programming Language :: Python",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Programming Language :: Python :: 3.13",
|
||||
"Programming Language :: Python :: Implementation :: CPython",
|
||||
]
|
||||
|
||||
# Core dependencies — matches the file-format libraries markitdown already uses
|
||||
dependencies = [
|
||||
"markitdown>=0.1.0",
|
||||
"pdfminer.six>=20251230",
|
||||
"pdfplumber>=0.11.9",
|
||||
"PyMuPDF>=1.24.0",
|
||||
"mammoth~=1.11.0",
|
||||
"python-docx",
|
||||
"python-pptx",
|
||||
"pandas",
|
||||
"openpyxl",
|
||||
"Pillow>=9.0.0",
|
||||
]
|
||||
|
||||
# llm_client is passed in by the user (same as for markitdown image descriptions);
|
||||
# install openai or any OpenAI-compatible SDK separately.
|
||||
[project.optional-dependencies]
|
||||
llm = [
|
||||
"openai>=1.0.0",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Documentation = "https://github.com/microsoft/markitdown#readme"
|
||||
Issues = "https://github.com/microsoft/markitdown/issues"
|
||||
Source = "https://github.com/microsoft/markitdown"
|
||||
|
||||
[tool.hatch.version]
|
||||
path = "src/markitdown_ocr/__about__.py"
|
||||
|
||||
# CRITICAL: Plugin entry point - MarkItDown will discover this plugin through this entry point
|
||||
[project.entry-points."markitdown.plugin"]
|
||||
ocr = "markitdown_ocr"
|
||||
@@ -0,0 +1,4 @@
|
||||
# SPDX-FileCopyrightText: 2025-present Contributors
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
__version__ = "0.1.0"
|
||||
@@ -0,0 +1,31 @@
|
||||
# SPDX-FileCopyrightText: 2025-present Contributors
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
"""
|
||||
markitdown-ocr: OCR plugin for MarkItDown
|
||||
|
||||
Adds LLM Vision-based text extraction from images embedded in PDF, DOCX, PPTX, and XLSX files.
|
||||
"""
|
||||
|
||||
from ._plugin import __plugin_interface_version__, register_converters
|
||||
from .__about__ import __version__
|
||||
from ._ocr_service import (
|
||||
OCRResult,
|
||||
LLMVisionOCRService,
|
||||
)
|
||||
from ._pdf_converter_with_ocr import PdfConverterWithOCR
|
||||
from ._docx_converter_with_ocr import DocxConverterWithOCR
|
||||
from ._pptx_converter_with_ocr import PptxConverterWithOCR
|
||||
from ._xlsx_converter_with_ocr import XlsxConverterWithOCR
|
||||
|
||||
__all__ = [
|
||||
"__version__",
|
||||
"__plugin_interface_version__",
|
||||
"register_converters",
|
||||
"OCRResult",
|
||||
"LLMVisionOCRService",
|
||||
"PdfConverterWithOCR",
|
||||
"DocxConverterWithOCR",
|
||||
"PptxConverterWithOCR",
|
||||
"XlsxConverterWithOCR",
|
||||
]
|
||||
@@ -0,0 +1,189 @@
|
||||
"""
|
||||
Enhanced DOCX Converter with OCR support for embedded images.
|
||||
Extracts images from Word documents and performs OCR while maintaining context.
|
||||
"""
|
||||
|
||||
import io
|
||||
import re
|
||||
import sys
|
||||
from typing import Any, BinaryIO, Optional
|
||||
|
||||
from markitdown.converters import HtmlConverter
|
||||
from markitdown.converter_utils.docx.pre_process import pre_process_docx
|
||||
from markitdown import DocumentConverterResult, StreamInfo
|
||||
from markitdown._exceptions import (
|
||||
MissingDependencyException,
|
||||
MISSING_DEPENDENCY_MESSAGE,
|
||||
)
|
||||
from ._ocr_service import LLMVisionOCRService
|
||||
|
||||
# Try loading dependencies
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import mammoth
|
||||
from docx import Document
|
||||
except ImportError:
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
# Placeholder injected into HTML so that mammoth never sees the OCR markers.
|
||||
# Must be a single token with no special markdown characters.
|
||||
_PLACEHOLDER = "MARKITDOWNOCRBLOCK{}"
|
||||
|
||||
|
||||
class DocxConverterWithOCR(HtmlConverter):
|
||||
"""
|
||||
Enhanced DOCX Converter with OCR support for embedded images.
|
||||
Maintains document flow while extracting text from images inline.
|
||||
"""
|
||||
|
||||
def __init__(self, ocr_service: Optional[LLMVisionOCRService] = None):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
self.ocr_service = ocr_service
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension == ".docx":
|
||||
return True
|
||||
|
||||
if mimetype.startswith(
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml"
|
||||
):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".docx",
|
||||
feature="docx",
|
||||
)
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
_dependency_exc_info[2]
|
||||
) # type: ignore[union-attr]
|
||||
|
||||
# Get OCR service if available (from kwargs or instance)
|
||||
ocr_service: Optional[LLMVisionOCRService] = (
|
||||
kwargs.get("ocr_service") or self.ocr_service
|
||||
)
|
||||
|
||||
if ocr_service:
|
||||
# 1. Extract and OCR images — returns raw text per image
|
||||
file_stream.seek(0)
|
||||
image_ocr_map = self._extract_and_ocr_images(file_stream, ocr_service)
|
||||
|
||||
# 2. Convert DOCX → HTML via mammoth
|
||||
file_stream.seek(0)
|
||||
pre_process_stream = pre_process_docx(file_stream)
|
||||
html_result = mammoth.convert_to_html(
|
||||
pre_process_stream, style_map=kwargs.get("style_map")
|
||||
).value
|
||||
|
||||
# 3. Replace <img> tags with plain placeholder tokens so that
|
||||
# mammoth's HTML→markdown step never escapes our OCR markers.
|
||||
html_with_placeholders, ocr_texts = self._inject_placeholders(
|
||||
html_result, image_ocr_map
|
||||
)
|
||||
|
||||
# 4. Convert HTML → markdown
|
||||
md_result = self._html_converter.convert_string(
|
||||
html_with_placeholders, **kwargs
|
||||
)
|
||||
md = md_result.markdown
|
||||
|
||||
# 5. Swap placeholders for the actual OCR blocks (post-conversion
|
||||
# so * and _ are never escaped by the markdown converter).
|
||||
for i, raw_text in enumerate(ocr_texts):
|
||||
placeholder = _PLACEHOLDER.format(i)
|
||||
ocr_block = f"*[Image OCR]\n{raw_text}\n[End OCR]*"
|
||||
md = md.replace(placeholder, ocr_block)
|
||||
|
||||
return DocumentConverterResult(markdown=md)
|
||||
else:
|
||||
# Standard conversion without OCR
|
||||
style_map = kwargs.get("style_map", None)
|
||||
pre_process_stream = pre_process_docx(file_stream)
|
||||
return self._html_converter.convert_string(
|
||||
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _extract_and_ocr_images(
|
||||
self, file_stream: BinaryIO, ocr_service: LLMVisionOCRService
|
||||
) -> dict[str, str]:
|
||||
"""
|
||||
Extract images from DOCX and OCR them.
|
||||
|
||||
Returns:
|
||||
Dict mapping image relationship IDs to raw OCR text (no markers).
|
||||
"""
|
||||
ocr_map = {}
|
||||
|
||||
try:
|
||||
file_stream.seek(0)
|
||||
doc = Document(file_stream)
|
||||
|
||||
for rel in doc.part.rels.values():
|
||||
if "image" in rel.target_ref.lower():
|
||||
try:
|
||||
image_bytes = rel.target_part.blob
|
||||
image_stream = io.BytesIO(image_bytes)
|
||||
ocr_result = ocr_service.extract_text(image_stream)
|
||||
|
||||
if ocr_result.text.strip():
|
||||
# Store raw text only — markers added later
|
||||
ocr_map[rel.rId] = ocr_result.text.strip()
|
||||
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return ocr_map
|
||||
|
||||
def _inject_placeholders(
|
||||
self, html: str, ocr_map: dict[str, str]
|
||||
) -> tuple[str, list[str]]:
|
||||
"""
|
||||
Replace <img> tags with numbered placeholder tokens.
|
||||
|
||||
Returns:
|
||||
(html_with_placeholders, ordered list of raw OCR texts)
|
||||
"""
|
||||
if not ocr_map:
|
||||
return html, []
|
||||
|
||||
ocr_texts = list(ocr_map.values())
|
||||
used: list[int] = []
|
||||
|
||||
def replace_img(match: re.Match) -> str: # type: ignore[type-arg]
|
||||
for i in range(len(ocr_texts)):
|
||||
if i not in used:
|
||||
used.append(i)
|
||||
return f"<p>{_PLACEHOLDER.format(i)}</p>"
|
||||
return "" # remove image if all OCR texts already used
|
||||
|
||||
result = re.sub(r"<img[^>]*>", replace_img, html)
|
||||
|
||||
# Any OCR texts that had no matching <img> tag go at the end
|
||||
for i in range(len(ocr_texts)):
|
||||
if i not in used:
|
||||
result += f"<p>{_PLACEHOLDER.format(i)}</p>"
|
||||
|
||||
return result, ocr_texts
|
||||
@@ -0,0 +1,110 @@
|
||||
"""
|
||||
OCR Service Layer for MarkItDown
|
||||
Provides LLM Vision-based image text extraction.
|
||||
"""
|
||||
|
||||
import base64
|
||||
from typing import Any, BinaryIO
|
||||
from dataclasses import dataclass
|
||||
|
||||
from markitdown import StreamInfo
|
||||
|
||||
|
||||
@dataclass
|
||||
class OCRResult:
|
||||
"""Result from OCR extraction."""
|
||||
|
||||
text: str
|
||||
confidence: float | None = None
|
||||
backend_used: str | None = None
|
||||
error: str | None = None
|
||||
|
||||
|
||||
class LLMVisionOCRService:
|
||||
"""OCR service using LLM vision models (OpenAI-compatible)."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
client: Any,
|
||||
model: str,
|
||||
default_prompt: str | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize LLM Vision OCR service.
|
||||
|
||||
Args:
|
||||
client: OpenAI-compatible client
|
||||
model: Model name (e.g., 'gpt-4o', 'gemini-2.0-flash')
|
||||
default_prompt: Default prompt for OCR extraction
|
||||
"""
|
||||
self.client = client
|
||||
self.model = model
|
||||
self.default_prompt = default_prompt or (
|
||||
"Extract all text from this image. "
|
||||
"Return ONLY the extracted text, maintaining the original "
|
||||
"layout and order. Do not add any commentary or description."
|
||||
)
|
||||
|
||||
def extract_text(
|
||||
self,
|
||||
image_stream: BinaryIO,
|
||||
prompt: str | None = None,
|
||||
stream_info: StreamInfo | None = None,
|
||||
**kwargs: Any,
|
||||
) -> OCRResult:
|
||||
"""Extract text using LLM vision."""
|
||||
if self.client is None:
|
||||
return OCRResult(
|
||||
text="",
|
||||
backend_used="llm_vision",
|
||||
error="LLM client not configured",
|
||||
)
|
||||
|
||||
try:
|
||||
image_stream.seek(0)
|
||||
|
||||
content_type: str | None = None
|
||||
if stream_info:
|
||||
content_type = stream_info.mimetype
|
||||
|
||||
if not content_type:
|
||||
try:
|
||||
from PIL import Image
|
||||
|
||||
image_stream.seek(0)
|
||||
img = Image.open(image_stream)
|
||||
fmt = img.format.lower() if img.format else "png"
|
||||
content_type = f"image/{fmt}"
|
||||
except Exception:
|
||||
content_type = "image/png"
|
||||
|
||||
image_stream.seek(0)
|
||||
base64_image = base64.b64encode(image_stream.read()).decode("utf-8")
|
||||
data_uri = f"data:{content_type};base64,{base64_image}"
|
||||
|
||||
actual_prompt = prompt or self.default_prompt
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": actual_prompt},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": data_uri},
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
text = response.choices[0].message.content
|
||||
return OCRResult(
|
||||
text=text.strip() if text else "",
|
||||
backend_used="llm_vision",
|
||||
)
|
||||
except Exception as e:
|
||||
return OCRResult(text="", backend_used="llm_vision", error=str(e))
|
||||
finally:
|
||||
image_stream.seek(0)
|
||||
@@ -0,0 +1,422 @@
|
||||
"""
|
||||
Enhanced PDF Converter with OCR support for embedded images.
|
||||
Extracts images from PDFs and performs OCR while maintaining document context.
|
||||
"""
|
||||
|
||||
import io
|
||||
import sys
|
||||
from typing import Any, BinaryIO, Optional
|
||||
|
||||
from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo
|
||||
from markitdown._exceptions import (
|
||||
MissingDependencyException,
|
||||
MISSING_DEPENDENCY_MESSAGE,
|
||||
)
|
||||
from ._ocr_service import LLMVisionOCRService
|
||||
|
||||
# Import dependencies
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import pdfminer
|
||||
import pdfminer.high_level
|
||||
import pdfplumber
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
def _extract_images_from_page(page: Any) -> list[dict]:
|
||||
"""
|
||||
Extract images from a PDF page by rendering page regions.
|
||||
|
||||
Returns:
|
||||
List of dicts with 'stream', 'bbox', 'name', 'y_pos' keys
|
||||
"""
|
||||
images_info = []
|
||||
|
||||
try:
|
||||
# Try multiple methods to detect images
|
||||
images = []
|
||||
|
||||
# Method 1: Use page.images (standard approach)
|
||||
if hasattr(page, "images") and page.images:
|
||||
images = page.images
|
||||
|
||||
# Method 2: If no images found, try underlying PDF objects
|
||||
if not images and hasattr(page, "objects") and "image" in page.objects:
|
||||
images = page.objects.get("image", [])
|
||||
|
||||
# Method 3: Try filtering all objects for image types
|
||||
if not images and hasattr(page, "objects"):
|
||||
all_objs = page.objects
|
||||
for obj_type in all_objs.keys():
|
||||
if "image" in obj_type.lower() or "xobject" in obj_type.lower():
|
||||
potential_imgs = all_objs.get(obj_type, [])
|
||||
if potential_imgs:
|
||||
images = potential_imgs
|
||||
break
|
||||
|
||||
for i, img_dict in enumerate(images):
|
||||
try:
|
||||
# Try to get the actual image stream from the PDF
|
||||
img_stream = None
|
||||
y_pos = 0
|
||||
|
||||
# Method A: If img_dict has 'stream' key, use it directly
|
||||
if "stream" in img_dict and hasattr(img_dict["stream"], "get_data"):
|
||||
try:
|
||||
img_bytes = img_dict["stream"].get_data()
|
||||
|
||||
# Try to open as PIL Image to validate/decode
|
||||
pil_img = Image.open(io.BytesIO(img_bytes))
|
||||
|
||||
# Convert to RGB if needed (handle CMYK, etc.)
|
||||
if pil_img.mode not in ("RGB", "L"):
|
||||
pil_img = pil_img.convert("RGB")
|
||||
|
||||
# Save to stream as PNG
|
||||
img_stream = io.BytesIO()
|
||||
pil_img.save(img_stream, format="PNG")
|
||||
img_stream.seek(0)
|
||||
|
||||
y_pos = img_dict.get("top", 0)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Method B: Fallback to rendering page region
|
||||
if img_stream is None:
|
||||
x0 = img_dict.get("x0", 0)
|
||||
y0 = img_dict.get("top", 0)
|
||||
x1 = img_dict.get("x1", 0)
|
||||
y1 = img_dict.get("bottom", 0)
|
||||
y_pos = y0
|
||||
|
||||
# Check if dimensions are valid
|
||||
if x1 <= x0 or y1 <= y0:
|
||||
continue
|
||||
|
||||
# Use pdfplumber's within_bbox to crop, then render
|
||||
# This preserves coordinate system correctly
|
||||
bbox = (x0, y0, x1, y1)
|
||||
cropped_page = page.within_bbox(bbox)
|
||||
|
||||
# Render at 150 DPI (balance between quality and size)
|
||||
page_img = cropped_page.to_image(resolution=150)
|
||||
|
||||
# Save to stream
|
||||
img_stream = io.BytesIO()
|
||||
page_img.original.save(img_stream, format="PNG")
|
||||
img_stream.seek(0)
|
||||
|
||||
if img_stream:
|
||||
images_info.append(
|
||||
{
|
||||
"stream": img_stream,
|
||||
"name": f"page_{page.page_number}_img_{i}",
|
||||
"y_pos": y_pos,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return images_info
|
||||
|
||||
|
||||
class PdfConverterWithOCR(DocumentConverter):
|
||||
"""
|
||||
Enhanced PDF Converter with OCR support for embedded images.
|
||||
Maintains document structure while extracting text from images inline.
|
||||
"""
|
||||
|
||||
def __init__(self, ocr_service: Optional[LLMVisionOCRService] = None):
|
||||
super().__init__()
|
||||
self.ocr_service = ocr_service
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension == ".pdf":
|
||||
return True
|
||||
|
||||
if mimetype.startswith("application/pdf") or mimetype.startswith(
|
||||
"application/x-pdf"
|
||||
):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".pdf",
|
||||
feature="pdf",
|
||||
)
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
_dependency_exc_info[2]
|
||||
) # type: ignore[union-attr]
|
||||
|
||||
# Get OCR service if available (from kwargs or instance)
|
||||
ocr_service: LLMVisionOCRService | None = (
|
||||
kwargs.get("ocr_service") or self.ocr_service
|
||||
)
|
||||
|
||||
# Read PDF into BytesIO
|
||||
file_stream.seek(0)
|
||||
pdf_bytes = io.BytesIO(file_stream.read())
|
||||
|
||||
markdown_content = []
|
||||
|
||||
try:
|
||||
with pdfplumber.open(pdf_bytes) as pdf:
|
||||
for page_num, page in enumerate(pdf.pages, 1):
|
||||
markdown_content.append(f"\n## Page {page_num}\n")
|
||||
|
||||
# If OCR is enabled, interleave text and images by position
|
||||
if ocr_service:
|
||||
images_on_page = self._extract_page_images(pdf_bytes, page_num)
|
||||
|
||||
if images_on_page:
|
||||
# Extract text lines with Y positions
|
||||
chars = page.chars
|
||||
if chars:
|
||||
# Group chars into lines based on Y position
|
||||
lines_with_y = []
|
||||
current_line = []
|
||||
current_y = None
|
||||
|
||||
for char in sorted(
|
||||
chars, key=lambda c: (c["top"], c["x0"])
|
||||
):
|
||||
y = char["top"]
|
||||
if current_y is None:
|
||||
current_y = y
|
||||
elif abs(y - current_y) > 2: # New line threshold
|
||||
if current_line:
|
||||
text = "".join(
|
||||
[c["text"] for c in current_line]
|
||||
)
|
||||
lines_with_y.append(
|
||||
{"y": current_y, "text": text.strip()}
|
||||
)
|
||||
current_line = []
|
||||
current_y = y
|
||||
current_line.append(char)
|
||||
|
||||
# Add last line
|
||||
if current_line:
|
||||
text = "".join([c["text"] for c in current_line])
|
||||
lines_with_y.append(
|
||||
{"y": current_y, "text": text.strip()}
|
||||
)
|
||||
else:
|
||||
# Fallback: use simple text extraction
|
||||
text_content = page.extract_text() or ""
|
||||
lines_with_y = [
|
||||
{"y": i * 10, "text": line}
|
||||
for i, line in enumerate(text_content.split("\n"))
|
||||
]
|
||||
|
||||
# OCR all images
|
||||
image_data = []
|
||||
for img_info in images_on_page:
|
||||
ocr_result = ocr_service.extract_text(
|
||||
img_info["stream"]
|
||||
)
|
||||
if ocr_result.text.strip():
|
||||
image_data.append(
|
||||
{
|
||||
"y_pos": img_info["y_pos"],
|
||||
"name": img_info["name"],
|
||||
"ocr_text": ocr_result.text,
|
||||
"backend": ocr_result.backend_used,
|
||||
"type": "image",
|
||||
}
|
||||
)
|
||||
|
||||
# Add text items
|
||||
content_items = [
|
||||
{
|
||||
"y_pos": item["y"],
|
||||
"text": item["text"],
|
||||
"type": "text",
|
||||
}
|
||||
for item in lines_with_y
|
||||
if item["text"]
|
||||
]
|
||||
content_items.extend(image_data)
|
||||
|
||||
# Sort all items by Y position (top to bottom)
|
||||
content_items.sort(key=lambda x: x["y_pos"])
|
||||
|
||||
# Build markdown by interleaving text and images
|
||||
for item in content_items:
|
||||
if item["type"] == "text":
|
||||
markdown_content.append(item["text"])
|
||||
else: # image
|
||||
ocr_text = item["ocr_text"]
|
||||
img_marker = (
|
||||
f"\n\n*[Image OCR]\n{ocr_text}\n[End OCR]*\n"
|
||||
)
|
||||
markdown_content.append(img_marker)
|
||||
else:
|
||||
# No images detected - just extract regular text
|
||||
text_content = page.extract_text() or ""
|
||||
if text_content.strip():
|
||||
markdown_content.append(text_content.strip())
|
||||
else:
|
||||
# No OCR, just extract text
|
||||
text_content = page.extract_text() or ""
|
||||
if text_content.strip():
|
||||
markdown_content.append(text_content.strip())
|
||||
|
||||
# Build final markdown
|
||||
markdown = "\n\n".join(markdown_content).strip()
|
||||
|
||||
# Fallback to pdfminer if empty
|
||||
if not markdown:
|
||||
pdf_bytes.seek(0)
|
||||
markdown = pdfminer.high_level.extract_text(pdf_bytes)
|
||||
|
||||
except Exception:
|
||||
# Fallback to pdfminer
|
||||
try:
|
||||
pdf_bytes.seek(0)
|
||||
markdown = pdfminer.high_level.extract_text(pdf_bytes)
|
||||
except Exception:
|
||||
markdown = ""
|
||||
|
||||
# Final fallback: If still empty/whitespace and OCR is available,
|
||||
# treat as scanned PDF and OCR full pages
|
||||
if ocr_service and (not markdown or not markdown.strip()):
|
||||
pdf_bytes.seek(0)
|
||||
markdown = self._ocr_full_pages(pdf_bytes, ocr_service)
|
||||
|
||||
return DocumentConverterResult(markdown=markdown)
|
||||
|
||||
def _extract_page_images(self, pdf_bytes: io.BytesIO, page_num: int) -> list[dict]:
|
||||
"""
|
||||
Extract images from a PDF page using pdfplumber.
|
||||
|
||||
Args:
|
||||
pdf_bytes: PDF file as BytesIO
|
||||
page_num: Page number (1-indexed)
|
||||
|
||||
Returns:
|
||||
List of image info dicts with 'stream', 'bbox', 'name', 'y_pos'
|
||||
"""
|
||||
images = []
|
||||
|
||||
try:
|
||||
pdf_bytes.seek(0)
|
||||
with pdfplumber.open(pdf_bytes) as pdf:
|
||||
if page_num <= len(pdf.pages):
|
||||
page = pdf.pages[page_num - 1] # 0-indexed
|
||||
images = _extract_images_from_page(page)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Sort by vertical position (top to bottom)
|
||||
images.sort(key=lambda x: x["y_pos"])
|
||||
|
||||
return images
|
||||
|
||||
def _ocr_full_pages(
|
||||
self, pdf_bytes: io.BytesIO, ocr_service: LLMVisionOCRService
|
||||
) -> str:
|
||||
"""
|
||||
Fallback for scanned PDFs: Convert entire pages to images and OCR them.
|
||||
Used when text extraction returns empty/whitespace results.
|
||||
|
||||
Args:
|
||||
pdf_bytes: PDF file as BytesIO
|
||||
ocr_service: OCR service to use
|
||||
|
||||
Returns:
|
||||
Markdown text extracted from OCR of full pages
|
||||
"""
|
||||
markdown_parts = []
|
||||
|
||||
try:
|
||||
pdf_bytes.seek(0)
|
||||
with pdfplumber.open(pdf_bytes) as pdf:
|
||||
for page_num, page in enumerate(pdf.pages, 1):
|
||||
try:
|
||||
markdown_parts.append(f"\n## Page {page_num}\n")
|
||||
|
||||
# Render page to image
|
||||
page_img = page.to_image(resolution=300)
|
||||
img_stream = io.BytesIO()
|
||||
page_img.original.save(img_stream, format="PNG")
|
||||
img_stream.seek(0)
|
||||
|
||||
# Run OCR
|
||||
ocr_result = ocr_service.extract_text(img_stream)
|
||||
|
||||
if ocr_result.text.strip():
|
||||
text = ocr_result.text.strip()
|
||||
markdown_parts.append(f"*[Image OCR]\n{text}\n[End OCR]*")
|
||||
else:
|
||||
markdown_parts.append(
|
||||
"*[No text could be extracted from this page]*"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
markdown_parts.append(
|
||||
f"*[Error processing page {page_num}: {str(e)}]*"
|
||||
)
|
||||
continue
|
||||
|
||||
except Exception:
|
||||
# pdfplumber failed (e.g. malformed EOF) — try PyMuPDF for rendering
|
||||
markdown_parts = []
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
|
||||
pdf_bytes.seek(0)
|
||||
doc = fitz.open(stream=pdf_bytes.read(), filetype="pdf")
|
||||
for page_num in range(1, doc.page_count + 1):
|
||||
try:
|
||||
markdown_parts.append(f"\n## Page {page_num}\n")
|
||||
page = doc[page_num - 1]
|
||||
mat = fitz.Matrix(300 / 72, 300 / 72) # 300 DPI
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
img_stream = io.BytesIO(pix.tobytes("png"))
|
||||
img_stream.seek(0)
|
||||
|
||||
ocr_result = ocr_service.extract_text(img_stream)
|
||||
|
||||
if ocr_result.text.strip():
|
||||
text = ocr_result.text.strip()
|
||||
markdown_parts.append(f"*[Image OCR]\n{text}\n[End OCR]*")
|
||||
else:
|
||||
markdown_parts.append(
|
||||
"*[No text could be extracted from this page]*"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
markdown_parts.append(
|
||||
f"*[Error processing page {page_num}: {str(e)}]*"
|
||||
)
|
||||
continue
|
||||
doc.close()
|
||||
except Exception:
|
||||
return "*[Error: Could not process scanned PDF]*"
|
||||
|
||||
return "\n\n".join(markdown_parts).strip()
|
||||
@@ -0,0 +1,68 @@
|
||||
"""
|
||||
Plugin registration for markitdown-ocr.
|
||||
Registers OCR-enhanced converters with priority-based replacement strategy.
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
from markitdown import MarkItDown
|
||||
|
||||
from ._ocr_service import LLMVisionOCRService
|
||||
from ._pdf_converter_with_ocr import PdfConverterWithOCR
|
||||
from ._docx_converter_with_ocr import DocxConverterWithOCR
|
||||
from ._pptx_converter_with_ocr import PptxConverterWithOCR
|
||||
from ._xlsx_converter_with_ocr import XlsxConverterWithOCR
|
||||
|
||||
|
||||
__plugin_interface_version__ = 1
|
||||
|
||||
|
||||
def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None:
|
||||
"""
|
||||
Register OCR-enhanced converters with MarkItDown.
|
||||
|
||||
This plugin provides OCR support for PDF, DOCX, PPTX, and XLSX files.
|
||||
The converters are registered with priority -1.0 to run BEFORE built-in
|
||||
converters (which have priority 0.0), effectively replacing them when
|
||||
the plugin is enabled.
|
||||
|
||||
Args:
|
||||
markitdown: MarkItDown instance to register converters with
|
||||
**kwargs: Additional keyword arguments that may include:
|
||||
- llm_client: OpenAI-compatible client for LLM-based OCR (required for OCR to work)
|
||||
- llm_model: Model name (e.g., 'gpt-4o')
|
||||
- llm_prompt: Custom prompt for text extraction
|
||||
"""
|
||||
# Create OCR service — reads the same llm_client/llm_model kwargs
|
||||
# that MarkItDown itself already accepts for image descriptions
|
||||
llm_client = kwargs.get("llm_client")
|
||||
llm_model = kwargs.get("llm_model")
|
||||
llm_prompt = kwargs.get("llm_prompt")
|
||||
|
||||
ocr_service: LLMVisionOCRService | None = None
|
||||
if llm_client and llm_model:
|
||||
ocr_service = LLMVisionOCRService(
|
||||
client=llm_client,
|
||||
model=llm_model,
|
||||
default_prompt=llm_prompt,
|
||||
)
|
||||
|
||||
# Register converters with priority -1.0 (before built-ins at 0.0)
|
||||
# This effectively "replaces" the built-in converters when plugin is installed
|
||||
# Pass the OCR service to each converter's constructor
|
||||
PRIORITY_OCR_ENHANCED = -1.0
|
||||
|
||||
markitdown.register_converter(
|
||||
PdfConverterWithOCR(ocr_service=ocr_service), priority=PRIORITY_OCR_ENHANCED
|
||||
)
|
||||
|
||||
markitdown.register_converter(
|
||||
DocxConverterWithOCR(ocr_service=ocr_service), priority=PRIORITY_OCR_ENHANCED
|
||||
)
|
||||
|
||||
markitdown.register_converter(
|
||||
PptxConverterWithOCR(ocr_service=ocr_service), priority=PRIORITY_OCR_ENHANCED
|
||||
)
|
||||
|
||||
markitdown.register_converter(
|
||||
XlsxConverterWithOCR(ocr_service=ocr_service), priority=PRIORITY_OCR_ENHANCED
|
||||
)
|
||||
@@ -0,0 +1,249 @@
|
||||
"""
|
||||
Enhanced PPTX Converter with improved OCR support.
|
||||
Already has LLM-based image description, this enhances it with traditional OCR fallback.
|
||||
"""
|
||||
|
||||
import io
|
||||
import sys
|
||||
from typing import Any, BinaryIO, Optional
|
||||
|
||||
from typing import BinaryIO, Any, Optional
|
||||
|
||||
from markitdown.converters import HtmlConverter
|
||||
from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo
|
||||
from markitdown._exceptions import (
|
||||
MissingDependencyException,
|
||||
MISSING_DEPENDENCY_MESSAGE,
|
||||
)
|
||||
from ._ocr_service import LLMVisionOCRService
|
||||
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import pptx
|
||||
except ImportError:
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
class PptxConverterWithOCR(DocumentConverter):
|
||||
"""Enhanced PPTX Converter with OCR fallback."""
|
||||
|
||||
def __init__(self, ocr_service: Optional[LLMVisionOCRService] = None):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
self.ocr_service = ocr_service
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension == ".pptx":
|
||||
return True
|
||||
|
||||
if mimetype.startswith(
|
||||
"application/vnd.openxmlformats-officedocument.presentationml"
|
||||
):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".pptx",
|
||||
feature="pptx",
|
||||
)
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
_dependency_exc_info[2]
|
||||
) # type: ignore[union-attr]
|
||||
|
||||
# Get OCR service (from kwargs or instance)
|
||||
ocr_service: Optional[LLMVisionOCRService] = (
|
||||
kwargs.get("ocr_service") or self.ocr_service
|
||||
)
|
||||
llm_client = kwargs.get("llm_client")
|
||||
|
||||
presentation = pptx.Presentation(file_stream)
|
||||
md_content = ""
|
||||
slide_num = 0
|
||||
|
||||
for slide in presentation.slides:
|
||||
slide_num += 1
|
||||
md_content += f"\\n\\n<!-- Slide number: {slide_num} -->\\n"
|
||||
|
||||
title = slide.shapes.title
|
||||
|
||||
def get_shape_content(shape, **kwargs):
|
||||
nonlocal md_content
|
||||
|
||||
# Pictures
|
||||
if self._is_picture(shape):
|
||||
# Get image data
|
||||
image_stream = io.BytesIO(shape.image.blob)
|
||||
|
||||
# Try LLM description first if available
|
||||
llm_description = ""
|
||||
if llm_client and kwargs.get("llm_model"):
|
||||
try:
|
||||
from ._llm_caption import llm_caption
|
||||
|
||||
image_filename = shape.image.filename
|
||||
image_extension = None
|
||||
if image_filename:
|
||||
import os
|
||||
|
||||
image_extension = os.path.splitext(image_filename)[1]
|
||||
|
||||
image_stream_info = StreamInfo(
|
||||
mimetype=shape.image.content_type,
|
||||
extension=image_extension,
|
||||
filename=image_filename,
|
||||
)
|
||||
|
||||
llm_description = llm_caption(
|
||||
image_stream,
|
||||
image_stream_info,
|
||||
client=llm_client,
|
||||
model=kwargs.get("llm_model"),
|
||||
prompt=kwargs.get("llm_prompt"),
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Try OCR if LLM failed or not available
|
||||
ocr_text = ""
|
||||
if not llm_description and ocr_service:
|
||||
try:
|
||||
image_stream.seek(0)
|
||||
ocr_result = ocr_service.extract_text(image_stream)
|
||||
if ocr_result.text.strip():
|
||||
ocr_text = ocr_result.text.strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Format extracted content using unified OCR block format
|
||||
content = (llm_description or ocr_text or "").strip()
|
||||
if content:
|
||||
md_content += f"\n*[Image OCR]\n{content}\n[End OCR]*\n"
|
||||
|
||||
# Tables
|
||||
if self._is_table(shape):
|
||||
md_content += self._convert_table_to_markdown(shape.table, **kwargs)
|
||||
|
||||
# Charts
|
||||
if shape.has_chart:
|
||||
md_content += self._convert_chart_to_markdown(shape.chart)
|
||||
|
||||
# Text areas
|
||||
elif shape.has_text_frame:
|
||||
if shape == title:
|
||||
md_content += "# " + shape.text.lstrip() + "\\n"
|
||||
else:
|
||||
md_content += shape.text + "\\n"
|
||||
|
||||
# Group Shapes
|
||||
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
|
||||
sorted_shapes = sorted(
|
||||
shape.shapes,
|
||||
key=lambda x: (
|
||||
float("-inf") if not x.top else x.top,
|
||||
float("-inf") if not x.left else x.left,
|
||||
),
|
||||
)
|
||||
for subshape in sorted_shapes:
|
||||
get_shape_content(subshape, **kwargs)
|
||||
|
||||
sorted_shapes = sorted(
|
||||
slide.shapes,
|
||||
key=lambda x: (
|
||||
float("-inf") if not x.top else x.top,
|
||||
float("-inf") if not x.left else x.left,
|
||||
),
|
||||
)
|
||||
for shape in sorted_shapes:
|
||||
get_shape_content(shape, **kwargs)
|
||||
|
||||
md_content = md_content.strip()
|
||||
|
||||
if slide.has_notes_slide:
|
||||
md_content += "\\n\\n### Notes:\\n"
|
||||
notes_frame = slide.notes_slide.notes_text_frame
|
||||
if notes_frame is not None:
|
||||
md_content += notes_frame.text
|
||||
md_content = md_content.strip()
|
||||
|
||||
return DocumentConverterResult(markdown=md_content.strip())
|
||||
|
||||
def _is_picture(self, shape):
|
||||
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
|
||||
return True
|
||||
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
|
||||
if hasattr(shape, "image"):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _is_table(self, shape):
|
||||
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _convert_table_to_markdown(self, table, **kwargs):
|
||||
import html
|
||||
|
||||
html_table = "<html><body><table>"
|
||||
first_row = True
|
||||
for row in table.rows:
|
||||
html_table += "<tr>"
|
||||
for cell in row.cells:
|
||||
if first_row:
|
||||
html_table += "<th>" + html.escape(cell.text) + "</th>"
|
||||
else:
|
||||
html_table += "<td>" + html.escape(cell.text) + "</td>"
|
||||
html_table += "</tr>"
|
||||
first_row = False
|
||||
html_table += "</table></body></html>"
|
||||
|
||||
return (
|
||||
self._html_converter.convert_string(html_table, **kwargs).markdown.strip()
|
||||
+ "\\n"
|
||||
)
|
||||
|
||||
def _convert_chart_to_markdown(self, chart):
|
||||
try:
|
||||
md = "\\n\\n### Chart"
|
||||
if chart.has_title:
|
||||
md += f": {chart.chart_title.text_frame.text}"
|
||||
md += "\\n\\n"
|
||||
data = []
|
||||
category_names = [c.label for c in chart.plots[0].categories]
|
||||
series_names = [s.name for s in chart.series]
|
||||
data.append(["Category"] + series_names)
|
||||
|
||||
for idx, category in enumerate(category_names):
|
||||
row = [category]
|
||||
for series in chart.series:
|
||||
row.append(series.values[idx])
|
||||
data.append(row)
|
||||
|
||||
markdown_table = []
|
||||
for row in data:
|
||||
markdown_table.append("| " + " | ".join(map(str, row)) + " |")
|
||||
header = markdown_table[0]
|
||||
separator = "|" + "|".join(["---"] * len(data[0])) + "|"
|
||||
return md + "\\n".join([header, separator] + markdown_table[1:])
|
||||
except ValueError as e:
|
||||
if "unsupported plot type" in str(e):
|
||||
return "\\n\\n[unsupported chart]\\n\\n"
|
||||
except Exception:
|
||||
return "\\n\\n[unsupported chart]\\n\\n"
|
||||
@@ -0,0 +1,225 @@
|
||||
"""
|
||||
Enhanced XLSX Converter with OCR support for embedded images.
|
||||
Extracts images from Excel spreadsheets and performs OCR while maintaining cell context.
|
||||
"""
|
||||
|
||||
import io
|
||||
import sys
|
||||
from typing import Any, BinaryIO, Optional
|
||||
|
||||
from markitdown.converters import HtmlConverter
|
||||
from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo
|
||||
from markitdown._exceptions import (
|
||||
MissingDependencyException,
|
||||
MISSING_DEPENDENCY_MESSAGE,
|
||||
)
|
||||
from ._ocr_service import LLMVisionOCRService
|
||||
|
||||
# Try loading dependencies
|
||||
_xlsx_dependency_exc_info = None
|
||||
try:
|
||||
import pandas as pd
|
||||
from openpyxl import load_workbook
|
||||
except ImportError:
|
||||
_xlsx_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
class XlsxConverterWithOCR(DocumentConverter):
|
||||
"""
|
||||
Enhanced XLSX Converter with OCR support for embedded images.
|
||||
Extracts images with their cell positions and performs OCR.
|
||||
"""
|
||||
|
||||
def __init__(self, ocr_service: Optional[LLMVisionOCRService] = None):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
self.ocr_service = ocr_service
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension == ".xlsx":
|
||||
return True
|
||||
|
||||
if mimetype.startswith(
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml"
|
||||
):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
if _xlsx_dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".xlsx",
|
||||
feature="xlsx",
|
||||
)
|
||||
) from _xlsx_dependency_exc_info[1].with_traceback(
|
||||
_xlsx_dependency_exc_info[2]
|
||||
) # type: ignore[union-attr]
|
||||
|
||||
# Get OCR service if available (from kwargs or instance)
|
||||
ocr_service: Optional[LLMVisionOCRService] = (
|
||||
kwargs.get("ocr_service") or self.ocr_service
|
||||
)
|
||||
|
||||
if ocr_service:
|
||||
# Remove ocr_service from kwargs to avoid duplicate argument error
|
||||
kwargs_without_ocr = {k: v for k, v in kwargs.items() if k != "ocr_service"}
|
||||
return self._convert_with_ocr(
|
||||
file_stream, ocr_service, **kwargs_without_ocr
|
||||
)
|
||||
else:
|
||||
return self._convert_standard(file_stream, **kwargs)
|
||||
|
||||
def _convert_standard(
|
||||
self, file_stream: BinaryIO, **kwargs: Any
|
||||
) -> DocumentConverterResult:
|
||||
"""Standard conversion without OCR."""
|
||||
file_stream.seek(0)
|
||||
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
|
||||
md_content = ""
|
||||
|
||||
for sheet_name in sheets:
|
||||
md_content += f"## {sheet_name}\n"
|
||||
html_content = sheets[sheet_name].to_html(index=False)
|
||||
md_content += (
|
||||
self._html_converter.convert_string(
|
||||
html_content, **kwargs
|
||||
).markdown.strip()
|
||||
+ "\n\n"
|
||||
)
|
||||
|
||||
return DocumentConverterResult(markdown=md_content.strip())
|
||||
|
||||
def _convert_with_ocr(
|
||||
self, file_stream: BinaryIO, ocr_service: LLMVisionOCRService, **kwargs: Any
|
||||
) -> DocumentConverterResult:
|
||||
"""Convert XLSX with image OCR."""
|
||||
file_stream.seek(0)
|
||||
wb = load_workbook(file_stream)
|
||||
|
||||
md_content = ""
|
||||
|
||||
for sheet_name in wb.sheetnames:
|
||||
sheet = wb[sheet_name]
|
||||
md_content += f"## {sheet_name}\n\n"
|
||||
|
||||
# Convert sheet data to markdown table
|
||||
file_stream.seek(0)
|
||||
try:
|
||||
df = pd.read_excel(
|
||||
file_stream, sheet_name=sheet_name, engine="openpyxl"
|
||||
)
|
||||
html_content = df.to_html(index=False)
|
||||
md_content += (
|
||||
self._html_converter.convert_string(
|
||||
html_content, **kwargs
|
||||
).markdown.strip()
|
||||
+ "\n\n"
|
||||
)
|
||||
except Exception:
|
||||
# If pandas fails, just skip the table
|
||||
pass
|
||||
|
||||
# Extract and OCR images in this sheet
|
||||
images_with_ocr = self._extract_and_ocr_sheet_images(sheet, ocr_service)
|
||||
|
||||
if images_with_ocr:
|
||||
md_content += "### Images in this sheet:\n\n"
|
||||
for img_info in images_with_ocr:
|
||||
ocr_text = img_info["ocr_text"]
|
||||
md_content += f"*[Image OCR]\n{ocr_text}\n[End OCR]*\n\n"
|
||||
|
||||
return DocumentConverterResult(markdown=md_content.strip())
|
||||
|
||||
def _extract_and_ocr_sheet_images(
|
||||
self, sheet: Any, ocr_service: LLMVisionOCRService
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Extract and OCR images from an Excel sheet.
|
||||
|
||||
Args:
|
||||
sheet: openpyxl worksheet
|
||||
ocr_service: OCR service
|
||||
|
||||
Returns:
|
||||
List of dicts with 'cell_ref' and 'ocr_text'
|
||||
"""
|
||||
results = []
|
||||
|
||||
try:
|
||||
# Check if sheet has images
|
||||
if hasattr(sheet, "_images"):
|
||||
for img in sheet._images:
|
||||
try:
|
||||
# Get image data
|
||||
if hasattr(img, "_data"):
|
||||
image_data = img._data()
|
||||
elif hasattr(img, "image"):
|
||||
# Some versions store it differently
|
||||
image_data = img.image
|
||||
else:
|
||||
continue
|
||||
|
||||
# Create image stream
|
||||
image_stream = io.BytesIO(image_data)
|
||||
|
||||
# Get cell reference
|
||||
cell_ref = "unknown"
|
||||
if hasattr(img, "anchor"):
|
||||
anchor = img.anchor
|
||||
if hasattr(anchor, "_from"):
|
||||
from_cell = anchor._from
|
||||
if hasattr(from_cell, "col") and hasattr(
|
||||
from_cell, "row"
|
||||
):
|
||||
# Convert column number to letter
|
||||
col_letter = self._column_number_to_letter(
|
||||
from_cell.col
|
||||
)
|
||||
cell_ref = f"{col_letter}{from_cell.row + 1}"
|
||||
|
||||
# Perform OCR
|
||||
ocr_result = ocr_service.extract_text(image_stream)
|
||||
|
||||
if ocr_result.text.strip():
|
||||
results.append(
|
||||
{
|
||||
"cell_ref": cell_ref,
|
||||
"ocr_text": ocr_result.text.strip(),
|
||||
"backend": ocr_result.backend_used,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def _column_number_to_letter(n: int) -> str:
|
||||
"""Convert column number to Excel column letter (0-indexed)."""
|
||||
result = ""
|
||||
n = n + 1 # Make 1-indexed
|
||||
while n > 0:
|
||||
n -= 1
|
||||
result = chr(65 + (n % 26)) + result
|
||||
n //= 26
|
||||
return result
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,79 @@
|
||||
%PDF-1.3
|
||||
%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
|
||||
1 0 obj
|
||||
<<
|
||||
/F1 2 0 R
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/BitsPerComponent 8 /ColorSpace /DeviceRGB /Filter [ /ASCII85Decode /FlateDecode ] /Height 80 /Length 4282 /Subtype /Image
|
||||
/Type /XObject /Width 400
|
||||
>>
|
||||
stream
|
||||
Gb"/k$+*^]+31jd1_Sc48j,Pi+@:`R01h=9+]FPXQDmE0%*Lb4@[Wi36jU!;cssJbQ5,g%R?K'+$#.h<qu?Z`Dn#2Gqj`$$\bE9$XS)%of4Vd>cT_6mF8#7^^Y_6P]N!%L#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j4Z3bU/Gm9s<T86G'Ht,"?C)(`3j[rI\Y+2%=-fXJAVq>eJc&9=)%mQH&Yh<W#b:QHSf&hc5bR6(7?1RO+2U+,2j=JTJHV/n1*m;JAGbDu[IX.pg30l0S'.0*($<'u;[b/GRDJ[J=c-W0HaX>?jIi$uR^]%u?lJ6Z*VV,Z28T=.3[G"!N]2!6iqW[_CVOQZ9Um#Qd)&t%d!r@Y0=g5[M*c.,qcc"UaVkc?<W;kud8W>KHVDN2.L1\=-s4#kKB4PQPI/e#*[DZR^Y$^Xi6K`(0^so>.#p<K8pN:ein%PZ#Y7*R@MH861DpLo<$rl6(H_8?WBC8!u2*l:OF52PthFZhN<gX0$3^m^nt74tEo^W)lR[oP32CBgdV=rlCn/?MttP@YH\'hL=3DKjK>7hgY.jk*u=UQX(s.K*Fd*WL[eV0^25,*fS$V)Q-Z/Ii`SFMMDe;iM2HMUsg37cCLf8L+b)KY6rDWN#jR5YQ.1R1gba'M2,4kCR4578^=b\Bn#r]R"h8?'u=7,fh/#GD*$m:^@g'YaL,g&OD*(\9V3qM@J4qIp#mRhYee0oG3^KQSpS`k(a)%\\KrFo]NtW+D/curY.W3&31Xd58H(q0_cISK<s@\A@tq[3b*;)prprplo?NNWP"U[H"mT)":p2cCY.e)nGK>lIaB)]QNP9.mWE2l3Yi$/1lIFGq?b"J/%A'=e_4!7DM'qA.H+Eb6+$Wn7]h3.+R^:;FLJDKh6Z0V`KM>R1?7!q\hg`Vs6PqO%XsU&A'e9Y:\qjB8:9X.p&5omkN]TihU"VSM0Gdu%IekLW7Z.T+=gZ8?G+)N`8D1/:))EVV'V%>@o^?^e2`FI#RXkRcVk5<aX)Gb<Anp($Eo(tDRA['co9=J!r?\(k+3obVpgT(rh)[&!*=m"?fb<WboEW]&a*9n`H`'s&IkQkBo"rK"ncnu1$k!hAk*UR=.2DO6/^buFp=jJKa-\QsgioA9CY[S7mOaK!FH$moNAmYB\)0A"En%&3d-/qKIR!Etjt(g9!ZEncRkEWkI'W[)AEhgn9/$+_o'-r&1h\!<m^U0S31\a,_&Rd"r.'Np%gT9f8?\3>1O$"8"QQu<lL!5$/k#`#:KEulTq4-7/YE)\g=Sb$OCcUO)BHRhQu09oJWIZkkosFc_B+&7'c_j!")cW%]@9&7fl>'od45V^,W"[b:mfhGls_c]o[:8?WXO3sS%ABFnD/;VaJp_j5H4#BX4qPO#9Rd2UQp3!11,Hp.<#+W;VjHWUp(CD>tmalGRY\Uq<)U7[bb1;!ICRCfbRQSW`R7B!G^;uZXfo5`5U7D;,E%89G,#:1)%4QDA%S)!5IL>R;C=R4rV;kBDYiJMaSpYGjR@-K1X!l^X,hul.*@fk/SRgZtX.(?1#F1Z,3(>l3p>i<Pf+sbdtG`]h4ZQR\8ke79"3MknReA"?c^RDe@Fjk1.cu5MjEDpdNhJ+7mGf2KIY:S*Y\2Cs77pae4B]4nt\0_9hN"cX*)6Y95CMGu-b.h<l/f)o087<L+.ZYQN\"^nJd(2$BIfm6^Z>hKjO-5DXhZKVbK,R$;#1+h5rhZ?WW+cIfDMR5M:U;WJk^U1M2V=1pp;4^.2,/RU"b7N@$b8R4LOO?H"DR.Lf`L[*m,BTYmDZ_t`L-M$_)#8#p!)[O706GPi_l#Yq>cO^MHRc(Jp:hO`,H*Y]jp")!6$Iu21q$\8nLN&Ju<?TEli:_c^Fu;7mar?jW@Fi5=&+@XX3Du$Vp!Z:kp'-MBe4(Gq5273Z*<l$oQj,ndL:>:,=6H/*LPHo45Js7W8j$_!Qm0FH1P&^"`>@W4%?`Nma<X,sJlXF*,/9?-'cJp]Gl[CD(*jN88AiD,rcf:jl=)$?G1A+QH`L1Y,qGh381N!)?4VfakRqR\de*W_P5=i_rQ88,Nf"08ju'!L3:gtBn9tR`<1O'UehuL-ao(I9mcdD[iu:\EjK;,iTiXhVd0(hgkW_rte\s*ID1Wu(.MjQ`-_-:KRW+1tA<S?3r8>E^)_qfq#N;4tr+%k&Ep8k#92@_4?NnV=N@"8F,!hg:if"abZSI*B&dFMB&j8pk=5i_MJAeY/_a-bBH!b7VKr\Kt#C"Ke<_A>`"`=AC>VJ=jpNj/XAJ.8N&11/:hfIr$D^^R2#qRLKK:(9GU8"CB@_;$5Fq-q:K0TBPN]^2`GM'aEs1Y+T=D'>N2JXWoc8.%IYO^gsm'1RJSeGm+YDRQhLku5aKi&&h'k:Ae':8oK<la[fL\k0;fH3(LIfJts]t<l4*,ri:knWWe!M_E[M,&V9JH2`"=)ml_1[8!OOU7V,rHd]X#^@U_hK>1_Fu*NH]a>^r>**\J#14;Ei@8Dd[B!VZ.j64i(icM@UQ_>]1i+QL[q8@sXNl,qq<0pH2r<c]E5`R>K@bgt+3u4X[=5N,XXpe$Pa+h/i2Ns+!9@kBH_P,uQG__S.W7M^frRPr4EZHW;p0Je?#:'3`%IWs^jMgsS>TFs]-96.iKS'H_`---RRk+q]Jr]FS(In4Pq-F!6Cm%,U[%@0.OI2<<)q%YS\L]"SQrA8jisi-Yc]j.NcUR5eZO4@bV<6:Q<7Y8Tbc.:)0RB[f;uae0#hXi-F,V+Y7!Mj#7a2'<d>UX7up@?R.l5hdJ`J2qIRW9l3nLb6mCBmOi<W\odW.t='rA%`7sRbXB5/RD_LA/<@gLr;i'i3jlV::Z3F&:]ir"sAd&Y6P"h>gnWA-O?D51eitk>F^2j&Iq6CcN2Ju0jXH_V;7Z"7$/f.cVY>Mu"+'&]*\$$EFH_au5?=QCNV/dCcC.k5.]`boT#$n8q"$7k7cbB=_S?6!sI(ERNS%rY/q#(V&?"M=dPp_pD^a<mS>iJ84-qUUOnpsEBD(@=c8&j(fD<_iW8Y:1]3'Z*lk814$BMEn>20Z3q9`%2[odf^kVG8_KfBHJTq!iP-bZf!WUjfi-Z(mjNh$1Mk%I4bUXT_KbmDgHtQ7Z[%/U=`ol;d(+7MLe^9J.%pG(>?Z7,R7p2_!_Qbsrj-nZ^jp1P_<pXaK9!2C;b6ck"=Cj_ThjdJFfoo]T[$FN[^)H53%_>QETt1O4#d3il&h>-]FM?E7.3BltXHM-bbl_r^C;;uMGdf.Kh%L(0?a^%V$SMIKn-g/OBA,Ng_8qOt.G4*;07b-d&^'[LU$f5ngd%r-XNimO'c=1SVor0:Eg?<1-k=*lR5.^@!L"%EH/XBn&hq=*'_o;%t#(A>I6JN':Wh5=&pRCU'1C"15l6HQiH<#l)E>c9A33g31NEH\$h]o'o:W53E#msr(FBMb0g*jP1nCIbQ^<-?M19Kr3mq8.j:>;q*:p4Rb"@"DU#`i.DU&`=Vn-ANGOK'T46_'jF^$R0`j>ib(E*\_<8o*cItM:B3D-9Z>Of29HcT0]Z'G'co.PNW`2:qpYXp0-36TIRP-&3V+PPe>^kkuHt*7[/f`Z?74q^`DXV.TS7@]I@7J7#?[&(&hPL%\629`r50o^;oKq?P9#!l9@Fff9p3njK2nUHBg!&A`c[uXD61%4M,a"/_P#gZUo)#L[uI,Q>:BQkk3P?Scmo]DXk])TLK"NX2u"><@[CElgT\uF2.fcn<iiPL)@TrV2\AYDo>2%@`(OZ<M6L#'7K_ZStJZ)]&Fp39s]tR`?'J?rE-I11YEH*I?3FE.#D8]B:lU#l-Q&"X6RDb@GL2>K[lYeY=buQU?HWK8[#]q-;`G(]:<Ao2d9eEHWd<E81SK4JM$5dbM`T<KnN,YjlTi4>kV>d%&i?1&P=:i,4>V2MnI*kV+_s8='X"H,gcL;Uo:%-"-M]-mmX/gFJ;bSiNq;:Y3_r5g<a"7!Y]Bk,;T:p3c2CBn/b6lYENkm?LZ[fW1tg10cT`#9kR&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j46GWU$GHRA>~>endstream
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/Contents 8 0 R /MediaBox [ 0 0 612 792 ] /Parent 7 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject <<
|
||||
/FormXob.0315aed9f6006a101b3226a3b7404028 3 0 R
|
||||
>>
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/PageMode /UseNone /Pages 7 0 R /Type /Catalog
|
||||
>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<<
|
||||
/Author (anonymous) /CreationDate (D:20260126172022+01'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20260126172022+01'00') /Producer (ReportLab PDF Library - www.reportlab.com)
|
||||
/Subject (unspecified) /Title (untitled) /Trapped /False
|
||||
>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<<
|
||||
/Count 1 /Kids [ 4 0 R ] /Type /Pages
|
||||
>>
|
||||
endobj
|
||||
8 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 260
|
||||
>>
|
||||
stream
|
||||
Gas3/9kseb&-h'>I`6Z84fgHmCc;"L7g6_e&889#h,kA$Zt,m0Hdcho6>O[sLZ+YF+:QDRLY`5CAhdUI=MeslW_fp84Bms2r(UspMdQW.jtWA9rW?q[M1*5b[XIYc1kOQ$55sEf7La^q2$a/'T.)S#<V#*e,['$SVK^(f9:,Nq;AW\a?Zt7p:RM+pHF)-4F;E;l5ui'$5;T>HA_.,@?H2a/)Ol=NY+4r->>:n6'/ubPg6GC78<Gb)GJls9>QKuE<U0~>endstream
|
||||
endobj
|
||||
xref
|
||||
0 9
|
||||
0000000000 65535 f
|
||||
0000000073 00000 n
|
||||
0000000104 00000 n
|
||||
0000000211 00000 n
|
||||
0000004683 00000 n
|
||||
0000004939 00000 n
|
||||
0000005007 00000 n
|
||||
0000005303 00000 n
|
||||
0000005362 00000 n
|
||||
trailer
|
||||
<<
|
||||
/ID
|
||||
[<5d5eceaa0d906ef66e559ebcd616f18d><5d5eceaa0d906ef66e559ebcd616f18d>]
|
||||
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
|
||||
|
||||
/Info 6 0 R
|
||||
/Root 5 0 R
|
||||
/Size 9
|
||||
>>
|
||||
startxref
|
||||
5712
|
||||
%%EOF
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -0,0 +1,79 @@
|
||||
%PDF-1.3
|
||||
%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
|
||||
1 0 obj
|
||||
<<
|
||||
/F1 2 0 R
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/BitsPerComponent 8 /ColorSpace /DeviceRGB /Filter [ /ASCII85Decode /FlateDecode ] /Height 100 /Length 4720 /Subtype /Image
|
||||
/Type /XObject /Width 500
|
||||
>>
|
||||
stream
|
||||
Gb"0VH#OJ:qoA4A'Hn[Z$4u82K`j4ZR%PRX#F(qaK&V?K8<b&(;#dar'M8Oni!CfV<*>q$(4m``.P7J2`EVk#7#VtC6:(*s$)76DC7esMC3FfEP21e=J%f9>Up;d>4l&7WcZJp*DIk*ozzzzzzzzzzzzzzzzzzzzzzz!!#9dqY/lsQRl8pCtPt8mFiS/o[0Y;WL90Bj2R)5Y[N/Gfn0f!(^fES:Hsio97D?(Xn?5jq!"]KU?6X;&P"ZofW]f$p(q"VdAg3IP#\)UWg`=MO$9TD0>@3js(id(lnKeb'nVdJ7e;beRl:iu3cs8nIEn"+F\0s:]mE81*hAmoIah4b[;OfHn`%MZXAic<H:=+SS^nBEXOMIOI2D3Y<1k'jGjr"Mb7m#8djq#sB[J"W0CSVhD_EOgna,8=^]'+;Q/-Q29o7K`^M3`IrI+P7M:JgD-;8@g?QuaS(L!S'NU#&pVmQm&;+ooep7=EoT&nD(?bbsoCn2i[<[UJujmhj_M<u#m'ah(clITBmFV`P'C`bB@K_BHIO[n\%kN(^*?b\d]&LhJ.U*@&CM]/WZY.jbti9?:_k*Y>(J)7jH@XF(Q5('jYY[u"DM\[nu[VacM!sePdg%40X+-%@'f%P<0baG`E4oP$%Q/JgWmY\Um466eV$?Y)Q9)nh\c]Mq3f`',ShaFWth-oBcOd=q3cTY"+4B:C5D=,9ZUiS/hg5kVA4*K+[1fUQ5bbN_se_nC2++F"$]j*.j.s0:>;7?2OB:nigYB[?_a,Ulb<cmfe?*!8BYQ*mgYI_'G:#:9hc!%'nIu9\#K.OFt>Aq1i\Q1k(X_QRsW=DbSc%F'bgtZ/2>e+n:Kbn'oJ*sr;^;r/1Z+Vo?T!W4\7L>qdS!IH-WeB#r>ch5>%TRL&'caJA=^o?na4Y*tXgMf5H"N01jlPU;HhZ+Fp?gW#Ip5H[Y;u'ao8X`t6%]C0Gr*LD?+V"4C6Y<]^1GKRW1+$NV&M@2Zk6GD=d]J*qS.+7cN!h6O!e)cfIf(11iD*Y7*8G.!Fu"f5Q5o`Fk=$<gK"E?j,ZG(Q<S6H:\dHiQ8`a=>Zb+\X]m_AEjKB&FH%hV\FAFmKDpQr8j8Fc:%F6j4nD>GFYUI\FW`_flD4$L7>hqmCTh$Uf"^_*P&AuHA>O*GCsJP2O$EX=EQ9*O\8c#4&JZa4ARj7@85!.BB?cmQFmIWVr;9Tt>%neNS8ud$:Htthg-nk9OnPg[;$TI`>0jlL=-.a+_hJV!J^kl(gQd$+PUS\<me#jih7@a_WGZZ(.4K"nV+[.iR&jScFk0\%2K,Zhh0o%R)Nq/WF?l+\*Dd3krNhA#g[-\oC]W"'hnHd4_hLeQjHEBn&n644.4m-ZIetY!]Maa6"3/b.DR^j3BPL<$4Z,)s+'5XPm7A'P[OZn]2^N_0O[lEQSs)o18I6(3r!Qc+D!gf8aN/&O]Qq2:ob7jW;;79<m7q;t-1bA`T7?icP9s!jiRtHa:-1$`1XkWli8@t0Uu_-o6OtUq>"[U''if?a#-Wq5c7%ag\H8!"ALF'oU?-RQD7B?-GJ]">g6^'>CFf@5s8D[^<m$#K27ioe<`YMIH[l(oGML?\W`P:J[(9Uijd"PR!k=->Y$F-4CB"/,6\Z#s5Dlg2HM"F7VJkA+mbDo1>N-;k32'EW?6)(KY`B.a^\mY\PAK*gH+(7uU^F-po`(RMK06EPHHdD0;Bn\le,c[Y^V3lFa4(IC]mFtt!K@dP<o88m]iqJ+@)2E/k&hb!@XF*Gief89[^t6$$47L+p[?u]I+odK?/mT/=%`2+)fO@AogV9q*r!U1m1g?N]&Ti<e"o\RXmOcGj);^2<k\(R>\obnltl>ED4q/%Sh>o`U)Q4>YWf)aUl].\e=Ep??@2&sT>Dj(+.kQ]\905L.Bs_jQMg@#5Ad*SY8q<&o<LoLF#$U&]1eeYfg_3L=pCsBn2Zo8@8IZ+Xg>-8CKR8V]"=q/;I!IC.2@XQ-aimNpYWG+0>@4U4t=ghn%S,KY6Ikm?-DX7<>iL_CI@3\nVAbo+bpOJCAW-_HQp]RX&=4gH(-^/Z@s5UCp8LSn\c))=o$*Q*<M^D^#4JMJtt=95Q%_uUo1-F7q-h)d`H"f/Jt%*3B9)\WKo2Erql0!qemE![bE'dH=+t)O0/c#?9IC9XEPfCe/A/_qsP1I:Gi93mAc2E?t738#p#IUW/S7Mm!5=<`aIfEM<Ys2>e&.Y0ZhJX-aq'tbOsIoYE.k=J%d;VB:aB<bI_rblEfBj^p1R=K*Hi'nV;J/+I,YT[]<3iSq0j"_fD5.GHQ9sRi?Hm3c3S-p!79pR,Q`;q!mC0XK\qU5$G/?oAYN4XKK9![O9M9;(JK$g@P<;orgiE)Wd/_e6&iR=?W_Xldsm><h,SP+L,3L,ntesb+_2WpLI=+=Q*1F"S(>qmjXiPmbHLOYe%@qT_T#OK#H+:rVJ+)kHmJLjHDqC"3ei1+I0_7b&fC4SN?G-:Hn<P`F4_m)X;X7>C^ac96r3OHOcmBeK9_BW&gqhjl7$/j4:&TqtBm]_@&#AP,Y']*Eo)'ONPAD?/7inL-[;Y?u405b]Ka3.k@s]eE(j,^\#rI[BQU.aF>#4B$F4/opmSM*F5.or8NVf4NVD[_hmc;1iLl9739e\*dBrn2.Z2*I#rOpSJRG>K>mOaX&`@A]5&)7CQehdOsNbC>B_D5cTCSXB*di92jW_WX!=EhTKNR%fVt.%Q<$j[iSM!0d6/k`CY(3)+XeI\r:.i,Kg1O$IGhnlT&gm[L0VDFcUFbqACJhEm'4S\ChfI]q:mQ"ZL[OBmJ_6*\'6h<$$-W"[^F9VSm/"@Ys!-Y/kBOeN9p]O$ui,lR;]Y'fWi?-gh&>)baIM5;iRQYFQq5Me#,uCc8P?h`0K;LQ;G)X')<<ZlIDq&LLPU^bo=&gcErEUfq:W`I+ft!GF'4,DQL*1IX_:-FmGd!pPJ9q(G?8P`t=nl4**0bl/9C1$Pk;?WMl,4lD^\UVMQ6bn$qBfs80*^[?ET$I!^-aH!XgKQFCSW7VR7m>c6G0oT/C6@E@rs_sQ]md5bQ1:uFPQQE5J6oF@\//u>D@rl6i0Db`9"CffLREne*h9ea#&lL)T6cQ+8d[]`uKp7-3LZ,`(u?(+fr>(mI*G/\k-YJ:uXP.0=t4*2mZ-eQ't.M\@&WZ\RXWp+0AS>cW33cqTe`:fXp?:r\D9j`52V-"$nNukEh^\mZGUTX8#HKq+^Sc;g;39(Dp^!D$\>%6A^eKD`,3r`Ehh.Y<QB$Hd6Dn\4V,K$O2eQ#]H'IHuY<'.PWh7M8sFIp`W<?e^(pf-sk`:cWX(0Rr5S=GEL-Ru1K?[mL]^3qom@^04`'ab3DaOa@KMi0rX@XE^O>K:6c7M&12fk$N'7q-hiZ)75?UbZH"N)8kd3WGbMX"P/.K^RR%.rqaT@h#u_AEs1)jILMOBks:._,[m*eQ/HMh/2T8\^k5p-Gbk4:UO]EUnspPj)`O0k>MR,M8scu:M"<+[OYCfCtV].VbWfJfu8q0hWVoO!s]=g_kY<KG3'BXc*o(K]QH6CDr&"TSejAI&r>p4`r`?FMo`BP7H7X"Km<=Xfhj^&%sjRIEf&?W*]uFIg@FfT]->7T*G\=GA%PJ<HdnOVSmLf.+KHmSZ$jZQ*BecC<(3<9grri,I2*)b1:SDnGU+d]Xg6MY8$T(:.4?Uk`u[BiGe0Oe2f<H[Ue1=Kh4riO*S$)8!@o+s?;W"![]<f%`Y5^Zc;'ok\LTFOfW\3I*DUf6[:**:Q92N&d_']\[d1Hqldmd(IV"Q.V:G^=4P&F/.]W6G(>cB1O#e,SV5<H8f\>>$gU<+7PBoDYDti\Up=RQ$_FGu!./Yu`]tGE[]1Xflr3@X<>TH,QPDG[k\(f5EN#4:dBj9Dtm'hE@kFId$O4XJRs#<fi\gX(P(\HEsYBB9>>%h8.(c#WXs*h$)D\#t'aEg:?XOs\SA`#9^5CU9:p<JsU>L#D+>hdhTPki]s+6c*j=3>f=V>+D"=D5gHfUbY*f!X/5kZq(aU0s.TSSbiDpX_$Rm57L'='`/+(t;Rbo#i[rD<hl-MMd9XiU7<R]U8H\\)5nGm!GTqIWJoT^k&3K2m'gnqWfVra4/mgQ`:tY5PX.=H\ipm,paod-T=!9ie-6^rsOM%b,=gWO8LhMekGg^s513Ue4#ZT>@pYrm#1Im](Qt<AVg4pp1hYAJ<c+q=&_b]DnkJ,HRr<'>$A+9]t/CS)@C[llo,Jtei=]'$rSMO^!toPHeWb/:-J8LrU&LWJ)\^W(Lh_BC(Sq=]a:sW(9CiU>+L>jbY2:SMO\P;Zl(Q*_#4$"rP+Wa'D/kYlP9isqjdB"Z4^4CMs\(rfP=ldGLb]=a4-[40"Pb'F3QT9K-?3m2,`JlHL%\1Ij*8m=o$^)IJ``GG>8o)=:i+tB%*VOA&aJ4rTY`4<mp,AAS#lUS&fu(ONL&D/#q[E-aS3rEpZinTItFX7`LZA;mpPt<aK*M4`L.JdGj.pL%3[B<0^9=Js@ifcC6aG'JXr;^#lG4Z!G16L%!Kgc\r_t,%&S@[KH;Sbh`b-,X:&f!<?*P^juT/EcNB$W&AtP0_oZ%$360Lace*-_EY$)IJ\1l;I3ZnIJS%;Cu2i#mbPJcDt*f-eZa,X:4"Ls6%]AE=]p#qGtjbd%>DQu\n93U_d,;'5W.rd^OPtDft"Z(lDUVVUibkLA^[AGcHdpAzzzzzzzzzzzzzzzzzz!.Z!\J#>u+ci~>endstream
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/Contents 8 0 R /MediaBox [ 0 0 612 792 ] /Parent 7 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject <<
|
||||
/FormXob.41b05a9cf8679f0fe6e7c30c9462b767 3 0 R
|
||||
>>
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/PageMode /UseNone /Pages 7 0 R /Type /Catalog
|
||||
>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<<
|
||||
/Author (anonymous) /CreationDate (D:20260126172022+01'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20260126172022+01'00') /Producer (ReportLab PDF Library - www.reportlab.com)
|
||||
/Subject (unspecified) /Title (untitled) /Trapped /False
|
||||
>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<<
|
||||
/Count 1 /Kids [ 4 0 R ] /Type /Pages
|
||||
>>
|
||||
endobj
|
||||
8 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 250
|
||||
>>
|
||||
stream
|
||||
Gas2BZ&Z[T&4Ckp`KUTrY_02PMb#<CFN=Wfj',kM@19sp55uUe"pptDD)Los"F*-#r%7t"K39EA8f/'^$OO.*D:jQe'n<f:3Cq8'p9Rm8qll,u+[sQj[W6hrFQL%\7G?"sX/%4LXYeUkIBuT`A)Y3?=ouE3GIShId3E("2qqVte.E2,r_bJ%q1G(F,@9C<XiC-L`O1W5it(MP9X]^nj..r=,_#ecrj!ceT&ATWd4)p.7/d!C@/gP%;p#~>endstream
|
||||
endobj
|
||||
xref
|
||||
0 9
|
||||
0000000000 65535 f
|
||||
0000000073 00000 n
|
||||
0000000104 00000 n
|
||||
0000000211 00000 n
|
||||
0000005122 00000 n
|
||||
0000005378 00000 n
|
||||
0000005446 00000 n
|
||||
0000005742 00000 n
|
||||
0000005801 00000 n
|
||||
trailer
|
||||
<<
|
||||
/ID
|
||||
[<38bd217c814ddf937f148e537dce51f8><38bd217c814ddf937f148e537dce51f8>]
|
||||
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
|
||||
|
||||
/Info 6 0 R
|
||||
/Root 5 0 R
|
||||
/Size 9
|
||||
>>
|
||||
startxref
|
||||
6141
|
||||
%%EOF
|
||||
@@ -0,0 +1,139 @@
|
||||
%PDF-1.3
|
||||
%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
|
||||
1 0 obj
|
||||
<<
|
||||
/F1 2 0 R
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/BitsPerComponent 8 /ColorSpace /DeviceRGB /Filter [ /ASCII85Decode /FlateDecode ] /Height 80 /Length 3374 /Subtype /Image
|
||||
/Type /XObject /Width 400
|
||||
>>
|
||||
stream
|
||||
Gb"/kH&U9Q'ZR>43%?O'&jT85=3qe+N1f0n5S02hEQ*VDL]ZS)-n8)hB\WWtW#6!n.\\/*',>!9/rH$h#V%%<D1W$*'jLaXkBH(D['+neI4_Y@WVLF\J+T;ghR7Y(mQ%b#VGJrM63n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&/S]&BhHiT>FDK@e$Z6UXYgOsk+@,I0-;pc?Ln*mjmTTlQ+?K]e$#D.eB)O5NG7.u<*,RJ_p,8ck3p(&R=G"JnR/3XR:fF1m&!LSDdTjBc:RULFd1\@RVR9Y?IMP'Ajfu#dnfOdRMIRU5?(oCkRY2lY[PsI[bY!Lo7-qefWg9>U_8tF1IHqd?[hMs%mDBpEmdN`p:`8WC.(r[:;%"2F3\c4Mk792G,A7iab,>+^X:Q1B)Ct4JU?`lHM9=Q*\(H$'0?&(T*=du53O+iUIN?O=m=In-UqEE?ghWKl;bQuY*`RG[FO81(L.M1]2c_&%<`*Rq.JU[hs+4A7O4D[l.+Bm#OHs>Bg2RPN#ZQX55r(P>4!naFjcC+c`URpH'c],PFQUH`f2c]IHB5B)pF^[%Q[/+G3L2gM0I)]FA`$/.TW]/!sdN]/(^i(\!C)"CY$!K4SDm&fpLsK;:U@[oi*BN6O,DaRecA5ZZ2bqqSNigQne<3\(m/F9af2d;H'i,rO4%j7$89^HKB/%EH:d:UE-7IC?3kP08QNWFSkIn&tXU0hVtf\g_heWn%H2@\EYR$R+7l,it!qkZIsA%.8%:<b4Q+4o7Re<@/=uc<D?1CkG#t*Sj+#k=3?C8<eoR]gMmuDKs]#U,%B6^\'Q>l.UaQb$n85YGMOQX1#>:k7>o*u_[[jr:Hg5Je^Y:RKsDj_5eqt.H?@qPba^,b_QC<DQE1;H6P%jjc90Rgp0)'S2.M@\$lIp4C<@5NV3A+Sq=@L,V9GDV`O9L@%f5,%,!P;IfdpJOYbPun*G`61QOD3'0Y6GmF^2XsRCVfROl*/ge#f)W1s!?*VEG*0."1`MfU_Qj-_HcIc]o.$p?:mMIQ<MK^AHukRr/l=E7?<+:@V_=m:@ob4Q**VPHYWj[Zo=CRr=V!*BT#C(LJ_:?.qW/l#KCp@t8d\[H2o7Bt=PmC=",eKi0L-.*!_c/%pOEK43<A[3H$o[#=noHdqYr^oBKc5fjO>LsH:W;>mQ8`hXnJRbG389AnU"B(jou524Uh#I;&<U<No:XESS[)bnZpSA"f;hd8gI+P]H$/\(H<O[nan=8V^RLa-H@:Xf+/INI%?Zd4q"#jj>q*.4u=YF[nrO\>7O#of";9>"U0op9fI&cNL!f<:OL7XG#UCV^-W3n&X!D@elVYI"hK-(KVsoa8aWL<4F`IGb_t#tRF<8C]_m^L^GO6G3G5S09nd$$CBb>0uZ(bhmeXG1t'(r;5s6L_nY]J6S``aIm.70L=toNMorm#gcR5B07$ZWs&l!IuphJhN(6@caN->pY971W`M`H*G1Tioii3W&eZ2TZB^Na&P9Djoa9RTg9i]Nf@JZ^+U;<o?jn>70)4?,Z*\6\f"L%[`BJ6Kj9)%X!(RqAJ!s1(OmA61d_0J*HI^@ba8Pi</l@aVqWUPab%]CO5^)(hH3sGpX-^oTZd5(_lb]&C\k)B3IsfnOdI_nbfq2>Pl-9KMgjahN*A`OY%3@&b90[&bRjRh=*NU*X?K'!LVo@=gF%20@?O=gnO^q+Jr=$XI<l#nIRPZI$e8J>159G:[l)_5.-%,a+qd/.CDY\U4-ibD!])@[SqAD!$G2,qi7HmfIX"M?gq4^a:eU]L7T>7]\Ni+_8Hg7U"jUaRtKJ]bt'P<e&53@l:okMAKlUSY_?MI_!>3`l9op5MT]g<,Eau4CBfS9qgs8'hVO^q,7ISl%l9Re9VphQHBQpHf7;ouN+#41(!]Bpq.a"s<1RM\\qcK'V\`Q!N1@Y46f"8:opL%80Oi]-T\^Ju'R*bNtSnP$N;[5T*[5i*NZi1b"eWK@pn&8g.BWL$qSS5iR0*5>0K<j*&mC2tplBiAcJVg9(]ip]PZ1oSVKVeSV_/S3Pg9=ab"pNZ5V'2SCk-Vb@KNuhlT6mp3R?8XX`:Q.L9H,UNhF3YacG3W(VX0*uZ8>4e?>K_G;Rn$CUR?oeU93At-Arf=,=bsA0p$p(CN!F<.%bX@`pKfj\]c&XOS1!:`do;;tZ6cVZD;+'sUf$Bt,Q7PD^1rYmq+$QG]u!IsY+87o.os_h#/VHUYNfDXl;b!f:?gYC^>D*J>\&T\cBJ-sI_$N&=^rPk>LSVt;>P.46N:frEgm*GGj4=<qT,Y(1Zc+Z]h7%8,[8^^eLRgoB$=^7=$"Xl3[>=b41/JMmG;,"j2T%QRo@!%UIU4G6f=MZjM:Z2<iT63Xu]!qai:DQ:RWOH]Hn]!\#0inkU)(>L9M]+g80^DM^g"!X,S'0pD;8jH/WctXls@'Zr*MP]h7%8,[8^^e]B13k1X#5g%F[m.h'jn0tqYq>l-H\%mM%:Cju%L]qAkq0pc,X(bDM0Q0YGKF?F$>3]@tZ>A#n)hKGBreCD[e6OjG&;>-_QZ2iafV]GHA*H^f/E.J;%Om]_HNhJJ%MZ3l1UbMX*]CbunmMSLel=3ef=.@Tn,XYJoeZ)Vm1L<aCJD&:+Ijur)RA66H:%K_'"UbD-=0FNC4t9b>F/i:Y!Za@[WkasO461hJ%d@!MX3Q&p.'mXeE"=VfNAkB83#nEb-?5c%',NbTmp@Ic`7),9GbJp)qSmsX%1!Y73W6=/j,Gsc`f7.5:+Yeeq`Ai+!fL?NOM]56n(<,D$J18;P:'SXB'[uZ2^6A$<-j66f6Jk.&&c3W_A9(cRi[QK19NKCB\1b$_[lLN+.MEm1P"`E)t++5a[#+RZP^-(?@hY,mDDVfkSlm8e\\@>q0l2NF1QP^9\%[*ac\o<7!Y^`e%EF9dL)U==0$Z?$N0G;m+M)j+f_J:*uda;lh'4kU0Eu=[1gfDqKnase*WU=mVh!(:]"OUWP//]CqZjE&P4=Fd]4EP,j2"jQH?"R,$k%R$h6Q3^%g%3\k2'Q2t#0eqVHl3N]f0YdOSV61-pC:UfT.\lB:IuJp7hZ'6FlP!LEm!e%`Z.s*j]KJu),bq<-MI+2hQBC-aRH=2j-B+b#)kL'"'pn0^RRn5.!9II/5DmNHR"$lgsSKY-\*\*MgP]XVFu.o`0C'fI6ZKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&<`?/!KNiek5~>endstream
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/Contents 12 0 R /MediaBox [ 0 0 612 792 ] /Parent 11 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject <<
|
||||
/FormXob.b6d21e33426b982eedc18c3d4e93428b 3 0 R
|
||||
>>
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/BitsPerComponent 8 /ColorSpace /DeviceRGB /Filter [ /ASCII85Decode /FlateDecode ] /Height 80 /Length 3746 /Subtype /Image
|
||||
/Type /XObject /Width 400
|
||||
>>
|
||||
stream
|
||||
Gb"/kGE<NX(WT`Jj\uYAo)R):e01]`";Q!n&d*n+.):?M$;N-L92SQib[Ni],#Y)4BR+![6n$6QZkR#<71h=R.UYRO+N",lDr8pJ4*g7;J)9%2]0:`2oQ9iO]^,RBHegiOad<J[KFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgJmN6a^Irj:'BVG/#9rV!+sehf4Np$4uGT6?Z%jn76cYI/bg\b0"PYFheo17N)h[bT;Qm:q@c230t>rr(GQpkKpmDTn_bcA]O+Jd#cU@+2Zm>]f;6bs;T&C"&dtb&Y8uEeu<LE-(N92GKeb>4LdJj]\*:qP]'A(VpnpR/2-8pYO1I<E>P5N\HYDRFS?rdSVTS(Rm^Cb\t8pp[.ji7rj(S`LM@bl-r9u\(u5id79+1Zkc$%??stuVZtblhmB,pZt^mu0`%Ls1i]8CHul4%*Hj.6mVDOM9@Q>X0"[KH505<^W'N@NK#fUn2VXT_I7/`G*I"#V]/HO(ZsFbo9PDDePMK]!H;tTT$fdk/bb^Xe<o%RJ;c=p@KgDW9>;u04LW/P]CXtHuJmX$+n(Vc<?4@i#F^)<je+N!;?@B4`(2I>\^&%Tk]_oPE2P5G57Z;<354V60?+`jnL(BV81&C9p'qn^>huY?a*aVo\^AQF(LMnkjY1\5I3=GilDM^Mf#@3^HP(^=%G)"Pl2nSAIgiK?0>KOPWqO$PVHF<:_o#LfLahWdh*9(3^m/EKhl,$Q3cLgQYNON\9m^^C9op:#?rd<2$Vk!&)d<th.i@gbDG"YOud]6@72asongem@R4YF`QZs^cCb&Z^>EY^Bos&AIDEpB'*`7%OW#]/JaVk$ICr,?V+Pq1smN.e[Kbq-r/47c-[@E9!u4pDp'F`gCN0YN(T*.>3lQo[*tf$^DS35K&9pYYmC(C!8KWI=YO-Yh<iYq"0n-PcX/R22T"lAiUQ?;[;g"V[q<\)6WG:J^rp+%ZAH>K@cWZ,bpLf<4]8ob:WF?31lf84UU8ba9QV_YEY=:-f*?M'pH:5PUm1J)N_lEV.S5l4J>"ICf>9o#Q>b\(rC/e:S+@s,o'A+RfKf[#p[C^,r^GPUR4gNZ=J]6kHiV:DYI45Dd,\<-li[]NSt[l+63)OsT7L1W3eYFAoO;cK:jZ>hAN$F,_R#nIAbhgQ+SUFQjta@9[u&(LAMenD)m[`R55a*Y3l"))/k=oMU)6$+i]jVa*618S=TZ_GFf=XBs_%K:K'Fo]DciT&f4e(<2n?Vtb+e^g%Nu,1VYfeOc,e:Kg5KNLk9Rcmq(6W8>+.5UZlp$,%&K@JAXf9t/kp;B>ou]%HuU9<jA3FHc!7?g+9s6HK-!P:8IcUIVq-FJPOLMNgE:%ADX":FE$rF][b6ElT3'f8;d%Jo:3eXU\iUJe7VBlWsY`p!Z_)<M"V>Q7>RO1)%6mY+XGj^Cfi\llJ`iigb(cAMjb)$@^lQ9+"%O3RN/\G-)Fg+T0@+7sBYYOlju6E^l%O_dec#[W(oiP)i[<g)DQB?>1+.GAD<*#ee)n]ZlQc:X6"mf=0EeR3-\Rc-pb@okO8@.<a?P.;n:M[q+PD?$2E+e8+3J=j@AkSTd+T3ms/eoJ)7>\^lM5KERoAErtC0R56,oZN&WoDC3T.FfcQ):>]e:aVd1keQ_tITI3"f#+HG?-@C\E:ct,l:h<Cp?GYt&rHEP%f@DuqiK3-OdO5;eandB'^*u(E>'Y7/kYTAclDW&K5RRRs"3=c::U8&qE3Bk7N4522.XFX-a_8A&BTV-MqW1^SOa5rC:q\>me%lnUEUg#`JHMa+Iu3uP#:.&O#=ggsUgq2ho1`OG(%W"^SARTTWNR+&lC)M$Q`-sKP).<Rn<-H*Y^_.@YouJ.NulS]BgMA@p*ha_HBliRAPSEe%/q,5$rS?H&(.ebb:^u]K\M$!o#]`(^ABPX>'>!(C!`PX,)AUtl)+6md<^LC%`&<0kH\Z:!VH;uD<4`a?Bq\X!.ko\0k6B4?aIal%rsfT"YPIS4"n?"LH<iq4aDoZS1+2c#!%I<oORlE.6243F,5XrA7$Vp<=5I%YtpMPkuDakPrW:M755EjC1M+ASW1"l%6ssh".hAcrM+K$.)r`aK*P&Hs.u$/ckS5US2A?-\M(ZV;>Fn=!Y?$@qsJM8hgJQ9F[&m!?Bq\X'Z@O/_1jZPJh0WIlPaMENKSCZH_qf=)`E/ta?<^&&/;eKN[POQ>c>>*@<Qcq<skZL!t1i)YtpMPkuDakPrZ,@m=)4N13gIaPe1(Bgc3F?fe]L"FH<.+3eU/;OU:9l)jAj1eZ7B0jUb:P*^U'nk0B7LJU1=RVTVD0$LIB(o)AMa*Y*&F=l%,@f3RrO7lp:h;as9gS_OV%PBlb1<mC&X1YMQ7Z;Q.M?N'DLPOF@X4;:2e@\4k)e#VPa.Wa&'e[fnk(@eW9s8Hp3(LD&9UR,/A3p9VHP#n-p/dS?.Tbjb2E/1mX<eeWghoetsFg_!8\hVa==/BRk&$!s#r7l.`*`fG.c&hEW(G1egFu0Agn(Uo=c'TZhS"`tF_h6I>QRnu,W>Ap+0*lZ6>P/?C=#Kk>pfEI%X5o!bF40@(o?U'<]Eiua1#T-.-6qJtTfI(.G1oN.lKY+4/`*/<nC_FrBr@t'B!tXgNRb)RL+TCgW3<iX5O9#P?a!)IFFI8oG32\=#jga2H_&2^^0Kf,.OsLu_#eO08A/mmJDDtTcmr4t6O1`Dr,Q`30k4J%!o:K3@7,[V(bXXTZYYa2Cd1qoBXD(l2cQ3/<j,7X5ml5p#+n>6f$6'lUmhZt>sFRC2D);h+q6STAmNibWJLoa!_K+fl3/2UYVS=VJ+Mu+BpgT4#ns,rG4")XqHRFhp?e]lW)6<M/i!6i]r"JcHp"^;CaL.d(qc=(QVd0f"!@6_597/@.grp/FPoFQ-+&Hk[Kh<ZWObTpod[MGb+)FWKf>msB7&pCcn]iARI''Q3&WZn4`cfmXRYX$=L#_*oT6]L\$1K[YD^6ieQ7a0Sj]d/M^g5G<=fIGJCn-7*ka$`dtNA96I:UC5Sul1`\4p@]Ls&$eZG>,T=j\`^pYF(5psRDI[ZIJUoPB8(M_b:GbMQoKNEpNmMNdcINqeCi3'cEM:qF.X06_nLu%gkBg5VlBSp+B1fTm,9!@mC$E:N.o^gaK:4o/&_:c/cq+m2[#j^;N_DJlWf4=p-!+I-6h@omP!WV_Y"`IN?:CP*<`.u@#A9nDKO*4\G2pT\?kZ'Dt?,HQ7f!)_^L]dn6@h4u8d7pi9\@8;-o?'k"lKf\AD;4odYK88Q^$"H$>rOH)/`G1DGh5_0eMt&PpsLK>qs%#:d;7WAZfKR;rBm5'b+%b!VCn!a[@b*Y1e"S\)QM"QV-!NdrV>Ws'[ojRr<li1<loMopq>5.dVU]'X/_ssN#`kAmA/umL!VA/'h#6Ik/uc^1EO5Ek,(eJ=3@$n19'-D]1fkFPi7&q%$9QZs+c](m.qMB*W<LO;^VmoE[;onAu'qSYrV;=B=>iUUHSFKA3ttm5!=55Q\.qs8;OBp(ii!qKaVIY^A]oF]ER6$H=l(;gJ?HbR]':Z$q1FFKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`fq"Z!rjOlg~>endstream
|
||||
endobj
|
||||
6 0 obj
|
||||
<<
|
||||
/Contents 13 0 R /MediaBox [ 0 0 612 792 ] /Parent 11 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject <<
|
||||
/FormXob.67f2b803142796cfcc78829acfff7782 5 0 R
|
||||
>>
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<<
|
||||
/BitsPerComponent 8 /ColorSpace /DeviceRGB /Filter [ /ASCII85Decode /FlateDecode ] /Height 80 /Length 3671 /Subtype /Image
|
||||
/Type /XObject /Width 400
|
||||
>>
|
||||
stream
|
||||
Gb"/kH&NHV(WW/Z*isSg&d2e95Yqb:,+b_Ylbt"c&J%p\W#GlYi-cjh1$F4VW);.c'TXPH7"cs*'CeNo0aOLC"P*Zn'GO!upO2q-m[9Y1pX_cEoNV.hZ!I=.X/ihg[pN.eCb@(q63n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f<&gLu-H0\W/S)K\UiU/d1e=((AE1\ViZgoP7G`;;^96"eA^VhA0L.[BPc_E\[V_jF2]4eaSpjj$D"&Bno1d#['rMp*ip0m[:kfFCp?cGGD5F+!`f-%F.hd'N;+G=82r*?QTUB\d3]4;&O$@@]/UdP:"a?Mun%X(hIdfY%c!Gar_>X+@1HlIml`F@TKbp&ZMDsE$keh8Gd3hgr.uuh/jX)6'!qj^&c9!\hln?+ERl7S6Q>-NjMlpa9'WJ6Y0"C)9EqnU6a<@k<:/9+6qo^@Z'\I'[FS"YZTL.@L2sJd]G1f=ahJl&2l`Ks-M:S_+:'iN)f]\_,l;^8p>qsj0Lbs?q^q550MlTodIT_d4`uhTtM2WG=S:0CRJ?g#U89K(O`^f6E_&QM!\8c8?YcYWG^A,Rga7Gib>7NblcZ\TL@>T?RFh4gP,RLuW%NY0c='mP/s6[2ZP"RWEO$.%PqO$8NHF;:(g19^%:Vd53pXdDSgjf-D?!)SSY:tXbbWl,ljiaKo6&,Tk^%ZEq<rWC2djpdFNmk>T*#!;VcpRKUM_Ah@JTSpQ_,km?"fI5ldt/$XrDiRJ>7IaG`llTEkqIRJqXuLseKJ0E@^B[c'G&YC.*Td\lQ;0M&l<>n.Lhop@hJHBr`p<Goua16/n;ob.>5F)Zdo(A@eK#h]C[X$;ni/uM_oq\m:G*7H0QjW]3@5ik9%GVKE;f&Uf!n]DI`Nb%2C3boPu^,\d9$\T7(8@A:HcW%-c/0@u<e?e^USp7r<*.WB9Qj*&d0_/#)>2Bsh8UF<RaQg/[=u(e[]i3HG8U$[j'W<'t#[TmqC\O=RK\k4gLjcBXSgP'657?9`D%/7'<t=0\%EfUaok*dBq#D;/+Kc<ku7gob>(_H9.A]-D0Nm^Ygqir3#\O:*\f:`5QV2)9W.6-PkGX:u*s.q82:[bLF*!YSl>fWgl`93^mI>>?W-=Q[qRY)bl6lGcHZFNr),2$%FM_ALH%]g?+ZMc<[[9]ZgI@@Xj1?$uZ`fQ@E=T_=^Y)Iq>Z]n3T+,ESq+[<(gAZ#oB@"U39sQJhH7qWUVC,rjBb5Bpf01/"POnP#E1HLYF]r-FX(;KH&I":;elcp?*VMmI:t9XJ+lKojSD4*?HTLsA.bG26.GQ$>[hlK'QDo^(hmQ-cTH%506+oa4EtR1#lVq>$DdUYh/>J)/5O7A3XUoj?\?S"2`7HXiK0'jc'n-K65=5Xbo07YG+,DnJ#k)B0'A/6f!>6\]:+"l=`SG$XA)CADn0K'_K4f/f=5]6+1l9?W_X6Ot?rDnk[F$>,)cO;]%-S-9:BXpp/7pgGNTNfPU?P,hXj.lFe))E5rE0u.QQXBgC'[=5fD-+Dd7D_BiCOsStaKInr&6L*#iQ7hibR@-51:cbp\1q]mqe13@abo2#F%iXN!ou3+QMh*ChlU('RV`>Se52@/A>k8Q@LYVs5!&,>nq4C`OI,m?KN"dj,lR]4^*`J4t3RN0'e>.R)(f4&I7-<086hRGl]?\CGX4ZLuPqD?mGbY492PJuG5NhO5Rl'E?jVGU6ID)(X#%WK)7j1.B[t1a9Ju#GK#qImB754Z:n*t7Ur]Z,U]e<VdfHLtQ-nqGDhosqd,=eUe.n.A!MBr':nf?;3P9TgoKmg#m"o/ECONol,Ita.<KBmQk4*.;]m50gE0nc>k*9ufGe;5djX]Ln4+n3J_qX-GkTR1n9@0\q1VH9&8FY9fC.nhd]SpA>*.D5WHRi4b(7#_j-Wf>90+Nlk8XG?8Zml*$eFnI4mV<54R5pU/km!].$fd-Iknlq@5,Xe$Tq95^0d<lAV<+_t?GZbWe?PK(;2'+J=hKZqF"CV9=i:=Sc3pXBNm0iUoS9^uD((5S3VosJdC=XqK(Y2"k`E5Uq'nDYoh05K4psDTXB`"b1or?HP0("#t;7\'`7fWI:CgYu@0,G<mOUj\+!#:VG'\hb3LlHB]mfcAP9ZI;M,2&bnaX]6X/Y7e('"4F+\QPhZp1:%8f78Z'U.)Ue6KD?ha_fc%12pV.ZP#..XGC/#0BU7nKDib`:Hn"\?[&('o]Qm.c(BLBHqjocLXHf@LM`\>@eFKu9Kg=%YequpeAKtGp$Y/ZWq<GeX&l?&`O&_;(00M@OlMMsp8pnZ(i0jKoBoC=e\>Nh>cB:jR9h2CeD)rjW#<;*rq4m'&E21"B#_8-[n2C1%.R]OKZKFC,\A?;GZg/0Y7QnAmMs\'7ippJ^\Xso)'/EhB"`5cHhZ>EZWOn-3.t*;K+PqCj$o$J&L5tQQ=@Q((N`qd]u'*NP.O+%`e+d_QG%XgkgA[eF6Dhl!:9<aH2$USNm2LWq28%<VYR)jaXbV4YCJ4u\jDXW7Cc+bW^I:L/(3_5/$Gmk<L%t/D89:Y9U<>n!ZO%2OGm.G+*H6L3F-L($4-&[X?+S`,)XG+'c8f#F"j,#4M.6<MY4!T\h9(FHlkcV,>FdOF]l)Y>rspUrd+UDb:`D!)fGlVc8C*chooP,GPs!_V7E]#$=\U/qWTG4Pfm%09%<@9,->21E?O4?&pP2@H$`4$?gM>J.^gGA7I8>OOjhu9o]#%SpYD:!d-.[JU5C>G.uT">4k:L+m`uN(or>=//s',t<F).:*dk3lKC#=$qW5D':K;=$Vh$e>G-.OlmLurLLn0%0^[#WL$M5f>V7B:m$9_l<#pr?m_rNDl<j,-Cn?O7'?6LH#Ilm/to:\)&a/]7'o(b@./-_E+cQ^)a4*],55ON>nbM;@Ec#YLh)mC]GG;L?q"@ifc)?NL)=1E(%%]V"'[:(Jp]+fX=<H2:\81X=InR?-T/csEXCV7l>pXRL(KCof)C>7a(8CDp=I/^[HE.V#Z]?\'*R<,cj#3Qd'QlFa+LO?d-;J@`k]u&gL%D;=rZV;%XY.6PuMmCm6;Dc%f8>TCO-`\Q1J;CGJ(.F>s_rU",SE[+39$;uD4Q!l$]cFl9np^it-Z]/KiBJ2.rd<i1"5=SRESH6hk"I1b&.Z[f2i1jldA*7J@TMK#qXgf3]<7t,gRFY%(IWDRW[lrsp-a6$p3oYdYf(Bd^OH#r>$B]'[tKFhF0C=aRdt[f,Y&iJ]18[YX+V2TDbj8FlguYXiND`1&gV9j[X(r2L6iXSoZBYBjd4#Tfh\E%5A]:QeC^]5U+TaDlRI4g@n3(]?$0/`*ag3C]s<bYFJo\=W[`.=4S7639@Ps.ou\;sq=0D>YKFND8ubs#ku->#@_ZT/BZ#nhJEtc$fKAnu*.>2c7>0%$]DfZY`<r1%=hp-6L:goFqV(ALl`"4(F?bVarqV"-(gH8)Y?-O\1!d^"N#`kAb+5=sRHml8%4?f?63n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUEtn)Jn'&1>[~>endstream
|
||||
endobj
|
||||
8 0 obj
|
||||
<<
|
||||
/Contents 14 0 R /MediaBox [ 0 0 612 792 ] /Parent 11 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject <<
|
||||
/FormXob.7ce3f428fed09445afad362830e52447 7 0 R
|
||||
>>
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
9 0 obj
|
||||
<<
|
||||
/PageMode /UseNone /Pages 11 0 R /Type /Catalog
|
||||
>>
|
||||
endobj
|
||||
10 0 obj
|
||||
<<
|
||||
/Author (anonymous) /CreationDate (D:20260126185515+01'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20260126185515+01'00') /Producer (ReportLab PDF Library - www.reportlab.com)
|
||||
/Subject (unspecified) /Title (untitled) /Trapped /False
|
||||
>>
|
||||
endobj
|
||||
11 0 obj
|
||||
<<
|
||||
/Count 3 /Kids [ 4 0 R 6 0 R 8 0 R ] /Type /Pages
|
||||
>>
|
||||
endobj
|
||||
12 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 302
|
||||
>>
|
||||
stream
|
||||
Gas2D0i,\@'SL]1MAmuWBq2]41U`B=+JA;@)ETUJW/a5S$%>'U)FQlACq\foqfM9dic+ZLN<Q`tbE@K*:^spj"Oo"1`W"9\N8S]7/.WgR/fq$*$ITZl?0A3Yd+#RVYd`S"!VHM:q3ue\ZE&.5ico>/#%%PKVtVn!b+n6KWeM,?U:f@u6(=k$)>9=A;GQ#t3m&eV#g&$:bL-jnalu?/Fi#S%7?Zn?-:G9#d\O:D4D7XQ`j*RVq8@Qm.FMjt9rX$+<uAFWrR=.*pU4ORU>6iZ0lp3O3um&1LmEd6.tN*K;n6j'~>endstream
|
||||
endobj
|
||||
13 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 281
|
||||
>>
|
||||
stream
|
||||
Gas2DYti1j&;GBn`BOD:C$`F9ZVnjI&kF'G*Fh#tN?'!3]KRM,ciKk&h=0aE:V*="Q_Ne&,OhA1lR406EhL)):sXAaA0[ug"g)PsmBSG*k#J$))")C&+kr+KmIL<Brl.":L6#Q;:T1n?*25E!Zk"i,4uuBV3G4oRN56iFD+G.*U'<hlkt*7N8pVC@\#B7T'\f?qTfO:fq24F=Moh9cYOO9_Ug3_JW1$`&3Et?9G$Rf%HgIe&37c9!:H9)*A"58?9%Ib;S.e4E4@\m25^i]720%7~>endstream
|
||||
endobj
|
||||
14 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 249
|
||||
>>
|
||||
stream
|
||||
Gas2B8INBh&;BTK(%8)Q2+a"?*aMq\4K.?![FW8"e$p+akF8n$UkfH'`dI4a"80L!>ZbC60Zk4LLGE6]&5Z#qMYu/6Ns)3ldF]OCoN(cR,K(-$>Bb@Hb$Fm@B;e+Uh$?f>L6HTg25p.\@EBp=GIr"0+>.bL"Ab!5e$0H>2u,XrGS3n+\I^LXNi]kl12d&'Y,la0?'!jr\BDiS++DQrec,bZT6(6I/"hnM&*R'u?RM762ns?o2j@QC[f~>endstream
|
||||
endobj
|
||||
xref
|
||||
0 15
|
||||
0000000000 65535 f
|
||||
0000000073 00000 n
|
||||
0000000104 00000 n
|
||||
0000000211 00000 n
|
||||
0000003775 00000 n
|
||||
0000004033 00000 n
|
||||
0000007969 00000 n
|
||||
0000008227 00000 n
|
||||
0000012088 00000 n
|
||||
0000012346 00000 n
|
||||
0000012415 00000 n
|
||||
0000012712 00000 n
|
||||
0000012784 00000 n
|
||||
0000013177 00000 n
|
||||
0000013549 00000 n
|
||||
trailer
|
||||
<<
|
||||
/ID
|
||||
[<8efaabb9b9953607755769fba673a5bf><8efaabb9b9953607755769fba673a5bf>]
|
||||
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
|
||||
|
||||
/Info 10 0 R
|
||||
/Root 9 0 R
|
||||
/Size 15
|
||||
>>
|
||||
startxref
|
||||
13889
|
||||
%%EOF
|
||||
@@ -0,0 +1,88 @@
|
||||
%PDF-1.3
|
||||
%東京 ReportLab Generated PDF document http://www.reportlab.com
|
||||
1 0 obj
|
||||
<<
|
||||
/F1 2 0 R
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/BitsPerComponent 8 /ColorSpace /DeviceRGB /Filter [ /ASCII85Decode /FlateDecode ] /Height 80 /Length 4030 /Subtype /Image
|
||||
/Type /XObject /Width 400
|
||||
>>
|
||||
stream
|
||||
Gb"/jGApR4(<3O%]`e\8(`3O5&gh'9UL4Y/2^+lN@RPf5J1sk+NQ;)EK[=iEKil-Qc;6BQ-:LGDM9+$J',f.nAHF$>+;)0QS%B_Sc9&LGT7Z-dn+T=!BA[iT=_mDCmsXW7DDr_l&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j4;C$M\fk4SCf9@^_*csp?0E:tA:NU]#jiWj*eoUTRh)0!!`5]fOL5)!H>oFG9DVR2p+lUH`IrqKY)BXD"&`V6FB2;HMc(7';GJ3Ri.fjIH*^/fZ0Or*2Il>Q?21r^]?[Q:^FZY!?_$=YY:S0iN>)NU7,O;iAF)l:J:7R.&b*4>RZXupbn&1%rQH_7Vb5!5MER63=/j;H?_mC;n]Y$@HQVU)1)MO]*WmC]s?Bm'Eo$F't5&H1H?C?\PZOT*=k"OUBF^Z7('_KU*cW%)S*WMHX>B\o<I2:$`SBCZ%`^?q2(GB+]eubDeR*F:Vn)#4P":#10VP[s<B,;6r>eYSG,9p^:L_3O7EcSHah6r%e]`O=YOgf8dp9lDfH=\S3c8r1HgU==+3,mg#Rl>?_pYUI1'87(cTWVY:DUqL#0^"?4&$]I&kNC0`5JLC0C)C?hEoh,WmelnPO$)t=.f&emDnSAh"(gjeWnZaC>tjK_q=<Y;2^p2tgSS*;Q.a5>kWnKp@"uqSN>jhKjj-*a*6R/cmlaT]DPqQi#i_sfOu^V];l<C(qWb+)+X,K]55F9'T7-DN5.u%#%NNom8J@WaU$Ys4EcZ<p'k6@=H1U1Nf\"4dJ%Sa[;MZc\a,_<liPGacdte(N4#Ld&J7G4#qWW.gemV!7e*Ykse!lmlI6&MpTjGEYW""=AdA+bUmFtWqp@60F94#=0o#op<o8V#IIK09?Vu_>9H0E/'"u"N,<U8_fPGUCUB[J#'4)`ugV+[/l@hgImB]$Q&++O4II3)@-X=:iG5aNlrihrDt1>!G@S%i^qIJ7$3^\6AsEbQnB1qh&Ubj<rb+7oR9`51Wc:HuhG`mCl"YPN?sfuR?9TKbo,*Xsp8_GH7iUV'<j2Q"^R:?R!:`1LALo[6Bt.TJfM[:mr31c/1%Y[kk=hS"9r+/DM.<0XIVdF$A<$KL.*`4/*c#08_`F9A=5:/6g][Wq=O\\1b/3Y3fE$[VL2A^Dsg+d,P8$hWM:-_?FRhKnLi6AL;36.CAZjVR[)*:k&[VG3Q>UVt*h^pT^rHWE2?D;2KcRo]!jbUdu=J*Y^iO7NMpK&+/M?E#p8P[:&KcCI&WT>lj0hn!r'Ddu;@XC[FI)\EY_V0_d]7q%$Vah7c\%+%Y5*O#<]LtTjQE1fFkLQENDq6=GM6pd`rMIpbfSH&W.T3_Ol$<b!Gm_&PqlR5%C@JK*Ol!h2Imp7Ot.+d%:%3%3]MgkH[#Hbj+HhPP+H1'Iu;KCj>&_r2rYl%'!QJ.^n(hm(#X5AF,*LSQi,0+l+X_QCd-sXH3[JDTElP16rE1kDGP]cKR_8u#V]Y"6aMOg*%"icN@-gSBDCX=S#a-tPZm-JQ<M>nqsR%UpnUK?#%8%U]B4C%d;CZj!6'e<<Q+hZci=8cPWZ5+GD%id30L5h:gr7\PodKJi:2tP_K@\Qq+EV)a?Ul]h5_1DjI[kCso9J35<SVlOcqg71^,=fU%5!E:*Yls*-m+ARh)lulg3U$-Nba:,pm+SkJTsi93ruC,0)`CY;VB`@`"Ms*Z\d)j&;boQ1M6h3^7dju]MOg*%KdG1:BYpEDMN0Qp=1H1W:%UjRoYunt=j%eq(Yulu6#ZJULEF)i>=uH5kuE5#MQ?sdq?&m6=[kl8Tc=t$9jhO62tP_K@\QrVip+eZoCIBVVI.)e.%EMOIc+6PiK4ajcW;'k,f-)WZWXVHl1GC1G[.CW]@LAEm#oop\)2X5*2\@n4)j*X1-$m:bbYF=S8$HLkmmrTSX5auF,e"0nU1VTA)3IC$D8-]"Dr>:d49"#,PSWbhql]_\g6^db0"bZp1dtL,AY,Hrd`/-m-ruOhB-14LQ;od4K*.0TNLGYVbWfTAdFC7kkt8JqJt7[c^Ql>:b?jjnDUs$l_[IMNbHS?"ibH+Q6-&jp=Nm3cV`0>dPSYcp:A>VG$`7Io@6oL.1Yru`9tj;1MbRC8OuD!T$h`FdRA1Y^%4"c]QM>g?3P;LgT"R'VH'Wq6-8?<USYnh?<PGk\JN:FmgdODqo^Y-[-q!`!^tV."8sD%$a=W62-nVR5dA`f_`,bBeeu3ao@Bu0gUBEIr:BIZ;o%Z0&e^p5.nho"?_Kdin'6=Th05;oSNV>N<>b"^YjQ;nmbG@*@ga"'Fml$&HKSjO@C'C@dp'!_Ffa>t?@e@l=1ULa\_XlA]C"jJ[EOb[EU<Ad0TKc?#SM"3X(Lm^XP:Ai-*%LAh7FIFeZ)VBqd3-*?CmNmaQd@AW)n[b?#",SQm'MSet=LGn*8H(EmU"ap$8frb$7:h(cmkPT'j1f=')P0OfDf-J!fq>LM]CT:sc(6S,=-4)`A+!]_LKEDDRhbe129S\o$XGLlIB_@GSM;0aiBoeY#3\$rnT",rr#-L8WThrVH3Wd>f5/m!I81VBY=an%\pL-#\@;=L#`iU,5`YFD7-fMIo%:V-`E0oiVNt"U>:mo'NpD2RN%p)fKE=Wh?#X9URXg?^k0Ij3g**Ork^G>.)NP0^Zo`HhZs,@E=NRrX<D_QiVi,Ql*A5m(A3^.6?$s:Tscqo1s(3kg6"-]oontmG$5h'tiM,?M3U6bKrXpDQZ+8OY)5\YPQ1RA1]df+(N?OL"VhJ@gqIkI.E-;o/3Qt1BZ.-^fcFk]\"'SkiU-Zp$1)VkDu]4'.6Q)Rpf>e7Rl\9C@L/t\;Z<&1+XN&%j.rRWDZ,P`5RWN'o-KfFuX0F4HJ=HdaGcm]mfpk]J#M4P%(H_.XIrZ=?Cg4eurF6+-eE^<^5E+044/<]H!b,^3/b-4Pk'SYL'H2BJX;H*1(;>.@2s+l4^Ld[GX<"m,,S8jnVW1p25U-)leT"(Rd*85eRMpFgl;HQ5B?dNZ>%sGi@`*P8u]+OF+6o8`@u[s,9A[)598-5To.NR<lP-If-_B`3<WT]66n!$kEk=@IN'deV@j'G*jECo*=n-CGGPSQS2^c0JU>Hgri>q[;4C6)9l.D<V/o>Yr;9tI4q7F2Vk[EZD9m8%k8qS#MUgZFAT/+X&c>ZakEt-!tIC@gq7p=,@:&"fuR?9TKhl$^"^,@C[&CBoS#=R9q$`uOH:#JSJ9<W:p15N\sY?eMAc-TfVUQCf[/aU7Q<+W&cX!Gg5QIU/.fucFm:(bne,@%k0<G*A&jUu)&=dF=S^2O(/T;7N!1"hV*7RChA=/aUheSb/jG#CKc/_f;X(jRo.*8Mg=Ih\Oh@5uQu:VmJml*(fb1#VW`5t>P:&Gm=.MEs`m+]EUC$a"&A7V[4&1(O+/U5tc%5l8bfgduMA7WcZLS/+L8KGT=PZX]or?B?!uj1:N/iq<5oqK2W)9>[j2YeFBE.YV?a;6JT9Q4NVaHp2:S]C*Y[u"D`JYPMk(OUXco6M[L(>@Y58HVo85]"55<n&T0HJOk!N-,QmPlFjWD]R'acb;QGO![lac[s)?XtR.?N'ae(!#%[/$O0^<q#:-:!!5#^Yc+q1Gi1@C=S]=(`^A8mFp['?G60sRthIolIN'VAu5FhcfZf/QG"1R`Q25(?i\KC4#^p(-nO-j%L*Xa(C,)gChDno6`*pRMWEi/GSmFTTK>IG+o_]R.!G(9muq-BFa8ETq]Ipg#U)AK2f>//o4/S?>UdM"B);/a.)]9Kd\TSI[X3Z=U2f//"o0>`-h5:!aHeD^"pG1@4>4BtrUnbQ\p&f=niu3sje\cKZtRhg)ldr?b(YV+-RL1_Dd!GjKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U/qprr=Mu.$a~>endstream
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/BitsPerComponent 8 /ColorSpace /DeviceRGB /Filter [ /ASCII85Decode /FlateDecode ] /Height 80 /Length 4649 /Subtype /Image
|
||||
/Type /XObject /Width 400
|
||||
>>
|
||||
stream
|
||||
Gb"/jGB=Qg)obtD(hlkN"f0(5:+-aT<2CgqGS',YK-J92</AC'U_i9g+@S\\U&raT$(ug!$4%UW-.5X!OX),AnUAfpn!Ei.ZhN>CmL:6LTAL_NG:0`'1[i!JbWE/;*Y0EI&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j4Eeer2aX7P8Gl-m;mrV!9$er./Dr&!ITgFGgA]g5nB?j\gC<`5,n"5+/EDb62SNDgA(+`SG0AE<rQgcQJG2U/e+R8qjL(>AddU%7a,tE0=*^(Ec[;Xqd5T\EOW]FtK0RnAjQS4C.$PtF;o^lOY2JjA(q!>?5rnj<G\:)&,dhUp&iW]f.:o[KoXBDknr:%Vf^;G^:].Qgi,t!Cq7>hqh;qnpMFKAOW-;:r<^AH9g>e+lD6ps06kbI-F(^N'<gi-Gcfoqm`D<`e/XBDl/[X1QKi:_Nlme*"2?`R7BfZQ0Yn][CW7>_eACt5OcbEjk(IV6iiD9J4s/kRp`OHAta@uhcDp$1sUc^m;JVdmc-<LpXGp$)c(Hk;,Z7Z;:iTQi5(W(^OWj5YQ"X&GpV4SR^\.hEaCrqG<">P%b#odXg*ftJs\9L;@@2JoU%\UmIb7]_:X"BnAg8PVo7"#l=q;QoL`\om<c?*C)#T0=:[KaS]?>+g+\Q7Q1-1hhO`F6CiVjuIe^m/?\904&j@l.#kH4Fk1D;,Pn,s$FCkgKq>QM?oqqM4!SS5Q<QQX'N=qdO.h^m&2`sM\[n]^V[k)d\U9,\%pa.@q.TDm$L"eIRL!nb*BjG)?7Bqo-V?%d\TU3:EfTg^\mXEJ,E_-&<#c6bEkHfgiJN=njqoeR5#87\R3+#(G8t>*fUYEeuW#*!X5lBcX+"oeKmkS@.(h*0mT1nrUS,bYIsE5U03`ScpJ<e8mf>^IB)[\rqY`>H2da;>5Fp[LW%H??G5X>9I<PAY[@K\1i1gkRchBYhS[+bJ,aqhT)A3+5BlPN)'4Kg>8jeZbo1@@Rl5+ugph@\]QnSZaRRJ1c_/-=odWtJ379>AFMnp+G4!`KBR8dD=K*Pa.$qbpOj^:tQl,H?<^#prXH0"q[r1$MSsu`#-;Bq^d`.=is1np^^aDK96L*.(Hg9*0T*>V9Q`\n^`L]5>i\sQ)AOEM\?DBt!8#7Y0SiiDsB23^-W`?+JZ!P-=ienX,kdX6M.]EUBN#=ETZtP"4E0#kk.nX&MZXupQJK6dnON]"CP^nh:H5s_#io8rs2G=?r4"L]CJn3h!L8UnMn3&`J&s32P.9WsPPW!4%+F!3AJ[djYeuWV-Z>A4"8En[*=5Y$_-RU/bi:`*I1]ICNmogd^&>KHo<_nI&aUW0Z4F*r.YHH(N0/])H@AVt,Qj$t>c%"]+(Grh2@hqR\Kp>)Z"qC's<2ica_Zur<m^u*Y/Q8MTPR<N]nt9$(DsPuVc&uYY%\d#Y-N4c2<Xd/X\=C<^SfiC5&Xk4B%BU5u_1MuTN[G%fFL=0<a#bps%YP9MDr1EZ\)4&m]`M"L$&W*k$l5Y3iR$k4ldeYQkic^(a@KBro!2iME/?\=G3i$/_H*tt(cQ?&U`e+$N@567Stob?D:u4k4BLb^V?V:WLl&4sV)/U+,U1BRd9b&3,)e"jWF"OBU.>-Q2/AL<_pP5LNT<5uc'*A?hCXJ.kFHh"?b\4MT7?jNW:Z<';^CI[++A`7OE8^;3KaGt`tU2'.D<$$(8lJ$qXeKD*.IYNhqsqO(qjtQ7I&aVcqsCYCu`NpZ:0D]`fV90Y<_!ZI:XrsMu4G<QsOrnk)*8Si:<@U^<s54-76lfcp@@u'Ae03>pR/S`Z+]D^@_i>hXTXH<?g3fN.t`mHFrIk+[^u,Nj?A=n'Rm8Z?>Qg<A$#ni\E8Ed[UQk3^PU.?M3aB)jcO&2:>*\=T.d1+*ZF@"?A5D+<XGJ21+oBV+^@Ul)1.3B,E[O-k`eb[<f-3'8ScY12"q)NH?`MY$JYo98TG?o]]l2K>E8\PZb2+R`274i=_&+Z-TjqgJjbPoZE^@ah=VW647kCR58IohRMC(*CR(BT4n*g4pe*Q*FX(Ze.='Up?^1I6=]+CR-!_%#,)!P&(,gr)C9gt'd@U<[_Mh<bGWalRRZ:i#nmA)7@_#-gU80l>7[.BXW9QFj@HU`+`>?^il-h`Cm[n-7Qu"^R/N<pp-\T5XtaG+fY:'fp1.8-b7N1^\)2X5)8a8-&-1WqRO?!W,`XY#S-!.OM%I+5h25bRQ9Y.]eP[9P9!<'"`J%WLB$Hd$-E,<4N*a'd,.Y1#h7D;b:aKfuJfOZ2&A>q)/q=rVY'\h65$\b8K9Z?3pKR74:;W&Vrb0'RUnf8X,*n!Vo@(0TeZW?;S5&\%q=Edol##.]0ta!D>-S?B@9uVpR#`p(A3D=o2%[df2_8e$4'f1)NRB<l#/epT=HLX<p$1'cRu'Req!dUQ^M`WY%C7F7G4"#B&i77,rq+Z8\EqkQUW6mqn`1?2:<99X*Ib9nLECu$5p=q2dRJff._W-+(3b(Yrlpuq[pdum$V%>TH'-m?jdWZp&qR5i[E?3(7'GO%!UQIuh95N^kDD#a!lS/(+69W4/mZ%*VQC%uHIj\7D6iFIm5:M9YL]ma?a!d!FZRM2L8hJQ&\WdViAY@$CLtG[U/u!RSi'E`c5,!URlB=%P(1u[;9l7Q6M'9A^A>u+KnAe0>frXsk/mMom5)EOm'BeFBCOfX;l@XR`#.>7PVnNg$Ar0C2iBc2!hXl2M:?(bVG3Z?oZE^@ah:&r%'`jC6A5c$G<'9m%\d#1i<%XtiOYApFZ#ngUWnJtG/]%:$X.K=GW,cd(,X]VC#UfU)`BPA]i2*s(/L,')$%Fg"HB/6o9V+;QA(poeWD(H%'PK'i*'^M$-GEjj5ZsajEKE@7)e`B&V9IbT7,k5*08(&Z-#ERJlX!nJ6-,qOtU0+)>0FG+$Y3Z0!U5:(0dnE2>ldh:HuO;nY0Pm-Nb)*J,HR6XB5,?i``NJa]m"YMA3m7nYoRq4LCiWU8$(:YI&.rTieR/pt(60)sl<&(qm64bGjc,Wid`T\j#uS,;!3t#ZWX@Qb]H6m+0<mq"pT?/gh,$$MM]52_Qf@HL!0M.BgGYRaS8&f<<*5L9LAQhWCm'9YO$4=E`:QN3t-8WYjUQ_:E*b:=22WPC.B]jq;$$D\t?-7XMIQbD+4-gUCs0@Q;HpGa*a;+>:1;qsHNtRn/+a^TqR>@.Xea;s?o];:@&[UK4L#Bgp,F,RsE=He%6J^-n8]*f2^ig*%<Ho%<Bl^t<YGaN-n_khWk[Q9Js,*5hZBeUD3LcIOkYKViFO>WUU9]mE:;]ttcMZNm[=\K]o`.A)aeHbb.4k%p-knF1D'??PP_$'uAW<n&=urVQ?PaH<5kR52PVqQ%ASi>.YfGi'1*4F,A`4oAd^jGb*;,,KJMg7lSZNi\i-]QnS9fD@keR"@(a"19:*>e>/RJdS>U2U)kn??q^kNZ$^Bf?AOeYG1M#F69N)YKHC81t4$<='G]b*BVjAL@1)g&>WXcmq%"$FN(@d[u(<&)jh7^9q42j;/'(@*qZ9#1_hV7kg;bg1;[eiWMc>NH^cj+,)L?e-tC8Ul?"$BW,("fP"k2kTgOS\'#P]UR$afb6UO5'fV1fm!:>pa-mFu0fN>Vk8V,EUiAp`*k=7lnd6j#G?@gXj^]4:[lhRS7^\h!<=Ohc'UIUA;HcM*b-SH\Up<+TaZX2<A99=J]8a]hLl'35R1/-l(io8r/DFn:Ul4p7&\[%C"A]pCUpdeZ(I(:I`"K>JrHeBK!>nJAaY?kLL/gnU,#h^q""dFhRG&6H/a5UeX7Z<FFkZgN[Qi[]b[qUYmnY9pRZKapTW57tpDaub.C],WPGf!!siWZ[$Gc?(sK1T!bnhJ6m\oc&$UX6PlA0M$=gp2k0U'g^N_FXLaSPN%91<W='3St=UOZ_!6o;:`G7>p6gFeM-UOA/J@Q7cIs89qr*N`gtc.gV7W@Pgu35H&1[il-gG6ps9s12"o1k*p:dX^3kuci>4)9#`+:[3-;KGd.!54*Cm-Y<5R+fnq"UN/<B'K;7<YI,ljoR\iis3%@XTHKF\iIdi7K^8P2@-It.qj05blIf9,65(+?-)OUrW];+]CXb/IHrbtO&cAE>ejI8lFT$382/`"$_QOh#2/DLjqoXT1J5gUR^:Q73Z/%*Q8G$9Be]Pl[k/.(FEb(9d)@N&6Z]ZhSo`8H7)_IDWLQ('dTVL5SII6X+!=b>6UY]Ahta^E[MKH*pf9IX>_4DG/tD:u3@Q=+'LrO%cbH8T["^qG*h2K%:eK3(85o6G^0<BC>e=,qT0_iZI$F6Ci^qWb,K`6fP]$4[.MF'Y6B(&,:Gh,TCP2$t[aqq^Lo&44Hf_5)pDh0RQRF/\'r*qi?.M@`+%+QqN0=0@MG=&Q81PV9AI^:8FXigm1m+bV6r>dtn(*3_sE%hF_WLlbEq6:+#iY$HCPCI\XR.7d!#(c,btV+R!a:h@tE4Z#"&=0Gs$*@i:d&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+lr@e*ukCHmf~>endstream
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/Contents 9 0 R /MediaBox [ 0 0 612 792 ] /Parent 8 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject <<
|
||||
/FormXob.41b05a9cf8679f0fe6e7c30c9462b767 3 0 R /FormXob.94284ebb61fac7951963d5746d1b193a 4 0 R
|
||||
>>
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<<
|
||||
/PageMode /UseNone /Pages 8 0 R /Type /Catalog
|
||||
>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<<
|
||||
/Author (anonymous) /CreationDate (D:20260126172022+01'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20260126172022+01'00') /Producer (ReportLab PDF Library - www.reportlab.com)
|
||||
/Subject (unspecified) /Title (untitled) /Trapped /False
|
||||
>>
|
||||
endobj
|
||||
8 0 obj
|
||||
<<
|
||||
/Count 1 /Kids [ 5 0 R ] /Type /Pages
|
||||
>>
|
||||
endobj
|
||||
9 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 300
|
||||
>>
|
||||
stream
|
||||
Gas3-b=]`-&-h'@TAj4XMk6`hV:j"r+Qu/5_JPH2[*jl@3?0-u[(9'GR:-/*/ft>A_=nj;i7d2;EpsDoJr<OBhVlHiq4E/El7+06*H?(h_eGnqiS:>Dgn0>N^CGqOd65m'$2XdN[8"CN<R^<p;O;.QTL>"4'o-s=`lHc!JpSi8$*d@]6l&@V%Q+V`W6/nPEL_rB?OF1iZbk.;Ju<];RLo@-9lO$dQ,9&`I`%EM@\dBr0Lf$$+R^&+/ncK?;0=7o:`];ceF"uKA7ETdrT"0YNT=QC"`>/@%I83@M@]K&@Nk~>endstream
|
||||
endobj
|
||||
xref
|
||||
0 10
|
||||
0000000000 65535 f
|
||||
0000000073 00000 n
|
||||
0000000104 00000 n
|
||||
0000000211 00000 n
|
||||
0000004431 00000 n
|
||||
0000009270 00000 n
|
||||
0000009574 00000 n
|
||||
0000009642 00000 n
|
||||
0000009938 00000 n
|
||||
0000009997 00000 n
|
||||
trailer
|
||||
<<
|
||||
/ID
|
||||
[<60f7c7338a7d1cfd54f86e6a06e41602><60f7c7338a7d1cfd54f86e6a06e41602>]
|
||||
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
|
||||
|
||||
/Info 7 0 R
|
||||
/Root 6 0 R
|
||||
/Size 10
|
||||
>>
|
||||
startxref
|
||||
10387
|
||||
%%EOF
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,223 @@
|
||||
"""
|
||||
Unit tests for DocxConverterWithOCR.
|
||||
|
||||
For each DOCX test file: convert with a mock OCR service then compare the
|
||||
full output string against the expected snapshot.
|
||||
|
||||
OCR block format used by the converter:
|
||||
*[Image OCR]
|
||||
MOCK_OCR_TEXT_12345
|
||||
[End OCR]*
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from markitdown_ocr._ocr_service import OCRResult # noqa: E402
|
||||
from markitdown_ocr._docx_converter_with_ocr import ( # noqa: E402
|
||||
DocxConverterWithOCR,
|
||||
)
|
||||
from markitdown import StreamInfo # noqa: E402
|
||||
|
||||
TEST_DATA_DIR = Path(__file__).parent / "ocr_test_data"
|
||||
|
||||
_MOCK_TEXT = "MOCK_OCR_TEXT_12345"
|
||||
|
||||
|
||||
class MockOCRService:
|
||||
def extract_text( # noqa: ANN101
|
||||
self, image_stream: Any, **kwargs: Any
|
||||
) -> OCRResult:
|
||||
return OCRResult(text=_MOCK_TEXT, backend_used="mock")
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def svc() -> MockOCRService:
|
||||
return MockOCRService()
|
||||
|
||||
|
||||
def _convert(filename: str, ocr_service: MockOCRService) -> str:
|
||||
path = TEST_DATA_DIR / filename
|
||||
if not path.exists():
|
||||
pytest.skip(f"Test file not found: {path}")
|
||||
converter = DocxConverterWithOCR()
|
||||
with open(path, "rb") as f:
|
||||
return converter.convert(
|
||||
f, StreamInfo(extension=".docx"), ocr_service=ocr_service
|
||||
).text_content
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# docx_image_start.docx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_docx_image_start(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"Document with Image at Start\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"This is the main content after the header image.\n\n"
|
||||
"More text content here."
|
||||
)
|
||||
assert _convert("docx_image_start.docx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# docx_image_middle.docx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_docx_image_middle(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"# Introduction\n\n"
|
||||
"This is the introduction section.\n\n"
|
||||
"We will see an image below.\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"# Analysis\n\n"
|
||||
"This section comes after the image."
|
||||
)
|
||||
assert _convert("docx_image_middle.docx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# docx_image_end.docx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_docx_image_end(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"Report\n\n"
|
||||
"Main findings of the report.\n\n"
|
||||
"Details and analysis.\n\n"
|
||||
"Recommendations.\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
)
|
||||
assert _convert("docx_image_end.docx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# docx_multiple_images.docx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_docx_multiple_images(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"Multi-Image Document\n\n"
|
||||
"First section\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"Second section with another image\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"Conclusion"
|
||||
)
|
||||
assert _convert("docx_multiple_images.docx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# docx_multipage.docx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_docx_multipage(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"# Page 1 - Mixed Content\n\n"
|
||||
"This is the first paragraph on page 1.\n\n"
|
||||
"BEFORE IMAGE: Important content appears here.\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"AFTER IMAGE: This content follows the image.\n\n"
|
||||
"More text on page 1.\n\n"
|
||||
"# Page 2 - Image at End\n\n"
|
||||
"Content on page 2.\n\n"
|
||||
"Multiple paragraphs of text.\n\n"
|
||||
"Building up to the image...\n\n"
|
||||
"Final paragraph before image.\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"# Page 3 - Image at Start\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"Content that follows the header image.\n\n"
|
||||
"AFTER IMAGE: This text is after the image."
|
||||
)
|
||||
assert _convert("docx_multipage.docx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# docx_complex_layout.docx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_docx_complex_layout(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"Complex Document\n\n"
|
||||
"| | |\n"
|
||||
"| --- | --- |\n"
|
||||
"| Feature | Status |\n"
|
||||
"| Authentication | Active |\n"
|
||||
"| Encryption | Enabled |\n\n"
|
||||
"Security notice:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
)
|
||||
assert _convert("docx_complex_layout.docx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _inject_placeholders — internal unit tests (no file I/O)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_inject_placeholders_single_image() -> None:
|
||||
converter = DocxConverterWithOCR()
|
||||
html = "<p>Before</p><img src='x.png'/><p>After</p>"
|
||||
result_html, texts = converter._inject_placeholders(html, {"rId1": "TEXT"})
|
||||
assert "<img" not in result_html
|
||||
assert "MARKITDOWNOCRBLOCK0" in result_html
|
||||
assert texts == ["TEXT"]
|
||||
|
||||
|
||||
def test_inject_placeholders_two_images_sequential_tokens() -> None:
|
||||
converter = DocxConverterWithOCR()
|
||||
html = "<img src='a.png'/><p>Mid</p><img src='b.png'/>"
|
||||
result_html, texts = converter._inject_placeholders(
|
||||
html, {"rId1": "FIRST", "rId2": "SECOND"}
|
||||
)
|
||||
assert "MARKITDOWNOCRBLOCK0" in result_html
|
||||
assert "MARKITDOWNOCRBLOCK1" in result_html
|
||||
assert result_html.index("MARKITDOWNOCRBLOCK0") < result_html.index(
|
||||
"MARKITDOWNOCRBLOCK1"
|
||||
)
|
||||
assert len(texts) == 2
|
||||
|
||||
|
||||
def test_inject_placeholders_no_img_tag_appends_at_end() -> None:
|
||||
converter = DocxConverterWithOCR()
|
||||
html = "<p>No images</p>"
|
||||
result_html, texts = converter._inject_placeholders(html, {"rId1": "ORPHAN"})
|
||||
assert "MARKITDOWNOCRBLOCK0" in result_html
|
||||
assert texts == ["ORPHAN"]
|
||||
|
||||
|
||||
def test_inject_placeholders_empty_map_leaves_html_unchanged() -> None:
|
||||
converter = DocxConverterWithOCR()
|
||||
html = "<p>Content</p><img src='pic.jpg'/>"
|
||||
result_html, texts = converter._inject_placeholders(html, {})
|
||||
assert result_html == html
|
||||
assert texts == []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# No OCR service — no OCR tags emitted
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_docx_no_ocr_service_no_tags() -> None:
|
||||
path = TEST_DATA_DIR / "docx_image_middle.docx"
|
||||
if not path.exists():
|
||||
pytest.skip(f"Test file not found: {path}")
|
||||
converter = DocxConverterWithOCR()
|
||||
with open(path, "rb") as f:
|
||||
md = converter.convert(f, StreamInfo(extension=".docx")).text_content
|
||||
assert "*[Image OCR]" not in md
|
||||
assert "[End OCR]*" not in md
|
||||
@@ -0,0 +1,234 @@
|
||||
"""
|
||||
Unit tests for PdfConverterWithOCR.
|
||||
|
||||
For each PDF test file: convert with a mock OCR service then compare the
|
||||
full output string against the expected snapshot.
|
||||
|
||||
OCR block format used by the converter:
|
||||
*[Image OCR]
|
||||
MOCK_OCR_TEXT_12345
|
||||
[End OCR]*
|
||||
"""
|
||||
|
||||
import io
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from markitdown_ocr._ocr_service import OCRResult # noqa: E402
|
||||
from markitdown_ocr._pdf_converter_with_ocr import ( # noqa: E402
|
||||
PdfConverterWithOCR,
|
||||
)
|
||||
from markitdown import StreamInfo # noqa: E402
|
||||
|
||||
TEST_DATA_DIR = Path(__file__).parent / "ocr_test_data"
|
||||
|
||||
_MOCK_TEXT = "MOCK_OCR_TEXT_12345"
|
||||
_OCR_BLOCK = f"*[Image OCR]\n{_MOCK_TEXT}\n[End OCR]*"
|
||||
_PAGE_1_SCANNED = f"## Page 1\n\n\n\n\n{_OCR_BLOCK}"
|
||||
|
||||
|
||||
class MockOCRService:
|
||||
def extract_text(
|
||||
self, # noqa: ANN101
|
||||
image_stream: Any,
|
||||
**kwargs: Any,
|
||||
) -> OCRResult:
|
||||
return OCRResult(text=_MOCK_TEXT, backend_used="mock")
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def svc() -> MockOCRService:
|
||||
return MockOCRService()
|
||||
|
||||
|
||||
def _convert(filename: str, ocr_service: MockOCRService) -> str:
|
||||
path = TEST_DATA_DIR / filename
|
||||
if not path.exists():
|
||||
pytest.skip(f"Test file not found: {path}")
|
||||
converter = PdfConverterWithOCR()
|
||||
with open(path, "rb") as f:
|
||||
return converter.convert(
|
||||
f, StreamInfo(extension=".pdf"), ocr_service=ocr_service
|
||||
).text_content
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pdf_image_start.pdf
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pdf_image_start(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"## Page 1\n\n\n\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n\n"
|
||||
"This is text BEFORE the image.\n\n"
|
||||
"The image should appear above this text.\n\n"
|
||||
"This is more content after the image."
|
||||
)
|
||||
assert _convert("pdf_image_start.pdf", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pdf_image_middle.pdf
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pdf_image_middle(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"## Page 1\n\n\n"
|
||||
"Section 1: Introduction\n\n"
|
||||
"This document contains an image in the middle.\n\n"
|
||||
"Here is some introductory text.\n\n\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n\n"
|
||||
"Section 2: Details\n\n"
|
||||
"This text appears AFTER the image."
|
||||
)
|
||||
assert _convert("pdf_image_middle.pdf", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pdf_image_end.pdf
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pdf_image_end(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"## Page 1\n\n\n"
|
||||
"Main Content\n\n"
|
||||
"This is the main text content.\n\n"
|
||||
"The image will appear at the end.\n\n"
|
||||
"Keep reading...\n\n\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
)
|
||||
assert _convert("pdf_image_end.pdf", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pdf_multiple_images.pdf
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pdf_multiple_images(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"## Page 1\n\n\n"
|
||||
"Document with Multiple Images\n\n\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n\n"
|
||||
"Text between first and second image.\n\n\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n\n"
|
||||
"Final text after all images."
|
||||
)
|
||||
assert _convert("pdf_multiple_images.pdf", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pdf_complex_layout.pdf
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pdf_complex_layout(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"## Page 1\n\n\n"
|
||||
"Complex Layout Document\n\n"
|
||||
"Table:\n\n"
|
||||
"ItemQuantity\n\n\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n\n"
|
||||
"Widget A5"
|
||||
)
|
||||
assert _convert("pdf_complex_layout.pdf", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pdf_multipage.pdf — pdfplumber/pdfminer fail (EOF); PyMuPDF fallback used
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pdf_multipage(svc: MockOCRService) -> None:
|
||||
# pdfplumber cannot open this file (Unexpected EOF), so _ocr_full_pages
|
||||
# falls back to PyMuPDF for page rendering. Each page becomes one OCR block.
|
||||
expected = (
|
||||
f"## Page 1\n\n\n{_OCR_BLOCK}\n\n\n"
|
||||
f"## Page 2\n\n\n{_OCR_BLOCK}\n\n\n"
|
||||
f"## Page 3\n\n\n{_OCR_BLOCK}"
|
||||
)
|
||||
assert _convert("pdf_multipage.pdf", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pdf_scanned_*.pdf — raster-only pages → full-page OCR
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pdf_scanned_invoice(svc: MockOCRService) -> None:
|
||||
assert _convert("pdf_scanned_invoice.pdf", svc) == _PAGE_1_SCANNED
|
||||
|
||||
|
||||
def test_pdf_scanned_meeting_minutes(svc: MockOCRService) -> None:
|
||||
assert _convert("pdf_scanned_meeting_minutes.pdf", svc) == _PAGE_1_SCANNED
|
||||
|
||||
|
||||
def test_pdf_scanned_minimal(svc: MockOCRService) -> None:
|
||||
assert _convert("pdf_scanned_minimal.pdf", svc) == _PAGE_1_SCANNED
|
||||
|
||||
|
||||
def test_pdf_scanned_sales_report(svc: MockOCRService) -> None:
|
||||
assert _convert("pdf_scanned_sales_report.pdf", svc) == _PAGE_1_SCANNED
|
||||
|
||||
|
||||
def test_pdf_scanned_report(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
f"{_PAGE_1_SCANNED}\n\n\n\n"
|
||||
f"## Page 2\n\n\n\n\n{_OCR_BLOCK}\n\n\n\n"
|
||||
f"## Page 3\n\n\n\n\n{_OCR_BLOCK}"
|
||||
)
|
||||
assert _convert("pdf_scanned_report.pdf", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scanned PDF fallback path (pdfplumber finds no text → full-page OCR)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pdf_scanned_fallback_format(svc: MockOCRService) -> None:
|
||||
"""_ocr_full_pages emits *[Image OCR]...[End OCR]* for each page."""
|
||||
path = TEST_DATA_DIR / "pdf_image_start.pdf"
|
||||
if not path.exists():
|
||||
pytest.skip(f"Test file not found: {path}")
|
||||
|
||||
converter = PdfConverterWithOCR()
|
||||
with patch("pdfplumber.open") as mock_plumber:
|
||||
mock_pdf = MagicMock()
|
||||
mock_page = MagicMock()
|
||||
mock_page.page_number = 1
|
||||
mock_pdf.pages = [mock_page]
|
||||
mock_pdf.__enter__.return_value = mock_pdf
|
||||
mock_plumber.return_value = mock_pdf
|
||||
|
||||
with open(path, "rb") as f:
|
||||
md = converter._ocr_full_pages(io.BytesIO(f.read()), svc)
|
||||
|
||||
expected = "## Page 1\n\n\n" "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
assert (
|
||||
md == expected
|
||||
), f"_ocr_full_pages must produce:\n{expected!r}\nActual:\n{md!r}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# No OCR service — no OCR tags emitted
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pdf_no_ocr_service_no_tags() -> None:
|
||||
path = TEST_DATA_DIR / "pdf_image_middle.pdf"
|
||||
if not path.exists():
|
||||
pytest.skip(f"Test file not found: {path}")
|
||||
converter = PdfConverterWithOCR()
|
||||
with open(path, "rb") as f:
|
||||
md = converter.convert(f, StreamInfo(extension=".pdf")).text_content
|
||||
assert "*[Image OCR]" not in md
|
||||
assert "[End OCR]*" not in md
|
||||
@@ -0,0 +1,148 @@
|
||||
"""
|
||||
Unit tests for PptxConverterWithOCR.
|
||||
|
||||
For each PPTX test file: convert with a mock OCR service then compare the
|
||||
full output string against the expected snapshot.
|
||||
|
||||
OCR block format used by the converter:
|
||||
*[Image OCR]
|
||||
MOCK_OCR_TEXT_12345
|
||||
[End OCR]*
|
||||
|
||||
Note: PPTX slide text uses literal backslash-n (\\n) sequences from the
|
||||
underlying PPTX converter template; OCR blocks use real newlines.
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from markitdown_ocr._ocr_service import OCRResult # noqa: E402
|
||||
from markitdown_ocr._pptx_converter_with_ocr import ( # noqa: E402
|
||||
PptxConverterWithOCR,
|
||||
)
|
||||
from markitdown import StreamInfo # noqa: E402
|
||||
|
||||
TEST_DATA_DIR = Path(__file__).parent / "ocr_test_data"
|
||||
|
||||
_MOCK_TEXT = "MOCK_OCR_TEXT_12345"
|
||||
_OCR_BLOCK = f"*[Image OCR]\n{_MOCK_TEXT}\n[End OCR]*"
|
||||
|
||||
|
||||
class MockOCRService:
|
||||
def extract_text(
|
||||
self, # noqa: ANN101
|
||||
image_stream: Any,
|
||||
**kwargs: Any,
|
||||
) -> OCRResult:
|
||||
return OCRResult(text=_MOCK_TEXT, backend_used="mock")
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def svc() -> MockOCRService:
|
||||
return MockOCRService()
|
||||
|
||||
|
||||
def _convert(filename: str, ocr_service: MockOCRService) -> str:
|
||||
path = TEST_DATA_DIR / filename
|
||||
if not path.exists():
|
||||
pytest.skip(f"Test file not found: {path}")
|
||||
converter = PptxConverterWithOCR()
|
||||
with open(path, "rb") as f:
|
||||
return converter.convert(
|
||||
f, StreamInfo(extension=".pptx"), ocr_service=ocr_service
|
||||
).text_content
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pptx_image_start.pptx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pptx_image_start(svc: MockOCRService) -> None:
|
||||
# Slide 1: title "Welcome" followed by an image
|
||||
expected = (
|
||||
"\\n\\n<!-- Slide number: 1 -->\\n# Welcome\\n\\n"
|
||||
"\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
)
|
||||
assert _convert("pptx_image_start.pptx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pptx_image_middle.pptx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pptx_image_middle(svc: MockOCRService) -> None:
|
||||
# Slide 1: Introduction | Slide 2: Architecture + image | Slide 3: Conclusion # noqa: E501
|
||||
expected = (
|
||||
"\\n\\n<!-- Slide number: 1 -->\\n# Introduction"
|
||||
"\\n\\n\\n\\n<!-- Slide number: 2 -->\\n# Architecture\\n\\n"
|
||||
"\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
"\\n\\n<!-- Slide number: 3 -->\\n# Conclusion\\n\\n"
|
||||
)
|
||||
assert _convert("pptx_image_middle.pptx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pptx_image_end.pptx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pptx_image_end(svc: MockOCRService) -> None:
|
||||
# Slide 1: Presentation | Slide 2: Thank You + image
|
||||
expected = (
|
||||
"\\n\\n<!-- Slide number: 1 -->\\n# Presentation"
|
||||
"\\n\\n\\n\\n<!-- Slide number: 2 -->\\n# Thank You\\n\\n"
|
||||
"\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
)
|
||||
assert _convert("pptx_image_end.pptx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pptx_multiple_images.pptx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pptx_multiple_images(svc: MockOCRService) -> None:
|
||||
# Slide 1: two images, no title text
|
||||
expected = (
|
||||
"\\n\\n<!-- Slide number: 1 -->\\n# \\n"
|
||||
"\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
"\n\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
)
|
||||
assert _convert("pptx_multiple_images.pptx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pptx_complex_layout.pptx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pptx_complex_layout(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"\\n\\n<!-- Slide number: 1 -->\\n# Product Comparison"
|
||||
"\\n\\nOur products lead the market\\n"
|
||||
"\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
)
|
||||
assert _convert("pptx_complex_layout.pptx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# No OCR service — no OCR tags emitted
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pptx_no_ocr_service_no_tags() -> None:
|
||||
path = TEST_DATA_DIR / "pptx_image_middle.pptx"
|
||||
if not path.exists():
|
||||
pytest.skip(f"Test file not found: {path}")
|
||||
converter = PptxConverterWithOCR()
|
||||
with open(path, "rb") as f:
|
||||
md = converter.convert(f, StreamInfo(extension=".pptx")).text_content
|
||||
assert "*[Image OCR]" not in md
|
||||
assert "[End OCR]*" not in md
|
||||
@@ -0,0 +1,249 @@
|
||||
"""
|
||||
Unit tests for XlsxConverterWithOCR.
|
||||
|
||||
For each XLSX test file: convert with a mock OCR service then compare the
|
||||
full output string against the expected snapshot.
|
||||
|
||||
OCR block format used by the converter:
|
||||
*[Image OCR]
|
||||
MOCK_OCR_TEXT_12345
|
||||
[End OCR]*
|
||||
|
||||
Images are grouped at the end of each sheet under:
|
||||
### Images in this sheet:
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from markitdown_ocr._ocr_service import OCRResult # noqa: E402
|
||||
from markitdown_ocr._xlsx_converter_with_ocr import ( # noqa: E402
|
||||
XlsxConverterWithOCR,
|
||||
)
|
||||
from markitdown import StreamInfo # noqa: E402
|
||||
|
||||
TEST_DATA_DIR = Path(__file__).parent / "ocr_test_data"
|
||||
|
||||
_MOCK_TEXT = "MOCK_OCR_TEXT_12345"
|
||||
_OCR_BLOCK = f"*[Image OCR]\n{_MOCK_TEXT}\n[End OCR]*"
|
||||
_IMG_SECTION = "### Images in this sheet:"
|
||||
|
||||
|
||||
class MockOCRService:
|
||||
def extract_text(
|
||||
self, # noqa: ANN101
|
||||
image_stream: Any,
|
||||
**kwargs: Any,
|
||||
) -> OCRResult:
|
||||
return OCRResult(text=_MOCK_TEXT, backend_used="mock")
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def svc() -> MockOCRService:
|
||||
return MockOCRService()
|
||||
|
||||
|
||||
def _convert(filename: str, ocr_service: MockOCRService) -> str:
|
||||
path = TEST_DATA_DIR / filename
|
||||
if not path.exists():
|
||||
pytest.skip(f"Test file not found: {path}")
|
||||
converter = XlsxConverterWithOCR()
|
||||
with open(path, "rb") as f:
|
||||
return converter.convert(
|
||||
f, StreamInfo(extension=".xlsx"), ocr_service=ocr_service
|
||||
).text_content
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# xlsx_image_start.xlsx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_xlsx_image_start(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"## Sales Q1\n\n"
|
||||
"| Product | Sales |\n"
|
||||
"| --- | --- |\n"
|
||||
"| Widget A | 100 |\n"
|
||||
"| Widget B | 150 |\n\n"
|
||||
"### Images in this sheet:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"## Forecast Q2\n\n"
|
||||
"| Projected Sales | Unnamed: 1 |\n"
|
||||
"| --- | --- |\n"
|
||||
"| Widget A | 120 |\n"
|
||||
"| Widget B | 180 |\n\n"
|
||||
"### Images in this sheet:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
)
|
||||
assert _convert("xlsx_image_start.xlsx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# xlsx_image_middle.xlsx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_xlsx_image_middle(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"## Revenue\n\n"
|
||||
"| Q1 Report | Unnamed: 1 |\n"
|
||||
"| --- | --- |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| Revenue | $50,000 |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| Profit Margin | 40% |\n\n"
|
||||
"### Images in this sheet:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"## Expenses\n\n"
|
||||
"| Expense Breakdown | Unnamed: 1 |\n"
|
||||
"| --- | --- |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| Expenses | $30,000 |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| Savings | $5,000 |\n\n"
|
||||
"### Images in this sheet:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
)
|
||||
assert _convert("xlsx_image_middle.xlsx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# xlsx_image_end.xlsx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_xlsx_image_end(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"## Sheet\n\n"
|
||||
"| Financial Summary | Unnamed: 1 |\n"
|
||||
"| --- | --- |\n"
|
||||
"| Total Revenue | $500,000 |\n"
|
||||
"| Total Expenses | $300,000 |\n"
|
||||
"| Net Profit | $200,000 |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| Signature: | NaN |\n\n"
|
||||
"### Images in this sheet:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"## Budget\n\n"
|
||||
"| Budget Allocation | Unnamed: 1 |\n"
|
||||
"| --- | --- |\n"
|
||||
"| Marketing | $100,000 |\n"
|
||||
"| R&D | $150,000 |\n"
|
||||
"| Operations | $50,000 |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| Approved: | NaN |\n\n"
|
||||
"### Images in this sheet:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
)
|
||||
assert _convert("xlsx_image_end.xlsx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# xlsx_multiple_images.xlsx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_xlsx_multiple_images(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"## Overview\n\n"
|
||||
"| Dashboard |\n"
|
||||
"| --- |\n"
|
||||
"| Status: Active |\n"
|
||||
"| NaN |\n"
|
||||
"| NaN |\n"
|
||||
"| NaN |\n"
|
||||
"| NaN |\n"
|
||||
"| Performance Summary |\n\n"
|
||||
"### Images in this sheet:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"## Details\n\n"
|
||||
"| Detailed Metrics |\n"
|
||||
"| --- |\n"
|
||||
"| System Health |\n\n"
|
||||
"### Images in this sheet:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"## Summary\n\n"
|
||||
"| Quarter Summary |\n"
|
||||
"| --- |\n"
|
||||
"| Overall Performance |\n\n"
|
||||
"### Images in this sheet:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
)
|
||||
assert _convert("xlsx_multiple_images.xlsx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# xlsx_complex_layout.xlsx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_xlsx_complex_layout(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"## Complex Report\n\n"
|
||||
"| Annual Report 2024 | Unnamed: 1 |\n"
|
||||
"| --- | --- |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| Month | Sales |\n"
|
||||
"| Jan | 1000 |\n"
|
||||
"| Feb | 1200 |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| Total | 2200 |\n\n"
|
||||
"### Images in this sheet:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"## Customers\n\n"
|
||||
"| Customer Metrics | Unnamed: 1 |\n"
|
||||
"| --- | --- |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| New Customers | 250 |\n"
|
||||
"| Retention Rate | 92% |\n\n"
|
||||
"### Images in this sheet:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"## Regions\n\n"
|
||||
"| Regional Breakdown | Unnamed: 1 |\n"
|
||||
"| --- | --- |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| Region | Revenue |\n"
|
||||
"| North | $800K |\n"
|
||||
"| South | $600K |\n\n"
|
||||
"### Images in this sheet:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
)
|
||||
assert _convert("xlsx_complex_layout.xlsx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# No OCR service — no OCR tags emitted
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_xlsx_no_ocr_service_no_tags() -> None:
|
||||
path = TEST_DATA_DIR / "xlsx_image_middle.xlsx"
|
||||
if not path.exists():
|
||||
pytest.skip(f"Test file not found: {path}")
|
||||
converter = XlsxConverterWithOCR()
|
||||
with open(path, "rb") as f:
|
||||
md = converter.convert(f, StreamInfo(extension=".xlsx")).text_content
|
||||
assert "*[Image OCR]" not in md
|
||||
assert "[End OCR]*" not in md
|
||||
@@ -30,7 +30,6 @@ dependencies = [
|
||||
"magika~=0.6.1",
|
||||
"charset-normalizer",
|
||||
"defusedxml",
|
||||
"onnxruntime<=1.20.1; sys_platform == 'win32'",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
__version__ = "0.1.5b1"
|
||||
__version__ = "0.1.6b2"
|
||||
|
||||
@@ -107,6 +107,13 @@ class MarkItDown:
|
||||
requests_session = kwargs.get("requests_session")
|
||||
if requests_session is None:
|
||||
self._requests_session = requests.Session()
|
||||
# Signal that we prefer markdown over HTML, etc. if the server supports it.
|
||||
# e.g., https://blog.cloudflare.com/markdown-for-agents/
|
||||
self._requests_session.headers.update(
|
||||
{
|
||||
"Accept": "text/markdown, text/html;q=0.9, text/plain;q=0.8, */*;q=0.1"
|
||||
}
|
||||
)
|
||||
else:
|
||||
self._requests_session = requests_session
|
||||
|
||||
|
||||
@@ -198,15 +198,62 @@ def _extract_form_content_from_words(page: Any) -> str | None:
|
||||
if not all_table_x_positions:
|
||||
return None
|
||||
|
||||
# Compute global column boundaries
|
||||
# Compute adaptive column clustering tolerance based on gap analysis
|
||||
all_table_x_positions.sort()
|
||||
|
||||
# Calculate gaps between consecutive x-positions
|
||||
gaps = []
|
||||
for i in range(len(all_table_x_positions) - 1):
|
||||
gap = all_table_x_positions[i + 1] - all_table_x_positions[i]
|
||||
if gap > 5: # Only significant gaps
|
||||
gaps.append(gap)
|
||||
|
||||
# Determine optimal tolerance using statistical analysis
|
||||
if gaps and len(gaps) >= 3:
|
||||
# Use 70th percentile of gaps as threshold (balances precision/recall)
|
||||
sorted_gaps = sorted(gaps)
|
||||
percentile_70_idx = int(len(sorted_gaps) * 0.70)
|
||||
adaptive_tolerance = sorted_gaps[percentile_70_idx]
|
||||
|
||||
# Clamp tolerance to reasonable range [25, 50]
|
||||
adaptive_tolerance = max(25, min(50, adaptive_tolerance))
|
||||
else:
|
||||
# Fallback to conservative value
|
||||
adaptive_tolerance = 35
|
||||
|
||||
# Compute global column boundaries using adaptive tolerance
|
||||
global_columns: list[float] = []
|
||||
for x in all_table_x_positions:
|
||||
if not global_columns or x - global_columns[-1] > 30:
|
||||
if not global_columns or x - global_columns[-1] > adaptive_tolerance:
|
||||
global_columns.append(x)
|
||||
|
||||
# Too many columns suggests dense text, not a form
|
||||
if len(global_columns) > 8:
|
||||
# Adaptive max column check based on page characteristics
|
||||
# Calculate average column width
|
||||
if len(global_columns) > 1:
|
||||
content_width = global_columns[-1] - global_columns[0]
|
||||
avg_col_width = content_width / len(global_columns)
|
||||
|
||||
# Forms with very narrow columns (< 30px) are likely dense text
|
||||
if avg_col_width < 30:
|
||||
return None
|
||||
|
||||
# Compute adaptive max based on columns per inch
|
||||
# Typical forms have 3-8 columns per inch
|
||||
columns_per_inch = len(global_columns) / (content_width / 72)
|
||||
|
||||
# If density is too high (> 10 cols/inch), likely not a form
|
||||
if columns_per_inch > 10:
|
||||
return None
|
||||
|
||||
# Adaptive max: allow more columns for wider pages
|
||||
# Standard letter is 612pt wide, so scale accordingly
|
||||
adaptive_max_columns = int(20 * (page_width / 612))
|
||||
adaptive_max_columns = max(15, adaptive_max_columns) # At least 15
|
||||
|
||||
if len(global_columns) > adaptive_max_columns:
|
||||
return None
|
||||
else:
|
||||
# Single column, not a form
|
||||
return None
|
||||
|
||||
# Now classify each row as table row or not
|
||||
@@ -489,39 +536,41 @@ class PdfConverter(DocumentConverter):
|
||||
|
||||
assert isinstance(file_stream, io.IOBase)
|
||||
|
||||
markdown_chunks: list[str] = []
|
||||
|
||||
# Read file stream into BytesIO for compatibility with pdfplumber
|
||||
pdf_bytes = io.BytesIO(file_stream.read())
|
||||
|
||||
try:
|
||||
# Track how many pages are form-style vs plain text
|
||||
form_pages = 0
|
||||
plain_pages = 0
|
||||
# Single pass: check every page for form-style content.
|
||||
# Pages with tables/forms get rich extraction; plain-text
|
||||
# pages are collected separately. page.close() is called
|
||||
# after each page to free pdfplumber's cached objects and
|
||||
# keep memory usage constant regardless of page count.
|
||||
markdown_chunks: list[str] = []
|
||||
form_page_count = 0
|
||||
plain_page_indices: list[int] = []
|
||||
|
||||
with pdfplumber.open(pdf_bytes) as pdf:
|
||||
for page in pdf.pages:
|
||||
# Try form-style word position extraction
|
||||
for page_idx, page in enumerate(pdf.pages):
|
||||
page_content = _extract_form_content_from_words(page)
|
||||
|
||||
# If extraction returns None, this page is not form-style
|
||||
if page_content is None:
|
||||
plain_pages += 1
|
||||
# Extract text using pdfplumber's basic extraction for this page
|
||||
if page_content is not None:
|
||||
form_page_count += 1
|
||||
if page_content.strip():
|
||||
markdown_chunks.append(page_content)
|
||||
else:
|
||||
plain_page_indices.append(page_idx)
|
||||
text = page.extract_text()
|
||||
if text and text.strip():
|
||||
markdown_chunks.append(text.strip())
|
||||
else:
|
||||
form_pages += 1
|
||||
if page_content.strip():
|
||||
markdown_chunks.append(page_content)
|
||||
|
||||
# If most pages are plain text, use pdfminer for better text handling
|
||||
if plain_pages > form_pages and plain_pages > 0:
|
||||
page.close() # Free cached page data immediately
|
||||
|
||||
# If no pages had form-style content, use pdfminer for
|
||||
# the whole document (better text spacing for prose).
|
||||
if form_page_count == 0:
|
||||
pdf_bytes.seek(0)
|
||||
markdown = pdfminer.high_level.extract_text(pdf_bytes)
|
||||
else:
|
||||
# Build markdown from chunks
|
||||
markdown = "\n\n".join(markdown_chunks).strip()
|
||||
|
||||
except Exception:
|
||||
|
||||
Vendored
Vendored
+81
@@ -0,0 +1,81 @@
|
||||
TECHMART ELECTRONICS
|
||||
4567 Innovation Blvd
|
||||
San Francisco, CA 94103
|
||||
(415) 555-0199
|
||||
|
||||
===================================
|
||||
|
||||
Store #0342 - Downtown SF
|
||||
11/23/2024 14:32:18 PST
|
||||
TXN: TXN-98765-2024
|
||||
Cashier: Emily Rodriguez
|
||||
Register: POS-07
|
||||
|
||||
-----------------------------------
|
||||
|
||||
Wireless Noise-Cancelling
|
||||
Headphones - Premium Black
|
||||
AUDIO-5521 1 @ $349.99
|
||||
Member Discount $-50.00
|
||||
$299.99
|
||||
USB-C Hub 7-in-1 Adapter
|
||||
with HDMI & Ethernet
|
||||
ACC-8834 2 @ $79.99
|
||||
$159.98
|
||||
Portable SSD 2TB
|
||||
Thunderbolt 3 Compatible
|
||||
STOR-2241 1 @ $289.00
|
||||
Member Discount $-29.00
|
||||
$260.00
|
||||
Ergonomic Wireless Mouse
|
||||
Rechargeable Battery
|
||||
ACC-9012 1 @ $59.99
|
||||
$59.99
|
||||
Screen Cleaning Kit
|
||||
Professional Grade
|
||||
CARE-1156 3 @ $12.99
|
||||
$38.97
|
||||
HDMI 2.1 Cable 6ft
|
||||
8K Resolution Support
|
||||
CABLE-7789 2 @ $24.99
|
||||
Member Discount $-5.00
|
||||
$44.98
|
||||
-----------------------------------
|
||||
|
||||
SUBTOTAL $863.91
|
||||
Member Discount (15%)-$84.00
|
||||
Sales Tax (8.5%) $66.23
|
||||
Rewards Applied -$25.00
|
||||
===================================
|
||||
TOTAL $821.14
|
||||
===================================
|
||||
|
||||
PAYMENT METHOD
|
||||
Visa Card ending in 4782
|
||||
Auth: 847392
|
||||
Ref: REF-20241123-98765
|
||||
|
||||
-----------------------------------
|
||||
|
||||
REWARDS MEMBER
|
||||
Sarah Mitchell
|
||||
ID: TM-447821
|
||||
Points Earned: 821
|
||||
Total Points: 3,247
|
||||
Next Reward: $50 gift card
|
||||
at 5,000 pts (1,753 to go)
|
||||
|
||||
-----------------------------------
|
||||
|
||||
RETURN POLICY
|
||||
Returns within 30 days
|
||||
Receipt required
|
||||
Electronics must be unopened
|
||||
|
||||
*TXN98765202411231432*
|
||||
|
||||
Thank you for shopping!
|
||||
www.techmart.example.com
|
||||
|
||||
===================================
|
||||
|
||||
+76
@@ -0,0 +1,76 @@
|
||||
ZAVA AUTO REPAIR
|
||||
Certified Collision Repair
|
||||
123 Main Street, Redmond, WA 98052
|
||||
Phone: (425) 000-0000
|
||||
Preliminary Estimate (ID: EST-1008)
|
||||
| Customer Information | | | Vehicle Information | |
|
||||
| -------------------- | ------------------- | --- | ------------------- | ----------------- |
|
||||
| Insured name | Gabriel Diaz | | Year | 2022 |
|
||||
| Claim # | SF-1008 | | Make | Jeep |
|
||||
| Policy # | POL-2022-555 | | Model | Grand Cherokee |
|
||||
| Phone | (425) 111-1111 | | Trim | Limited |
|
||||
| Email | gabriel@contoso.com | | VIN | 1C4RJFBG2NC123456 |
|
||||
| | | | Color | White |
|
||||
| | | | Odometer | 9,800 |
|
||||
| Repair Order # | RO-20221108 | | Estimator | Ellis Turner |
|
||||
Estimate Totals
|
||||
| | | Hours | Rate | Cost |
|
||||
| ---------------- | --- | ----- | ---- | ----- |
|
||||
| Parts | | | | 2,100 |
|
||||
| Body Labor | | 2 | 150 | 300 |
|
||||
| Paint Labor | | 1.5 | 150 | 225 |
|
||||
| Mechanical Labor | | - | - | - |
|
||||
Supplies
|
||||
| | Paint Supplies | | | 60 |
|
||||
| ------------- | ------------------------ | --- | ------ | ------ |
|
||||
| | Body Supplies | | | 30 |
|
||||
| Other Charges | | | | 15 |
|
||||
| Subtotal | | | | 2,730 |
|
||||
| Sales Tax | | | 10.20% | 278.46 |
|
||||
| GRAND TOTAL | | | | 5,738 |
|
||||
| Note | Minor rear bumper repair | | | |
|
||||
This is a preliminary estimate for the visible damage of the vehicle. Additional damage / repairs / parts may be found
|
||||
after the vehicle has been disassembled and damaged parts have been removed. Suspension damages may be
|
||||
present, but can not be determined until an alignment on the vehicle has been done. Parts Prices may vary due to
|
||||
models and vehicle maker price updates. Please be advised if vehicle owner elects to have vehicle sent to service for
|
||||
any mechanical concerns, ALL service departments charge a vehicle diagnostic charge. If the mechanical concern is
|
||||
deemed not related to an insurance claim, vehicle owner will be reponsible for charges.
|
||||
|
||||
ZAVA AUTO REPAIR
|
||||
Certified Collision Repair
|
||||
123 Main Street, Redmond, WA 98052
|
||||
Phone: (425) 000-0000
|
||||
Preliminary Estimate (ID: EST-1008)
|
||||
Customer Information Vehicle Information
|
||||
| Insured name | Bruce Wayne | | Year | 2025 |
|
||||
| -------------- | -------------------------- | --- | --------- | ------------ |
|
||||
| Claim # | | 999 | Make | Batman |
|
||||
| Policy # | IM-BATMAN | | Model | Batmobile |
|
||||
| Phone | (416) 555-1234 | | Trim | Limited |
|
||||
| Email | batman@wayneindustries.com | | VIN | XXX |
|
||||
| | | | Color | Black |
|
||||
| | | | Odometer | 1 |
|
||||
| Repair Order # | RO-20221108 | | Estimator | Ellis Turner |
|
||||
Estimate Totals
|
||||
| | | Hours | Rate | Cost |
|
||||
| ---------------- | --- | ----- | ---- | ------ |
|
||||
| Parts | | | | 99,999 |
|
||||
| Body Labor | | 2 | 150 | 300 |
|
||||
| Paint Labor | | 1.5 | 150 | 225 |
|
||||
| Mechanical Labor | | - | - | - |
|
||||
Supplies
|
||||
| | Paint Supplies | | | 60 |
|
||||
| ------------- | ------------------------ | --- | ------ | --------- |
|
||||
| | Body Supplies | | | 30 |
|
||||
| Other Charges | | | | 15 |
|
||||
| Subtotal | | | | 100,629 |
|
||||
| Sales Tax | | | 10.20% | 10264.158 |
|
||||
| GRAND TOTAL | | | | 211,522 |
|
||||
| Note | Minor rear bumper repair | | | |
|
||||
|
||||
This is a preliminary estimate for the visible damage of the vehicle. Additional damage / repairs / parts may be found
|
||||
after the vehicle has been disassembled and damaged parts have been removed. Suspension damages may be
|
||||
present, but can not be determined until an alignment on the vehicle has been done. Parts Prices may vary due to
|
||||
models and vehicle maker price updates. Please be advised if vehicle owner elects to have vehicle sent to service for
|
||||
any mechanical concerns, ALL service departments charge a vehicle diagnostic charge. If the mechanical concern is
|
||||
deemed not related to an insurance claim, vehicle owner will be reponsible for charges.
|
||||
Vendored
+44
@@ -0,0 +1,44 @@
|
||||
INVENTORY RECONCILIATION REPORT
|
||||
Report ID: SPARSE-2024-INV-1234
|
||||
Warehouse: Distribution Center East
|
||||
Report Date: 2024-11-15
|
||||
Prepared By: Sarah Martinez
|
||||
| Product Code | Location | Expected | Actual | Variance | Status |
|
||||
| ------------ | -------- | -------- | ------ | -------- | -------- |
|
||||
| SKU-8847 | A-12 | 450 | | | |
|
||||
| | B-07 | | 289 | -23 | |
|
||||
| SKU-9201 | | 780 | 778 | | OK |
|
||||
| | C-15 | | | +15 | |
|
||||
| SKU-4563 | D-22 | | 156 | | CRITICAL |
|
||||
| | | 180 | | -24 | |
|
||||
| SKU-7728 | A-08 | 920 | | | |
|
||||
| | | | 935 | +15 | OK |
|
||||
Variance Analysis:
|
||||
Summary Statistics:
|
||||
Total Variance Cost: $4,287.50
|
||||
Critical Items: 1
|
||||
Overall Accuracy: 97.2%
|
||||
Detailed Analysis by Category:
|
||||
The inventory reconciliation reveals several key findings. The primary variance driver is SKU-4563,
|
||||
which shows a -24 unit discrepancy requiring immediate investigation. Location B-07 handling of
|
||||
SKU-8847 also demonstrates significant variance. Cross-location verification protocols should be
|
||||
|
||||
reviewed to prevent future discrepancies. The overall accuracy rate of 97.2% meets our target
|
||||
threshold, but critical items require expedited resolution to maintain operational efficiency.
|
||||
Extended Inventory Review:
|
||||
| Product Code | Category | Unit Cost | Total Value | Last Audit | Notes |
|
||||
| ------------ | ----------- | --------- | ----------- | ---------- | ---------- |
|
||||
| SKU-8847 | Electronics | $45.00 | $13,005.00 | 2024-10-15 | |
|
||||
| SKU-9201 | Hardware | $32.50 | $25,285.00 | 2024-10-22 | Verified |
|
||||
| SKU-4563 | Software | $120.00 | $18,720.00 | | Critical |
|
||||
| SKU-7728 | Accessories | $15.75 | $14,726.25 | 2024-11-01 | |
|
||||
| SKU-3345 | Electronics | $67.00 | $22,445.00 | 2024-10-18 | |
|
||||
| SKU-5512 | Hardware | $89.00 | $31,150.00 | | Pending |
|
||||
| SKU-6678 | Software | $200.00 | $42,000.00 | 2024-10-25 | High Value |
|
||||
| SKU-7789 | Accessories | $8.50 | $5,950.00 | 2024-11-05 | |
|
||||
| SKU-2234 | Electronics | $125.00 | $35,000.00 | | |
|
||||
| SKU-1123 | Hardware | $55.00 | $27,500.00 | 2024-10-30 | Verified |
|
||||
Recommendations:
|
||||
1. Immediate review of SKU-4563 handling procedures. 2. Implement additional verification for critical
|
||||
items. 3. Schedule follow-up audit for high-value products (SKU-6678, SKU-2234).
|
||||
Approval:
|
||||
+62
@@ -0,0 +1,62 @@
|
||||
BOOKING ORDER
|
||||
Print Date 12/15/2024 14:30:22
|
||||
Page 1 of 1
|
||||
STARLIGHT CINEMAS
|
||||
Orders
|
||||
| Order / Rev: | 2024-12-5678 | | | Cinema: | | Downtown Multiplex |
|
||||
| ------------ | -------------- | --- | --- | ---------------- | --- | ------------------ |
|
||||
| Alt Order #: | SC-WINTER-2024 | | | Primary Contact: | | Sarah Johnson |
|
||||
Product Desc: Holiday Movie Marathon Package Location: NYC-01
|
||||
| Estimate: | EST-456 | | | Region: | | NORTHEAST |
|
||||
| -------------------- | ----------------------- | --- | --- | ------- | --- | --------- |
|
||||
| Booking Dates: | 12/20/2024 - 12/31/2024 | | | | | |
|
||||
| Original Date / Rev: | 12/01/24 / 12/10/24 | | | | | |
|
||||
| Order Type: | Premium Package | | | | | |
|
||||
Booking Agency
|
||||
| Name: | Premier Entertainment Group | | | | | |
|
||||
| ---------------- | --------------------------- | --- | --- | -------------- | --- | --------- |
|
||||
| | | | | Billing Type: | | Net 30 |
|
||||
| Contact: | Michael Chen | | | | | |
|
||||
| | | | | Payment Terms: | | Corporate |
|
||||
| Billing Contact: | accounting@premierent.com | | | | | |
|
||||
| | | | | Commission: | | 10% |
|
||||
555 Broadway Suite 1200
|
||||
New York, NY 10012
|
||||
Customer
|
||||
| Name: | Universal Studios Distribution | | | | | |
|
||||
| -------------- | ------------------------------ | --- | --- | --- | --- | --- |
|
||||
| Category: | Film Distributor | | | | | |
|
||||
| Contact Email: | bookings@universalstudios.com | | | | | |
|
||||
| Customer ID: | CUST-98765 | | | | | |
|
||||
| Revenue Code: | FILM-PREMIUM | | | | | |
|
||||
Booking Summary
|
||||
| Start Date | End Date | # Shows | Gross Amount | Net Amount | | |
|
||||
| ---------- | -------- | ------- | ------------ | ---------- | --- | --- |
|
||||
| 12/20/24 | 12/31/24 | 48 | $12,500.00 | $11,250.00 | | |
|
||||
Totals
|
||||
| Month | # Shows | Gross Amount | | Net Amount | | Occupancy |
|
||||
| ------------- | ------- | ------------ | --- | ---------- | --- | --------- |
|
||||
| December 2024 | 48 | $12,500.00 | | $11,250.00 | | 85% |
|
||||
| Totals | 48 | $12,500.00 | | $11,250.00 | | 85% |
|
||||
Account Representatives
|
||||
Representative Territory Region Start Date / End Date Commission %
|
||||
| Sarah Johnson | NYC Metro | NORTHEAST | 12/20/24 - 12/31/24 | | 100% | |
|
||||
| ------------- | --------- | --------- | ------------------- | --- | ---- | --- |
|
||||
Show Schedule Details
|
||||
Ln Screen Start End Movie Title Format Showtime Days Shows Rate Type Total
|
||||
1 SCR-1 12/20/24 12/25/24 Holiday Spectacular IMAX 3D 7:00 PM Daily 12 $250 PM $3,000
|
||||
(Runtime: 142 min); Holiday Season Premium
|
||||
2 SCR-2 12/20/24 12/31/24 Winter Wonderland Standard 4:30 PM Daily 24 $150 MT $3,600
|
||||
(Runtime: 98 min); Matinee Special
|
||||
3 SCR-1 12/26/24 12/31/24 New Year Mystery 4DX 9:30 PM Daily 12 $300 PM $3,600
|
||||
(Runtime: 116 min); Premium Experience
|
||||
Show Details
|
||||
| Show Screen | Date Range | Title | Showtime | Days Type | Rate | Revenue |
|
||||
| ----------- | ---------- | ----- | -------- | --------- | ---- | ------- |
|
||||
1 SCR-1 12/20-12/25 Holiday Spectacular 7:00 PM Daily PM $250 $3,000
|
||||
This booking order is subject to cinema availability and standard terms.
|
||||
2 SCR-2 12/20-12/31 Winter Wonderland 4:30 PM Daily MT $150 $3,600
|
||||
All showtimes are approximate and subject to change.
|
||||
3 SCR-1 12/26-12/31 New Year Mystery 9:30 PM Daily PM $300 $3,600
|
||||
| Total Revenue: | | | | | | $12,500.00 |
|
||||
| -------------- | --- | --- | --- | --- | --- | ---------- |
|
||||
@@ -0,0 +1,65 @@
|
||||
1
|
||||
|
||||
Introduction
|
||||
|
||||
Large language models (LLMs) are becoming a crucial building block in developing powerful agents
|
||||
that utilize LLMs for reasoning, tool usage, and adapting to new observations (Yao et al., 2022; Xi
|
||||
et al., 2023; Wang et al., 2023b) in many real-world tasks. Given the expanding tasks that could
|
||||
benefit from LLMs and the growing task complexity, an intuitive approach to scale up the power of
|
||||
agents is to use multiple agents that cooperate. Prior work suggests that multiple agents can help
|
||||
encourage divergent thinking (Liang et al., 2023), improve factuality and reasoning (Du et al., 2023),
|
||||
and provide validation (Wu et al., 2023). In light of the intuition and early evidence of promise, it is
|
||||
intriguing to ask the following question: how can we facilitate the development of LLM applications
|
||||
that could span a broad spectrum of domains and complexities based on the multi-agent approach?
|
||||
|
||||
Our insight is to use multi-agent conversations to achieve it. There are at least three reasons con-
|
||||
firming its general feasibility and utility thanks to recent advances in LLMs: First, because chat-
|
||||
optimized LLMs (e.g., GPT-4) show the ability to incorporate feedback, LLM agents can cooperate
|
||||
through conversations with each other or human(s), e.g., a dialog where agents provide and seek rea-
|
||||
soning, observations, critiques, and validation. Second, because a single LLM can exhibit a broad
|
||||
range of capabilities (especially when configured with the correct prompt and inference settings),
|
||||
conversations between differently configured agents can help combine these broad LLM capabilities
|
||||
in a modular and complementary manner. Third, LLMs have demonstrated ability to solve complex
|
||||
tasks when the tasks are broken into simpler subtasks. Multi-agent conversations can enable this
|
||||
partitioning and integration in an intuitive manner. How can we leverage the above insights and
|
||||
support different applications with the common requirement of coordinating multiple agents, poten-
|
||||
tially backed by LLMs, humans, or tools exhibiting different capacities? We desire a multi-agent
|
||||
conversation framework with generic abstraction and effective implementation that has the flexibil-
|
||||
ity to satisfy different application needs. Achieving this requires addressing two critical questions:
|
||||
(1) How can we design individual agents that are capable, reusable, customizable, and effective in
|
||||
multi-agent collaboration? (2) How can we develop a straightforward, unified interface that can
|
||||
accommodate a wide range of agent conversation patterns? In practice, applications of varying
|
||||
complexities may need distinct sets of agents with specific capabilities, and may require different
|
||||
conversation patterns, such as single- or multi-turn dialogs, different human involvement modes, and
|
||||
static vs. dynamic conversation. Moreover, developers may prefer the flexibility to program agent
|
||||
interactions in natural language or code. Failing to adequately address these two questions would
|
||||
limit the framework’s scope of applicability and generality.
|
||||
While there is contemporaneous exploration of multi-agent approaches,3 we present AutoGen, a
|
||||
generalized multi-agent conversation framework (Figure 1), based on the following new concepts.
|
||||
1 Customizable and conversable agents. AutoGen uses a generic design of agents that can lever-
|
||||
age LLMs, human inputs, tools, or a combination of them. The result is that developers can
|
||||
easily and quickly create agents with different roles (e.g., agents to write code, execute code,
|
||||
wire in human feedback, validate outputs, etc.) by selecting and configuring a subset of built-in
|
||||
capabilities. The agent’s backend can also be readily extended to allow more custom behaviors.
|
||||
To make these agents suitable for multi-agent conversation, every agent is made conversable –
|
||||
they can receive, react, and respond to messages. When configured properly, an agent can hold
|
||||
multiple turns of conversations with other agents autonomously or solicit human inputs at cer-
|
||||
tain rounds, enabling human agency and automation. The conversable agent design leverages the
|
||||
strong capability of the most advanced LLMs in taking feedback and making progress via chat
|
||||
and also allows combining capabilities of LLMs in a modular fashion. (Section 2.1)
|
||||
|
||||
2 Conversation programming. A fundamental insight of AutoGen is to simplify and unify com-
|
||||
plex LLM application workflows as multi-agent conversations. So AutoGen adopts a program-
|
||||
ming paradigm centered around these inter-agent conversations. We refer to this paradigm as
|
||||
conversation programming, which streamlines the development of intricate applications via two
|
||||
primary steps: (1) defining a set of conversable agents with specific capabilities and roles (as
|
||||
described above); (2) programming the interaction behavior between agents via conversation-
|
||||
centric computation and control. Both steps can be achieved via a fusion of natural and pro-
|
||||
gramming languages to build applications with a wide range of conversation patterns and agent
|
||||
behaviors. AutoGen provides ready-to-use implementations and also allows easy extension and
|
||||
experimentation for both steps. (Section 2.2)
|
||||
|
||||
3We refer to Appendix A for a detailed discussion.
|
||||
|
||||
2
|
||||
|
||||
@@ -0,0 +1,74 @@
|
||||
%PDF-1.3
|
||||
%“Œ‹ž ReportLab Generated PDF document (opensource)
|
||||
1 0 obj
|
||||
<<
|
||||
/F1 2 0 R /F2 3 0 R
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/Contents 8 0 R /MediaBox [ 0 0 612 792 ] /Parent 7 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/PageMode /UseNone /Pages 7 0 R /Type /Catalog
|
||||
>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<<
|
||||
/Author (anonymous) /CreationDate (D:20260210121342+01'00') /Creator (anonymous) /Keywords () /ModDate (D:20260210121342+01'00') /Producer (ReportLab PDF Library - \(opensource\))
|
||||
/Subject (unspecified) /Title (untitled) /Trapped /False
|
||||
>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<<
|
||||
/Count 1 /Kids [ 4 0 R ] /Type /Pages
|
||||
>>
|
||||
endobj
|
||||
8 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 2414
|
||||
>>
|
||||
stream
|
||||
Gat=m?#SK-&r,lL<tO"8con<J;5Cq;2s]18RrdR7Y[)>Ym#<31@rCsJ89.W.qa3u?82hU4/rD6bm_^2.o5\G6@H<5/\G85.&:2)\f,l]`/mA:-0HF*!^.%Yd0?rr<_LD*'1j8Q\=IJXu'N"=HL>KSX^]339h+)S%SB[D8U\2B8rL_pR7\MXONW%HeW99+,0hH$AU#^KYAoZ)6P-2'6m5cj7lZu'kGHQ:/\R1,Ma%hEl2eYq(:LZ"-`3OktM:dm<m,u<)W99/X#l.?0OO\Z_]Y4.9BoSuKGOrdaFbq^/)*_g%gm8s\<gU<a%e]re(gWm_H[^0bn(=;0X%(_^H%$;+Se<aM)L.FrW&=UUc*X3He'XMO]CgP3P*<]$#uOPN2Z#n\O]@7_]#$ZH.Gr&KZm_M+6]8I5lYVcZLH,)V@L:BCib%tuWd,p*A"0Gb=6gIkI+Y5.[<aH_D`iMaKNQpo.UHf=F]to-Ui6XS[Q;Qh\cD=LT#YQGpn9rsUVm:qR"1%pnrSZa/Mi*P+f4j?B6-uV5Em_Zqog)^@RtF[F-adqASQb%i[(eIIqZ)_CVEHpGDgIpX[[uQ4J6DNf5X^CB'JA+,d^#?/[fq)jH^+:rbdW>Y'H/a/1^A\lZD2qMb,5%-$pOaW5-%BjndGRZ<CV&?T^r@PWF)!H#gDKcZj?[/gATBZ;=XJ$_a;??F-qtH(HaQX?W#iIL#17<Y25AC[ePo/pO[]=c0(\#/j9R%W/]$do:5b%.e4%S0Z';YJm/!GO9jt-H8W>JTK5I,b-cnrpc!2H0BZZ`1%R*aB!ZE'JRRYNJ<J4B`!j/maqpD>*Rq$U:[Tq%Lr[m+DHGg*dP\Ee>\#VYo43^R>kA9W2b/WU:k/M#%^2;nC+,e'dAcEOp?t5Kk;4+.f4MU@-mf7iCT_29s_%g,%K_gB8!kWS28T%T6'u_$GK'qX*VP>7>5?dW_<?$QPg!n")cT(<-[c/-kEbS'`*BYR5SB9TPY<1jq1#Q/EWpCJrY=s;bQfH^=uT:DTR3.8/N>W)r8_SF*7+f;4415n3,ECi2P6&bjmn17t+qU8;D])\Qt.8QLi)?kJ`.t+lkW'Y4e876l-2di)Y?.3\K1<(0IrEfm1<:Oc^u?7B::q;On$J5_C7T<u%071ASb!ZD1u7Yd"g`I'`PJ>**>tRZrdD6q3W@5QfbW8242uIHro=(eV*P1KjY,oj4tW&obb>^q-Iur%F#A)mgu8+V*?E<bdEC6V0+Z7OS^l.$W4hmuq:sMdJ=Sk+94D3QtUBZ:AoIiBA%s3#GJdRDFCpZ)7\MZmitKhMID(%ic%oW#tD%ERrqpk,dD3ll!E6m)e):26BLNV!WiRV*d(+Ppl'p$%?J&MqeV<=uNJ_5,4P_NC:lWf`Iu3\u+^>Y]dUOk&c=m2^<YVV2cUoq[`<<W-]MTIC50Klu6rO5RUVZ"h`#"4adtt2qjs2b12hQi!@JBp4Jln>:1Dtc(*!NBU*DeAtLhuWu&JLWFQi:;ka#?AD6V.A_[>n$T,.]8d=tffJ,?'DbCKQ-BnKqTn_:1LGc865V]FFi=AAF`DGhW(F]2^o?>VbGN:;=!-s;ea7]Ll\f+eiZ8XZb0*mZp%8*K_pf+1"2fKuO1pNK%7f_(mPTD@0&ljSV?o$5BpUmleYs^Faq_SM'jX.o\d*6%j(EtY.N"m2B'E@[.Y_8Be+m(58m$\dcqm$?,0it)/=9@9kRfJB;N7D9t\'F<:#c$P82`UKqgN]$kU]5eLPZMR=0bO[rPk"\?hu>sT^KFg`B>!pml-a[ImSeWp!_l3s!E>gFKq4ng:"n=N:m57rHjN)GML<=a1ktQpUT8:?[D:c7+Gm@2q;uN1Q3)hpeThe-&[#`KYZ4e_=o]kk1KH/^jo:"<0_nRJingk\[1Jltc<,.Jq2\*]=AVcIiY#?iMASrc$Bp)4m=NdIOJ&,H=+<MC=^7]?Tb>M"H6ZdXTX2Ba;Gp=J-m]$,8ZCU/77rHJ,%1.[/DlnkH:pIIV$Oh.;:t?5e3.cs^[G:H=e;i>c+>B=)C&l7T)S<Bld"_W)BtgI(/F`Le;ULQ,!FM!^<8Kk?L6b_>G8Jp-TG;!V1144#29r2%;n-RmNHrGdR!76&H"R_D-]`c"1FCgZl*",7SUVuqc0oapDQ=^`nj#FFk@2%[K[V45$!KQIH[=;SUpTE8T!QLliC=5-9]nkQpBVdHM6-g)tYBAPuOqr^qkn[Wh4C;6L89J;D>5@cYM$2Y/24scnNiWp4jWhfJAF^ck!@I(VPV*s,pdkPKn<Zg-T3I%d.sSl"^f-Gm=*riV,>(\770jbu^lf\h1+IH>c;Bo;Pdg;!fA)'kmg$"\P3oX=/N5/rUltb3K-BdRTR;-W)J1bDbE?g<MKG;cK`l?D4l>.,O@6id::q]JXBH\Ws#0[#'8-5JQL>/c~>endstream
|
||||
endobj
|
||||
xref
|
||||
0 9
|
||||
0000000000 65535 f
|
||||
0000000061 00000 n
|
||||
0000000102 00000 n
|
||||
0000000209 00000 n
|
||||
0000000321 00000 n
|
||||
0000000514 00000 n
|
||||
0000000582 00000 n
|
||||
0000000843 00000 n
|
||||
0000000902 00000 n
|
||||
trailer
|
||||
<<
|
||||
/ID
|
||||
[<e319d5c305edb8c0fb6be9e44c6178fa><e319d5c305edb8c0fb6be9e44c6178fa>]
|
||||
% ReportLab generated PDF document -- digest (opensource)
|
||||
|
||||
/Info 6 0 R
|
||||
/Root 5 0 R
|
||||
/Size 9
|
||||
>>
|
||||
startxref
|
||||
3407
|
||||
%%EOF
|
||||
@@ -0,0 +1,364 @@
|
||||
#!/usr/bin/env python3 -m pytest
|
||||
"""Tests for PDF converter memory optimization.
|
||||
|
||||
Verifies that:
|
||||
- page.close() is called after processing each page (frees cached data)
|
||||
- Plain-text PDFs fall back to pdfminer when no form pages are found
|
||||
- Mixed PDFs use form extraction only on form-style pages
|
||||
- Memory stays constant regardless of page count
|
||||
"""
|
||||
|
||||
import gc
|
||||
import io
|
||||
import os
|
||||
import tracemalloc
|
||||
|
||||
import pytest
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
from markitdown import MarkItDown
|
||||
|
||||
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
|
||||
|
||||
|
||||
def _has_fpdf2() -> bool:
|
||||
try:
|
||||
import fpdf # noqa: F401
|
||||
|
||||
return True
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
|
||||
def _make_form_page():
|
||||
"""Create a mock page with 3-column table-like word positions."""
|
||||
page = MagicMock()
|
||||
page.width = 612
|
||||
page.close = MagicMock()
|
||||
page.extract_words.return_value = [
|
||||
{"text": "Name", "x0": 50, "x1": 100, "top": 10, "bottom": 20},
|
||||
{"text": "Value", "x0": 250, "x1": 300, "top": 10, "bottom": 20},
|
||||
{"text": "Unit", "x0": 450, "x1": 500, "top": 10, "bottom": 20},
|
||||
{"text": "Alpha", "x0": 50, "x1": 100, "top": 30, "bottom": 40},
|
||||
{"text": "100", "x0": 250, "x1": 280, "top": 30, "bottom": 40},
|
||||
{"text": "kg", "x0": 450, "x1": 470, "top": 30, "bottom": 40},
|
||||
{"text": "Beta", "x0": 50, "x1": 100, "top": 50, "bottom": 60},
|
||||
{"text": "200", "x0": 250, "x1": 280, "top": 50, "bottom": 60},
|
||||
{"text": "lb", "x0": 450, "x1": 470, "top": 50, "bottom": 60},
|
||||
]
|
||||
return page
|
||||
|
||||
|
||||
def _make_plain_page():
|
||||
"""Create a mock page with single-line paragraph (no table structure)."""
|
||||
page = MagicMock()
|
||||
page.width = 612
|
||||
page.close = MagicMock()
|
||||
page.extract_words.return_value = [
|
||||
{
|
||||
"text": "This is a long paragraph of plain text.",
|
||||
"x0": 50,
|
||||
"x1": 550,
|
||||
"top": 10,
|
||||
"bottom": 20,
|
||||
},
|
||||
]
|
||||
page.extract_text.return_value = "This is a long paragraph of plain text."
|
||||
return page
|
||||
|
||||
|
||||
def _mock_pdfplumber_open(pages):
|
||||
"""Return a mock pdfplumber.open that yields the given pages."""
|
||||
|
||||
def mock_open(stream):
|
||||
mock_pdf = MagicMock()
|
||||
mock_pdf.pages = pages
|
||||
mock_pdf.__enter__ = MagicMock(return_value=mock_pdf)
|
||||
mock_pdf.__exit__ = MagicMock(return_value=False)
|
||||
return mock_pdf
|
||||
|
||||
return mock_open
|
||||
|
||||
|
||||
class TestPdfMemoryOptimization:
|
||||
"""Test that PDF conversion cleans up per-page caches to limit memory."""
|
||||
|
||||
def test_page_close_called_on_every_page(self):
|
||||
"""Verify page.close() is called on every page during conversion.
|
||||
|
||||
This ensures cached word/layout data is freed after each page,
|
||||
preventing O(n) memory growth with page count.
|
||||
"""
|
||||
num_pages = 20
|
||||
pages = [_make_form_page() for _ in range(num_pages)]
|
||||
|
||||
with patch(
|
||||
"markitdown.converters._pdf_converter.pdfplumber"
|
||||
) as mock_pdfplumber:
|
||||
mock_pdfplumber.open.side_effect = _mock_pdfplumber_open(pages)
|
||||
|
||||
md = MarkItDown()
|
||||
buf = io.BytesIO(b"fake pdf content")
|
||||
from markitdown import StreamInfo
|
||||
|
||||
md.convert_stream(
|
||||
buf,
|
||||
stream_info=StreamInfo(extension=".pdf", mimetype="application/pdf"),
|
||||
)
|
||||
|
||||
# page.close() must be called on ALL pages
|
||||
for i, page in enumerate(pages):
|
||||
assert page.close.called, (
|
||||
f"page.close() was NOT called on page {i} — "
|
||||
"this would cause memory to accumulate"
|
||||
)
|
||||
|
||||
def test_plain_text_pdf_falls_back_to_pdfminer(self):
|
||||
"""Verify all-plain-text PDFs fall back to pdfminer.
|
||||
|
||||
When no page has form-style content, the converter should discard
|
||||
pdfplumber results and use pdfminer for the whole document (better
|
||||
text spacing for prose).
|
||||
"""
|
||||
num_pages = 50
|
||||
pages = [_make_plain_page() for _ in range(num_pages)]
|
||||
|
||||
with patch(
|
||||
"markitdown.converters._pdf_converter.pdfplumber"
|
||||
) as mock_pdfplumber, patch(
|
||||
"markitdown.converters._pdf_converter.pdfminer"
|
||||
) as mock_pdfminer:
|
||||
mock_pdfplumber.open.side_effect = _mock_pdfplumber_open(pages)
|
||||
mock_pdfminer.high_level.extract_text.return_value = "Plain text content"
|
||||
|
||||
md = MarkItDown()
|
||||
buf = io.BytesIO(b"fake pdf content")
|
||||
from markitdown import StreamInfo
|
||||
|
||||
result = md.convert_stream(
|
||||
buf,
|
||||
stream_info=StreamInfo(extension=".pdf", mimetype="application/pdf"),
|
||||
)
|
||||
|
||||
# pdfminer should be used for the final text extraction
|
||||
assert mock_pdfminer.high_level.extract_text.called, (
|
||||
"pdfminer.high_level.extract_text was not called — "
|
||||
"plain-text PDFs should fall back to pdfminer"
|
||||
)
|
||||
assert result.text_content is not None
|
||||
|
||||
def test_plain_text_pdf_still_closes_all_pages(self):
|
||||
"""Even for plain-text PDFs, page.close() must be called on every page."""
|
||||
num_pages = 30
|
||||
pages = [_make_plain_page() for _ in range(num_pages)]
|
||||
|
||||
with patch(
|
||||
"markitdown.converters._pdf_converter.pdfplumber"
|
||||
) as mock_pdfplumber, patch(
|
||||
"markitdown.converters._pdf_converter.pdfminer"
|
||||
) as mock_pdfminer:
|
||||
mock_pdfplumber.open.side_effect = _mock_pdfplumber_open(pages)
|
||||
mock_pdfminer.high_level.extract_text.return_value = "text"
|
||||
|
||||
md = MarkItDown()
|
||||
buf = io.BytesIO(b"fake pdf content")
|
||||
from markitdown import StreamInfo
|
||||
|
||||
md.convert_stream(
|
||||
buf,
|
||||
stream_info=StreamInfo(extension=".pdf", mimetype="application/pdf"),
|
||||
)
|
||||
|
||||
for i, page in enumerate(pages):
|
||||
assert (
|
||||
page.close.called
|
||||
), f"page.close() was NOT called on plain-text page {i}"
|
||||
|
||||
def test_mixed_pdf_uses_form_extraction_per_page(self):
|
||||
"""In a mixed PDF, form pages get table extraction while plain pages don't.
|
||||
|
||||
Ensures we don't miss form-style pages and don't waste work
|
||||
running form extraction on plain-text pages.
|
||||
"""
|
||||
# Pages 0,2,4 are form-style; pages 1,3 are plain text
|
||||
pages = [
|
||||
_make_form_page(), # 0 - form
|
||||
_make_plain_page(), # 1 - plain
|
||||
_make_form_page(), # 2 - form
|
||||
_make_plain_page(), # 3 - plain
|
||||
_make_form_page(), # 4 - form
|
||||
]
|
||||
|
||||
with patch(
|
||||
"markitdown.converters._pdf_converter.pdfplumber"
|
||||
) as mock_pdfplumber:
|
||||
mock_pdfplumber.open.side_effect = _mock_pdfplumber_open(pages)
|
||||
|
||||
md = MarkItDown()
|
||||
buf = io.BytesIO(b"fake pdf content")
|
||||
from markitdown import StreamInfo
|
||||
|
||||
result = md.convert_stream(
|
||||
buf,
|
||||
stream_info=StreamInfo(extension=".pdf", mimetype="application/pdf"),
|
||||
)
|
||||
|
||||
# All pages should have close() called
|
||||
for i, page in enumerate(pages):
|
||||
assert page.close.called, f"page.close() not called on page {i}"
|
||||
|
||||
# Form pages (0,2,4) should have extract_words called
|
||||
for i in [0, 2, 4]:
|
||||
assert pages[
|
||||
i
|
||||
].extract_words.called, f"extract_words not called on form page {i}"
|
||||
|
||||
# Result should contain table content from form pages
|
||||
assert result.text_content is not None
|
||||
assert (
|
||||
"|" in result.text_content
|
||||
), "Expected markdown table pipes in output from form-style pages"
|
||||
|
||||
def test_only_one_pdfplumber_open_call(self):
|
||||
"""Verify pdfplumber.open is called exactly once (single pass)."""
|
||||
pages = [_make_form_page() for _ in range(10)]
|
||||
|
||||
with patch(
|
||||
"markitdown.converters._pdf_converter.pdfplumber"
|
||||
) as mock_pdfplumber:
|
||||
mock_pdfplumber.open.side_effect = _mock_pdfplumber_open(pages)
|
||||
|
||||
md = MarkItDown()
|
||||
buf = io.BytesIO(b"fake pdf content")
|
||||
from markitdown import StreamInfo
|
||||
|
||||
md.convert_stream(
|
||||
buf,
|
||||
stream_info=StreamInfo(extension=".pdf", mimetype="application/pdf"),
|
||||
)
|
||||
|
||||
assert mock_pdfplumber.open.call_count == 1, (
|
||||
f"Expected 1 pdfplumber.open call (single pass), "
|
||||
f"got {mock_pdfplumber.open.call_count}"
|
||||
)
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not os.path.exists(os.path.join(TEST_FILES_DIR, "test.pdf")),
|
||||
reason="test.pdf not available",
|
||||
)
|
||||
def test_real_pdf_page_cleanup(self):
|
||||
"""Integration test: verify page.close() is called with a real PDF."""
|
||||
import pdfplumber
|
||||
|
||||
close_call_count = 0
|
||||
original_close = pdfplumber.page.Page.close
|
||||
|
||||
def tracking_close(self):
|
||||
nonlocal close_call_count
|
||||
close_call_count += 1
|
||||
original_close(self)
|
||||
|
||||
with patch.object(pdfplumber.page.Page, "close", tracking_close):
|
||||
md = MarkItDown()
|
||||
pdf_path = os.path.join(TEST_FILES_DIR, "test.pdf")
|
||||
md.convert(pdf_path)
|
||||
|
||||
assert (
|
||||
close_call_count > 0
|
||||
), "page.close() was never called during PDF conversion"
|
||||
|
||||
|
||||
def _generate_table_pdf(num_pages: int) -> bytes:
|
||||
"""Generate a PDF with table-like content on every page."""
|
||||
from fpdf import FPDF
|
||||
|
||||
pdf = FPDF()
|
||||
pdf.set_auto_page_break(auto=False)
|
||||
for page_num in range(num_pages):
|
||||
pdf.add_page()
|
||||
pdf.set_font("Helvetica", size=10)
|
||||
pdf.set_xy(10, 10)
|
||||
pdf.cell(60, 8, "Parameter", border=1)
|
||||
pdf.cell(60, 8, "Value", border=1)
|
||||
pdf.cell(60, 8, "Unit", border=1)
|
||||
pdf.ln()
|
||||
for row in range(20):
|
||||
y = 18 + row * 8
|
||||
if y > 270:
|
||||
break
|
||||
pdf.set_xy(10, y)
|
||||
pdf.cell(60, 8, f"Param_{page_num}_{row}", border=1)
|
||||
pdf.cell(60, 8, f"{(page_num * 100 + row) * 1.23:.2f}", border=1)
|
||||
pdf.cell(60, 8, "kg/m2", border=1)
|
||||
return pdf.output()
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
not _has_fpdf2(),
|
||||
reason="fpdf2 not installed",
|
||||
)
|
||||
class TestPdfMemoryBenchmark:
|
||||
"""Benchmark: verify memory stays constant with page.close() fix."""
|
||||
|
||||
def test_memory_does_not_grow_linearly(self):
|
||||
"""Peak memory for 200 pages should be far less than without the fix.
|
||||
|
||||
Without page.close(), 200 pages uses ~225 MiB (linear growth).
|
||||
With the fix, peak memory should stay under 30 MiB.
|
||||
"""
|
||||
from markitdown import StreamInfo
|
||||
|
||||
num_pages = 200
|
||||
pdf_bytes = _generate_table_pdf(num_pages)
|
||||
|
||||
gc.collect()
|
||||
tracemalloc.start()
|
||||
|
||||
md = MarkItDown()
|
||||
buf = io.BytesIO(pdf_bytes)
|
||||
md.convert_stream(buf, stream_info=StreamInfo(extension=".pdf"))
|
||||
|
||||
_, peak = tracemalloc.get_traced_memory()
|
||||
tracemalloc.stop()
|
||||
|
||||
peak_mib = peak / 1024 / 1024
|
||||
# Without the fix this would be ~225 MiB. With the fix it should
|
||||
# be well under 30 MiB. Use a generous threshold to avoid flaky
|
||||
# failures on different machines.
|
||||
assert peak_mib < 30, (
|
||||
f"Peak memory {peak_mib:.1f} MiB for {num_pages} pages is too high. "
|
||||
f"Expected < 30 MiB with page.close() fix."
|
||||
)
|
||||
|
||||
def test_memory_constant_across_page_counts(self):
|
||||
"""Peak memory should not scale linearly with page count.
|
||||
|
||||
Converts 50-page and 200-page PDFs and asserts the peak memory
|
||||
ratio is much less than the 4x page count ratio.
|
||||
"""
|
||||
from markitdown import StreamInfo
|
||||
|
||||
results = {}
|
||||
for num_pages in [50, 200]:
|
||||
pdf_bytes = _generate_table_pdf(num_pages)
|
||||
|
||||
gc.collect()
|
||||
tracemalloc.start()
|
||||
|
||||
md = MarkItDown()
|
||||
buf = io.BytesIO(pdf_bytes)
|
||||
md.convert_stream(buf, stream_info=StreamInfo(extension=".pdf"))
|
||||
|
||||
_, peak = tracemalloc.get_traced_memory()
|
||||
tracemalloc.stop()
|
||||
results[num_pages] = peak
|
||||
|
||||
ratio = results[200] / results[50]
|
||||
# With O(n) memory growth the ratio would be ~4x.
|
||||
# With the fix it should be close to 1x (well under 2x).
|
||||
assert ratio < 2.0, (
|
||||
f"Memory ratio 200p/50p = {ratio:.2f}x — "
|
||||
f"expected < 2.0x (constant memory). "
|
||||
f"50p={results[50] / 1024 / 1024:.1f} MiB, "
|
||||
f"200p={results[200] / 1024 / 1024:.1f} MiB"
|
||||
)
|
||||
@@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env python3 -m pytest
|
||||
"""Tests for PDF table extraction functionality."""
|
||||
|
||||
import os
|
||||
import re
|
||||
import pytest
|
||||
@@ -650,6 +651,332 @@ class TestPdfTableExtraction:
|
||||
result.text_content.strip() == ""
|
||||
), f"Scanned PDF should have empty extraction, got: '{result.text_content[:100]}...'"
|
||||
|
||||
def test_movie_theater_booking_pdf_extraction(self, markitdown):
|
||||
"""Test extraction of movie theater booking PDF with complex tables.
|
||||
|
||||
Expected output: Pipe-separated format with booking details, agency info,
|
||||
customer details, and show schedules in structured tables.
|
||||
"""
|
||||
pdf_path = os.path.join(TEST_FILES_DIR, "movie-theater-booking-2024.pdf")
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
text_content = result.text_content
|
||||
|
||||
# Validate pipe-separated table format
|
||||
assert "|" in text_content, "Booking order should contain pipe separators"
|
||||
|
||||
# Validate key booking information
|
||||
expected_strings = [
|
||||
"BOOKING ORDER",
|
||||
"2024-12-5678", # Order number
|
||||
"Holiday Movie Marathon Package", # Product description
|
||||
"12/20/2024 - 12/31/2024", # Booking dates
|
||||
"SC-WINTER-2024", # Alt order number
|
||||
"STARLIGHT CINEMAS", # Cinema brand
|
||||
]
|
||||
validate_strings(result, expected_strings)
|
||||
|
||||
# Validate agency information
|
||||
agency_strings = [
|
||||
"Premier Entertainment Group", # Agency name
|
||||
"Michael Chen", # Contact
|
||||
"Sarah Johnson", # Primary contact
|
||||
"Downtown Multiplex", # Cinema name
|
||||
]
|
||||
validate_strings(result, agency_strings)
|
||||
|
||||
# Validate customer information
|
||||
customer_strings = [
|
||||
"Universal Studios Distribution", # Customer name
|
||||
"Film Distributor", # Category
|
||||
"CUST-98765", # Customer ID
|
||||
]
|
||||
validate_strings(result, customer_strings)
|
||||
|
||||
# Validate booking summary totals
|
||||
booking_strings = [
|
||||
"$12,500.00", # Gross amount
|
||||
"$11,250.00", # Net amount
|
||||
"December 2024", # Month
|
||||
"48", # Number of shows
|
||||
]
|
||||
validate_strings(result, booking_strings)
|
||||
|
||||
# Validate show schedule details
|
||||
show_strings = [
|
||||
"Holiday Spectacular", # Movie title
|
||||
"Winter Wonderland", # Movie title
|
||||
"New Year Mystery", # Movie title
|
||||
"IMAX 3D", # Format
|
||||
"$250", # Rate
|
||||
"$300", # Rate
|
||||
"$3,000", # Revenue
|
||||
"$3,600", # Revenue
|
||||
]
|
||||
validate_strings(result, show_strings)
|
||||
|
||||
|
||||
class TestPdfFullOutputComparison:
|
||||
"""Test that PDF extraction produces expected complete outputs."""
|
||||
|
||||
@pytest.fixture
|
||||
def markitdown(self):
|
||||
"""Create MarkItDown instance."""
|
||||
return MarkItDown()
|
||||
|
||||
def test_movie_theater_full_output(self, markitdown):
|
||||
"""Test complete output for movie theater booking PDF."""
|
||||
pdf_path = os.path.join(TEST_FILES_DIR, "movie-theater-booking-2024.pdf")
|
||||
expected_path = os.path.join(
|
||||
TEST_FILES_DIR, "expected_outputs", "movie-theater-booking-2024.md"
|
||||
)
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
if not os.path.exists(expected_path):
|
||||
pytest.skip(f"Expected output not found: {expected_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
actual_output = result.text_content
|
||||
|
||||
with open(expected_path, "r", encoding="utf-8") as f:
|
||||
expected_output = f.read()
|
||||
|
||||
# Compare outputs
|
||||
actual_lines = [line.rstrip() for line in actual_output.split("\n")]
|
||||
expected_lines = [line.rstrip() for line in expected_output.split("\n")]
|
||||
|
||||
# Check line count
|
||||
assert abs(len(actual_lines) - len(expected_lines)) <= 2, (
|
||||
f"Line count mismatch: actual={len(actual_lines)}, "
|
||||
f"expected={len(expected_lines)}"
|
||||
)
|
||||
|
||||
# Check structural elements
|
||||
assert actual_output.count("|") > 80, "Should have many pipe separators"
|
||||
assert actual_output.count("---") > 8, "Should have table separators"
|
||||
|
||||
# Validate critical sections
|
||||
for section in [
|
||||
"BOOKING ORDER",
|
||||
"STARLIGHT CINEMAS",
|
||||
"2024-12-5678",
|
||||
"Holiday Spectacular",
|
||||
"$12,500.00",
|
||||
]:
|
||||
assert section in actual_output, f"Missing section: {section}"
|
||||
|
||||
# Check table structure
|
||||
table_rows = [line for line in actual_lines if line.startswith("|")]
|
||||
assert (
|
||||
len(table_rows) > 15
|
||||
), f"Should have >15 table rows, got {len(table_rows)}"
|
||||
|
||||
def test_sparse_borderless_table_full_output(self, markitdown):
|
||||
"""Test complete output for SPARSE borderless table PDF."""
|
||||
pdf_path = os.path.join(
|
||||
TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf"
|
||||
)
|
||||
expected_path = os.path.join(
|
||||
TEST_FILES_DIR,
|
||||
"expected_outputs",
|
||||
"SPARSE-2024-INV-1234_borderless_table.md",
|
||||
)
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
if not os.path.exists(expected_path):
|
||||
pytest.skip(f"Expected output not found: {expected_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
actual_output = result.text_content
|
||||
|
||||
with open(expected_path, "r", encoding="utf-8") as f:
|
||||
expected_output = f.read()
|
||||
|
||||
# Compare outputs
|
||||
actual_lines = [line.rstrip() for line in actual_output.split("\n")]
|
||||
expected_lines = [line.rstrip() for line in expected_output.split("\n")]
|
||||
|
||||
# Check line count is close
|
||||
assert abs(len(actual_lines) - len(expected_lines)) <= 2, (
|
||||
f"Line count mismatch: actual={len(actual_lines)}, "
|
||||
f"expected={len(expected_lines)}"
|
||||
)
|
||||
|
||||
# Check structural elements
|
||||
assert actual_output.count("|") > 50, "Should have many pipe separators"
|
||||
|
||||
# Validate critical sections
|
||||
for section in [
|
||||
"INVENTORY RECONCILIATION REPORT",
|
||||
"SPARSE-2024-INV-1234",
|
||||
"SKU-8847",
|
||||
"SKU-9201",
|
||||
"Variance Analysis",
|
||||
]:
|
||||
assert section in actual_output, f"Missing section: {section}"
|
||||
|
||||
def test_repair_multipage_full_output(self, markitdown):
|
||||
"""Test complete output for REPAIR multipage invoice PDF."""
|
||||
pdf_path = os.path.join(TEST_FILES_DIR, "REPAIR-2022-INV-001_multipage.pdf")
|
||||
expected_path = os.path.join(
|
||||
TEST_FILES_DIR, "expected_outputs", "REPAIR-2022-INV-001_multipage.md"
|
||||
)
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
if not os.path.exists(expected_path):
|
||||
pytest.skip(f"Expected output not found: {expected_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
actual_output = result.text_content
|
||||
|
||||
with open(expected_path, "r", encoding="utf-8") as f:
|
||||
expected_output = f.read()
|
||||
|
||||
# Compare outputs
|
||||
actual_lines = [line.rstrip() for line in actual_output.split("\n")]
|
||||
expected_lines = [line.rstrip() for line in expected_output.split("\n")]
|
||||
|
||||
# Check line count is close
|
||||
assert abs(len(actual_lines) - len(expected_lines)) <= 2, (
|
||||
f"Line count mismatch: actual={len(actual_lines)}, "
|
||||
f"expected={len(expected_lines)}"
|
||||
)
|
||||
|
||||
# Check structural elements
|
||||
assert actual_output.count("|") > 40, "Should have many pipe separators"
|
||||
|
||||
# Validate critical sections
|
||||
for section in [
|
||||
"ZAVA AUTO REPAIR",
|
||||
"Gabriel Diaz",
|
||||
"Jeep",
|
||||
"Grand Cherokee",
|
||||
"GRAND TOTAL",
|
||||
]:
|
||||
assert section in actual_output, f"Missing section: {section}"
|
||||
|
||||
def test_receipt_full_output(self, markitdown):
|
||||
"""Test complete output for RECEIPT retail purchase PDF."""
|
||||
pdf_path = os.path.join(
|
||||
TEST_FILES_DIR, "RECEIPT-2024-TXN-98765_retail_purchase.pdf"
|
||||
)
|
||||
expected_path = os.path.join(
|
||||
TEST_FILES_DIR,
|
||||
"expected_outputs",
|
||||
"RECEIPT-2024-TXN-98765_retail_purchase.md",
|
||||
)
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
if not os.path.exists(expected_path):
|
||||
pytest.skip(f"Expected output not found: {expected_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
actual_output = result.text_content
|
||||
|
||||
with open(expected_path, "r", encoding="utf-8") as f:
|
||||
expected_output = f.read()
|
||||
|
||||
# Compare outputs
|
||||
actual_lines = [line.rstrip() for line in actual_output.split("\n")]
|
||||
expected_lines = [line.rstrip() for line in expected_output.split("\n")]
|
||||
|
||||
# Check line count is close
|
||||
assert abs(len(actual_lines) - len(expected_lines)) <= 2, (
|
||||
f"Line count mismatch: actual={len(actual_lines)}, "
|
||||
f"expected={len(expected_lines)}"
|
||||
)
|
||||
|
||||
# Validate critical sections
|
||||
for section in [
|
||||
"TECHMART ELECTRONICS",
|
||||
"TXN-98765-2024",
|
||||
"Sarah Mitchell",
|
||||
"$821.14",
|
||||
"RETURN POLICY",
|
||||
]:
|
||||
assert section in actual_output, f"Missing section: {section}"
|
||||
|
||||
def test_academic_paper_full_output(self, markitdown):
|
||||
"""Test complete output for academic paper PDF."""
|
||||
pdf_path = os.path.join(TEST_FILES_DIR, "test.pdf")
|
||||
expected_path = os.path.join(TEST_FILES_DIR, "expected_outputs", "test.md")
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
if not os.path.exists(expected_path):
|
||||
pytest.skip(f"Expected output not found: {expected_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
actual_output = result.text_content
|
||||
|
||||
with open(expected_path, "r", encoding="utf-8") as f:
|
||||
expected_output = f.read()
|
||||
|
||||
# Compare outputs
|
||||
actual_lines = [line.rstrip() for line in actual_output.split("\n")]
|
||||
expected_lines = [line.rstrip() for line in expected_output.split("\n")]
|
||||
|
||||
# Check line count is close
|
||||
assert abs(len(actual_lines) - len(expected_lines)) <= 2, (
|
||||
f"Line count mismatch: actual={len(actual_lines)}, "
|
||||
f"expected={len(expected_lines)}"
|
||||
)
|
||||
|
||||
# Academic paper should not have pipe separators
|
||||
assert (
|
||||
actual_output.count("|") == 0
|
||||
), "Academic paper should not have pipe separators"
|
||||
|
||||
# Validate critical sections
|
||||
for section in [
|
||||
"Introduction",
|
||||
"Large language models",
|
||||
"agents",
|
||||
"multi-agent",
|
||||
]:
|
||||
assert section in actual_output, f"Missing section: {section}"
|
||||
|
||||
def test_medical_scan_full_output(self, markitdown):
|
||||
"""Test complete output for medical report scan PDF (empty, no text layer)."""
|
||||
pdf_path = os.path.join(
|
||||
TEST_FILES_DIR, "MEDRPT-2024-PAT-3847_medical_report_scan.pdf"
|
||||
)
|
||||
expected_path = os.path.join(
|
||||
TEST_FILES_DIR,
|
||||
"expected_outputs",
|
||||
"MEDRPT-2024-PAT-3847_medical_report_scan.md",
|
||||
)
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
if not os.path.exists(expected_path):
|
||||
pytest.skip(f"Expected output not found: {expected_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
actual_output = result.text_content
|
||||
|
||||
with open(expected_path, "r", encoding="utf-8") as f:
|
||||
expected_output = f.read()
|
||||
|
||||
# Both should be empty (scanned PDF with no text layer)
|
||||
assert actual_output.strip() == "", "Scanned PDF should produce empty output"
|
||||
assert (
|
||||
expected_output.strip() == ""
|
||||
), "Expected output should be empty for scanned PDF"
|
||||
|
||||
|
||||
class TestPdfTableMarkdownFormat:
|
||||
"""Test that extracted tables have proper markdown formatting."""
|
||||
|
||||
Reference in New Issue
Block a user