Compare commits
104 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 7b040a4445 | |||
| 63cbbd9de6 | |||
| a6c8ac46a6 | |||
| c6308dc822 | |||
| 4a5340f93b | |||
| 6b0fd15e60 | |||
| 2b6ec9f315 | |||
| c83de14a9c | |||
| 7fdaefb724 | |||
| 251dddcf0c | |||
| dde250a456 | |||
| 3d4fe3cdcc | |||
| 447c047731 | |||
| 8a9d8f1593 | |||
| 17365654c9 | |||
| 59eb60f8cb | |||
| 459d462f29 | |||
| c3f6cb356c | |||
| 0c4d3945a0 | |||
| f8b60b5403 | |||
| 16ca285d30 | |||
| b81a387616 | |||
| ea1a3dfb60 | |||
| b6e5da8874 | |||
| fb1ad24833 | |||
| 1178c2e211 | |||
| 9278119bb3 | |||
| da7bcea527 | |||
| 3bfb821c09 | |||
| 62b72284fe | |||
| 1dd3c83339 | |||
| 9dc982a3b1 | |||
| effde4767b | |||
| 04bf831209 | |||
| 9fd680c366 | |||
| 38261fd31c | |||
| 131f0c7739 | |||
| 56f7579ce2 | |||
| cb421cf9ea | |||
| 39e7252940 | |||
| bbcf876b18 | |||
| 041be54471 | |||
| ebe2684b3d | |||
| 8576f1d915 | |||
| 3fcd48cdfc | |||
| 9e067c42b6 | |||
| 9a951055f0 | |||
| 73b9d57312 | |||
| 3ca57986ef | |||
| c1f9a323ee | |||
| e928b43afb | |||
| 2ffe6ea591 | |||
| efc55b260d | |||
| 52432bd228 | |||
| c0a511ecff | |||
| cd6aa41361 | |||
| 716f74dcb9 | |||
| a93e0567e6 | |||
| c5f70b904f | |||
| 53834fdd24 | |||
| 5c565b7d79 | |||
| a78857bd43 | |||
| 09df7fe8df | |||
| 6a9f09b153 | |||
| 0b815fb916 | |||
| 12620f1545 | |||
| 5f75e16d20 | |||
| 75140a90e2 | |||
| af1be36e0c | |||
| 2a2ccc86aa | |||
| 2e51ba22e7 | |||
| 8f8e58c9bb | |||
| 8e73a325c6 | |||
| 2405f201af | |||
| 99d8e562db | |||
| 515fa854bf | |||
| 0229ff6cb7 | |||
| 82d84e3edd | |||
| 36c4bc9ec3 | |||
| 80baa5db18 | |||
| 00a65e8f8b | |||
| 6bedf6d950 | |||
| 9380112892 | |||
| 784c293579 | |||
| 70e9f8c3c0 | |||
| e921497f79 | |||
| 1d2f231146 | |||
| c5cd659f63 | |||
| f01c6c5277 | |||
| 43bd79adc9 | |||
| 9182923375 | |||
| 9a19fdd134 | |||
| e82e0c1372 | |||
| a394cc7c27 | |||
| a87fbf01ee | |||
| d0ed74fdf4 | |||
| e4b419ba40 | |||
| dbdf2c0c10 | |||
| 97eeed5f32 | |||
| 935da9976c | |||
| 5ce85c236c | |||
| 3a5ca22a8d | |||
| 4b62506451 | |||
| c73afcffea |
+3
-1
@@ -1 +1,3 @@
|
||||
*
|
||||
*
|
||||
!packages/
|
||||
!app.py
|
||||
|
||||
+5
-1
@@ -1 +1,5 @@
|
||||
tests/test_files/** linguist-vendored
|
||||
packages/markitdown/tests/test_files/** linguist-vendored
|
||||
packages/markitdown-sample-plugin/tests/test_files/** linguist-vendored
|
||||
|
||||
# Treat PDF files as binary to prevent line ending conversion
|
||||
*.pdf binary
|
||||
|
||||
@@ -5,7 +5,7 @@ jobs:
|
||||
pre-commit:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v5
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
|
||||
@@ -5,21 +5,14 @@ jobs:
|
||||
tests:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v5
|
||||
- uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: |
|
||||
3.10
|
||||
3.11
|
||||
3.12
|
||||
- name: Set up pip cache
|
||||
if: runner.os == 'Linux'
|
||||
uses: actions/cache@v4
|
||||
with:
|
||||
path: ~/.cache/pip
|
||||
key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }}
|
||||
restore-keys: ${{ runner.os }}-pip-
|
||||
- name: Install Hatch
|
||||
run: pipx install hatch
|
||||
- name: Run tests
|
||||
run: hatch test
|
||||
run: cd packages/markitdown; hatch test
|
||||
|
||||
@@ -52,6 +52,7 @@ coverage.xml
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
.test-logs/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
@@ -164,3 +165,4 @@ cython_debug/
|
||||
#.idea/
|
||||
src/.DS_Store
|
||||
.DS_Store
|
||||
.cursorrules
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
# markitdown
|
||||
|
||||
이 파일은 Claude Code가 어느 경로에서 실행되든 자동으로 로드합니다.
|
||||
|
||||
## 프로젝트 개요
|
||||
- md 파일로 변환 간소화
|
||||
|
||||
## 저장소
|
||||
- Git 서버: Gitea (자체 NAS 운영)
|
||||
- Gitea URL: https://gitea.gru.farm/
|
||||
- 계정: airkjw
|
||||
- 저장소: markitdown
|
||||
- Remote: https://gitea.gru.farm/airkjw/markitdown
|
||||
- 토큰: b1a93cfe7024411e34b3cb9ff04bb0c3abc35bc6
|
||||
+20
-10
@@ -1,22 +1,32 @@
|
||||
FROM python:3.13-slim-bullseye
|
||||
|
||||
USER root
|
||||
|
||||
ARG INSTALL_GIT=false
|
||||
RUN if [ "$INSTALL_GIT" = "true" ]; then \
|
||||
apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*; \
|
||||
fi
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV EXIFTOOL_PATH=/usr/bin/exiftool
|
||||
ENV FFMPEG_PATH=/usr/bin/ffmpeg
|
||||
|
||||
# Runtime dependency
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
ffmpeg \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
exiftool
|
||||
|
||||
RUN pip install markitdown
|
||||
ARG INSTALL_GIT=false
|
||||
RUN if [ "$INSTALL_GIT" = "true" ]; then \
|
||||
apt-get install -y --no-install-recommends \
|
||||
git; \
|
||||
fi
|
||||
|
||||
# Cleanup
|
||||
RUN rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /app
|
||||
COPY . /app
|
||||
RUN pip --no-cache-dir install \
|
||||
/app/packages/markitdown[all] \
|
||||
/app/packages/markitdown-sample-plugin
|
||||
|
||||
# Default USERID and GROUPID
|
||||
ARG USERID=10000
|
||||
ARG GROUPID=10000
|
||||
ARG USERID=nobody
|
||||
ARG GROUPID=nogroup
|
||||
|
||||
USER $USERID:$GROUPID
|
||||
|
||||
|
||||
@@ -0,0 +1,34 @@
|
||||
FROM python:3.13-slim-bullseye
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV EXIFTOOL_PATH=/usr/local/bin/exiftool
|
||||
ENV FFMPEG_PATH=/usr/bin/ffmpeg
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
ffmpeg \
|
||||
curl \
|
||||
perl \
|
||||
make \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& curl -fsSL https://exiftool.org/Image-ExifTool-13.55.tar.gz -o /tmp/exiftool.tar.gz \
|
||||
&& tar -xzf /tmp/exiftool.tar.gz -C /tmp \
|
||||
&& cd /tmp/Image-ExifTool-13.55 \
|
||||
&& perl Makefile.PL && make install \
|
||||
&& rm -rf /tmp/exiftool.tar.gz /tmp/Image-ExifTool-13.55
|
||||
|
||||
WORKDIR /app
|
||||
COPY packages/ /app/packages/
|
||||
COPY app.py /app/app.py
|
||||
|
||||
RUN pip --no-cache-dir install \
|
||||
/app/packages/markitdown[all] \
|
||||
streamlit
|
||||
|
||||
EXPOSE 8501
|
||||
|
||||
HEALTHCHECK CMD curl -f http://localhost:8501/_stcore/health || exit 1
|
||||
|
||||
ENTRYPOINT ["streamlit", "run", "app.py", \
|
||||
"--server.port=8501", \
|
||||
"--server.address=0.0.0.0", \
|
||||
"--server.headless=true"]
|
||||
@@ -4,9 +4,19 @@
|
||||

|
||||
[](https://github.com/microsoft/autogen)
|
||||
|
||||
> [!TIP]
|
||||
> MarkItDown now offers an MCP (Model Context Protocol) server for integration with LLM applications like Claude Desktop. See [markitdown-mcp](https://github.com/microsoft/markitdown/tree/main/packages/markitdown-mcp) for more information.
|
||||
|
||||
> [!IMPORTANT]
|
||||
> Breaking changes between 0.0.1 to 0.1.0:
|
||||
> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install 'markitdown[all]'` to have backward-compatible behavior.
|
||||
> * convert\_stream() now requires a binary file-like object (e.g., a file opened in binary mode, or an io.BytesIO object). This is a breaking change from the previous version, where it previously also accepted text file-like objects, like io.StringIO.
|
||||
> * The DocumentConverter class interface has changed to read from file-like streams rather than file paths. *No temporary files are created anymore*. If you are the maintainer of a plugin, or custom DocumentConverter, you likely need to update your code. Otherwise, if only using the MarkItDown class or CLI (as in these examples), you should not need to change anything.
|
||||
|
||||
MarkItDown is a lightweight Python utility for converting various files to Markdown for use with LLMs and related text analysis pipelines. To this end, it is most comparable to [textract](https://github.com/deanmalmgren/textract), but with a focus on preserving important document structure and content as Markdown (including: headings, lists, tables, links, etc.) While the output is often reasonably presentable and human-friendly, it is meant to be consumed by text analysis tools -- and may not be the best option for high-fidelity document conversions for human consumption.
|
||||
|
||||
MarkItDown currently supports the conversion from:
|
||||
|
||||
MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
|
||||
It supports:
|
||||
- PDF
|
||||
- PowerPoint
|
||||
- Word
|
||||
@@ -16,8 +26,53 @@ It supports:
|
||||
- HTML
|
||||
- Text-based formats (CSV, JSON, XML)
|
||||
- ZIP files (iterates over contents)
|
||||
- Youtube URLs
|
||||
- EPubs
|
||||
- ... and more!
|
||||
|
||||
To install MarkItDown, use pip: `pip install markitdown`. Alternatively, you can install it from the source: `pip install -e .`
|
||||
## Why Markdown?
|
||||
|
||||
Markdown is extremely close to plain text, with minimal markup or formatting, but still
|
||||
provides a way to represent important document structure. Mainstream LLMs, such as
|
||||
OpenAI's GPT-4o, natively "_speak_" Markdown, and often incorporate Markdown into their
|
||||
responses unprompted. This suggests that they have been trained on vast amounts of
|
||||
Markdown-formatted text, and understand it well. As a side benefit, Markdown conventions
|
||||
are also highly token-efficient.
|
||||
|
||||
## Prerequisites
|
||||
MarkItDown requires Python 3.10 or higher. It is recommended to use a virtual environment to avoid dependency conflicts.
|
||||
|
||||
With the standard Python installation, you can create and activate a virtual environment using the following commands:
|
||||
|
||||
```bash
|
||||
python -m venv .venv
|
||||
source .venv/bin/activate
|
||||
```
|
||||
|
||||
If using `uv`, you can create a virtual environment with:
|
||||
|
||||
```bash
|
||||
uv venv --python=3.12 .venv
|
||||
source .venv/bin/activate
|
||||
# NOTE: Be sure to use 'uv pip install' rather than just 'pip install' to install packages in this virtual environment
|
||||
```
|
||||
|
||||
If you are using Anaconda, you can create a virtual environment with:
|
||||
|
||||
```bash
|
||||
conda create -n markitdown python=3.12
|
||||
conda activate markitdown
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
To install MarkItDown, use pip: `pip install 'markitdown[all]'`. Alternatively, you can install it from the source:
|
||||
|
||||
```bash
|
||||
git clone git@github.com:microsoft/markitdown.git
|
||||
cd markitdown
|
||||
pip install -e 'packages/markitdown[all]'
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
@@ -33,18 +88,90 @@ Or use `-o` to specify the output file:
|
||||
markitdown path-to-file.pdf -o document.md
|
||||
```
|
||||
|
||||
To use Document Intelligence conversion:
|
||||
|
||||
```bash
|
||||
markitdown path-to-file.pdf -o document.md -d -e "<document_intelligence_endpoint>"
|
||||
```
|
||||
|
||||
You can also pipe content:
|
||||
|
||||
```bash
|
||||
cat path-to-file.pdf | markitdown
|
||||
```
|
||||
|
||||
### Optional Dependencies
|
||||
MarkItDown has optional dependencies for activating various file formats. Earlier in this document, we installed all optional dependencies with the `[all]` option. However, you can also install them individually for more control. For example:
|
||||
|
||||
```bash
|
||||
pip install 'markitdown[pdf, docx, pptx]'
|
||||
```
|
||||
|
||||
will install only the dependencies for PDF, DOCX, and PPTX files.
|
||||
|
||||
At the moment, the following optional dependencies are available:
|
||||
|
||||
* `[all]` Installs all optional dependencies
|
||||
* `[pptx]` Installs dependencies for PowerPoint files
|
||||
* `[docx]` Installs dependencies for Word files
|
||||
* `[xlsx]` Installs dependencies for Excel files
|
||||
* `[xls]` Installs dependencies for older Excel files
|
||||
* `[pdf]` Installs dependencies for PDF files
|
||||
* `[outlook]` Installs dependencies for Outlook messages
|
||||
* `[az-doc-intel]` Installs dependencies for Azure Document Intelligence
|
||||
* `[audio-transcription]` Installs dependencies for audio transcription of wav and mp3 files
|
||||
* `[youtube-transcription]` Installs dependencies for fetching YouTube video transcription
|
||||
|
||||
### Plugins
|
||||
|
||||
MarkItDown also supports 3rd-party plugins. Plugins are disabled by default. To list installed plugins:
|
||||
|
||||
```bash
|
||||
markitdown --list-plugins
|
||||
```
|
||||
|
||||
To enable plugins use:
|
||||
|
||||
```bash
|
||||
markitdown --use-plugins path-to-file.pdf
|
||||
```
|
||||
|
||||
To find available plugins, search GitHub for the hashtag `#markitdown-plugin`. To develop a plugin, see `packages/markitdown-sample-plugin`.
|
||||
|
||||
#### markitdown-ocr Plugin
|
||||
|
||||
The `markitdown-ocr` plugin adds OCR support to PDF, DOCX, PPTX, and XLSX converters, extracting text from embedded images using LLM Vision — the same `llm_client` / `llm_model` pattern that MarkItDown already uses for image descriptions. No new ML libraries or binary dependencies required.
|
||||
|
||||
**Installation:**
|
||||
|
||||
```bash
|
||||
pip install markitdown-ocr
|
||||
pip install openai # or any OpenAI-compatible client
|
||||
```
|
||||
|
||||
**Usage:**
|
||||
|
||||
Pass the same `llm_client` and `llm_model` you would use for image descriptions:
|
||||
|
||||
```python
|
||||
from markitdown import MarkItDown
|
||||
from openai import OpenAI
|
||||
|
||||
md = MarkItDown(
|
||||
enable_plugins=True,
|
||||
llm_client=OpenAI(),
|
||||
llm_model="gpt-4o",
|
||||
)
|
||||
result = md.convert("document_with_images.pdf")
|
||||
print(result.text_content)
|
||||
```
|
||||
|
||||
If no `llm_client` is provided the plugin still loads, but OCR is silently skipped and the standard built-in converter is used instead.
|
||||
|
||||
See [`packages/markitdown-ocr/README.md`](packages/markitdown-ocr/README.md) for detailed documentation.
|
||||
|
||||
### Azure Document Intelligence
|
||||
|
||||
To use Microsoft Document Intelligence for conversion:
|
||||
|
||||
```bash
|
||||
markitdown path-to-file.pdf -o document.md -d -e "<document_intelligence_endpoint>"
|
||||
```
|
||||
|
||||
More information about how to set up an Azure Document Intelligence Resource can be found [here](https://learn.microsoft.com/en-us/azure/ai-services/document-intelligence/how-to-guides/create-document-intelligence-resource?view=doc-intel-4.0.0)
|
||||
|
||||
### Python API
|
||||
@@ -54,7 +181,7 @@ Basic usage in Python:
|
||||
```python
|
||||
from markitdown import MarkItDown
|
||||
|
||||
md = MarkItDown()
|
||||
md = MarkItDown(enable_plugins=False) # Set to True to enable plugins
|
||||
result = md.convert("test.xlsx")
|
||||
print(result.text_content)
|
||||
```
|
||||
@@ -69,14 +196,14 @@ result = md.convert("test.pdf")
|
||||
print(result.text_content)
|
||||
```
|
||||
|
||||
To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`:
|
||||
To use Large Language Models for image descriptions (currently only for pptx and image files), provide `llm_client` and `llm_model`:
|
||||
|
||||
```python
|
||||
from markitdown import MarkItDown
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI()
|
||||
md = MarkItDown(llm_client=client, llm_model="gpt-4o")
|
||||
md = MarkItDown(llm_client=client, llm_model="gpt-4o", llm_prompt="optional custom prompt")
|
||||
result = md.convert("example.jpg")
|
||||
print(result.text_content)
|
||||
```
|
||||
@@ -87,46 +214,10 @@ print(result.text_content)
|
||||
docker build -t markitdown:latest .
|
||||
docker run --rm -i markitdown:latest < ~/your-file.pdf > output.md
|
||||
```
|
||||
<details>
|
||||
|
||||
<summary>Batch Processing Multiple Files</summary>
|
||||
|
||||
This example shows how to convert multiple files to markdown format in a single run. The script processes all supported files in a directory and creates corresponding markdown files.
|
||||
|
||||
|
||||
```python convert.py
|
||||
from markitdown import MarkItDown
|
||||
from openai import OpenAI
|
||||
import os
|
||||
client = OpenAI(api_key="your-api-key-here")
|
||||
md = MarkItDown(llm_client=client, llm_model="gpt-4o-2024-11-20")
|
||||
supported_extensions = ('.pptx', '.docx', '.pdf', '.jpg', '.jpeg', '.png')
|
||||
files_to_convert = [f for f in os.listdir('.') if f.lower().endswith(supported_extensions)]
|
||||
for file in files_to_convert:
|
||||
print(f"\nConverting {file}...")
|
||||
try:
|
||||
md_file = os.path.splitext(file)[0] + '.md'
|
||||
result = md.convert(file)
|
||||
with open(md_file, 'w') as f:
|
||||
f.write(result.text_content)
|
||||
|
||||
print(f"Successfully converted {file} to {md_file}")
|
||||
except Exception as e:
|
||||
print(f"Error converting {file}: {str(e)}")
|
||||
|
||||
print("\nAll conversions completed!")
|
||||
```
|
||||
2. Place the script in the same directory as your files
|
||||
3. Install required packages: like openai
|
||||
4. Run script ```bash python convert.py ```
|
||||
|
||||
Note that original files will remain unchanged and new markdown files are created with the same base name.
|
||||
|
||||
</details>
|
||||
|
||||
## Contributing
|
||||
|
||||
This project welcomes contributions and suggestions. Most contributions require you to agree to a
|
||||
This project welcomes contributions and suggestions. Most contributions require you to agree to a
|
||||
Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
|
||||
the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
|
||||
|
||||
@@ -140,35 +231,46 @@ contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additio
|
||||
|
||||
### How to Contribute
|
||||
|
||||
You can help by looking at issues or helping review PRs. Any issue or PR is welcome, but we have also marked some as 'open for contribution' and 'open for reviewing' to help facilitate community contributions. These are ofcourse just suggestions and you are welcome to contribute in any way you like.
|
||||
|
||||
You can help by looking at issues or helping review PRs. Any issue or PR is welcome, but we have also marked some as 'open for contribution' and 'open for reviewing' to help facilitate community contributions. These are of course just suggestions and you are welcome to contribute in any way you like.
|
||||
|
||||
<div align="center">
|
||||
|
||||
| | All | Especially Needs Help from Community |
|
||||
|-----------------------|------------------------------------------|------------------------------------------------------------------------------------------|
|
||||
| **Issues** | [All Issues](https://github.com/microsoft/markitdown/issues) | [Issues open for contribution](https://github.com/microsoft/markitdown/issues?q=is%3Aissue+is%3Aopen+label%3A%22open+for+contribution%22) |
|
||||
| **PRs** | [All PRs](https://github.com/microsoft/markitdown/pulls) | [PRs open for reviewing](https://github.com/microsoft/markitdown/pulls?q=is%3Apr+is%3Aopen+label%3A%22open+for+reviewing%22) |
|
||||
| | All | Especially Needs Help from Community |
|
||||
| ---------- | ------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| **Issues** | [All Issues](https://github.com/microsoft/markitdown/issues) | [Issues open for contribution](https://github.com/microsoft/markitdown/issues?q=is%3Aissue+is%3Aopen+label%3A%22open+for+contribution%22) |
|
||||
| **PRs** | [All PRs](https://github.com/microsoft/markitdown/pulls) | [PRs open for reviewing](https://github.com/microsoft/markitdown/pulls?q=is%3Apr+is%3Aopen+label%3A%22open+for+reviewing%22) |
|
||||
|
||||
</div>
|
||||
|
||||
### Running Tests and Checks
|
||||
|
||||
- Navigate to the MarkItDown package:
|
||||
|
||||
```sh
|
||||
cd packages/markitdown
|
||||
```
|
||||
|
||||
- Install `hatch` in your environment and run tests:
|
||||
```sh
|
||||
pip install hatch # Other ways of installing hatch: https://hatch.pypa.io/dev/install/
|
||||
hatch shell
|
||||
hatch test
|
||||
```
|
||||
|
||||
```sh
|
||||
pip install hatch # Other ways of installing hatch: https://hatch.pypa.io/dev/install/
|
||||
hatch shell
|
||||
hatch test
|
||||
```
|
||||
|
||||
(Alternative) Use the Devcontainer which has all the dependencies installed:
|
||||
```sh
|
||||
# Reopen the project in Devcontainer and run:
|
||||
hatch test
|
||||
```
|
||||
|
||||
```sh
|
||||
# Reopen the project in Devcontainer and run:
|
||||
hatch test
|
||||
```
|
||||
|
||||
- Run pre-commit checks before submitting a PR: `pre-commit run --all-files`
|
||||
|
||||
### Contributing 3rd-party Plugins
|
||||
|
||||
You can also contribute by creating and sharing 3rd party plugins. See `packages/markitdown-sample-plugin` for more details.
|
||||
|
||||
## Trademarks
|
||||
|
||||
This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
|
||||
|
||||
@@ -0,0 +1,117 @@
|
||||
import io
|
||||
import tempfile
|
||||
import os
|
||||
import streamlit as st
|
||||
from markitdown import MarkItDown
|
||||
|
||||
st.set_page_config(
|
||||
page_title="MarkItDown",
|
||||
page_icon="📄",
|
||||
layout="wide",
|
||||
)
|
||||
|
||||
st.title("📄 MarkItDown")
|
||||
st.caption("파일을 Markdown으로 변환합니다")
|
||||
|
||||
SUPPORTED_EXTENSIONS = [
|
||||
"pdf", "docx", "pptx", "xlsx", "xls",
|
||||
"jpg", "jpeg", "png",
|
||||
"mp3", "wav",
|
||||
"html", "htm",
|
||||
"csv", "json", "xml",
|
||||
"ipynb", "epub", "zip", "msg",
|
||||
]
|
||||
|
||||
# Sidebar
|
||||
with st.sidebar:
|
||||
st.header("설정")
|
||||
show_preview = st.toggle("Markdown 렌더링 미리보기", value=True)
|
||||
st.divider()
|
||||
st.markdown("**지원 포맷**")
|
||||
st.markdown(
|
||||
"PDF · DOCX · PPTX · XLSX · XLS\n\n"
|
||||
"JPG · PNG · MP3 · WAV\n\n"
|
||||
"HTML · CSV · JSON · XML\n\n"
|
||||
"IPYNB · EPUB · ZIP · MSG"
|
||||
)
|
||||
|
||||
# URL 변환
|
||||
url_tab, file_tab = st.tabs(["URL 변환", "파일 업로드"])
|
||||
|
||||
md = MarkItDown()
|
||||
|
||||
with url_tab:
|
||||
url = st.text_input("URL 입력", placeholder="https://example.com 또는 YouTube URL")
|
||||
if st.button("변환", key="url_btn", disabled=not url):
|
||||
with st.spinner("변환 중..."):
|
||||
try:
|
||||
result = md.convert(url)
|
||||
st.session_state["url_result"] = result.text_content
|
||||
st.session_state["url_filename"] = "output.md"
|
||||
except Exception as e:
|
||||
st.error(f"변환 실패: {e}")
|
||||
|
||||
if "url_result" in st.session_state:
|
||||
_content = st.session_state["url_result"]
|
||||
col1, col2 = st.columns([1, 1]) if show_preview else (st.container(), None)
|
||||
|
||||
with col1:
|
||||
st.subheader("Markdown 원문")
|
||||
st.code(_content, language="markdown")
|
||||
|
||||
if show_preview and col2:
|
||||
with col2:
|
||||
st.subheader("미리보기")
|
||||
st.markdown(_content)
|
||||
|
||||
st.download_button(
|
||||
"⬇️ .md 파일 다운로드",
|
||||
data=_content,
|
||||
file_name=st.session_state["url_filename"],
|
||||
mime="text/markdown",
|
||||
)
|
||||
|
||||
with file_tab:
|
||||
uploaded = st.file_uploader(
|
||||
"파일을 끌어다 놓거나 클릭해서 선택하세요",
|
||||
type=SUPPORTED_EXTENSIONS,
|
||||
)
|
||||
|
||||
if uploaded is not None:
|
||||
if st.button("변환", key="file_btn"):
|
||||
with st.spinner("변환 중..."):
|
||||
try:
|
||||
suffix = os.path.splitext(uploaded.name)[1]
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
|
||||
tmp.write(uploaded.getvalue())
|
||||
tmp_path = tmp.name
|
||||
|
||||
result = md.convert(tmp_path)
|
||||
os.unlink(tmp_path)
|
||||
|
||||
st.session_state["file_result"] = result.text_content
|
||||
st.session_state["file_filename"] = os.path.splitext(uploaded.name)[0] + ".md"
|
||||
except Exception as e:
|
||||
st.error(f"변환 실패: {e}")
|
||||
|
||||
if "file_result" in st.session_state:
|
||||
_content = st.session_state["file_result"]
|
||||
|
||||
if show_preview:
|
||||
col1, col2 = st.columns([1, 1])
|
||||
with col1:
|
||||
st.subheader("Markdown 원문")
|
||||
st.code(_content, language="markdown")
|
||||
with col2:
|
||||
st.subheader("미리보기")
|
||||
st.markdown(_content)
|
||||
else:
|
||||
st.subheader("Markdown 원문")
|
||||
st.code(_content, language="markdown")
|
||||
|
||||
st.download_button(
|
||||
"⬇️ .md 파일 다운로드",
|
||||
data=_content,
|
||||
file_name=st.session_state["file_filename"],
|
||||
mime="text/markdown",
|
||||
)
|
||||
@@ -0,0 +1,28 @@
|
||||
FROM python:3.13-slim-bullseye
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV EXIFTOOL_PATH=/usr/bin/exiftool
|
||||
ENV FFMPEG_PATH=/usr/bin/ffmpeg
|
||||
ENV MARKITDOWN_ENABLE_PLUGINS=True
|
||||
|
||||
# Runtime dependency
|
||||
# NOTE: Add any additional MarkItDown plugins here
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
ffmpeg \
|
||||
exiftool
|
||||
|
||||
# Cleanup
|
||||
RUN rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY . /app
|
||||
RUN pip --no-cache-dir install /app
|
||||
|
||||
WORKDIR /workdir
|
||||
|
||||
# Default USERID and GROUPID
|
||||
ARG USERID=nobody
|
||||
ARG GROUPID=nogroup
|
||||
|
||||
USER $USERID:$GROUPID
|
||||
|
||||
ENTRYPOINT [ "markitdown-mcp" ]
|
||||
@@ -0,0 +1,142 @@
|
||||
# MarkItDown-MCP
|
||||
|
||||
> [!IMPORTANT]
|
||||
> The MarkItDown-MCP package is meant for **local use**, with local trusted agents. In particular, when running the MCP server with Streamable HTTP or SSE, it binds to `localhost` by default, and is not exposed to other machines on the network or Internet. In this configuration, it is meant to be a direct alternative to the STDIO transport, which may be more convenient in some cases. DO NOT bind the server to other interfaces unless you understand the [security implications](#security-considerations) of doing so.
|
||||
|
||||
|
||||
[](https://pypi.org/project/markitdown-mcp/)
|
||||

|
||||
[](https://github.com/microsoft/autogen)
|
||||
|
||||
The `markitdown-mcp` package provides a lightweight STDIO, Streamable HTTP, and SSE MCP server for calling MarkItDown.
|
||||
|
||||
It exposes one tool: `convert_to_markdown(uri)`, where uri can be any `http:`, `https:`, `file:`, or `data:` URI.
|
||||
|
||||
## Installation
|
||||
|
||||
To install the package, use pip:
|
||||
|
||||
```bash
|
||||
pip install markitdown-mcp
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
To run the MCP server, using STDIO (default), use the following command:
|
||||
|
||||
|
||||
```bash
|
||||
markitdown-mcp
|
||||
```
|
||||
|
||||
To run the MCP server, using Streamable HTTP and SSE, use the following command:
|
||||
|
||||
```bash
|
||||
markitdown-mcp --http --host 127.0.0.1 --port 3001
|
||||
```
|
||||
|
||||
## Running in Docker
|
||||
|
||||
To run `markitdown-mcp` in Docker, build the Docker image using the provided Dockerfile:
|
||||
```bash
|
||||
docker build -t markitdown-mcp:latest .
|
||||
```
|
||||
|
||||
And run it using:
|
||||
```bash
|
||||
docker run -it --rm markitdown-mcp:latest
|
||||
```
|
||||
This will be sufficient for remote URIs. To access local files, you need to mount the local directory into the container. For example, if you want to access files in `/home/user/data`, you can run:
|
||||
|
||||
```bash
|
||||
docker run -it --rm -v /home/user/data:/workdir markitdown-mcp:latest
|
||||
```
|
||||
|
||||
Once mounted, all files under data will be accessible under `/workdir` in the container. For example, if you have a file `example.txt` in `/home/user/data`, it will be accessible in the container at `/workdir/example.txt`.
|
||||
|
||||
## Accessing from Claude Desktop
|
||||
|
||||
It is recommended to use the Docker image when running the MCP server for Claude Desktop.
|
||||
|
||||
Follow [these instructions](https://modelcontextprotocol.io/quickstart/user#for-claude-desktop-users) to access Claude's `claude_desktop_config.json` file.
|
||||
|
||||
Edit it to include the following JSON entry:
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"markitdown": {
|
||||
"command": "docker",
|
||||
"args": [
|
||||
"run",
|
||||
"--rm",
|
||||
"-i",
|
||||
"markitdown-mcp:latest"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
If you want to mount a directory, adjust it accordingly:
|
||||
|
||||
```json
|
||||
{
|
||||
"mcpServers": {
|
||||
"markitdown": {
|
||||
"command": "docker",
|
||||
"args": [
|
||||
"run",
|
||||
"--rm",
|
||||
"-i",
|
||||
"-v",
|
||||
"/home/user/data:/workdir",
|
||||
"markitdown-mcp:latest"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Debugging
|
||||
|
||||
To debug the MCP server you can use the `MCP Inspector` tool.
|
||||
|
||||
```bash
|
||||
npx @modelcontextprotocol/inspector
|
||||
```
|
||||
|
||||
You can then connect to the inspector through the specified host and port (e.g., `http://localhost:5173/`).
|
||||
|
||||
If using STDIO:
|
||||
* select `STDIO` as the transport type,
|
||||
* input `markitdown-mcp` as the command, and
|
||||
* click `Connect`
|
||||
|
||||
If using Streamable HTTP:
|
||||
* select `Streamable HTTP` as the transport type,
|
||||
* input `http://127.0.0.1:3001/mcp` as the URL, and
|
||||
* click `Connect`
|
||||
|
||||
If using SSE:
|
||||
* select `SSE` as the transport type,
|
||||
* input `http://127.0.0.1:3001/sse` as the URL, and
|
||||
* click `Connect`
|
||||
|
||||
Finally:
|
||||
* click the `Tools` tab,
|
||||
* click `List Tools`,
|
||||
* click `convert_to_markdown`, and
|
||||
* run the tool on any valid URI.
|
||||
|
||||
## Security Considerations
|
||||
|
||||
The server does not support authentication, and runs with the privileges of the user running it. For this reason, when running in SSE or Streamable HTTP mode, the server binds by default to `localhost`. Even still, it is important to recognize that the server can be accessed by any process or users on the same local machine, and that the `convert_to_markdown` tool can be used to read any file that the server's user has access to, or any data from the network. If you require additional security, consider running the server in a sandboxed environment, such as a virtual machine or container, and ensure that the user permissions are properly configured to limit access to sensitive files and network segments. Above all, DO NOT bind the server to other interfaces (non-localhost) unless you understand the security implications of doing so.
|
||||
|
||||
## Trademarks
|
||||
|
||||
This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
|
||||
trademarks or logos is subject to and must follow
|
||||
[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
|
||||
Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
|
||||
Any use of third-party trademarks or logos are subject to those third-party's policies.
|
||||
@@ -0,0 +1,69 @@
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "markitdown-mcp"
|
||||
dynamic = ["version"]
|
||||
description = 'An MCP server for the "markitdown" library.'
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
license = "MIT"
|
||||
keywords = []
|
||||
authors = [
|
||||
{ name = "Adam Fourney", email = "adamfo@microsoft.com" },
|
||||
]
|
||||
classifiers = [
|
||||
"Development Status :: 4 - Beta",
|
||||
"Programming Language :: Python",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Programming Language :: Python :: 3.13",
|
||||
"Programming Language :: Python :: Implementation :: CPython",
|
||||
"Programming Language :: Python :: Implementation :: PyPy",
|
||||
]
|
||||
dependencies = [
|
||||
"mcp~=1.8.0",
|
||||
"markitdown[all]>=0.1.1,<0.2.0",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Documentation = "https://github.com/microsoft/markitdown#readme"
|
||||
Issues = "https://github.com/microsoft/markitdown/issues"
|
||||
Source = "https://github.com/microsoft/markitdown"
|
||||
|
||||
[tool.hatch.version]
|
||||
path = "src/markitdown_mcp/__about__.py"
|
||||
|
||||
[project.scripts]
|
||||
markitdown-mcp = "markitdown_mcp.__main__:main"
|
||||
|
||||
[tool.hatch.envs.types]
|
||||
extra-dependencies = [
|
||||
"mypy>=1.0.0",
|
||||
]
|
||||
[tool.hatch.envs.types.scripts]
|
||||
check = "mypy --install-types --non-interactive {args:src/markitdown_mcp tests}"
|
||||
|
||||
[tool.coverage.run]
|
||||
source_pkgs = ["markitdown-mcp", "tests"]
|
||||
branch = true
|
||||
parallel = true
|
||||
omit = [
|
||||
"src/markitdown_mcp/__about__.py",
|
||||
]
|
||||
|
||||
[tool.coverage.paths]
|
||||
markitdown-mcp = ["src/markitdown_mcp", "*/markitdown-mcp/src/markitdown_mcp"]
|
||||
tests = ["tests", "*/markitdown-mcp/tests"]
|
||||
|
||||
[tool.coverage.report]
|
||||
exclude_lines = [
|
||||
"no cov",
|
||||
"if __name__ == .__main__.:",
|
||||
"if TYPE_CHECKING:",
|
||||
]
|
||||
|
||||
[tool.hatch.build.targets.sdist]
|
||||
only-include = ["src/markitdown_mcp"]
|
||||
@@ -0,0 +1,9 @@
|
||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
from .__about__ import __version__
|
||||
|
||||
__all__ = [
|
||||
"__version__",
|
||||
]
|
||||
@@ -0,0 +1,140 @@
|
||||
import contextlib
|
||||
import sys
|
||||
import os
|
||||
from collections.abc import AsyncIterator
|
||||
from mcp.server.fastmcp import FastMCP
|
||||
from starlette.applications import Starlette
|
||||
from mcp.server.sse import SseServerTransport
|
||||
from starlette.requests import Request
|
||||
from starlette.routing import Mount, Route
|
||||
from starlette.types import Receive, Scope, Send
|
||||
from mcp.server import Server
|
||||
from mcp.server.streamable_http_manager import StreamableHTTPSessionManager
|
||||
from markitdown import MarkItDown
|
||||
import uvicorn
|
||||
|
||||
# Initialize FastMCP server for MarkItDown (SSE)
|
||||
mcp = FastMCP("markitdown")
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
async def convert_to_markdown(uri: str) -> str:
|
||||
"""Convert a resource described by an http:, https:, file: or data: URI to markdown"""
|
||||
return MarkItDown(enable_plugins=check_plugins_enabled()).convert_uri(uri).markdown
|
||||
|
||||
|
||||
def check_plugins_enabled() -> bool:
|
||||
return os.getenv("MARKITDOWN_ENABLE_PLUGINS", "false").strip().lower() in (
|
||||
"true",
|
||||
"1",
|
||||
"yes",
|
||||
)
|
||||
|
||||
|
||||
def create_starlette_app(mcp_server: Server, *, debug: bool = False) -> Starlette:
|
||||
sse = SseServerTransport("/messages/")
|
||||
session_manager = StreamableHTTPSessionManager(
|
||||
app=mcp_server,
|
||||
event_store=None,
|
||||
json_response=True,
|
||||
stateless=True,
|
||||
)
|
||||
|
||||
async def handle_sse(request: Request) -> None:
|
||||
async with sse.connect_sse(
|
||||
request.scope,
|
||||
request.receive,
|
||||
request._send,
|
||||
) as (read_stream, write_stream):
|
||||
await mcp_server.run(
|
||||
read_stream,
|
||||
write_stream,
|
||||
mcp_server.create_initialization_options(),
|
||||
)
|
||||
|
||||
async def handle_streamable_http(
|
||||
scope: Scope, receive: Receive, send: Send
|
||||
) -> None:
|
||||
await session_manager.handle_request(scope, receive, send)
|
||||
|
||||
@contextlib.asynccontextmanager
|
||||
async def lifespan(app: Starlette) -> AsyncIterator[None]:
|
||||
"""Context manager for session manager."""
|
||||
async with session_manager.run():
|
||||
print("Application started with StreamableHTTP session manager!")
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
print("Application shutting down...")
|
||||
|
||||
return Starlette(
|
||||
debug=debug,
|
||||
routes=[
|
||||
Route("/sse", endpoint=handle_sse),
|
||||
Mount("/mcp", app=handle_streamable_http),
|
||||
Mount("/messages/", app=sse.handle_post_message),
|
||||
],
|
||||
lifespan=lifespan,
|
||||
)
|
||||
|
||||
|
||||
# Main entry point
|
||||
def main():
|
||||
import argparse
|
||||
|
||||
mcp_server = mcp._mcp_server
|
||||
|
||||
parser = argparse.ArgumentParser(description="Run a MarkItDown MCP server")
|
||||
|
||||
parser.add_argument(
|
||||
"--http",
|
||||
action="store_true",
|
||||
help="Run the server with Streamable HTTP and SSE transport rather than STDIO (default: False)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--sse",
|
||||
action="store_true",
|
||||
help="(Deprecated) An alias for --http (default: False)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--host", default=None, help="Host to bind to (default: 127.0.0.1)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--port", type=int, default=None, help="Port to listen on (default: 3001)"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
use_http = args.http or args.sse
|
||||
|
||||
if not use_http and (args.host or args.port):
|
||||
parser.error(
|
||||
"Host and port arguments are only valid when using streamable HTTP or SSE transport (see: --http)."
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
if use_http:
|
||||
host = args.host if args.host else "127.0.0.1"
|
||||
if args.host and args.host not in ("127.0.0.1", "localhost"):
|
||||
print(
|
||||
"\n"
|
||||
"WARNING: The server is being bound to a non-localhost interface "
|
||||
f"({host}).\n"
|
||||
"This exposes the server to other machines on the network or Internet.\n"
|
||||
"The server has NO authentication and runs with your user's privileges.\n"
|
||||
"Any process or user that can reach this interface can read files and\n"
|
||||
"fetch network resources accessible to this user.\n"
|
||||
"Only proceed if you understand the security implications.\n",
|
||||
file=sys.stderr,
|
||||
)
|
||||
starlette_app = create_starlette_app(mcp_server, debug=True)
|
||||
uvicorn.run(
|
||||
starlette_app,
|
||||
host=host,
|
||||
port=args.port if args.port else 3001,
|
||||
)
|
||||
else:
|
||||
mcp.run()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) Microsoft Corporation.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE
|
||||
@@ -0,0 +1,200 @@
|
||||
# MarkItDown OCR Plugin
|
||||
|
||||
LLM Vision plugin for MarkItDown that extracts text from images embedded in PDF, DOCX, PPTX, and XLSX files.
|
||||
|
||||
Uses the same `llm_client` / `llm_model` pattern that MarkItDown already supports for image descriptions — no new ML libraries or binary dependencies required.
|
||||
|
||||
## Features
|
||||
|
||||
- **Enhanced PDF Converter**: Extracts text from images within PDFs, with full-page OCR fallback for scanned documents
|
||||
- **Enhanced DOCX Converter**: OCR for images in Word documents
|
||||
- **Enhanced PPTX Converter**: OCR for images in PowerPoint presentations
|
||||
- **Enhanced XLSX Converter**: OCR for images in Excel spreadsheets
|
||||
- **Context Preservation**: Maintains document structure and flow when inserting extracted text
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
pip install markitdown-ocr
|
||||
```
|
||||
|
||||
The plugin uses whatever OpenAI-compatible client you already have. Install one if you don't have it yet:
|
||||
|
||||
```bash
|
||||
pip install openai
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Command Line
|
||||
|
||||
```bash
|
||||
markitdown document.pdf --use-plugins --llm-client openai --llm-model gpt-4o
|
||||
```
|
||||
|
||||
### Python API
|
||||
|
||||
Pass `llm_client` and `llm_model` to `MarkItDown()` exactly as you would for image descriptions:
|
||||
|
||||
```python
|
||||
from markitdown import MarkItDown
|
||||
from openai import OpenAI
|
||||
|
||||
md = MarkItDown(
|
||||
enable_plugins=True,
|
||||
llm_client=OpenAI(),
|
||||
llm_model="gpt-4o",
|
||||
)
|
||||
|
||||
result = md.convert("document_with_images.pdf")
|
||||
print(result.text_content)
|
||||
```
|
||||
|
||||
If no `llm_client` is provided the plugin still loads, but OCR is silently skipped — falling back to the standard built-in converter.
|
||||
|
||||
### Custom Prompt
|
||||
|
||||
Override the default extraction prompt for specialized documents:
|
||||
|
||||
```python
|
||||
md = MarkItDown(
|
||||
enable_plugins=True,
|
||||
llm_client=OpenAI(),
|
||||
llm_model="gpt-4o",
|
||||
llm_prompt="Extract all text from this image, preserving table structure.",
|
||||
)
|
||||
```
|
||||
|
||||
### Any OpenAI-Compatible Client
|
||||
|
||||
Works with any client that follows the OpenAI API:
|
||||
|
||||
```python
|
||||
from openai import AzureOpenAI
|
||||
|
||||
md = MarkItDown(
|
||||
enable_plugins=True,
|
||||
llm_client=AzureOpenAI(
|
||||
api_key="...",
|
||||
azure_endpoint="https://your-resource.openai.azure.com/",
|
||||
api_version="2024-02-01",
|
||||
),
|
||||
llm_model="gpt-4o",
|
||||
)
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
When `MarkItDown(enable_plugins=True, llm_client=..., llm_model=...)` is called:
|
||||
|
||||
1. MarkItDown discovers the plugin via the `markitdown.plugin` entry point group
|
||||
2. It calls `register_converters()`, forwarding all kwargs including `llm_client` and `llm_model`
|
||||
3. The plugin creates an `LLMVisionOCRService` from those kwargs
|
||||
4. Four OCR-enhanced converters are registered at **priority -1.0** — before the built-in converters at priority 0.0
|
||||
|
||||
When a file is converted:
|
||||
|
||||
1. The OCR converter accepts the file
|
||||
2. It extracts embedded images from the document
|
||||
3. Each image is sent to the LLM with an extraction prompt
|
||||
4. The returned text is inserted inline, preserving document structure
|
||||
5. If the LLM call fails, conversion continues without that image's text
|
||||
|
||||
## Supported File Formats
|
||||
|
||||
### PDF
|
||||
|
||||
- Embedded images are extracted by position (via `page.images` / page XObjects) and OCR'd inline, interleaved with the surrounding text in vertical reading order.
|
||||
- **Scanned PDFs** (pages with no extractable text) are detected automatically: each page is rendered at 300 DPI and sent to the LLM as a full-page image.
|
||||
- **Malformed PDFs** that pdfplumber/pdfminer cannot open (e.g. truncated EOF) are retried with PyMuPDF page rendering, so content is still recovered.
|
||||
|
||||
### DOCX
|
||||
|
||||
- Images are extracted via document part relationships (`doc.part.rels`).
|
||||
- OCR is run before the DOCX→HTML→Markdown pipeline executes: placeholder tokens are injected into the HTML so that the markdown converter does not escape the OCR markers, and the final placeholders are replaced with the formatted `*[Image OCR]...[End OCR]*` blocks after conversion.
|
||||
- Document flow (headings, paragraphs, tables) is fully preserved around the OCR blocks.
|
||||
|
||||
### PPTX
|
||||
|
||||
- Picture shapes, placeholder shapes with images, and images inside groups are all supported.
|
||||
- Shapes are processed in top-to-left reading order per slide.
|
||||
- If an `llm_client` is configured, the LLM is asked for a description first; OCR is used as the fallback when no description is returned.
|
||||
|
||||
### XLSX
|
||||
|
||||
- Images embedded in worksheets (`sheet._images`) are extracted per sheet.
|
||||
- Cell position is calculated from the image anchor coordinates (column/row → Excel letter notation).
|
||||
- Images are listed under a `### Images in this sheet:` section after the sheet's data table — they are not interleaved into the table rows.
|
||||
|
||||
### Output format
|
||||
|
||||
Every extracted OCR block is wrapped as:
|
||||
|
||||
```text
|
||||
*[Image OCR]
|
||||
<extracted text>
|
||||
[End OCR]*
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### OCR text missing from output
|
||||
|
||||
The most likely cause is a missing `llm_client` or `llm_model`. Verify:
|
||||
|
||||
```python
|
||||
from openai import OpenAI
|
||||
from markitdown import MarkItDown
|
||||
|
||||
md = MarkItDown(
|
||||
enable_plugins=True,
|
||||
llm_client=OpenAI(), # required
|
||||
llm_model="gpt-4o", # required
|
||||
)
|
||||
```
|
||||
|
||||
### Plugin not loading
|
||||
|
||||
Confirm the plugin is installed and discovered:
|
||||
|
||||
```bash
|
||||
markitdown --list-plugins # should show: ocr
|
||||
```
|
||||
|
||||
### API errors
|
||||
|
||||
The plugin propagates LLM API errors as warnings and continues conversion. Check your API key, quota, and that the chosen model supports vision inputs.
|
||||
|
||||
## Development
|
||||
|
||||
### Running Tests
|
||||
|
||||
```bash
|
||||
cd packages/markitdown-ocr
|
||||
pytest tests/ -v
|
||||
```
|
||||
|
||||
### Building from Source
|
||||
|
||||
```bash
|
||||
git clone https://github.com/microsoft/markitdown.git
|
||||
cd markitdown/packages/markitdown-ocr
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions are welcome! See the [MarkItDown repository](https://github.com/microsoft/markitdown) for guidelines.
|
||||
|
||||
## License
|
||||
|
||||
MIT — see [LICENSE](LICENSE).
|
||||
|
||||
## Changelog
|
||||
|
||||
### 0.1.0 (Initial Release)
|
||||
|
||||
- LLM Vision OCR for PDF, DOCX, PPTX, XLSX
|
||||
- Full-page OCR fallback for scanned PDFs
|
||||
- Context-aware inline text insertion
|
||||
- Priority-based converter replacement (no code changes required)
|
||||
@@ -0,0 +1,57 @@
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "markitdown-ocr"
|
||||
dynamic = ["version"]
|
||||
description = 'OCR plugin for MarkItDown - Extracts text from images in PDF, DOCX, PPTX, and XLSX via LLM Vision'
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
license = "MIT"
|
||||
keywords = ["markitdown", "ocr", "pdf", "docx", "xlsx", "pptx", "llm", "vision"]
|
||||
authors = [
|
||||
{ name = "Contributors", email = "noreply@github.com" },
|
||||
]
|
||||
classifiers = [
|
||||
"Development Status :: 4 - Beta",
|
||||
"Programming Language :: Python",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Programming Language :: Python :: 3.13",
|
||||
"Programming Language :: Python :: Implementation :: CPython",
|
||||
]
|
||||
|
||||
# Core dependencies — matches the file-format libraries markitdown already uses
|
||||
dependencies = [
|
||||
"markitdown>=0.1.0",
|
||||
"pdfminer.six>=20251230",
|
||||
"pdfplumber>=0.11.9",
|
||||
"PyMuPDF>=1.24.0",
|
||||
"mammoth~=1.11.0",
|
||||
"python-docx",
|
||||
"python-pptx",
|
||||
"pandas",
|
||||
"openpyxl",
|
||||
"Pillow>=9.0.0",
|
||||
]
|
||||
|
||||
# llm_client is passed in by the user (same as for markitdown image descriptions);
|
||||
# install openai or any OpenAI-compatible SDK separately.
|
||||
[project.optional-dependencies]
|
||||
llm = [
|
||||
"openai>=1.0.0",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Documentation = "https://github.com/microsoft/markitdown#readme"
|
||||
Issues = "https://github.com/microsoft/markitdown/issues"
|
||||
Source = "https://github.com/microsoft/markitdown"
|
||||
|
||||
[tool.hatch.version]
|
||||
path = "src/markitdown_ocr/__about__.py"
|
||||
|
||||
# CRITICAL: Plugin entry point - MarkItDown will discover this plugin through this entry point
|
||||
[project.entry-points."markitdown.plugin"]
|
||||
ocr = "markitdown_ocr"
|
||||
@@ -0,0 +1,4 @@
|
||||
# SPDX-FileCopyrightText: 2025-present Contributors
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
__version__ = "0.1.0"
|
||||
@@ -0,0 +1,31 @@
|
||||
# SPDX-FileCopyrightText: 2025-present Contributors
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
"""
|
||||
markitdown-ocr: OCR plugin for MarkItDown
|
||||
|
||||
Adds LLM Vision-based text extraction from images embedded in PDF, DOCX, PPTX, and XLSX files.
|
||||
"""
|
||||
|
||||
from ._plugin import __plugin_interface_version__, register_converters
|
||||
from .__about__ import __version__
|
||||
from ._ocr_service import (
|
||||
OCRResult,
|
||||
LLMVisionOCRService,
|
||||
)
|
||||
from ._pdf_converter_with_ocr import PdfConverterWithOCR
|
||||
from ._docx_converter_with_ocr import DocxConverterWithOCR
|
||||
from ._pptx_converter_with_ocr import PptxConverterWithOCR
|
||||
from ._xlsx_converter_with_ocr import XlsxConverterWithOCR
|
||||
|
||||
__all__ = [
|
||||
"__version__",
|
||||
"__plugin_interface_version__",
|
||||
"register_converters",
|
||||
"OCRResult",
|
||||
"LLMVisionOCRService",
|
||||
"PdfConverterWithOCR",
|
||||
"DocxConverterWithOCR",
|
||||
"PptxConverterWithOCR",
|
||||
"XlsxConverterWithOCR",
|
||||
]
|
||||
@@ -0,0 +1,189 @@
|
||||
"""
|
||||
Enhanced DOCX Converter with OCR support for embedded images.
|
||||
Extracts images from Word documents and performs OCR while maintaining context.
|
||||
"""
|
||||
|
||||
import io
|
||||
import re
|
||||
import sys
|
||||
from typing import Any, BinaryIO, Optional
|
||||
|
||||
from markitdown.converters import HtmlConverter
|
||||
from markitdown.converter_utils.docx.pre_process import pre_process_docx
|
||||
from markitdown import DocumentConverterResult, StreamInfo
|
||||
from markitdown._exceptions import (
|
||||
MissingDependencyException,
|
||||
MISSING_DEPENDENCY_MESSAGE,
|
||||
)
|
||||
from ._ocr_service import LLMVisionOCRService
|
||||
|
||||
# Try loading dependencies
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import mammoth
|
||||
from docx import Document
|
||||
except ImportError:
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
# Placeholder injected into HTML so that mammoth never sees the OCR markers.
|
||||
# Must be a single token with no special markdown characters.
|
||||
_PLACEHOLDER = "MARKITDOWNOCRBLOCK{}"
|
||||
|
||||
|
||||
class DocxConverterWithOCR(HtmlConverter):
|
||||
"""
|
||||
Enhanced DOCX Converter with OCR support for embedded images.
|
||||
Maintains document flow while extracting text from images inline.
|
||||
"""
|
||||
|
||||
def __init__(self, ocr_service: Optional[LLMVisionOCRService] = None):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
self.ocr_service = ocr_service
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension == ".docx":
|
||||
return True
|
||||
|
||||
if mimetype.startswith(
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml"
|
||||
):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".docx",
|
||||
feature="docx",
|
||||
)
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
_dependency_exc_info[2]
|
||||
) # type: ignore[union-attr]
|
||||
|
||||
# Get OCR service if available (from kwargs or instance)
|
||||
ocr_service: Optional[LLMVisionOCRService] = (
|
||||
kwargs.get("ocr_service") or self.ocr_service
|
||||
)
|
||||
|
||||
if ocr_service:
|
||||
# 1. Extract and OCR images — returns raw text per image
|
||||
file_stream.seek(0)
|
||||
image_ocr_map = self._extract_and_ocr_images(file_stream, ocr_service)
|
||||
|
||||
# 2. Convert DOCX → HTML via mammoth
|
||||
file_stream.seek(0)
|
||||
pre_process_stream = pre_process_docx(file_stream)
|
||||
html_result = mammoth.convert_to_html(
|
||||
pre_process_stream, style_map=kwargs.get("style_map")
|
||||
).value
|
||||
|
||||
# 3. Replace <img> tags with plain placeholder tokens so that
|
||||
# mammoth's HTML→markdown step never escapes our OCR markers.
|
||||
html_with_placeholders, ocr_texts = self._inject_placeholders(
|
||||
html_result, image_ocr_map
|
||||
)
|
||||
|
||||
# 4. Convert HTML → markdown
|
||||
md_result = self._html_converter.convert_string(
|
||||
html_with_placeholders, **kwargs
|
||||
)
|
||||
md = md_result.markdown
|
||||
|
||||
# 5. Swap placeholders for the actual OCR blocks (post-conversion
|
||||
# so * and _ are never escaped by the markdown converter).
|
||||
for i, raw_text in enumerate(ocr_texts):
|
||||
placeholder = _PLACEHOLDER.format(i)
|
||||
ocr_block = f"*[Image OCR]\n{raw_text}\n[End OCR]*"
|
||||
md = md.replace(placeholder, ocr_block)
|
||||
|
||||
return DocumentConverterResult(markdown=md)
|
||||
else:
|
||||
# Standard conversion without OCR
|
||||
style_map = kwargs.get("style_map", None)
|
||||
pre_process_stream = pre_process_docx(file_stream)
|
||||
return self._html_converter.convert_string(
|
||||
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _extract_and_ocr_images(
|
||||
self, file_stream: BinaryIO, ocr_service: LLMVisionOCRService
|
||||
) -> dict[str, str]:
|
||||
"""
|
||||
Extract images from DOCX and OCR them.
|
||||
|
||||
Returns:
|
||||
Dict mapping image relationship IDs to raw OCR text (no markers).
|
||||
"""
|
||||
ocr_map = {}
|
||||
|
||||
try:
|
||||
file_stream.seek(0)
|
||||
doc = Document(file_stream)
|
||||
|
||||
for rel in doc.part.rels.values():
|
||||
if "image" in rel.target_ref.lower():
|
||||
try:
|
||||
image_bytes = rel.target_part.blob
|
||||
image_stream = io.BytesIO(image_bytes)
|
||||
ocr_result = ocr_service.extract_text(image_stream)
|
||||
|
||||
if ocr_result.text.strip():
|
||||
# Store raw text only — markers added later
|
||||
ocr_map[rel.rId] = ocr_result.text.strip()
|
||||
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return ocr_map
|
||||
|
||||
def _inject_placeholders(
|
||||
self, html: str, ocr_map: dict[str, str]
|
||||
) -> tuple[str, list[str]]:
|
||||
"""
|
||||
Replace <img> tags with numbered placeholder tokens.
|
||||
|
||||
Returns:
|
||||
(html_with_placeholders, ordered list of raw OCR texts)
|
||||
"""
|
||||
if not ocr_map:
|
||||
return html, []
|
||||
|
||||
ocr_texts = list(ocr_map.values())
|
||||
used: list[int] = []
|
||||
|
||||
def replace_img(match: re.Match) -> str: # type: ignore[type-arg]
|
||||
for i in range(len(ocr_texts)):
|
||||
if i not in used:
|
||||
used.append(i)
|
||||
return f"<p>{_PLACEHOLDER.format(i)}</p>"
|
||||
return "" # remove image if all OCR texts already used
|
||||
|
||||
result = re.sub(r"<img[^>]*>", replace_img, html)
|
||||
|
||||
# Any OCR texts that had no matching <img> tag go at the end
|
||||
for i in range(len(ocr_texts)):
|
||||
if i not in used:
|
||||
result += f"<p>{_PLACEHOLDER.format(i)}</p>"
|
||||
|
||||
return result, ocr_texts
|
||||
@@ -0,0 +1,110 @@
|
||||
"""
|
||||
OCR Service Layer for MarkItDown
|
||||
Provides LLM Vision-based image text extraction.
|
||||
"""
|
||||
|
||||
import base64
|
||||
from typing import Any, BinaryIO
|
||||
from dataclasses import dataclass
|
||||
|
||||
from markitdown import StreamInfo
|
||||
|
||||
|
||||
@dataclass
|
||||
class OCRResult:
|
||||
"""Result from OCR extraction."""
|
||||
|
||||
text: str
|
||||
confidence: float | None = None
|
||||
backend_used: str | None = None
|
||||
error: str | None = None
|
||||
|
||||
|
||||
class LLMVisionOCRService:
|
||||
"""OCR service using LLM vision models (OpenAI-compatible)."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
client: Any,
|
||||
model: str,
|
||||
default_prompt: str | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize LLM Vision OCR service.
|
||||
|
||||
Args:
|
||||
client: OpenAI-compatible client
|
||||
model: Model name (e.g., 'gpt-4o', 'gemini-2.0-flash')
|
||||
default_prompt: Default prompt for OCR extraction
|
||||
"""
|
||||
self.client = client
|
||||
self.model = model
|
||||
self.default_prompt = default_prompt or (
|
||||
"Extract all text from this image. "
|
||||
"Return ONLY the extracted text, maintaining the original "
|
||||
"layout and order. Do not add any commentary or description."
|
||||
)
|
||||
|
||||
def extract_text(
|
||||
self,
|
||||
image_stream: BinaryIO,
|
||||
prompt: str | None = None,
|
||||
stream_info: StreamInfo | None = None,
|
||||
**kwargs: Any,
|
||||
) -> OCRResult:
|
||||
"""Extract text using LLM vision."""
|
||||
if self.client is None:
|
||||
return OCRResult(
|
||||
text="",
|
||||
backend_used="llm_vision",
|
||||
error="LLM client not configured",
|
||||
)
|
||||
|
||||
try:
|
||||
image_stream.seek(0)
|
||||
|
||||
content_type: str | None = None
|
||||
if stream_info:
|
||||
content_type = stream_info.mimetype
|
||||
|
||||
if not content_type:
|
||||
try:
|
||||
from PIL import Image
|
||||
|
||||
image_stream.seek(0)
|
||||
img = Image.open(image_stream)
|
||||
fmt = img.format.lower() if img.format else "png"
|
||||
content_type = f"image/{fmt}"
|
||||
except Exception:
|
||||
content_type = "image/png"
|
||||
|
||||
image_stream.seek(0)
|
||||
base64_image = base64.b64encode(image_stream.read()).decode("utf-8")
|
||||
data_uri = f"data:{content_type};base64,{base64_image}"
|
||||
|
||||
actual_prompt = prompt or self.default_prompt
|
||||
response = self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": actual_prompt},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": data_uri},
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
)
|
||||
|
||||
text = response.choices[0].message.content
|
||||
return OCRResult(
|
||||
text=text.strip() if text else "",
|
||||
backend_used="llm_vision",
|
||||
)
|
||||
except Exception as e:
|
||||
return OCRResult(text="", backend_used="llm_vision", error=str(e))
|
||||
finally:
|
||||
image_stream.seek(0)
|
||||
@@ -0,0 +1,422 @@
|
||||
"""
|
||||
Enhanced PDF Converter with OCR support for embedded images.
|
||||
Extracts images from PDFs and performs OCR while maintaining document context.
|
||||
"""
|
||||
|
||||
import io
|
||||
import sys
|
||||
from typing import Any, BinaryIO, Optional
|
||||
|
||||
from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo
|
||||
from markitdown._exceptions import (
|
||||
MissingDependencyException,
|
||||
MISSING_DEPENDENCY_MESSAGE,
|
||||
)
|
||||
from ._ocr_service import LLMVisionOCRService
|
||||
|
||||
# Import dependencies
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import pdfminer
|
||||
import pdfminer.high_level
|
||||
import pdfplumber
|
||||
from PIL import Image
|
||||
except ImportError:
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
def _extract_images_from_page(page: Any) -> list[dict]:
|
||||
"""
|
||||
Extract images from a PDF page by rendering page regions.
|
||||
|
||||
Returns:
|
||||
List of dicts with 'stream', 'bbox', 'name', 'y_pos' keys
|
||||
"""
|
||||
images_info = []
|
||||
|
||||
try:
|
||||
# Try multiple methods to detect images
|
||||
images = []
|
||||
|
||||
# Method 1: Use page.images (standard approach)
|
||||
if hasattr(page, "images") and page.images:
|
||||
images = page.images
|
||||
|
||||
# Method 2: If no images found, try underlying PDF objects
|
||||
if not images and hasattr(page, "objects") and "image" in page.objects:
|
||||
images = page.objects.get("image", [])
|
||||
|
||||
# Method 3: Try filtering all objects for image types
|
||||
if not images and hasattr(page, "objects"):
|
||||
all_objs = page.objects
|
||||
for obj_type in all_objs.keys():
|
||||
if "image" in obj_type.lower() or "xobject" in obj_type.lower():
|
||||
potential_imgs = all_objs.get(obj_type, [])
|
||||
if potential_imgs:
|
||||
images = potential_imgs
|
||||
break
|
||||
|
||||
for i, img_dict in enumerate(images):
|
||||
try:
|
||||
# Try to get the actual image stream from the PDF
|
||||
img_stream = None
|
||||
y_pos = 0
|
||||
|
||||
# Method A: If img_dict has 'stream' key, use it directly
|
||||
if "stream" in img_dict and hasattr(img_dict["stream"], "get_data"):
|
||||
try:
|
||||
img_bytes = img_dict["stream"].get_data()
|
||||
|
||||
# Try to open as PIL Image to validate/decode
|
||||
pil_img = Image.open(io.BytesIO(img_bytes))
|
||||
|
||||
# Convert to RGB if needed (handle CMYK, etc.)
|
||||
if pil_img.mode not in ("RGB", "L"):
|
||||
pil_img = pil_img.convert("RGB")
|
||||
|
||||
# Save to stream as PNG
|
||||
img_stream = io.BytesIO()
|
||||
pil_img.save(img_stream, format="PNG")
|
||||
img_stream.seek(0)
|
||||
|
||||
y_pos = img_dict.get("top", 0)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Method B: Fallback to rendering page region
|
||||
if img_stream is None:
|
||||
x0 = img_dict.get("x0", 0)
|
||||
y0 = img_dict.get("top", 0)
|
||||
x1 = img_dict.get("x1", 0)
|
||||
y1 = img_dict.get("bottom", 0)
|
||||
y_pos = y0
|
||||
|
||||
# Check if dimensions are valid
|
||||
if x1 <= x0 or y1 <= y0:
|
||||
continue
|
||||
|
||||
# Use pdfplumber's within_bbox to crop, then render
|
||||
# This preserves coordinate system correctly
|
||||
bbox = (x0, y0, x1, y1)
|
||||
cropped_page = page.within_bbox(bbox)
|
||||
|
||||
# Render at 150 DPI (balance between quality and size)
|
||||
page_img = cropped_page.to_image(resolution=150)
|
||||
|
||||
# Save to stream
|
||||
img_stream = io.BytesIO()
|
||||
page_img.original.save(img_stream, format="PNG")
|
||||
img_stream.seek(0)
|
||||
|
||||
if img_stream:
|
||||
images_info.append(
|
||||
{
|
||||
"stream": img_stream,
|
||||
"name": f"page_{page.page_number}_img_{i}",
|
||||
"y_pos": y_pos,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return images_info
|
||||
|
||||
|
||||
class PdfConverterWithOCR(DocumentConverter):
|
||||
"""
|
||||
Enhanced PDF Converter with OCR support for embedded images.
|
||||
Maintains document structure while extracting text from images inline.
|
||||
"""
|
||||
|
||||
def __init__(self, ocr_service: Optional[LLMVisionOCRService] = None):
|
||||
super().__init__()
|
||||
self.ocr_service = ocr_service
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension == ".pdf":
|
||||
return True
|
||||
|
||||
if mimetype.startswith("application/pdf") or mimetype.startswith(
|
||||
"application/x-pdf"
|
||||
):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".pdf",
|
||||
feature="pdf",
|
||||
)
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
_dependency_exc_info[2]
|
||||
) # type: ignore[union-attr]
|
||||
|
||||
# Get OCR service if available (from kwargs or instance)
|
||||
ocr_service: LLMVisionOCRService | None = (
|
||||
kwargs.get("ocr_service") or self.ocr_service
|
||||
)
|
||||
|
||||
# Read PDF into BytesIO
|
||||
file_stream.seek(0)
|
||||
pdf_bytes = io.BytesIO(file_stream.read())
|
||||
|
||||
markdown_content = []
|
||||
|
||||
try:
|
||||
with pdfplumber.open(pdf_bytes) as pdf:
|
||||
for page_num, page in enumerate(pdf.pages, 1):
|
||||
markdown_content.append(f"\n## Page {page_num}\n")
|
||||
|
||||
# If OCR is enabled, interleave text and images by position
|
||||
if ocr_service:
|
||||
images_on_page = self._extract_page_images(pdf_bytes, page_num)
|
||||
|
||||
if images_on_page:
|
||||
# Extract text lines with Y positions
|
||||
chars = page.chars
|
||||
if chars:
|
||||
# Group chars into lines based on Y position
|
||||
lines_with_y = []
|
||||
current_line = []
|
||||
current_y = None
|
||||
|
||||
for char in sorted(
|
||||
chars, key=lambda c: (c["top"], c["x0"])
|
||||
):
|
||||
y = char["top"]
|
||||
if current_y is None:
|
||||
current_y = y
|
||||
elif abs(y - current_y) > 2: # New line threshold
|
||||
if current_line:
|
||||
text = "".join(
|
||||
[c["text"] for c in current_line]
|
||||
)
|
||||
lines_with_y.append(
|
||||
{"y": current_y, "text": text.strip()}
|
||||
)
|
||||
current_line = []
|
||||
current_y = y
|
||||
current_line.append(char)
|
||||
|
||||
# Add last line
|
||||
if current_line:
|
||||
text = "".join([c["text"] for c in current_line])
|
||||
lines_with_y.append(
|
||||
{"y": current_y, "text": text.strip()}
|
||||
)
|
||||
else:
|
||||
# Fallback: use simple text extraction
|
||||
text_content = page.extract_text() or ""
|
||||
lines_with_y = [
|
||||
{"y": i * 10, "text": line}
|
||||
for i, line in enumerate(text_content.split("\n"))
|
||||
]
|
||||
|
||||
# OCR all images
|
||||
image_data = []
|
||||
for img_info in images_on_page:
|
||||
ocr_result = ocr_service.extract_text(
|
||||
img_info["stream"]
|
||||
)
|
||||
if ocr_result.text.strip():
|
||||
image_data.append(
|
||||
{
|
||||
"y_pos": img_info["y_pos"],
|
||||
"name": img_info["name"],
|
||||
"ocr_text": ocr_result.text,
|
||||
"backend": ocr_result.backend_used,
|
||||
"type": "image",
|
||||
}
|
||||
)
|
||||
|
||||
# Add text items
|
||||
content_items = [
|
||||
{
|
||||
"y_pos": item["y"],
|
||||
"text": item["text"],
|
||||
"type": "text",
|
||||
}
|
||||
for item in lines_with_y
|
||||
if item["text"]
|
||||
]
|
||||
content_items.extend(image_data)
|
||||
|
||||
# Sort all items by Y position (top to bottom)
|
||||
content_items.sort(key=lambda x: x["y_pos"])
|
||||
|
||||
# Build markdown by interleaving text and images
|
||||
for item in content_items:
|
||||
if item["type"] == "text":
|
||||
markdown_content.append(item["text"])
|
||||
else: # image
|
||||
ocr_text = item["ocr_text"]
|
||||
img_marker = (
|
||||
f"\n\n*[Image OCR]\n{ocr_text}\n[End OCR]*\n"
|
||||
)
|
||||
markdown_content.append(img_marker)
|
||||
else:
|
||||
# No images detected - just extract regular text
|
||||
text_content = page.extract_text() or ""
|
||||
if text_content.strip():
|
||||
markdown_content.append(text_content.strip())
|
||||
else:
|
||||
# No OCR, just extract text
|
||||
text_content = page.extract_text() or ""
|
||||
if text_content.strip():
|
||||
markdown_content.append(text_content.strip())
|
||||
|
||||
# Build final markdown
|
||||
markdown = "\n\n".join(markdown_content).strip()
|
||||
|
||||
# Fallback to pdfminer if empty
|
||||
if not markdown:
|
||||
pdf_bytes.seek(0)
|
||||
markdown = pdfminer.high_level.extract_text(pdf_bytes)
|
||||
|
||||
except Exception:
|
||||
# Fallback to pdfminer
|
||||
try:
|
||||
pdf_bytes.seek(0)
|
||||
markdown = pdfminer.high_level.extract_text(pdf_bytes)
|
||||
except Exception:
|
||||
markdown = ""
|
||||
|
||||
# Final fallback: If still empty/whitespace and OCR is available,
|
||||
# treat as scanned PDF and OCR full pages
|
||||
if ocr_service and (not markdown or not markdown.strip()):
|
||||
pdf_bytes.seek(0)
|
||||
markdown = self._ocr_full_pages(pdf_bytes, ocr_service)
|
||||
|
||||
return DocumentConverterResult(markdown=markdown)
|
||||
|
||||
def _extract_page_images(self, pdf_bytes: io.BytesIO, page_num: int) -> list[dict]:
|
||||
"""
|
||||
Extract images from a PDF page using pdfplumber.
|
||||
|
||||
Args:
|
||||
pdf_bytes: PDF file as BytesIO
|
||||
page_num: Page number (1-indexed)
|
||||
|
||||
Returns:
|
||||
List of image info dicts with 'stream', 'bbox', 'name', 'y_pos'
|
||||
"""
|
||||
images = []
|
||||
|
||||
try:
|
||||
pdf_bytes.seek(0)
|
||||
with pdfplumber.open(pdf_bytes) as pdf:
|
||||
if page_num <= len(pdf.pages):
|
||||
page = pdf.pages[page_num - 1] # 0-indexed
|
||||
images = _extract_images_from_page(page)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Sort by vertical position (top to bottom)
|
||||
images.sort(key=lambda x: x["y_pos"])
|
||||
|
||||
return images
|
||||
|
||||
def _ocr_full_pages(
|
||||
self, pdf_bytes: io.BytesIO, ocr_service: LLMVisionOCRService
|
||||
) -> str:
|
||||
"""
|
||||
Fallback for scanned PDFs: Convert entire pages to images and OCR them.
|
||||
Used when text extraction returns empty/whitespace results.
|
||||
|
||||
Args:
|
||||
pdf_bytes: PDF file as BytesIO
|
||||
ocr_service: OCR service to use
|
||||
|
||||
Returns:
|
||||
Markdown text extracted from OCR of full pages
|
||||
"""
|
||||
markdown_parts = []
|
||||
|
||||
try:
|
||||
pdf_bytes.seek(0)
|
||||
with pdfplumber.open(pdf_bytes) as pdf:
|
||||
for page_num, page in enumerate(pdf.pages, 1):
|
||||
try:
|
||||
markdown_parts.append(f"\n## Page {page_num}\n")
|
||||
|
||||
# Render page to image
|
||||
page_img = page.to_image(resolution=300)
|
||||
img_stream = io.BytesIO()
|
||||
page_img.original.save(img_stream, format="PNG")
|
||||
img_stream.seek(0)
|
||||
|
||||
# Run OCR
|
||||
ocr_result = ocr_service.extract_text(img_stream)
|
||||
|
||||
if ocr_result.text.strip():
|
||||
text = ocr_result.text.strip()
|
||||
markdown_parts.append(f"*[Image OCR]\n{text}\n[End OCR]*")
|
||||
else:
|
||||
markdown_parts.append(
|
||||
"*[No text could be extracted from this page]*"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
markdown_parts.append(
|
||||
f"*[Error processing page {page_num}: {str(e)}]*"
|
||||
)
|
||||
continue
|
||||
|
||||
except Exception:
|
||||
# pdfplumber failed (e.g. malformed EOF) — try PyMuPDF for rendering
|
||||
markdown_parts = []
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
|
||||
pdf_bytes.seek(0)
|
||||
doc = fitz.open(stream=pdf_bytes.read(), filetype="pdf")
|
||||
for page_num in range(1, doc.page_count + 1):
|
||||
try:
|
||||
markdown_parts.append(f"\n## Page {page_num}\n")
|
||||
page = doc[page_num - 1]
|
||||
mat = fitz.Matrix(300 / 72, 300 / 72) # 300 DPI
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
img_stream = io.BytesIO(pix.tobytes("png"))
|
||||
img_stream.seek(0)
|
||||
|
||||
ocr_result = ocr_service.extract_text(img_stream)
|
||||
|
||||
if ocr_result.text.strip():
|
||||
text = ocr_result.text.strip()
|
||||
markdown_parts.append(f"*[Image OCR]\n{text}\n[End OCR]*")
|
||||
else:
|
||||
markdown_parts.append(
|
||||
"*[No text could be extracted from this page]*"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
markdown_parts.append(
|
||||
f"*[Error processing page {page_num}: {str(e)}]*"
|
||||
)
|
||||
continue
|
||||
doc.close()
|
||||
except Exception:
|
||||
return "*[Error: Could not process scanned PDF]*"
|
||||
|
||||
return "\n\n".join(markdown_parts).strip()
|
||||
@@ -0,0 +1,68 @@
|
||||
"""
|
||||
Plugin registration for markitdown-ocr.
|
||||
Registers OCR-enhanced converters with priority-based replacement strategy.
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
from markitdown import MarkItDown
|
||||
|
||||
from ._ocr_service import LLMVisionOCRService
|
||||
from ._pdf_converter_with_ocr import PdfConverterWithOCR
|
||||
from ._docx_converter_with_ocr import DocxConverterWithOCR
|
||||
from ._pptx_converter_with_ocr import PptxConverterWithOCR
|
||||
from ._xlsx_converter_with_ocr import XlsxConverterWithOCR
|
||||
|
||||
|
||||
__plugin_interface_version__ = 1
|
||||
|
||||
|
||||
def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None:
|
||||
"""
|
||||
Register OCR-enhanced converters with MarkItDown.
|
||||
|
||||
This plugin provides OCR support for PDF, DOCX, PPTX, and XLSX files.
|
||||
The converters are registered with priority -1.0 to run BEFORE built-in
|
||||
converters (which have priority 0.0), effectively replacing them when
|
||||
the plugin is enabled.
|
||||
|
||||
Args:
|
||||
markitdown: MarkItDown instance to register converters with
|
||||
**kwargs: Additional keyword arguments that may include:
|
||||
- llm_client: OpenAI-compatible client for LLM-based OCR (required for OCR to work)
|
||||
- llm_model: Model name (e.g., 'gpt-4o')
|
||||
- llm_prompt: Custom prompt for text extraction
|
||||
"""
|
||||
# Create OCR service — reads the same llm_client/llm_model kwargs
|
||||
# that MarkItDown itself already accepts for image descriptions
|
||||
llm_client = kwargs.get("llm_client")
|
||||
llm_model = kwargs.get("llm_model")
|
||||
llm_prompt = kwargs.get("llm_prompt")
|
||||
|
||||
ocr_service: LLMVisionOCRService | None = None
|
||||
if llm_client and llm_model:
|
||||
ocr_service = LLMVisionOCRService(
|
||||
client=llm_client,
|
||||
model=llm_model,
|
||||
default_prompt=llm_prompt,
|
||||
)
|
||||
|
||||
# Register converters with priority -1.0 (before built-ins at 0.0)
|
||||
# This effectively "replaces" the built-in converters when plugin is installed
|
||||
# Pass the OCR service to each converter's constructor
|
||||
PRIORITY_OCR_ENHANCED = -1.0
|
||||
|
||||
markitdown.register_converter(
|
||||
PdfConverterWithOCR(ocr_service=ocr_service), priority=PRIORITY_OCR_ENHANCED
|
||||
)
|
||||
|
||||
markitdown.register_converter(
|
||||
DocxConverterWithOCR(ocr_service=ocr_service), priority=PRIORITY_OCR_ENHANCED
|
||||
)
|
||||
|
||||
markitdown.register_converter(
|
||||
PptxConverterWithOCR(ocr_service=ocr_service), priority=PRIORITY_OCR_ENHANCED
|
||||
)
|
||||
|
||||
markitdown.register_converter(
|
||||
XlsxConverterWithOCR(ocr_service=ocr_service), priority=PRIORITY_OCR_ENHANCED
|
||||
)
|
||||
@@ -0,0 +1,249 @@
|
||||
"""
|
||||
Enhanced PPTX Converter with improved OCR support.
|
||||
Already has LLM-based image description, this enhances it with traditional OCR fallback.
|
||||
"""
|
||||
|
||||
import io
|
||||
import sys
|
||||
from typing import Any, BinaryIO, Optional
|
||||
|
||||
from typing import BinaryIO, Any, Optional
|
||||
|
||||
from markitdown.converters import HtmlConverter
|
||||
from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo
|
||||
from markitdown._exceptions import (
|
||||
MissingDependencyException,
|
||||
MISSING_DEPENDENCY_MESSAGE,
|
||||
)
|
||||
from ._ocr_service import LLMVisionOCRService
|
||||
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import pptx
|
||||
except ImportError:
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
class PptxConverterWithOCR(DocumentConverter):
|
||||
"""Enhanced PPTX Converter with OCR fallback."""
|
||||
|
||||
def __init__(self, ocr_service: Optional[LLMVisionOCRService] = None):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
self.ocr_service = ocr_service
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension == ".pptx":
|
||||
return True
|
||||
|
||||
if mimetype.startswith(
|
||||
"application/vnd.openxmlformats-officedocument.presentationml"
|
||||
):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".pptx",
|
||||
feature="pptx",
|
||||
)
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
_dependency_exc_info[2]
|
||||
) # type: ignore[union-attr]
|
||||
|
||||
# Get OCR service (from kwargs or instance)
|
||||
ocr_service: Optional[LLMVisionOCRService] = (
|
||||
kwargs.get("ocr_service") or self.ocr_service
|
||||
)
|
||||
llm_client = kwargs.get("llm_client")
|
||||
|
||||
presentation = pptx.Presentation(file_stream)
|
||||
md_content = ""
|
||||
slide_num = 0
|
||||
|
||||
for slide in presentation.slides:
|
||||
slide_num += 1
|
||||
md_content += f"\\n\\n<!-- Slide number: {slide_num} -->\\n"
|
||||
|
||||
title = slide.shapes.title
|
||||
|
||||
def get_shape_content(shape, **kwargs):
|
||||
nonlocal md_content
|
||||
|
||||
# Pictures
|
||||
if self._is_picture(shape):
|
||||
# Get image data
|
||||
image_stream = io.BytesIO(shape.image.blob)
|
||||
|
||||
# Try LLM description first if available
|
||||
llm_description = ""
|
||||
if llm_client and kwargs.get("llm_model"):
|
||||
try:
|
||||
from ._llm_caption import llm_caption
|
||||
|
||||
image_filename = shape.image.filename
|
||||
image_extension = None
|
||||
if image_filename:
|
||||
import os
|
||||
|
||||
image_extension = os.path.splitext(image_filename)[1]
|
||||
|
||||
image_stream_info = StreamInfo(
|
||||
mimetype=shape.image.content_type,
|
||||
extension=image_extension,
|
||||
filename=image_filename,
|
||||
)
|
||||
|
||||
llm_description = llm_caption(
|
||||
image_stream,
|
||||
image_stream_info,
|
||||
client=llm_client,
|
||||
model=kwargs.get("llm_model"),
|
||||
prompt=kwargs.get("llm_prompt"),
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Try OCR if LLM failed or not available
|
||||
ocr_text = ""
|
||||
if not llm_description and ocr_service:
|
||||
try:
|
||||
image_stream.seek(0)
|
||||
ocr_result = ocr_service.extract_text(image_stream)
|
||||
if ocr_result.text.strip():
|
||||
ocr_text = ocr_result.text.strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Format extracted content using unified OCR block format
|
||||
content = (llm_description or ocr_text or "").strip()
|
||||
if content:
|
||||
md_content += f"\n*[Image OCR]\n{content}\n[End OCR]*\n"
|
||||
|
||||
# Tables
|
||||
if self._is_table(shape):
|
||||
md_content += self._convert_table_to_markdown(shape.table, **kwargs)
|
||||
|
||||
# Charts
|
||||
if shape.has_chart:
|
||||
md_content += self._convert_chart_to_markdown(shape.chart)
|
||||
|
||||
# Text areas
|
||||
elif shape.has_text_frame:
|
||||
if shape == title:
|
||||
md_content += "# " + shape.text.lstrip() + "\\n"
|
||||
else:
|
||||
md_content += shape.text + "\\n"
|
||||
|
||||
# Group Shapes
|
||||
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
|
||||
sorted_shapes = sorted(
|
||||
shape.shapes,
|
||||
key=lambda x: (
|
||||
float("-inf") if not x.top else x.top,
|
||||
float("-inf") if not x.left else x.left,
|
||||
),
|
||||
)
|
||||
for subshape in sorted_shapes:
|
||||
get_shape_content(subshape, **kwargs)
|
||||
|
||||
sorted_shapes = sorted(
|
||||
slide.shapes,
|
||||
key=lambda x: (
|
||||
float("-inf") if not x.top else x.top,
|
||||
float("-inf") if not x.left else x.left,
|
||||
),
|
||||
)
|
||||
for shape in sorted_shapes:
|
||||
get_shape_content(shape, **kwargs)
|
||||
|
||||
md_content = md_content.strip()
|
||||
|
||||
if slide.has_notes_slide:
|
||||
md_content += "\\n\\n### Notes:\\n"
|
||||
notes_frame = slide.notes_slide.notes_text_frame
|
||||
if notes_frame is not None:
|
||||
md_content += notes_frame.text
|
||||
md_content = md_content.strip()
|
||||
|
||||
return DocumentConverterResult(markdown=md_content.strip())
|
||||
|
||||
def _is_picture(self, shape):
|
||||
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
|
||||
return True
|
||||
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
|
||||
if hasattr(shape, "image"):
|
||||
return True
|
||||
return False
|
||||
|
||||
def _is_table(self, shape):
|
||||
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _convert_table_to_markdown(self, table, **kwargs):
|
||||
import html
|
||||
|
||||
html_table = "<html><body><table>"
|
||||
first_row = True
|
||||
for row in table.rows:
|
||||
html_table += "<tr>"
|
||||
for cell in row.cells:
|
||||
if first_row:
|
||||
html_table += "<th>" + html.escape(cell.text) + "</th>"
|
||||
else:
|
||||
html_table += "<td>" + html.escape(cell.text) + "</td>"
|
||||
html_table += "</tr>"
|
||||
first_row = False
|
||||
html_table += "</table></body></html>"
|
||||
|
||||
return (
|
||||
self._html_converter.convert_string(html_table, **kwargs).markdown.strip()
|
||||
+ "\\n"
|
||||
)
|
||||
|
||||
def _convert_chart_to_markdown(self, chart):
|
||||
try:
|
||||
md = "\\n\\n### Chart"
|
||||
if chart.has_title:
|
||||
md += f": {chart.chart_title.text_frame.text}"
|
||||
md += "\\n\\n"
|
||||
data = []
|
||||
category_names = [c.label for c in chart.plots[0].categories]
|
||||
series_names = [s.name for s in chart.series]
|
||||
data.append(["Category"] + series_names)
|
||||
|
||||
for idx, category in enumerate(category_names):
|
||||
row = [category]
|
||||
for series in chart.series:
|
||||
row.append(series.values[idx])
|
||||
data.append(row)
|
||||
|
||||
markdown_table = []
|
||||
for row in data:
|
||||
markdown_table.append("| " + " | ".join(map(str, row)) + " |")
|
||||
header = markdown_table[0]
|
||||
separator = "|" + "|".join(["---"] * len(data[0])) + "|"
|
||||
return md + "\\n".join([header, separator] + markdown_table[1:])
|
||||
except ValueError as e:
|
||||
if "unsupported plot type" in str(e):
|
||||
return "\\n\\n[unsupported chart]\\n\\n"
|
||||
except Exception:
|
||||
return "\\n\\n[unsupported chart]\\n\\n"
|
||||
@@ -0,0 +1,225 @@
|
||||
"""
|
||||
Enhanced XLSX Converter with OCR support for embedded images.
|
||||
Extracts images from Excel spreadsheets and performs OCR while maintaining cell context.
|
||||
"""
|
||||
|
||||
import io
|
||||
import sys
|
||||
from typing import Any, BinaryIO, Optional
|
||||
|
||||
from markitdown.converters import HtmlConverter
|
||||
from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo
|
||||
from markitdown._exceptions import (
|
||||
MissingDependencyException,
|
||||
MISSING_DEPENDENCY_MESSAGE,
|
||||
)
|
||||
from ._ocr_service import LLMVisionOCRService
|
||||
|
||||
# Try loading dependencies
|
||||
_xlsx_dependency_exc_info = None
|
||||
try:
|
||||
import pandas as pd
|
||||
from openpyxl import load_workbook
|
||||
except ImportError:
|
||||
_xlsx_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
class XlsxConverterWithOCR(DocumentConverter):
|
||||
"""
|
||||
Enhanced XLSX Converter with OCR support for embedded images.
|
||||
Extracts images with their cell positions and performs OCR.
|
||||
"""
|
||||
|
||||
def __init__(self, ocr_service: Optional[LLMVisionOCRService] = None):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
self.ocr_service = ocr_service
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension == ".xlsx":
|
||||
return True
|
||||
|
||||
if mimetype.startswith(
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml"
|
||||
):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
if _xlsx_dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".xlsx",
|
||||
feature="xlsx",
|
||||
)
|
||||
) from _xlsx_dependency_exc_info[1].with_traceback(
|
||||
_xlsx_dependency_exc_info[2]
|
||||
) # type: ignore[union-attr]
|
||||
|
||||
# Get OCR service if available (from kwargs or instance)
|
||||
ocr_service: Optional[LLMVisionOCRService] = (
|
||||
kwargs.get("ocr_service") or self.ocr_service
|
||||
)
|
||||
|
||||
if ocr_service:
|
||||
# Remove ocr_service from kwargs to avoid duplicate argument error
|
||||
kwargs_without_ocr = {k: v for k, v in kwargs.items() if k != "ocr_service"}
|
||||
return self._convert_with_ocr(
|
||||
file_stream, ocr_service, **kwargs_without_ocr
|
||||
)
|
||||
else:
|
||||
return self._convert_standard(file_stream, **kwargs)
|
||||
|
||||
def _convert_standard(
|
||||
self, file_stream: BinaryIO, **kwargs: Any
|
||||
) -> DocumentConverterResult:
|
||||
"""Standard conversion without OCR."""
|
||||
file_stream.seek(0)
|
||||
sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
|
||||
md_content = ""
|
||||
|
||||
for sheet_name in sheets:
|
||||
md_content += f"## {sheet_name}\n"
|
||||
html_content = sheets[sheet_name].to_html(index=False)
|
||||
md_content += (
|
||||
self._html_converter.convert_string(
|
||||
html_content, **kwargs
|
||||
).markdown.strip()
|
||||
+ "\n\n"
|
||||
)
|
||||
|
||||
return DocumentConverterResult(markdown=md_content.strip())
|
||||
|
||||
def _convert_with_ocr(
|
||||
self, file_stream: BinaryIO, ocr_service: LLMVisionOCRService, **kwargs: Any
|
||||
) -> DocumentConverterResult:
|
||||
"""Convert XLSX with image OCR."""
|
||||
file_stream.seek(0)
|
||||
wb = load_workbook(file_stream)
|
||||
|
||||
md_content = ""
|
||||
|
||||
for sheet_name in wb.sheetnames:
|
||||
sheet = wb[sheet_name]
|
||||
md_content += f"## {sheet_name}\n\n"
|
||||
|
||||
# Convert sheet data to markdown table
|
||||
file_stream.seek(0)
|
||||
try:
|
||||
df = pd.read_excel(
|
||||
file_stream, sheet_name=sheet_name, engine="openpyxl"
|
||||
)
|
||||
html_content = df.to_html(index=False)
|
||||
md_content += (
|
||||
self._html_converter.convert_string(
|
||||
html_content, **kwargs
|
||||
).markdown.strip()
|
||||
+ "\n\n"
|
||||
)
|
||||
except Exception:
|
||||
# If pandas fails, just skip the table
|
||||
pass
|
||||
|
||||
# Extract and OCR images in this sheet
|
||||
images_with_ocr = self._extract_and_ocr_sheet_images(sheet, ocr_service)
|
||||
|
||||
if images_with_ocr:
|
||||
md_content += "### Images in this sheet:\n\n"
|
||||
for img_info in images_with_ocr:
|
||||
ocr_text = img_info["ocr_text"]
|
||||
md_content += f"*[Image OCR]\n{ocr_text}\n[End OCR]*\n\n"
|
||||
|
||||
return DocumentConverterResult(markdown=md_content.strip())
|
||||
|
||||
def _extract_and_ocr_sheet_images(
|
||||
self, sheet: Any, ocr_service: LLMVisionOCRService
|
||||
) -> list[dict]:
|
||||
"""
|
||||
Extract and OCR images from an Excel sheet.
|
||||
|
||||
Args:
|
||||
sheet: openpyxl worksheet
|
||||
ocr_service: OCR service
|
||||
|
||||
Returns:
|
||||
List of dicts with 'cell_ref' and 'ocr_text'
|
||||
"""
|
||||
results = []
|
||||
|
||||
try:
|
||||
# Check if sheet has images
|
||||
if hasattr(sheet, "_images"):
|
||||
for img in sheet._images:
|
||||
try:
|
||||
# Get image data
|
||||
if hasattr(img, "_data"):
|
||||
image_data = img._data()
|
||||
elif hasattr(img, "image"):
|
||||
# Some versions store it differently
|
||||
image_data = img.image
|
||||
else:
|
||||
continue
|
||||
|
||||
# Create image stream
|
||||
image_stream = io.BytesIO(image_data)
|
||||
|
||||
# Get cell reference
|
||||
cell_ref = "unknown"
|
||||
if hasattr(img, "anchor"):
|
||||
anchor = img.anchor
|
||||
if hasattr(anchor, "_from"):
|
||||
from_cell = anchor._from
|
||||
if hasattr(from_cell, "col") and hasattr(
|
||||
from_cell, "row"
|
||||
):
|
||||
# Convert column number to letter
|
||||
col_letter = self._column_number_to_letter(
|
||||
from_cell.col
|
||||
)
|
||||
cell_ref = f"{col_letter}{from_cell.row + 1}"
|
||||
|
||||
# Perform OCR
|
||||
ocr_result = ocr_service.extract_text(image_stream)
|
||||
|
||||
if ocr_result.text.strip():
|
||||
results.append(
|
||||
{
|
||||
"cell_ref": cell_ref,
|
||||
"ocr_text": ocr_result.text.strip(),
|
||||
"backend": ocr_result.backend_used,
|
||||
}
|
||||
)
|
||||
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return results
|
||||
|
||||
@staticmethod
|
||||
def _column_number_to_letter(n: int) -> str:
|
||||
"""Convert column number to Excel column letter (0-indexed)."""
|
||||
result = ""
|
||||
n = n + 1 # Make 1-indexed
|
||||
while n > 0:
|
||||
n -= 1
|
||||
result = chr(65 + (n % 26)) + result
|
||||
n //= 26
|
||||
return result
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,79 @@
|
||||
%PDF-1.3
|
||||
%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
|
||||
1 0 obj
|
||||
<<
|
||||
/F1 2 0 R
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/BitsPerComponent 8 /ColorSpace /DeviceRGB /Filter [ /ASCII85Decode /FlateDecode ] /Height 80 /Length 4282 /Subtype /Image
|
||||
/Type /XObject /Width 400
|
||||
>>
|
||||
stream
|
||||
Gb"/k$+*^]+31jd1_Sc48j,Pi+@:`R01h=9+]FPXQDmE0%*Lb4@[Wi36jU!;cssJbQ5,g%R?K'+$#.h<qu?Z`Dn#2Gqj`$$\bE9$XS)%of4Vd>cT_6mF8#7^^Y_6P]N!%L#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j4Z3bU/Gm9s<T86G'Ht,"?C)(`3j[rI\Y+2%=-fXJAVq>eJc&9=)%mQH&Yh<W#b:QHSf&hc5bR6(7?1RO+2U+,2j=JTJHV/n1*m;JAGbDu[IX.pg30l0S'.0*($<'u;[b/GRDJ[J=c-W0HaX>?jIi$uR^]%u?lJ6Z*VV,Z28T=.3[G"!N]2!6iqW[_CVOQZ9Um#Qd)&t%d!r@Y0=g5[M*c.,qcc"UaVkc?<W;kud8W>KHVDN2.L1\=-s4#kKB4PQPI/e#*[DZR^Y$^Xi6K`(0^so>.#p<K8pN:ein%PZ#Y7*R@MH861DpLo<$rl6(H_8?WBC8!u2*l:OF52PthFZhN<gX0$3^m^nt74tEo^W)lR[oP32CBgdV=rlCn/?MttP@YH\'hL=3DKjK>7hgY.jk*u=UQX(s.K*Fd*WL[eV0^25,*fS$V)Q-Z/Ii`SFMMDe;iM2HMUsg37cCLf8L+b)KY6rDWN#jR5YQ.1R1gba'M2,4kCR4578^=b\Bn#r]R"h8?'u=7,fh/#GD*$m:^@g'YaL,g&OD*(\9V3qM@J4qIp#mRhYee0oG3^KQSpS`k(a)%\\KrFo]NtW+D/curY.W3&31Xd58H(q0_cISK<s@\A@tq[3b*;)prprplo?NNWP"U[H"mT)":p2cCY.e)nGK>lIaB)]QNP9.mWE2l3Yi$/1lIFGq?b"J/%A'=e_4!7DM'qA.H+Eb6+$Wn7]h3.+R^:;FLJDKh6Z0V`KM>R1?7!q\hg`Vs6PqO%XsU&A'e9Y:\qjB8:9X.p&5omkN]TihU"VSM0Gdu%IekLW7Z.T+=gZ8?G+)N`8D1/:))EVV'V%>@o^?^e2`FI#RXkRcVk5<aX)Gb<Anp($Eo(tDRA['co9=J!r?\(k+3obVpgT(rh)[&!*=m"?fb<WboEW]&a*9n`H`'s&IkQkBo"rK"ncnu1$k!hAk*UR=.2DO6/^buFp=jJKa-\QsgioA9CY[S7mOaK!FH$moNAmYB\)0A"En%&3d-/qKIR!Etjt(g9!ZEncRkEWkI'W[)AEhgn9/$+_o'-r&1h\!<m^U0S31\a,_&Rd"r.'Np%gT9f8?\3>1O$"8"QQu<lL!5$/k#`#:KEulTq4-7/YE)\g=Sb$OCcUO)BHRhQu09oJWIZkkosFc_B+&7'c_j!")cW%]@9&7fl>'od45V^,W"[b:mfhGls_c]o[:8?WXO3sS%ABFnD/;VaJp_j5H4#BX4qPO#9Rd2UQp3!11,Hp.<#+W;VjHWUp(CD>tmalGRY\Uq<)U7[bb1;!ICRCfbRQSW`R7B!G^;uZXfo5`5U7D;,E%89G,#:1)%4QDA%S)!5IL>R;C=R4rV;kBDYiJMaSpYGjR@-K1X!l^X,hul.*@fk/SRgZtX.(?1#F1Z,3(>l3p>i<Pf+sbdtG`]h4ZQR\8ke79"3MknReA"?c^RDe@Fjk1.cu5MjEDpdNhJ+7mGf2KIY:S*Y\2Cs77pae4B]4nt\0_9hN"cX*)6Y95CMGu-b.h<l/f)o087<L+.ZYQN\"^nJd(2$BIfm6^Z>hKjO-5DXhZKVbK,R$;#1+h5rhZ?WW+cIfDMR5M:U;WJk^U1M2V=1pp;4^.2,/RU"b7N@$b8R4LOO?H"DR.Lf`L[*m,BTYmDZ_t`L-M$_)#8#p!)[O706GPi_l#Yq>cO^MHRc(Jp:hO`,H*Y]jp")!6$Iu21q$\8nLN&Ju<?TEli:_c^Fu;7mar?jW@Fi5=&+@XX3Du$Vp!Z:kp'-MBe4(Gq5273Z*<l$oQj,ndL:>:,=6H/*LPHo45Js7W8j$_!Qm0FH1P&^"`>@W4%?`Nma<X,sJlXF*,/9?-'cJp]Gl[CD(*jN88AiD,rcf:jl=)$?G1A+QH`L1Y,qGh381N!)?4VfakRqR\de*W_P5=i_rQ88,Nf"08ju'!L3:gtBn9tR`<1O'UehuL-ao(I9mcdD[iu:\EjK;,iTiXhVd0(hgkW_rte\s*ID1Wu(.MjQ`-_-:KRW+1tA<S?3r8>E^)_qfq#N;4tr+%k&Ep8k#92@_4?NnV=N@"8F,!hg:if"abZSI*B&dFMB&j8pk=5i_MJAeY/_a-bBH!b7VKr\Kt#C"Ke<_A>`"`=AC>VJ=jpNj/XAJ.8N&11/:hfIr$D^^R2#qRLKK:(9GU8"CB@_;$5Fq-q:K0TBPN]^2`GM'aEs1Y+T=D'>N2JXWoc8.%IYO^gsm'1RJSeGm+YDRQhLku5aKi&&h'k:Ae':8oK<la[fL\k0;fH3(LIfJts]t<l4*,ri:knWWe!M_E[M,&V9JH2`"=)ml_1[8!OOU7V,rHd]X#^@U_hK>1_Fu*NH]a>^r>**\J#14;Ei@8Dd[B!VZ.j64i(icM@UQ_>]1i+QL[q8@sXNl,qq<0pH2r<c]E5`R>K@bgt+3u4X[=5N,XXpe$Pa+h/i2Ns+!9@kBH_P,uQG__S.W7M^frRPr4EZHW;p0Je?#:'3`%IWs^jMgsS>TFs]-96.iKS'H_`---RRk+q]Jr]FS(In4Pq-F!6Cm%,U[%@0.OI2<<)q%YS\L]"SQrA8jisi-Yc]j.NcUR5eZO4@bV<6:Q<7Y8Tbc.:)0RB[f;uae0#hXi-F,V+Y7!Mj#7a2'<d>UX7up@?R.l5hdJ`J2qIRW9l3nLb6mCBmOi<W\odW.t='rA%`7sRbXB5/RD_LA/<@gLr;i'i3jlV::Z3F&:]ir"sAd&Y6P"h>gnWA-O?D51eitk>F^2j&Iq6CcN2Ju0jXH_V;7Z"7$/f.cVY>Mu"+'&]*\$$EFH_au5?=QCNV/dCcC.k5.]`boT#$n8q"$7k7cbB=_S?6!sI(ERNS%rY/q#(V&?"M=dPp_pD^a<mS>iJ84-qUUOnpsEBD(@=c8&j(fD<_iW8Y:1]3'Z*lk814$BMEn>20Z3q9`%2[odf^kVG8_KfBHJTq!iP-bZf!WUjfi-Z(mjNh$1Mk%I4bUXT_KbmDgHtQ7Z[%/U=`ol;d(+7MLe^9J.%pG(>?Z7,R7p2_!_Qbsrj-nZ^jp1P_<pXaK9!2C;b6ck"=Cj_ThjdJFfoo]T[$FN[^)H53%_>QETt1O4#d3il&h>-]FM?E7.3BltXHM-bbl_r^C;;uMGdf.Kh%L(0?a^%V$SMIKn-g/OBA,Ng_8qOt.G4*;07b-d&^'[LU$f5ngd%r-XNimO'c=1SVor0:Eg?<1-k=*lR5.^@!L"%EH/XBn&hq=*'_o;%t#(A>I6JN':Wh5=&pRCU'1C"15l6HQiH<#l)E>c9A33g31NEH\$h]o'o:W53E#msr(FBMb0g*jP1nCIbQ^<-?M19Kr3mq8.j:>;q*:p4Rb"@"DU#`i.DU&`=Vn-ANGOK'T46_'jF^$R0`j>ib(E*\_<8o*cItM:B3D-9Z>Of29HcT0]Z'G'co.PNW`2:qpYXp0-36TIRP-&3V+PPe>^kkuHt*7[/f`Z?74q^`DXV.TS7@]I@7J7#?[&(&hPL%\629`r50o^;oKq?P9#!l9@Fff9p3njK2nUHBg!&A`c[uXD61%4M,a"/_P#gZUo)#L[uI,Q>:BQkk3P?Scmo]DXk])TLK"NX2u"><@[CElgT\uF2.fcn<iiPL)@TrV2\AYDo>2%@`(OZ<M6L#'7K_ZStJZ)]&Fp39s]tR`?'J?rE-I11YEH*I?3FE.#D8]B:lU#l-Q&"X6RDb@GL2>K[lYeY=buQU?HWK8[#]q-;`G(]:<Ao2d9eEHWd<E81SK4JM$5dbM`T<KnN,YjlTi4>kV>d%&i?1&P=:i,4>V2MnI*kV+_s8='X"H,gcL;Uo:%-"-M]-mmX/gFJ;bSiNq;:Y3_r5g<a"7!Y]Bk,;T:p3c2CBn/b6lYENkm?LZ[fW1tg10cT`#9kR&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j46GWU$GHRA>~>endstream
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/Contents 8 0 R /MediaBox [ 0 0 612 792 ] /Parent 7 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject <<
|
||||
/FormXob.0315aed9f6006a101b3226a3b7404028 3 0 R
|
||||
>>
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/PageMode /UseNone /Pages 7 0 R /Type /Catalog
|
||||
>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<<
|
||||
/Author (anonymous) /CreationDate (D:20260126172022+01'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20260126172022+01'00') /Producer (ReportLab PDF Library - www.reportlab.com)
|
||||
/Subject (unspecified) /Title (untitled) /Trapped /False
|
||||
>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<<
|
||||
/Count 1 /Kids [ 4 0 R ] /Type /Pages
|
||||
>>
|
||||
endobj
|
||||
8 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 260
|
||||
>>
|
||||
stream
|
||||
Gas3/9kseb&-h'>I`6Z84fgHmCc;"L7g6_e&889#h,kA$Zt,m0Hdcho6>O[sLZ+YF+:QDRLY`5CAhdUI=MeslW_fp84Bms2r(UspMdQW.jtWA9rW?q[M1*5b[XIYc1kOQ$55sEf7La^q2$a/'T.)S#<V#*e,['$SVK^(f9:,Nq;AW\a?Zt7p:RM+pHF)-4F;E;l5ui'$5;T>HA_.,@?H2a/)Ol=NY+4r->>:n6'/ubPg6GC78<Gb)GJls9>QKuE<U0~>endstream
|
||||
endobj
|
||||
xref
|
||||
0 9
|
||||
0000000000 65535 f
|
||||
0000000073 00000 n
|
||||
0000000104 00000 n
|
||||
0000000211 00000 n
|
||||
0000004683 00000 n
|
||||
0000004939 00000 n
|
||||
0000005007 00000 n
|
||||
0000005303 00000 n
|
||||
0000005362 00000 n
|
||||
trailer
|
||||
<<
|
||||
/ID
|
||||
[<5d5eceaa0d906ef66e559ebcd616f18d><5d5eceaa0d906ef66e559ebcd616f18d>]
|
||||
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
|
||||
|
||||
/Info 6 0 R
|
||||
/Root 5 0 R
|
||||
/Size 9
|
||||
>>
|
||||
startxref
|
||||
5712
|
||||
%%EOF
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -0,0 +1,79 @@
|
||||
%PDF-1.3
|
||||
%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
|
||||
1 0 obj
|
||||
<<
|
||||
/F1 2 0 R
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/BitsPerComponent 8 /ColorSpace /DeviceRGB /Filter [ /ASCII85Decode /FlateDecode ] /Height 100 /Length 4720 /Subtype /Image
|
||||
/Type /XObject /Width 500
|
||||
>>
|
||||
stream
|
||||
Gb"0VH#OJ:qoA4A'Hn[Z$4u82K`j4ZR%PRX#F(qaK&V?K8<b&(;#dar'M8Oni!CfV<*>q$(4m``.P7J2`EVk#7#VtC6:(*s$)76DC7esMC3FfEP21e=J%f9>Up;d>4l&7WcZJp*DIk*ozzzzzzzzzzzzzzzzzzzzzzz!!#9dqY/lsQRl8pCtPt8mFiS/o[0Y;WL90Bj2R)5Y[N/Gfn0f!(^fES:Hsio97D?(Xn?5jq!"]KU?6X;&P"ZofW]f$p(q"VdAg3IP#\)UWg`=MO$9TD0>@3js(id(lnKeb'nVdJ7e;beRl:iu3cs8nIEn"+F\0s:]mE81*hAmoIah4b[;OfHn`%MZXAic<H:=+SS^nBEXOMIOI2D3Y<1k'jGjr"Mb7m#8djq#sB[J"W0CSVhD_EOgna,8=^]'+;Q/-Q29o7K`^M3`IrI+P7M:JgD-;8@g?QuaS(L!S'NU#&pVmQm&;+ooep7=EoT&nD(?bbsoCn2i[<[UJujmhj_M<u#m'ah(clITBmFV`P'C`bB@K_BHIO[n\%kN(^*?b\d]&LhJ.U*@&CM]/WZY.jbti9?:_k*Y>(J)7jH@XF(Q5('jYY[u"DM\[nu[VacM!sePdg%40X+-%@'f%P<0baG`E4oP$%Q/JgWmY\Um466eV$?Y)Q9)nh\c]Mq3f`',ShaFWth-oBcOd=q3cTY"+4B:C5D=,9ZUiS/hg5kVA4*K+[1fUQ5bbN_se_nC2++F"$]j*.j.s0:>;7?2OB:nigYB[?_a,Ulb<cmfe?*!8BYQ*mgYI_'G:#:9hc!%'nIu9\#K.OFt>Aq1i\Q1k(X_QRsW=DbSc%F'bgtZ/2>e+n:Kbn'oJ*sr;^;r/1Z+Vo?T!W4\7L>qdS!IH-WeB#r>ch5>%TRL&'caJA=^o?na4Y*tXgMf5H"N01jlPU;HhZ+Fp?gW#Ip5H[Y;u'ao8X`t6%]C0Gr*LD?+V"4C6Y<]^1GKRW1+$NV&M@2Zk6GD=d]J*qS.+7cN!h6O!e)cfIf(11iD*Y7*8G.!Fu"f5Q5o`Fk=$<gK"E?j,ZG(Q<S6H:\dHiQ8`a=>Zb+\X]m_AEjKB&FH%hV\FAFmKDpQr8j8Fc:%F6j4nD>GFYUI\FW`_flD4$L7>hqmCTh$Uf"^_*P&AuHA>O*GCsJP2O$EX=EQ9*O\8c#4&JZa4ARj7@85!.BB?cmQFmIWVr;9Tt>%neNS8ud$:Htthg-nk9OnPg[;$TI`>0jlL=-.a+_hJV!J^kl(gQd$+PUS\<me#jih7@a_WGZZ(.4K"nV+[.iR&jScFk0\%2K,Zhh0o%R)Nq/WF?l+\*Dd3krNhA#g[-\oC]W"'hnHd4_hLeQjHEBn&n644.4m-ZIetY!]Maa6"3/b.DR^j3BPL<$4Z,)s+'5XPm7A'P[OZn]2^N_0O[lEQSs)o18I6(3r!Qc+D!gf8aN/&O]Qq2:ob7jW;;79<m7q;t-1bA`T7?icP9s!jiRtHa:-1$`1XkWli8@t0Uu_-o6OtUq>"[U''if?a#-Wq5c7%ag\H8!"ALF'oU?-RQD7B?-GJ]">g6^'>CFf@5s8D[^<m$#K27ioe<`YMIH[l(oGML?\W`P:J[(9Uijd"PR!k=->Y$F-4CB"/,6\Z#s5Dlg2HM"F7VJkA+mbDo1>N-;k32'EW?6)(KY`B.a^\mY\PAK*gH+(7uU^F-po`(RMK06EPHHdD0;Bn\le,c[Y^V3lFa4(IC]mFtt!K@dP<o88m]iqJ+@)2E/k&hb!@XF*Gief89[^t6$$47L+p[?u]I+odK?/mT/=%`2+)fO@AogV9q*r!U1m1g?N]&Ti<e"o\RXmOcGj);^2<k\(R>\obnltl>ED4q/%Sh>o`U)Q4>YWf)aUl].\e=Ep??@2&sT>Dj(+.kQ]\905L.Bs_jQMg@#5Ad*SY8q<&o<LoLF#$U&]1eeYfg_3L=pCsBn2Zo8@8IZ+Xg>-8CKR8V]"=q/;I!IC.2@XQ-aimNpYWG+0>@4U4t=ghn%S,KY6Ikm?-DX7<>iL_CI@3\nVAbo+bpOJCAW-_HQp]RX&=4gH(-^/Z@s5UCp8LSn\c))=o$*Q*<M^D^#4JMJtt=95Q%_uUo1-F7q-h)d`H"f/Jt%*3B9)\WKo2Erql0!qemE![bE'dH=+t)O0/c#?9IC9XEPfCe/A/_qsP1I:Gi93mAc2E?t738#p#IUW/S7Mm!5=<`aIfEM<Ys2>e&.Y0ZhJX-aq'tbOsIoYE.k=J%d;VB:aB<bI_rblEfBj^p1R=K*Hi'nV;J/+I,YT[]<3iSq0j"_fD5.GHQ9sRi?Hm3c3S-p!79pR,Q`;q!mC0XK\qU5$G/?oAYN4XKK9![O9M9;(JK$g@P<;orgiE)Wd/_e6&iR=?W_Xldsm><h,SP+L,3L,ntesb+_2WpLI=+=Q*1F"S(>qmjXiPmbHLOYe%@qT_T#OK#H+:rVJ+)kHmJLjHDqC"3ei1+I0_7b&fC4SN?G-:Hn<P`F4_m)X;X7>C^ac96r3OHOcmBeK9_BW&gqhjl7$/j4:&TqtBm]_@&#AP,Y']*Eo)'ONPAD?/7inL-[;Y?u405b]Ka3.k@s]eE(j,^\#rI[BQU.aF>#4B$F4/opmSM*F5.or8NVf4NVD[_hmc;1iLl9739e\*dBrn2.Z2*I#rOpSJRG>K>mOaX&`@A]5&)7CQehdOsNbC>B_D5cTCSXB*di92jW_WX!=EhTKNR%fVt.%Q<$j[iSM!0d6/k`CY(3)+XeI\r:.i,Kg1O$IGhnlT&gm[L0VDFcUFbqACJhEm'4S\ChfI]q:mQ"ZL[OBmJ_6*\'6h<$$-W"[^F9VSm/"@Ys!-Y/kBOeN9p]O$ui,lR;]Y'fWi?-gh&>)baIM5;iRQYFQq5Me#,uCc8P?h`0K;LQ;G)X')<<ZlIDq&LLPU^bo=&gcErEUfq:W`I+ft!GF'4,DQL*1IX_:-FmGd!pPJ9q(G?8P`t=nl4**0bl/9C1$Pk;?WMl,4lD^\UVMQ6bn$qBfs80*^[?ET$I!^-aH!XgKQFCSW7VR7m>c6G0oT/C6@E@rs_sQ]md5bQ1:uFPQQE5J6oF@\//u>D@rl6i0Db`9"CffLREne*h9ea#&lL)T6cQ+8d[]`uKp7-3LZ,`(u?(+fr>(mI*G/\k-YJ:uXP.0=t4*2mZ-eQ't.M\@&WZ\RXWp+0AS>cW33cqTe`:fXp?:r\D9j`52V-"$nNukEh^\mZGUTX8#HKq+^Sc;g;39(Dp^!D$\>%6A^eKD`,3r`Ehh.Y<QB$Hd6Dn\4V,K$O2eQ#]H'IHuY<'.PWh7M8sFIp`W<?e^(pf-sk`:cWX(0Rr5S=GEL-Ru1K?[mL]^3qom@^04`'ab3DaOa@KMi0rX@XE^O>K:6c7M&12fk$N'7q-hiZ)75?UbZH"N)8kd3WGbMX"P/.K^RR%.rqaT@h#u_AEs1)jILMOBks:._,[m*eQ/HMh/2T8\^k5p-Gbk4:UO]EUnspPj)`O0k>MR,M8scu:M"<+[OYCfCtV].VbWfJfu8q0hWVoO!s]=g_kY<KG3'BXc*o(K]QH6CDr&"TSejAI&r>p4`r`?FMo`BP7H7X"Km<=Xfhj^&%sjRIEf&?W*]uFIg@FfT]->7T*G\=GA%PJ<HdnOVSmLf.+KHmSZ$jZQ*BecC<(3<9grri,I2*)b1:SDnGU+d]Xg6MY8$T(:.4?Uk`u[BiGe0Oe2f<H[Ue1=Kh4riO*S$)8!@o+s?;W"![]<f%`Y5^Zc;'ok\LTFOfW\3I*DUf6[:**:Q92N&d_']\[d1Hqldmd(IV"Q.V:G^=4P&F/.]W6G(>cB1O#e,SV5<H8f\>>$gU<+7PBoDYDti\Up=RQ$_FGu!./Yu`]tGE[]1Xflr3@X<>TH,QPDG[k\(f5EN#4:dBj9Dtm'hE@kFId$O4XJRs#<fi\gX(P(\HEsYBB9>>%h8.(c#WXs*h$)D\#t'aEg:?XOs\SA`#9^5CU9:p<JsU>L#D+>hdhTPki]s+6c*j=3>f=V>+D"=D5gHfUbY*f!X/5kZq(aU0s.TSSbiDpX_$Rm57L'='`/+(t;Rbo#i[rD<hl-MMd9XiU7<R]U8H\\)5nGm!GTqIWJoT^k&3K2m'gnqWfVra4/mgQ`:tY5PX.=H\ipm,paod-T=!9ie-6^rsOM%b,=gWO8LhMekGg^s513Ue4#ZT>@pYrm#1Im](Qt<AVg4pp1hYAJ<c+q=&_b]DnkJ,HRr<'>$A+9]t/CS)@C[llo,Jtei=]'$rSMO^!toPHeWb/:-J8LrU&LWJ)\^W(Lh_BC(Sq=]a:sW(9CiU>+L>jbY2:SMO\P;Zl(Q*_#4$"rP+Wa'D/kYlP9isqjdB"Z4^4CMs\(rfP=ldGLb]=a4-[40"Pb'F3QT9K-?3m2,`JlHL%\1Ij*8m=o$^)IJ``GG>8o)=:i+tB%*VOA&aJ4rTY`4<mp,AAS#lUS&fu(ONL&D/#q[E-aS3rEpZinTItFX7`LZA;mpPt<aK*M4`L.JdGj.pL%3[B<0^9=Js@ifcC6aG'JXr;^#lG4Z!G16L%!Kgc\r_t,%&S@[KH;Sbh`b-,X:&f!<?*P^juT/EcNB$W&AtP0_oZ%$360Lace*-_EY$)IJ\1l;I3ZnIJS%;Cu2i#mbPJcDt*f-eZa,X:4"Ls6%]AE=]p#qGtjbd%>DQu\n93U_d,;'5W.rd^OPtDft"Z(lDUVVUibkLA^[AGcHdpAzzzzzzzzzzzzzzzzzz!.Z!\J#>u+ci~>endstream
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/Contents 8 0 R /MediaBox [ 0 0 612 792 ] /Parent 7 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject <<
|
||||
/FormXob.41b05a9cf8679f0fe6e7c30c9462b767 3 0 R
|
||||
>>
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/PageMode /UseNone /Pages 7 0 R /Type /Catalog
|
||||
>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<<
|
||||
/Author (anonymous) /CreationDate (D:20260126172022+01'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20260126172022+01'00') /Producer (ReportLab PDF Library - www.reportlab.com)
|
||||
/Subject (unspecified) /Title (untitled) /Trapped /False
|
||||
>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<<
|
||||
/Count 1 /Kids [ 4 0 R ] /Type /Pages
|
||||
>>
|
||||
endobj
|
||||
8 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 250
|
||||
>>
|
||||
stream
|
||||
Gas2BZ&Z[T&4Ckp`KUTrY_02PMb#<CFN=Wfj',kM@19sp55uUe"pptDD)Los"F*-#r%7t"K39EA8f/'^$OO.*D:jQe'n<f:3Cq8'p9Rm8qll,u+[sQj[W6hrFQL%\7G?"sX/%4LXYeUkIBuT`A)Y3?=ouE3GIShId3E("2qqVte.E2,r_bJ%q1G(F,@9C<XiC-L`O1W5it(MP9X]^nj..r=,_#ecrj!ceT&ATWd4)p.7/d!C@/gP%;p#~>endstream
|
||||
endobj
|
||||
xref
|
||||
0 9
|
||||
0000000000 65535 f
|
||||
0000000073 00000 n
|
||||
0000000104 00000 n
|
||||
0000000211 00000 n
|
||||
0000005122 00000 n
|
||||
0000005378 00000 n
|
||||
0000005446 00000 n
|
||||
0000005742 00000 n
|
||||
0000005801 00000 n
|
||||
trailer
|
||||
<<
|
||||
/ID
|
||||
[<38bd217c814ddf937f148e537dce51f8><38bd217c814ddf937f148e537dce51f8>]
|
||||
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
|
||||
|
||||
/Info 6 0 R
|
||||
/Root 5 0 R
|
||||
/Size 9
|
||||
>>
|
||||
startxref
|
||||
6141
|
||||
%%EOF
|
||||
@@ -0,0 +1,139 @@
|
||||
%PDF-1.3
|
||||
%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
|
||||
1 0 obj
|
||||
<<
|
||||
/F1 2 0 R
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/BitsPerComponent 8 /ColorSpace /DeviceRGB /Filter [ /ASCII85Decode /FlateDecode ] /Height 80 /Length 3374 /Subtype /Image
|
||||
/Type /XObject /Width 400
|
||||
>>
|
||||
stream
|
||||
Gb"/kH&U9Q'ZR>43%?O'&jT85=3qe+N1f0n5S02hEQ*VDL]ZS)-n8)hB\WWtW#6!n.\\/*',>!9/rH$h#V%%<D1W$*'jLaXkBH(D['+neI4_Y@WVLF\J+T;ghR7Y(mQ%b#VGJrM63n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&/S]&BhHiT>FDK@e$Z6UXYgOsk+@,I0-;pc?Ln*mjmTTlQ+?K]e$#D.eB)O5NG7.u<*,RJ_p,8ck3p(&R=G"JnR/3XR:fF1m&!LSDdTjBc:RULFd1\@RVR9Y?IMP'Ajfu#dnfOdRMIRU5?(oCkRY2lY[PsI[bY!Lo7-qefWg9>U_8tF1IHqd?[hMs%mDBpEmdN`p:`8WC.(r[:;%"2F3\c4Mk792G,A7iab,>+^X:Q1B)Ct4JU?`lHM9=Q*\(H$'0?&(T*=du53O+iUIN?O=m=In-UqEE?ghWKl;bQuY*`RG[FO81(L.M1]2c_&%<`*Rq.JU[hs+4A7O4D[l.+Bm#OHs>Bg2RPN#ZQX55r(P>4!naFjcC+c`URpH'c],PFQUH`f2c]IHB5B)pF^[%Q[/+G3L2gM0I)]FA`$/.TW]/!sdN]/(^i(\!C)"CY$!K4SDm&fpLsK;:U@[oi*BN6O,DaRecA5ZZ2bqqSNigQne<3\(m/F9af2d;H'i,rO4%j7$89^HKB/%EH:d:UE-7IC?3kP08QNWFSkIn&tXU0hVtf\g_heWn%H2@\EYR$R+7l,it!qkZIsA%.8%:<b4Q+4o7Re<@/=uc<D?1CkG#t*Sj+#k=3?C8<eoR]gMmuDKs]#U,%B6^\'Q>l.UaQb$n85YGMOQX1#>:k7>o*u_[[jr:Hg5Je^Y:RKsDj_5eqt.H?@qPba^,b_QC<DQE1;H6P%jjc90Rgp0)'S2.M@\$lIp4C<@5NV3A+Sq=@L,V9GDV`O9L@%f5,%,!P;IfdpJOYbPun*G`61QOD3'0Y6GmF^2XsRCVfROl*/ge#f)W1s!?*VEG*0."1`MfU_Qj-_HcIc]o.$p?:mMIQ<MK^AHukRr/l=E7?<+:@V_=m:@ob4Q**VPHYWj[Zo=CRr=V!*BT#C(LJ_:?.qW/l#KCp@t8d\[H2o7Bt=PmC=",eKi0L-.*!_c/%pOEK43<A[3H$o[#=noHdqYr^oBKc5fjO>LsH:W;>mQ8`hXnJRbG389AnU"B(jou524Uh#I;&<U<No:XESS[)bnZpSA"f;hd8gI+P]H$/\(H<O[nan=8V^RLa-H@:Xf+/INI%?Zd4q"#jj>q*.4u=YF[nrO\>7O#of";9>"U0op9fI&cNL!f<:OL7XG#UCV^-W3n&X!D@elVYI"hK-(KVsoa8aWL<4F`IGb_t#tRF<8C]_m^L^GO6G3G5S09nd$$CBb>0uZ(bhmeXG1t'(r;5s6L_nY]J6S``aIm.70L=toNMorm#gcR5B07$ZWs&l!IuphJhN(6@caN->pY971W`M`H*G1Tioii3W&eZ2TZB^Na&P9Djoa9RTg9i]Nf@JZ^+U;<o?jn>70)4?,Z*\6\f"L%[`BJ6Kj9)%X!(RqAJ!s1(OmA61d_0J*HI^@ba8Pi</l@aVqWUPab%]CO5^)(hH3sGpX-^oTZd5(_lb]&C\k)B3IsfnOdI_nbfq2>Pl-9KMgjahN*A`OY%3@&b90[&bRjRh=*NU*X?K'!LVo@=gF%20@?O=gnO^q+Jr=$XI<l#nIRPZI$e8J>159G:[l)_5.-%,a+qd/.CDY\U4-ibD!])@[SqAD!$G2,qi7HmfIX"M?gq4^a:eU]L7T>7]\Ni+_8Hg7U"jUaRtKJ]bt'P<e&53@l:okMAKlUSY_?MI_!>3`l9op5MT]g<,Eau4CBfS9qgs8'hVO^q,7ISl%l9Re9VphQHBQpHf7;ouN+#41(!]Bpq.a"s<1RM\\qcK'V\`Q!N1@Y46f"8:opL%80Oi]-T\^Ju'R*bNtSnP$N;[5T*[5i*NZi1b"eWK@pn&8g.BWL$qSS5iR0*5>0K<j*&mC2tplBiAcJVg9(]ip]PZ1oSVKVeSV_/S3Pg9=ab"pNZ5V'2SCk-Vb@KNuhlT6mp3R?8XX`:Q.L9H,UNhF3YacG3W(VX0*uZ8>4e?>K_G;Rn$CUR?oeU93At-Arf=,=bsA0p$p(CN!F<.%bX@`pKfj\]c&XOS1!:`do;;tZ6cVZD;+'sUf$Bt,Q7PD^1rYmq+$QG]u!IsY+87o.os_h#/VHUYNfDXl;b!f:?gYC^>D*J>\&T\cBJ-sI_$N&=^rPk>LSVt;>P.46N:frEgm*GGj4=<qT,Y(1Zc+Z]h7%8,[8^^eLRgoB$=^7=$"Xl3[>=b41/JMmG;,"j2T%QRo@!%UIU4G6f=MZjM:Z2<iT63Xu]!qai:DQ:RWOH]Hn]!\#0inkU)(>L9M]+g80^DM^g"!X,S'0pD;8jH/WctXls@'Zr*MP]h7%8,[8^^e]B13k1X#5g%F[m.h'jn0tqYq>l-H\%mM%:Cju%L]qAkq0pc,X(bDM0Q0YGKF?F$>3]@tZ>A#n)hKGBreCD[e6OjG&;>-_QZ2iafV]GHA*H^f/E.J;%Om]_HNhJJ%MZ3l1UbMX*]CbunmMSLel=3ef=.@Tn,XYJoeZ)Vm1L<aCJD&:+Ijur)RA66H:%K_'"UbD-=0FNC4t9b>F/i:Y!Za@[WkasO461hJ%d@!MX3Q&p.'mXeE"=VfNAkB83#nEb-?5c%',NbTmp@Ic`7),9GbJp)qSmsX%1!Y73W6=/j,Gsc`f7.5:+Yeeq`Ai+!fL?NOM]56n(<,D$J18;P:'SXB'[uZ2^6A$<-j66f6Jk.&&c3W_A9(cRi[QK19NKCB\1b$_[lLN+.MEm1P"`E)t++5a[#+RZP^-(?@hY,mDDVfkSlm8e\\@>q0l2NF1QP^9\%[*ac\o<7!Y^`e%EF9dL)U==0$Z?$N0G;m+M)j+f_J:*uda;lh'4kU0Eu=[1gfDqKnase*WU=mVh!(:]"OUWP//]CqZjE&P4=Fd]4EP,j2"jQH?"R,$k%R$h6Q3^%g%3\k2'Q2t#0eqVHl3N]f0YdOSV61-pC:UfT.\lB:IuJp7hZ'6FlP!LEm!e%`Z.s*j]KJu),bq<-MI+2hQBC-aRH=2j-B+b#)kL'"'pn0^RRn5.!9II/5DmNHR"$lgsSKY-\*\*MgP]XVFu.o`0C'fI6ZKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&<`?/!KNiek5~>endstream
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/Contents 12 0 R /MediaBox [ 0 0 612 792 ] /Parent 11 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject <<
|
||||
/FormXob.b6d21e33426b982eedc18c3d4e93428b 3 0 R
|
||||
>>
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/BitsPerComponent 8 /ColorSpace /DeviceRGB /Filter [ /ASCII85Decode /FlateDecode ] /Height 80 /Length 3746 /Subtype /Image
|
||||
/Type /XObject /Width 400
|
||||
>>
|
||||
stream
|
||||
Gb"/kGE<NX(WT`Jj\uYAo)R):e01]`";Q!n&d*n+.):?M$;N-L92SQib[Ni],#Y)4BR+![6n$6QZkR#<71h=R.UYRO+N",lDr8pJ4*g7;J)9%2]0:`2oQ9iO]^,RBHegiOad<J[KFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgJmN6a^Irj:'BVG/#9rV!+sehf4Np$4uGT6?Z%jn76cYI/bg\b0"PYFheo17N)h[bT;Qm:q@c230t>rr(GQpkKpmDTn_bcA]O+Jd#cU@+2Zm>]f;6bs;T&C"&dtb&Y8uEeu<LE-(N92GKeb>4LdJj]\*:qP]'A(VpnpR/2-8pYO1I<E>P5N\HYDRFS?rdSVTS(Rm^Cb\t8pp[.ji7rj(S`LM@bl-r9u\(u5id79+1Zkc$%??stuVZtblhmB,pZt^mu0`%Ls1i]8CHul4%*Hj.6mVDOM9@Q>X0"[KH505<^W'N@NK#fUn2VXT_I7/`G*I"#V]/HO(ZsFbo9PDDePMK]!H;tTT$fdk/bb^Xe<o%RJ;c=p@KgDW9>;u04LW/P]CXtHuJmX$+n(Vc<?4@i#F^)<je+N!;?@B4`(2I>\^&%Tk]_oPE2P5G57Z;<354V60?+`jnL(BV81&C9p'qn^>huY?a*aVo\^AQF(LMnkjY1\5I3=GilDM^Mf#@3^HP(^=%G)"Pl2nSAIgiK?0>KOPWqO$PVHF<:_o#LfLahWdh*9(3^m/EKhl,$Q3cLgQYNON\9m^^C9op:#?rd<2$Vk!&)d<th.i@gbDG"YOud]6@72asongem@R4YF`QZs^cCb&Z^>EY^Bos&AIDEpB'*`7%OW#]/JaVk$ICr,?V+Pq1smN.e[Kbq-r/47c-[@E9!u4pDp'F`gCN0YN(T*.>3lQo[*tf$^DS35K&9pYYmC(C!8KWI=YO-Yh<iYq"0n-PcX/R22T"lAiUQ?;[;g"V[q<\)6WG:J^rp+%ZAH>K@cWZ,bpLf<4]8ob:WF?31lf84UU8ba9QV_YEY=:-f*?M'pH:5PUm1J)N_lEV.S5l4J>"ICf>9o#Q>b\(rC/e:S+@s,o'A+RfKf[#p[C^,r^GPUR4gNZ=J]6kHiV:DYI45Dd,\<-li[]NSt[l+63)OsT7L1W3eYFAoO;cK:jZ>hAN$F,_R#nIAbhgQ+SUFQjta@9[u&(LAMenD)m[`R55a*Y3l"))/k=oMU)6$+i]jVa*618S=TZ_GFf=XBs_%K:K'Fo]DciT&f4e(<2n?Vtb+e^g%Nu,1VYfeOc,e:Kg5KNLk9Rcmq(6W8>+.5UZlp$,%&K@JAXf9t/kp;B>ou]%HuU9<jA3FHc!7?g+9s6HK-!P:8IcUIVq-FJPOLMNgE:%ADX":FE$rF][b6ElT3'f8;d%Jo:3eXU\iUJe7VBlWsY`p!Z_)<M"V>Q7>RO1)%6mY+XGj^Cfi\llJ`iigb(cAMjb)$@^lQ9+"%O3RN/\G-)Fg+T0@+7sBYYOlju6E^l%O_dec#[W(oiP)i[<g)DQB?>1+.GAD<*#ee)n]ZlQc:X6"mf=0EeR3-\Rc-pb@okO8@.<a?P.;n:M[q+PD?$2E+e8+3J=j@AkSTd+T3ms/eoJ)7>\^lM5KERoAErtC0R56,oZN&WoDC3T.FfcQ):>]e:aVd1keQ_tITI3"f#+HG?-@C\E:ct,l:h<Cp?GYt&rHEP%f@DuqiK3-OdO5;eandB'^*u(E>'Y7/kYTAclDW&K5RRRs"3=c::U8&qE3Bk7N4522.XFX-a_8A&BTV-MqW1^SOa5rC:q\>me%lnUEUg#`JHMa+Iu3uP#:.&O#=ggsUgq2ho1`OG(%W"^SARTTWNR+&lC)M$Q`-sKP).<Rn<-H*Y^_.@YouJ.NulS]BgMA@p*ha_HBliRAPSEe%/q,5$rS?H&(.ebb:^u]K\M$!o#]`(^ABPX>'>!(C!`PX,)AUtl)+6md<^LC%`&<0kH\Z:!VH;uD<4`a?Bq\X!.ko\0k6B4?aIal%rsfT"YPIS4"n?"LH<iq4aDoZS1+2c#!%I<oORlE.6243F,5XrA7$Vp<=5I%YtpMPkuDakPrW:M755EjC1M+ASW1"l%6ssh".hAcrM+K$.)r`aK*P&Hs.u$/ckS5US2A?-\M(ZV;>Fn=!Y?$@qsJM8hgJQ9F[&m!?Bq\X'Z@O/_1jZPJh0WIlPaMENKSCZH_qf=)`E/ta?<^&&/;eKN[POQ>c>>*@<Qcq<skZL!t1i)YtpMPkuDakPrZ,@m=)4N13gIaPe1(Bgc3F?fe]L"FH<.+3eU/;OU:9l)jAj1eZ7B0jUb:P*^U'nk0B7LJU1=RVTVD0$LIB(o)AMa*Y*&F=l%,@f3RrO7lp:h;as9gS_OV%PBlb1<mC&X1YMQ7Z;Q.M?N'DLPOF@X4;:2e@\4k)e#VPa.Wa&'e[fnk(@eW9s8Hp3(LD&9UR,/A3p9VHP#n-p/dS?.Tbjb2E/1mX<eeWghoetsFg_!8\hVa==/BRk&$!s#r7l.`*`fG.c&hEW(G1egFu0Agn(Uo=c'TZhS"`tF_h6I>QRnu,W>Ap+0*lZ6>P/?C=#Kk>pfEI%X5o!bF40@(o?U'<]Eiua1#T-.-6qJtTfI(.G1oN.lKY+4/`*/<nC_FrBr@t'B!tXgNRb)RL+TCgW3<iX5O9#P?a!)IFFI8oG32\=#jga2H_&2^^0Kf,.OsLu_#eO08A/mmJDDtTcmr4t6O1`Dr,Q`30k4J%!o:K3@7,[V(bXXTZYYa2Cd1qoBXD(l2cQ3/<j,7X5ml5p#+n>6f$6'lUmhZt>sFRC2D);h+q6STAmNibWJLoa!_K+fl3/2UYVS=VJ+Mu+BpgT4#ns,rG4")XqHRFhp?e]lW)6<M/i!6i]r"JcHp"^;CaL.d(qc=(QVd0f"!@6_597/@.grp/FPoFQ-+&Hk[Kh<ZWObTpod[MGb+)FWKf>msB7&pCcn]iARI''Q3&WZn4`cfmXRYX$=L#_*oT6]L\$1K[YD^6ieQ7a0Sj]d/M^g5G<=fIGJCn-7*ka$`dtNA96I:UC5Sul1`\4p@]Ls&$eZG>,T=j\`^pYF(5psRDI[ZIJUoPB8(M_b:GbMQoKNEpNmMNdcINqeCi3'cEM:qF.X06_nLu%gkBg5VlBSp+B1fTm,9!@mC$E:N.o^gaK:4o/&_:c/cq+m2[#j^;N_DJlWf4=p-!+I-6h@omP!WV_Y"`IN?:CP*<`.u@#A9nDKO*4\G2pT\?kZ'Dt?,HQ7f!)_^L]dn6@h4u8d7pi9\@8;-o?'k"lKf\AD;4odYK88Q^$"H$>rOH)/`G1DGh5_0eMt&PpsLK>qs%#:d;7WAZfKR;rBm5'b+%b!VCn!a[@b*Y1e"S\)QM"QV-!NdrV>Ws'[ojRr<li1<loMopq>5.dVU]'X/_ssN#`kAmA/umL!VA/'h#6Ik/uc^1EO5Ek,(eJ=3@$n19'-D]1fkFPi7&q%$9QZs+c](m.qMB*W<LO;^VmoE[;onAu'qSYrV;=B=>iUUHSFKA3ttm5!=55Q\.qs8;OBp(ii!qKaVIY^A]oF]ER6$H=l(;gJ?HbR]':Z$q1FFKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`fq"Z!rjOlg~>endstream
|
||||
endobj
|
||||
6 0 obj
|
||||
<<
|
||||
/Contents 13 0 R /MediaBox [ 0 0 612 792 ] /Parent 11 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject <<
|
||||
/FormXob.67f2b803142796cfcc78829acfff7782 5 0 R
|
||||
>>
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<<
|
||||
/BitsPerComponent 8 /ColorSpace /DeviceRGB /Filter [ /ASCII85Decode /FlateDecode ] /Height 80 /Length 3671 /Subtype /Image
|
||||
/Type /XObject /Width 400
|
||||
>>
|
||||
stream
|
||||
Gb"/kH&NHV(WW/Z*isSg&d2e95Yqb:,+b_Ylbt"c&J%p\W#GlYi-cjh1$F4VW);.c'TXPH7"cs*'CeNo0aOLC"P*Zn'GO!upO2q-m[9Y1pX_cEoNV.hZ!I=.X/ihg[pN.eCb@(q63n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f<&gLu-H0\W/S)K\UiU/d1e=((AE1\ViZgoP7G`;;^96"eA^VhA0L.[BPc_E\[V_jF2]4eaSpjj$D"&Bno1d#['rMp*ip0m[:kfFCp?cGGD5F+!`f-%F.hd'N;+G=82r*?QTUB\d3]4;&O$@@]/UdP:"a?Mun%X(hIdfY%c!Gar_>X+@1HlIml`F@TKbp&ZMDsE$keh8Gd3hgr.uuh/jX)6'!qj^&c9!\hln?+ERl7S6Q>-NjMlpa9'WJ6Y0"C)9EqnU6a<@k<:/9+6qo^@Z'\I'[FS"YZTL.@L2sJd]G1f=ahJl&2l`Ks-M:S_+:'iN)f]\_,l;^8p>qsj0Lbs?q^q550MlTodIT_d4`uhTtM2WG=S:0CRJ?g#U89K(O`^f6E_&QM!\8c8?YcYWG^A,Rga7Gib>7NblcZ\TL@>T?RFh4gP,RLuW%NY0c='mP/s6[2ZP"RWEO$.%PqO$8NHF;:(g19^%:Vd53pXdDSgjf-D?!)SSY:tXbbWl,ljiaKo6&,Tk^%ZEq<rWC2djpdFNmk>T*#!;VcpRKUM_Ah@JTSpQ_,km?"fI5ldt/$XrDiRJ>7IaG`llTEkqIRJqXuLseKJ0E@^B[c'G&YC.*Td\lQ;0M&l<>n.Lhop@hJHBr`p<Goua16/n;ob.>5F)Zdo(A@eK#h]C[X$;ni/uM_oq\m:G*7H0QjW]3@5ik9%GVKE;f&Uf!n]DI`Nb%2C3boPu^,\d9$\T7(8@A:HcW%-c/0@u<e?e^USp7r<*.WB9Qj*&d0_/#)>2Bsh8UF<RaQg/[=u(e[]i3HG8U$[j'W<'t#[TmqC\O=RK\k4gLjcBXSgP'657?9`D%/7'<t=0\%EfUaok*dBq#D;/+Kc<ku7gob>(_H9.A]-D0Nm^Ygqir3#\O:*\f:`5QV2)9W.6-PkGX:u*s.q82:[bLF*!YSl>fWgl`93^mI>>?W-=Q[qRY)bl6lGcHZFNr),2$%FM_ALH%]g?+ZMc<[[9]ZgI@@Xj1?$uZ`fQ@E=T_=^Y)Iq>Z]n3T+,ESq+[<(gAZ#oB@"U39sQJhH7qWUVC,rjBb5Bpf01/"POnP#E1HLYF]r-FX(;KH&I":;elcp?*VMmI:t9XJ+lKojSD4*?HTLsA.bG26.GQ$>[hlK'QDo^(hmQ-cTH%506+oa4EtR1#lVq>$DdUYh/>J)/5O7A3XUoj?\?S"2`7HXiK0'jc'n-K65=5Xbo07YG+,DnJ#k)B0'A/6f!>6\]:+"l=`SG$XA)CADn0K'_K4f/f=5]6+1l9?W_X6Ot?rDnk[F$>,)cO;]%-S-9:BXpp/7pgGNTNfPU?P,hXj.lFe))E5rE0u.QQXBgC'[=5fD-+Dd7D_BiCOsStaKInr&6L*#iQ7hibR@-51:cbp\1q]mqe13@abo2#F%iXN!ou3+QMh*ChlU('RV`>Se52@/A>k8Q@LYVs5!&,>nq4C`OI,m?KN"dj,lR]4^*`J4t3RN0'e>.R)(f4&I7-<086hRGl]?\CGX4ZLuPqD?mGbY492PJuG5NhO5Rl'E?jVGU6ID)(X#%WK)7j1.B[t1a9Ju#GK#qImB754Z:n*t7Ur]Z,U]e<VdfHLtQ-nqGDhosqd,=eUe.n.A!MBr':nf?;3P9TgoKmg#m"o/ECONol,Ita.<KBmQk4*.;]m50gE0nc>k*9ufGe;5djX]Ln4+n3J_qX-GkTR1n9@0\q1VH9&8FY9fC.nhd]SpA>*.D5WHRi4b(7#_j-Wf>90+Nlk8XG?8Zml*$eFnI4mV<54R5pU/km!].$fd-Iknlq@5,Xe$Tq95^0d<lAV<+_t?GZbWe?PK(;2'+J=hKZqF"CV9=i:=Sc3pXBNm0iUoS9^uD((5S3VosJdC=XqK(Y2"k`E5Uq'nDYoh05K4psDTXB`"b1or?HP0("#t;7\'`7fWI:CgYu@0,G<mOUj\+!#:VG'\hb3LlHB]mfcAP9ZI;M,2&bnaX]6X/Y7e('"4F+\QPhZp1:%8f78Z'U.)Ue6KD?ha_fc%12pV.ZP#..XGC/#0BU7nKDib`:Hn"\?[&('o]Qm.c(BLBHqjocLXHf@LM`\>@eFKu9Kg=%YequpeAKtGp$Y/ZWq<GeX&l?&`O&_;(00M@OlMMsp8pnZ(i0jKoBoC=e\>Nh>cB:jR9h2CeD)rjW#<;*rq4m'&E21"B#_8-[n2C1%.R]OKZKFC,\A?;GZg/0Y7QnAmMs\'7ippJ^\Xso)'/EhB"`5cHhZ>EZWOn-3.t*;K+PqCj$o$J&L5tQQ=@Q((N`qd]u'*NP.O+%`e+d_QG%XgkgA[eF6Dhl!:9<aH2$USNm2LWq28%<VYR)jaXbV4YCJ4u\jDXW7Cc+bW^I:L/(3_5/$Gmk<L%t/D89:Y9U<>n!ZO%2OGm.G+*H6L3F-L($4-&[X?+S`,)XG+'c8f#F"j,#4M.6<MY4!T\h9(FHlkcV,>FdOF]l)Y>rspUrd+UDb:`D!)fGlVc8C*chooP,GPs!_V7E]#$=\U/qWTG4Pfm%09%<@9,->21E?O4?&pP2@H$`4$?gM>J.^gGA7I8>OOjhu9o]#%SpYD:!d-.[JU5C>G.uT">4k:L+m`uN(or>=//s',t<F).:*dk3lKC#=$qW5D':K;=$Vh$e>G-.OlmLurLLn0%0^[#WL$M5f>V7B:m$9_l<#pr?m_rNDl<j,-Cn?O7'?6LH#Ilm/to:\)&a/]7'o(b@./-_E+cQ^)a4*],55ON>nbM;@Ec#YLh)mC]GG;L?q"@ifc)?NL)=1E(%%]V"'[:(Jp]+fX=<H2:\81X=InR?-T/csEXCV7l>pXRL(KCof)C>7a(8CDp=I/^[HE.V#Z]?\'*R<,cj#3Qd'QlFa+LO?d-;J@`k]u&gL%D;=rZV;%XY.6PuMmCm6;Dc%f8>TCO-`\Q1J;CGJ(.F>s_rU",SE[+39$;uD4Q!l$]cFl9np^it-Z]/KiBJ2.rd<i1"5=SRESH6hk"I1b&.Z[f2i1jldA*7J@TMK#qXgf3]<7t,gRFY%(IWDRW[lrsp-a6$p3oYdYf(Bd^OH#r>$B]'[tKFhF0C=aRdt[f,Y&iJ]18[YX+V2TDbj8FlguYXiND`1&gV9j[X(r2L6iXSoZBYBjd4#Tfh\E%5A]:QeC^]5U+TaDlRI4g@n3(]?$0/`*ag3C]s<bYFJo\=W[`.=4S7639@Ps.ou\;sq=0D>YKFND8ubs#ku->#@_ZT/BZ#nhJEtc$fKAnu*.>2c7>0%$]DfZY`<r1%=hp-6L:goFqV(ALl`"4(F?bVarqV"-(gH8)Y?-O\1!d^"N#`kAb+5=sRHml8%4?f?63n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUEtn)Jn'&1>[~>endstream
|
||||
endobj
|
||||
8 0 obj
|
||||
<<
|
||||
/Contents 14 0 R /MediaBox [ 0 0 612 792 ] /Parent 11 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject <<
|
||||
/FormXob.7ce3f428fed09445afad362830e52447 7 0 R
|
||||
>>
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
9 0 obj
|
||||
<<
|
||||
/PageMode /UseNone /Pages 11 0 R /Type /Catalog
|
||||
>>
|
||||
endobj
|
||||
10 0 obj
|
||||
<<
|
||||
/Author (anonymous) /CreationDate (D:20260126185515+01'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20260126185515+01'00') /Producer (ReportLab PDF Library - www.reportlab.com)
|
||||
/Subject (unspecified) /Title (untitled) /Trapped /False
|
||||
>>
|
||||
endobj
|
||||
11 0 obj
|
||||
<<
|
||||
/Count 3 /Kids [ 4 0 R 6 0 R 8 0 R ] /Type /Pages
|
||||
>>
|
||||
endobj
|
||||
12 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 302
|
||||
>>
|
||||
stream
|
||||
Gas2D0i,\@'SL]1MAmuWBq2]41U`B=+JA;@)ETUJW/a5S$%>'U)FQlACq\foqfM9dic+ZLN<Q`tbE@K*:^spj"Oo"1`W"9\N8S]7/.WgR/fq$*$ITZl?0A3Yd+#RVYd`S"!VHM:q3ue\ZE&.5ico>/#%%PKVtVn!b+n6KWeM,?U:f@u6(=k$)>9=A;GQ#t3m&eV#g&$:bL-jnalu?/Fi#S%7?Zn?-:G9#d\O:D4D7XQ`j*RVq8@Qm.FMjt9rX$+<uAFWrR=.*pU4ORU>6iZ0lp3O3um&1LmEd6.tN*K;n6j'~>endstream
|
||||
endobj
|
||||
13 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 281
|
||||
>>
|
||||
stream
|
||||
Gas2DYti1j&;GBn`BOD:C$`F9ZVnjI&kF'G*Fh#tN?'!3]KRM,ciKk&h=0aE:V*="Q_Ne&,OhA1lR406EhL)):sXAaA0[ug"g)PsmBSG*k#J$))")C&+kr+KmIL<Brl.":L6#Q;:T1n?*25E!Zk"i,4uuBV3G4oRN56iFD+G.*U'<hlkt*7N8pVC@\#B7T'\f?qTfO:fq24F=Moh9cYOO9_Ug3_JW1$`&3Et?9G$Rf%HgIe&37c9!:H9)*A"58?9%Ib;S.e4E4@\m25^i]720%7~>endstream
|
||||
endobj
|
||||
14 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 249
|
||||
>>
|
||||
stream
|
||||
Gas2B8INBh&;BTK(%8)Q2+a"?*aMq\4K.?![FW8"e$p+akF8n$UkfH'`dI4a"80L!>ZbC60Zk4LLGE6]&5Z#qMYu/6Ns)3ldF]OCoN(cR,K(-$>Bb@Hb$Fm@B;e+Uh$?f>L6HTg25p.\@EBp=GIr"0+>.bL"Ab!5e$0H>2u,XrGS3n+\I^LXNi]kl12d&'Y,la0?'!jr\BDiS++DQrec,bZT6(6I/"hnM&*R'u?RM762ns?o2j@QC[f~>endstream
|
||||
endobj
|
||||
xref
|
||||
0 15
|
||||
0000000000 65535 f
|
||||
0000000073 00000 n
|
||||
0000000104 00000 n
|
||||
0000000211 00000 n
|
||||
0000003775 00000 n
|
||||
0000004033 00000 n
|
||||
0000007969 00000 n
|
||||
0000008227 00000 n
|
||||
0000012088 00000 n
|
||||
0000012346 00000 n
|
||||
0000012415 00000 n
|
||||
0000012712 00000 n
|
||||
0000012784 00000 n
|
||||
0000013177 00000 n
|
||||
0000013549 00000 n
|
||||
trailer
|
||||
<<
|
||||
/ID
|
||||
[<8efaabb9b9953607755769fba673a5bf><8efaabb9b9953607755769fba673a5bf>]
|
||||
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
|
||||
|
||||
/Info 10 0 R
|
||||
/Root 9 0 R
|
||||
/Size 15
|
||||
>>
|
||||
startxref
|
||||
13889
|
||||
%%EOF
|
||||
@@ -0,0 +1,88 @@
|
||||
%PDF-1.3
|
||||
%東京 ReportLab Generated PDF document http://www.reportlab.com
|
||||
1 0 obj
|
||||
<<
|
||||
/F1 2 0 R
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/BitsPerComponent 8 /ColorSpace /DeviceRGB /Filter [ /ASCII85Decode /FlateDecode ] /Height 80 /Length 4030 /Subtype /Image
|
||||
/Type /XObject /Width 400
|
||||
>>
|
||||
stream
|
||||
Gb"/jGApR4(<3O%]`e\8(`3O5&gh'9UL4Y/2^+lN@RPf5J1sk+NQ;)EK[=iEKil-Qc;6BQ-:LGDM9+$J',f.nAHF$>+;)0QS%B_Sc9&LGT7Z-dn+T=!BA[iT=_mDCmsXW7DDr_l&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j4;C$M\fk4SCf9@^_*csp?0E:tA:NU]#jiWj*eoUTRh)0!!`5]fOL5)!H>oFG9DVR2p+lUH`IrqKY)BXD"&`V6FB2;HMc(7';GJ3Ri.fjIH*^/fZ0Or*2Il>Q?21r^]?[Q:^FZY!?_$=YY:S0iN>)NU7,O;iAF)l:J:7R.&b*4>RZXupbn&1%rQH_7Vb5!5MER63=/j;H?_mC;n]Y$@HQVU)1)MO]*WmC]s?Bm'Eo$F't5&H1H?C?\PZOT*=k"OUBF^Z7('_KU*cW%)S*WMHX>B\o<I2:$`SBCZ%`^?q2(GB+]eubDeR*F:Vn)#4P":#10VP[s<B,;6r>eYSG,9p^:L_3O7EcSHah6r%e]`O=YOgf8dp9lDfH=\S3c8r1HgU==+3,mg#Rl>?_pYUI1'87(cTWVY:DUqL#0^"?4&$]I&kNC0`5JLC0C)C?hEoh,WmelnPO$)t=.f&emDnSAh"(gjeWnZaC>tjK_q=<Y;2^p2tgSS*;Q.a5>kWnKp@"uqSN>jhKjj-*a*6R/cmlaT]DPqQi#i_sfOu^V];l<C(qWb+)+X,K]55F9'T7-DN5.u%#%NNom8J@WaU$Ys4EcZ<p'k6@=H1U1Nf\"4dJ%Sa[;MZc\a,_<liPGacdte(N4#Ld&J7G4#qWW.gemV!7e*Ykse!lmlI6&MpTjGEYW""=AdA+bUmFtWqp@60F94#=0o#op<o8V#IIK09?Vu_>9H0E/'"u"N,<U8_fPGUCUB[J#'4)`ugV+[/l@hgImB]$Q&++O4II3)@-X=:iG5aNlrihrDt1>!G@S%i^qIJ7$3^\6AsEbQnB1qh&Ubj<rb+7oR9`51Wc:HuhG`mCl"YPN?sfuR?9TKbo,*Xsp8_GH7iUV'<j2Q"^R:?R!:`1LALo[6Bt.TJfM[:mr31c/1%Y[kk=hS"9r+/DM.<0XIVdF$A<$KL.*`4/*c#08_`F9A=5:/6g][Wq=O\\1b/3Y3fE$[VL2A^Dsg+d,P8$hWM:-_?FRhKnLi6AL;36.CAZjVR[)*:k&[VG3Q>UVt*h^pT^rHWE2?D;2KcRo]!jbUdu=J*Y^iO7NMpK&+/M?E#p8P[:&KcCI&WT>lj0hn!r'Ddu;@XC[FI)\EY_V0_d]7q%$Vah7c\%+%Y5*O#<]LtTjQE1fFkLQENDq6=GM6pd`rMIpbfSH&W.T3_Ol$<b!Gm_&PqlR5%C@JK*Ol!h2Imp7Ot.+d%:%3%3]MgkH[#Hbj+HhPP+H1'Iu;KCj>&_r2rYl%'!QJ.^n(hm(#X5AF,*LSQi,0+l+X_QCd-sXH3[JDTElP16rE1kDGP]cKR_8u#V]Y"6aMOg*%"icN@-gSBDCX=S#a-tPZm-JQ<M>nqsR%UpnUK?#%8%U]B4C%d;CZj!6'e<<Q+hZci=8cPWZ5+GD%id30L5h:gr7\PodKJi:2tP_K@\Qq+EV)a?Ul]h5_1DjI[kCso9J35<SVlOcqg71^,=fU%5!E:*Yls*-m+ARh)lulg3U$-Nba:,pm+SkJTsi93ruC,0)`CY;VB`@`"Ms*Z\d)j&;boQ1M6h3^7dju]MOg*%KdG1:BYpEDMN0Qp=1H1W:%UjRoYunt=j%eq(Yulu6#ZJULEF)i>=uH5kuE5#MQ?sdq?&m6=[kl8Tc=t$9jhO62tP_K@\QrVip+eZoCIBVVI.)e.%EMOIc+6PiK4ajcW;'k,f-)WZWXVHl1GC1G[.CW]@LAEm#oop\)2X5*2\@n4)j*X1-$m:bbYF=S8$HLkmmrTSX5auF,e"0nU1VTA)3IC$D8-]"Dr>:d49"#,PSWbhql]_\g6^db0"bZp1dtL,AY,Hrd`/-m-ruOhB-14LQ;od4K*.0TNLGYVbWfTAdFC7kkt8JqJt7[c^Ql>:b?jjnDUs$l_[IMNbHS?"ibH+Q6-&jp=Nm3cV`0>dPSYcp:A>VG$`7Io@6oL.1Yru`9tj;1MbRC8OuD!T$h`FdRA1Y^%4"c]QM>g?3P;LgT"R'VH'Wq6-8?<USYnh?<PGk\JN:FmgdODqo^Y-[-q!`!^tV."8sD%$a=W62-nVR5dA`f_`,bBeeu3ao@Bu0gUBEIr:BIZ;o%Z0&e^p5.nho"?_Kdin'6=Th05;oSNV>N<>b"^YjQ;nmbG@*@ga"'Fml$&HKSjO@C'C@dp'!_Ffa>t?@e@l=1ULa\_XlA]C"jJ[EOb[EU<Ad0TKc?#SM"3X(Lm^XP:Ai-*%LAh7FIFeZ)VBqd3-*?CmNmaQd@AW)n[b?#",SQm'MSet=LGn*8H(EmU"ap$8frb$7:h(cmkPT'j1f=')P0OfDf-J!fq>LM]CT:sc(6S,=-4)`A+!]_LKEDDRhbe129S\o$XGLlIB_@GSM;0aiBoeY#3\$rnT",rr#-L8WThrVH3Wd>f5/m!I81VBY=an%\pL-#\@;=L#`iU,5`YFD7-fMIo%:V-`E0oiVNt"U>:mo'NpD2RN%p)fKE=Wh?#X9URXg?^k0Ij3g**Ork^G>.)NP0^Zo`HhZs,@E=NRrX<D_QiVi,Ql*A5m(A3^.6?$s:Tscqo1s(3kg6"-]oontmG$5h'tiM,?M3U6bKrXpDQZ+8OY)5\YPQ1RA1]df+(N?OL"VhJ@gqIkI.E-;o/3Qt1BZ.-^fcFk]\"'SkiU-Zp$1)VkDu]4'.6Q)Rpf>e7Rl\9C@L/t\;Z<&1+XN&%j.rRWDZ,P`5RWN'o-KfFuX0F4HJ=HdaGcm]mfpk]J#M4P%(H_.XIrZ=?Cg4eurF6+-eE^<^5E+044/<]H!b,^3/b-4Pk'SYL'H2BJX;H*1(;>.@2s+l4^Ld[GX<"m,,S8jnVW1p25U-)leT"(Rd*85eRMpFgl;HQ5B?dNZ>%sGi@`*P8u]+OF+6o8`@u[s,9A[)598-5To.NR<lP-If-_B`3<WT]66n!$kEk=@IN'deV@j'G*jECo*=n-CGGPSQS2^c0JU>Hgri>q[;4C6)9l.D<V/o>Yr;9tI4q7F2Vk[EZD9m8%k8qS#MUgZFAT/+X&c>ZakEt-!tIC@gq7p=,@:&"fuR?9TKhl$^"^,@C[&CBoS#=R9q$`uOH:#JSJ9<W:p15N\sY?eMAc-TfVUQCf[/aU7Q<+W&cX!Gg5QIU/.fucFm:(bne,@%k0<G*A&jUu)&=dF=S^2O(/T;7N!1"hV*7RChA=/aUheSb/jG#CKc/_f;X(jRo.*8Mg=Ih\Oh@5uQu:VmJml*(fb1#VW`5t>P:&Gm=.MEs`m+]EUC$a"&A7V[4&1(O+/U5tc%5l8bfgduMA7WcZLS/+L8KGT=PZX]or?B?!uj1:N/iq<5oqK2W)9>[j2YeFBE.YV?a;6JT9Q4NVaHp2:S]C*Y[u"D`JYPMk(OUXco6M[L(>@Y58HVo85]"55<n&T0HJOk!N-,QmPlFjWD]R'acb;QGO![lac[s)?XtR.?N'ae(!#%[/$O0^<q#:-:!!5#^Yc+q1Gi1@C=S]=(`^A8mFp['?G60sRthIolIN'VAu5FhcfZf/QG"1R`Q25(?i\KC4#^p(-nO-j%L*Xa(C,)gChDno6`*pRMWEi/GSmFTTK>IG+o_]R.!G(9muq-BFa8ETq]Ipg#U)AK2f>//o4/S?>UdM"B);/a.)]9Kd\TSI[X3Z=U2f//"o0>`-h5:!aHeD^"pG1@4>4BtrUnbQ\p&f=niu3sje\cKZtRhg)ldr?b(YV+-RL1_Dd!GjKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U/qprr=Mu.$a~>endstream
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/BitsPerComponent 8 /ColorSpace /DeviceRGB /Filter [ /ASCII85Decode /FlateDecode ] /Height 80 /Length 4649 /Subtype /Image
|
||||
/Type /XObject /Width 400
|
||||
>>
|
||||
stream
|
||||
Gb"/jGB=Qg)obtD(hlkN"f0(5:+-aT<2CgqGS',YK-J92</AC'U_i9g+@S\\U&raT$(ug!$4%UW-.5X!OX),AnUAfpn!Ei.ZhN>CmL:6LTAL_NG:0`'1[i!JbWE/;*Y0EI&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j4Eeer2aX7P8Gl-m;mrV!9$er./Dr&!ITgFGgA]g5nB?j\gC<`5,n"5+/EDb62SNDgA(+`SG0AE<rQgcQJG2U/e+R8qjL(>AddU%7a,tE0=*^(Ec[;Xqd5T\EOW]FtK0RnAjQS4C.$PtF;o^lOY2JjA(q!>?5rnj<G\:)&,dhUp&iW]f.:o[KoXBDknr:%Vf^;G^:].Qgi,t!Cq7>hqh;qnpMFKAOW-;:r<^AH9g>e+lD6ps06kbI-F(^N'<gi-Gcfoqm`D<`e/XBDl/[X1QKi:_Nlme*"2?`R7BfZQ0Yn][CW7>_eACt5OcbEjk(IV6iiD9J4s/kRp`OHAta@uhcDp$1sUc^m;JVdmc-<LpXGp$)c(Hk;,Z7Z;:iTQi5(W(^OWj5YQ"X&GpV4SR^\.hEaCrqG<">P%b#odXg*ftJs\9L;@@2JoU%\UmIb7]_:X"BnAg8PVo7"#l=q;QoL`\om<c?*C)#T0=:[KaS]?>+g+\Q7Q1-1hhO`F6CiVjuIe^m/?\904&j@l.#kH4Fk1D;,Pn,s$FCkgKq>QM?oqqM4!SS5Q<QQX'N=qdO.h^m&2`sM\[n]^V[k)d\U9,\%pa.@q.TDm$L"eIRL!nb*BjG)?7Bqo-V?%d\TU3:EfTg^\mXEJ,E_-&<#c6bEkHfgiJN=njqoeR5#87\R3+#(G8t>*fUYEeuW#*!X5lBcX+"oeKmkS@.(h*0mT1nrUS,bYIsE5U03`ScpJ<e8mf>^IB)[\rqY`>H2da;>5Fp[LW%H??G5X>9I<PAY[@K\1i1gkRchBYhS[+bJ,aqhT)A3+5BlPN)'4Kg>8jeZbo1@@Rl5+ugph@\]QnSZaRRJ1c_/-=odWtJ379>AFMnp+G4!`KBR8dD=K*Pa.$qbpOj^:tQl,H?<^#prXH0"q[r1$MSsu`#-;Bq^d`.=is1np^^aDK96L*.(Hg9*0T*>V9Q`\n^`L]5>i\sQ)AOEM\?DBt!8#7Y0SiiDsB23^-W`?+JZ!P-=ienX,kdX6M.]EUBN#=ETZtP"4E0#kk.nX&MZXupQJK6dnON]"CP^nh:H5s_#io8rs2G=?r4"L]CJn3h!L8UnMn3&`J&s32P.9WsPPW!4%+F!3AJ[djYeuWV-Z>A4"8En[*=5Y$_-RU/bi:`*I1]ICNmogd^&>KHo<_nI&aUW0Z4F*r.YHH(N0/])H@AVt,Qj$t>c%"]+(Grh2@hqR\Kp>)Z"qC's<2ica_Zur<m^u*Y/Q8MTPR<N]nt9$(DsPuVc&uYY%\d#Y-N4c2<Xd/X\=C<^SfiC5&Xk4B%BU5u_1MuTN[G%fFL=0<a#bps%YP9MDr1EZ\)4&m]`M"L$&W*k$l5Y3iR$k4ldeYQkic^(a@KBro!2iME/?\=G3i$/_H*tt(cQ?&U`e+$N@567Stob?D:u4k4BLb^V?V:WLl&4sV)/U+,U1BRd9b&3,)e"jWF"OBU.>-Q2/AL<_pP5LNT<5uc'*A?hCXJ.kFHh"?b\4MT7?jNW:Z<';^CI[++A`7OE8^;3KaGt`tU2'.D<$$(8lJ$qXeKD*.IYNhqsqO(qjtQ7I&aVcqsCYCu`NpZ:0D]`fV90Y<_!ZI:XrsMu4G<QsOrnk)*8Si:<@U^<s54-76lfcp@@u'Ae03>pR/S`Z+]D^@_i>hXTXH<?g3fN.t`mHFrIk+[^u,Nj?A=n'Rm8Z?>Qg<A$#ni\E8Ed[UQk3^PU.?M3aB)jcO&2:>*\=T.d1+*ZF@"?A5D+<XGJ21+oBV+^@Ul)1.3B,E[O-k`eb[<f-3'8ScY12"q)NH?`MY$JYo98TG?o]]l2K>E8\PZb2+R`274i=_&+Z-TjqgJjbPoZE^@ah=VW647kCR58IohRMC(*CR(BT4n*g4pe*Q*FX(Ze.='Up?^1I6=]+CR-!_%#,)!P&(,gr)C9gt'd@U<[_Mh<bGWalRRZ:i#nmA)7@_#-gU80l>7[.BXW9QFj@HU`+`>?^il-h`Cm[n-7Qu"^R/N<pp-\T5XtaG+fY:'fp1.8-b7N1^\)2X5)8a8-&-1WqRO?!W,`XY#S-!.OM%I+5h25bRQ9Y.]eP[9P9!<'"`J%WLB$Hd$-E,<4N*a'd,.Y1#h7D;b:aKfuJfOZ2&A>q)/q=rVY'\h65$\b8K9Z?3pKR74:;W&Vrb0'RUnf8X,*n!Vo@(0TeZW?;S5&\%q=Edol##.]0ta!D>-S?B@9uVpR#`p(A3D=o2%[df2_8e$4'f1)NRB<l#/epT=HLX<p$1'cRu'Req!dUQ^M`WY%C7F7G4"#B&i77,rq+Z8\EqkQUW6mqn`1?2:<99X*Ib9nLECu$5p=q2dRJff._W-+(3b(Yrlpuq[pdum$V%>TH'-m?jdWZp&qR5i[E?3(7'GO%!UQIuh95N^kDD#a!lS/(+69W4/mZ%*VQC%uHIj\7D6iFIm5:M9YL]ma?a!d!FZRM2L8hJQ&\WdViAY@$CLtG[U/u!RSi'E`c5,!URlB=%P(1u[;9l7Q6M'9A^A>u+KnAe0>frXsk/mMom5)EOm'BeFBCOfX;l@XR`#.>7PVnNg$Ar0C2iBc2!hXl2M:?(bVG3Z?oZE^@ah:&r%'`jC6A5c$G<'9m%\d#1i<%XtiOYApFZ#ngUWnJtG/]%:$X.K=GW,cd(,X]VC#UfU)`BPA]i2*s(/L,')$%Fg"HB/6o9V+;QA(poeWD(H%'PK'i*'^M$-GEjj5ZsajEKE@7)e`B&V9IbT7,k5*08(&Z-#ERJlX!nJ6-,qOtU0+)>0FG+$Y3Z0!U5:(0dnE2>ldh:HuO;nY0Pm-Nb)*J,HR6XB5,?i``NJa]m"YMA3m7nYoRq4LCiWU8$(:YI&.rTieR/pt(60)sl<&(qm64bGjc,Wid`T\j#uS,;!3t#ZWX@Qb]H6m+0<mq"pT?/gh,$$MM]52_Qf@HL!0M.BgGYRaS8&f<<*5L9LAQhWCm'9YO$4=E`:QN3t-8WYjUQ_:E*b:=22WPC.B]jq;$$D\t?-7XMIQbD+4-gUCs0@Q;HpGa*a;+>:1;qsHNtRn/+a^TqR>@.Xea;s?o];:@&[UK4L#Bgp,F,RsE=He%6J^-n8]*f2^ig*%<Ho%<Bl^t<YGaN-n_khWk[Q9Js,*5hZBeUD3LcIOkYKViFO>WUU9]mE:;]ttcMZNm[=\K]o`.A)aeHbb.4k%p-knF1D'??PP_$'uAW<n&=urVQ?PaH<5kR52PVqQ%ASi>.YfGi'1*4F,A`4oAd^jGb*;,,KJMg7lSZNi\i-]QnS9fD@keR"@(a"19:*>e>/RJdS>U2U)kn??q^kNZ$^Bf?AOeYG1M#F69N)YKHC81t4$<='G]b*BVjAL@1)g&>WXcmq%"$FN(@d[u(<&)jh7^9q42j;/'(@*qZ9#1_hV7kg;bg1;[eiWMc>NH^cj+,)L?e-tC8Ul?"$BW,("fP"k2kTgOS\'#P]UR$afb6UO5'fV1fm!:>pa-mFu0fN>Vk8V,EUiAp`*k=7lnd6j#G?@gXj^]4:[lhRS7^\h!<=Ohc'UIUA;HcM*b-SH\Up<+TaZX2<A99=J]8a]hLl'35R1/-l(io8r/DFn:Ul4p7&\[%C"A]pCUpdeZ(I(:I`"K>JrHeBK!>nJAaY?kLL/gnU,#h^q""dFhRG&6H/a5UeX7Z<FFkZgN[Qi[]b[qUYmnY9pRZKapTW57tpDaub.C],WPGf!!siWZ[$Gc?(sK1T!bnhJ6m\oc&$UX6PlA0M$=gp2k0U'g^N_FXLaSPN%91<W='3St=UOZ_!6o;:`G7>p6gFeM-UOA/J@Q7cIs89qr*N`gtc.gV7W@Pgu35H&1[il-gG6ps9s12"o1k*p:dX^3kuci>4)9#`+:[3-;KGd.!54*Cm-Y<5R+fnq"UN/<B'K;7<YI,ljoR\iis3%@XTHKF\iIdi7K^8P2@-It.qj05blIf9,65(+?-)OUrW];+]CXb/IHrbtO&cAE>ejI8lFT$382/`"$_QOh#2/DLjqoXT1J5gUR^:Q73Z/%*Q8G$9Be]Pl[k/.(FEb(9d)@N&6Z]ZhSo`8H7)_IDWLQ('dTVL5SII6X+!=b>6UY]Ahta^E[MKH*pf9IX>_4DG/tD:u3@Q=+'LrO%cbH8T["^qG*h2K%:eK3(85o6G^0<BC>e=,qT0_iZI$F6Ci^qWb,K`6fP]$4[.MF'Y6B(&,:Gh,TCP2$t[aqq^Lo&44Hf_5)pDh0RQRF/\'r*qi?.M@`+%+QqN0=0@MG=&Q81PV9AI^:8FXigm1m+bV6r>dtn(*3_sE%hF_WLlbEq6:+#iY$HCPCI\XR.7d!#(c,btV+R!a:h@tE4Z#"&=0Gs$*@i:d&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+bUCn#U+j463n`f&4-XGKFgHU+lr@e*ukCHmf~>endstream
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/Contents 9 0 R /MediaBox [ 0 0 612 792 ] /Parent 8 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject <<
|
||||
/FormXob.41b05a9cf8679f0fe6e7c30c9462b767 3 0 R /FormXob.94284ebb61fac7951963d5746d1b193a 4 0 R
|
||||
>>
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<<
|
||||
/PageMode /UseNone /Pages 8 0 R /Type /Catalog
|
||||
>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<<
|
||||
/Author (anonymous) /CreationDate (D:20260126172022+01'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20260126172022+01'00') /Producer (ReportLab PDF Library - www.reportlab.com)
|
||||
/Subject (unspecified) /Title (untitled) /Trapped /False
|
||||
>>
|
||||
endobj
|
||||
8 0 obj
|
||||
<<
|
||||
/Count 1 /Kids [ 5 0 R ] /Type /Pages
|
||||
>>
|
||||
endobj
|
||||
9 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 300
|
||||
>>
|
||||
stream
|
||||
Gas3-b=]`-&-h'@TAj4XMk6`hV:j"r+Qu/5_JPH2[*jl@3?0-u[(9'GR:-/*/ft>A_=nj;i7d2;EpsDoJr<OBhVlHiq4E/El7+06*H?(h_eGnqiS:>Dgn0>N^CGqOd65m'$2XdN[8"CN<R^<p;O;.QTL>"4'o-s=`lHc!JpSi8$*d@]6l&@V%Q+V`W6/nPEL_rB?OF1iZbk.;Ju<];RLo@-9lO$dQ,9&`I`%EM@\dBr0Lf$$+R^&+/ncK?;0=7o:`];ceF"uKA7ETdrT"0YNT=QC"`>/@%I83@M@]K&@Nk~>endstream
|
||||
endobj
|
||||
xref
|
||||
0 10
|
||||
0000000000 65535 f
|
||||
0000000073 00000 n
|
||||
0000000104 00000 n
|
||||
0000000211 00000 n
|
||||
0000004431 00000 n
|
||||
0000009270 00000 n
|
||||
0000009574 00000 n
|
||||
0000009642 00000 n
|
||||
0000009938 00000 n
|
||||
0000009997 00000 n
|
||||
trailer
|
||||
<<
|
||||
/ID
|
||||
[<60f7c7338a7d1cfd54f86e6a06e41602><60f7c7338a7d1cfd54f86e6a06e41602>]
|
||||
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
|
||||
|
||||
/Info 7 0 R
|
||||
/Root 6 0 R
|
||||
/Size 10
|
||||
>>
|
||||
startxref
|
||||
10387
|
||||
%%EOF
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,223 @@
|
||||
"""
|
||||
Unit tests for DocxConverterWithOCR.
|
||||
|
||||
For each DOCX test file: convert with a mock OCR service then compare the
|
||||
full output string against the expected snapshot.
|
||||
|
||||
OCR block format used by the converter:
|
||||
*[Image OCR]
|
||||
MOCK_OCR_TEXT_12345
|
||||
[End OCR]*
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from markitdown_ocr._ocr_service import OCRResult # noqa: E402
|
||||
from markitdown_ocr._docx_converter_with_ocr import ( # noqa: E402
|
||||
DocxConverterWithOCR,
|
||||
)
|
||||
from markitdown import StreamInfo # noqa: E402
|
||||
|
||||
TEST_DATA_DIR = Path(__file__).parent / "ocr_test_data"
|
||||
|
||||
_MOCK_TEXT = "MOCK_OCR_TEXT_12345"
|
||||
|
||||
|
||||
class MockOCRService:
|
||||
def extract_text( # noqa: ANN101
|
||||
self, image_stream: Any, **kwargs: Any
|
||||
) -> OCRResult:
|
||||
return OCRResult(text=_MOCK_TEXT, backend_used="mock")
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def svc() -> MockOCRService:
|
||||
return MockOCRService()
|
||||
|
||||
|
||||
def _convert(filename: str, ocr_service: MockOCRService) -> str:
|
||||
path = TEST_DATA_DIR / filename
|
||||
if not path.exists():
|
||||
pytest.skip(f"Test file not found: {path}")
|
||||
converter = DocxConverterWithOCR()
|
||||
with open(path, "rb") as f:
|
||||
return converter.convert(
|
||||
f, StreamInfo(extension=".docx"), ocr_service=ocr_service
|
||||
).text_content
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# docx_image_start.docx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_docx_image_start(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"Document with Image at Start\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"This is the main content after the header image.\n\n"
|
||||
"More text content here."
|
||||
)
|
||||
assert _convert("docx_image_start.docx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# docx_image_middle.docx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_docx_image_middle(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"# Introduction\n\n"
|
||||
"This is the introduction section.\n\n"
|
||||
"We will see an image below.\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"# Analysis\n\n"
|
||||
"This section comes after the image."
|
||||
)
|
||||
assert _convert("docx_image_middle.docx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# docx_image_end.docx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_docx_image_end(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"Report\n\n"
|
||||
"Main findings of the report.\n\n"
|
||||
"Details and analysis.\n\n"
|
||||
"Recommendations.\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
)
|
||||
assert _convert("docx_image_end.docx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# docx_multiple_images.docx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_docx_multiple_images(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"Multi-Image Document\n\n"
|
||||
"First section\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"Second section with another image\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"Conclusion"
|
||||
)
|
||||
assert _convert("docx_multiple_images.docx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# docx_multipage.docx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_docx_multipage(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"# Page 1 - Mixed Content\n\n"
|
||||
"This is the first paragraph on page 1.\n\n"
|
||||
"BEFORE IMAGE: Important content appears here.\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"AFTER IMAGE: This content follows the image.\n\n"
|
||||
"More text on page 1.\n\n"
|
||||
"# Page 2 - Image at End\n\n"
|
||||
"Content on page 2.\n\n"
|
||||
"Multiple paragraphs of text.\n\n"
|
||||
"Building up to the image...\n\n"
|
||||
"Final paragraph before image.\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"# Page 3 - Image at Start\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"Content that follows the header image.\n\n"
|
||||
"AFTER IMAGE: This text is after the image."
|
||||
)
|
||||
assert _convert("docx_multipage.docx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# docx_complex_layout.docx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_docx_complex_layout(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"Complex Document\n\n"
|
||||
"| | |\n"
|
||||
"| --- | --- |\n"
|
||||
"| Feature | Status |\n"
|
||||
"| Authentication | Active |\n"
|
||||
"| Encryption | Enabled |\n\n"
|
||||
"Security notice:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
)
|
||||
assert _convert("docx_complex_layout.docx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _inject_placeholders — internal unit tests (no file I/O)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_inject_placeholders_single_image() -> None:
|
||||
converter = DocxConverterWithOCR()
|
||||
html = "<p>Before</p><img src='x.png'/><p>After</p>"
|
||||
result_html, texts = converter._inject_placeholders(html, {"rId1": "TEXT"})
|
||||
assert "<img" not in result_html
|
||||
assert "MARKITDOWNOCRBLOCK0" in result_html
|
||||
assert texts == ["TEXT"]
|
||||
|
||||
|
||||
def test_inject_placeholders_two_images_sequential_tokens() -> None:
|
||||
converter = DocxConverterWithOCR()
|
||||
html = "<img src='a.png'/><p>Mid</p><img src='b.png'/>"
|
||||
result_html, texts = converter._inject_placeholders(
|
||||
html, {"rId1": "FIRST", "rId2": "SECOND"}
|
||||
)
|
||||
assert "MARKITDOWNOCRBLOCK0" in result_html
|
||||
assert "MARKITDOWNOCRBLOCK1" in result_html
|
||||
assert result_html.index("MARKITDOWNOCRBLOCK0") < result_html.index(
|
||||
"MARKITDOWNOCRBLOCK1"
|
||||
)
|
||||
assert len(texts) == 2
|
||||
|
||||
|
||||
def test_inject_placeholders_no_img_tag_appends_at_end() -> None:
|
||||
converter = DocxConverterWithOCR()
|
||||
html = "<p>No images</p>"
|
||||
result_html, texts = converter._inject_placeholders(html, {"rId1": "ORPHAN"})
|
||||
assert "MARKITDOWNOCRBLOCK0" in result_html
|
||||
assert texts == ["ORPHAN"]
|
||||
|
||||
|
||||
def test_inject_placeholders_empty_map_leaves_html_unchanged() -> None:
|
||||
converter = DocxConverterWithOCR()
|
||||
html = "<p>Content</p><img src='pic.jpg'/>"
|
||||
result_html, texts = converter._inject_placeholders(html, {})
|
||||
assert result_html == html
|
||||
assert texts == []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# No OCR service — no OCR tags emitted
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_docx_no_ocr_service_no_tags() -> None:
|
||||
path = TEST_DATA_DIR / "docx_image_middle.docx"
|
||||
if not path.exists():
|
||||
pytest.skip(f"Test file not found: {path}")
|
||||
converter = DocxConverterWithOCR()
|
||||
with open(path, "rb") as f:
|
||||
md = converter.convert(f, StreamInfo(extension=".docx")).text_content
|
||||
assert "*[Image OCR]" not in md
|
||||
assert "[End OCR]*" not in md
|
||||
@@ -0,0 +1,234 @@
|
||||
"""
|
||||
Unit tests for PdfConverterWithOCR.
|
||||
|
||||
For each PDF test file: convert with a mock OCR service then compare the
|
||||
full output string against the expected snapshot.
|
||||
|
||||
OCR block format used by the converter:
|
||||
*[Image OCR]
|
||||
MOCK_OCR_TEXT_12345
|
||||
[End OCR]*
|
||||
"""
|
||||
|
||||
import io
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from markitdown_ocr._ocr_service import OCRResult # noqa: E402
|
||||
from markitdown_ocr._pdf_converter_with_ocr import ( # noqa: E402
|
||||
PdfConverterWithOCR,
|
||||
)
|
||||
from markitdown import StreamInfo # noqa: E402
|
||||
|
||||
TEST_DATA_DIR = Path(__file__).parent / "ocr_test_data"
|
||||
|
||||
_MOCK_TEXT = "MOCK_OCR_TEXT_12345"
|
||||
_OCR_BLOCK = f"*[Image OCR]\n{_MOCK_TEXT}\n[End OCR]*"
|
||||
_PAGE_1_SCANNED = f"## Page 1\n\n\n\n\n{_OCR_BLOCK}"
|
||||
|
||||
|
||||
class MockOCRService:
|
||||
def extract_text(
|
||||
self, # noqa: ANN101
|
||||
image_stream: Any,
|
||||
**kwargs: Any,
|
||||
) -> OCRResult:
|
||||
return OCRResult(text=_MOCK_TEXT, backend_used="mock")
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def svc() -> MockOCRService:
|
||||
return MockOCRService()
|
||||
|
||||
|
||||
def _convert(filename: str, ocr_service: MockOCRService) -> str:
|
||||
path = TEST_DATA_DIR / filename
|
||||
if not path.exists():
|
||||
pytest.skip(f"Test file not found: {path}")
|
||||
converter = PdfConverterWithOCR()
|
||||
with open(path, "rb") as f:
|
||||
return converter.convert(
|
||||
f, StreamInfo(extension=".pdf"), ocr_service=ocr_service
|
||||
).text_content
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pdf_image_start.pdf
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pdf_image_start(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"## Page 1\n\n\n\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n\n"
|
||||
"This is text BEFORE the image.\n\n"
|
||||
"The image should appear above this text.\n\n"
|
||||
"This is more content after the image."
|
||||
)
|
||||
assert _convert("pdf_image_start.pdf", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pdf_image_middle.pdf
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pdf_image_middle(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"## Page 1\n\n\n"
|
||||
"Section 1: Introduction\n\n"
|
||||
"This document contains an image in the middle.\n\n"
|
||||
"Here is some introductory text.\n\n\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n\n"
|
||||
"Section 2: Details\n\n"
|
||||
"This text appears AFTER the image."
|
||||
)
|
||||
assert _convert("pdf_image_middle.pdf", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pdf_image_end.pdf
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pdf_image_end(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"## Page 1\n\n\n"
|
||||
"Main Content\n\n"
|
||||
"This is the main text content.\n\n"
|
||||
"The image will appear at the end.\n\n"
|
||||
"Keep reading...\n\n\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
)
|
||||
assert _convert("pdf_image_end.pdf", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pdf_multiple_images.pdf
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pdf_multiple_images(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"## Page 1\n\n\n"
|
||||
"Document with Multiple Images\n\n\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n\n"
|
||||
"Text between first and second image.\n\n\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n\n"
|
||||
"Final text after all images."
|
||||
)
|
||||
assert _convert("pdf_multiple_images.pdf", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pdf_complex_layout.pdf
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pdf_complex_layout(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"## Page 1\n\n\n"
|
||||
"Complex Layout Document\n\n"
|
||||
"Table:\n\n"
|
||||
"ItemQuantity\n\n\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n\n"
|
||||
"Widget A5"
|
||||
)
|
||||
assert _convert("pdf_complex_layout.pdf", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pdf_multipage.pdf — pdfplumber/pdfminer fail (EOF); PyMuPDF fallback used
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pdf_multipage(svc: MockOCRService) -> None:
|
||||
# pdfplumber cannot open this file (Unexpected EOF), so _ocr_full_pages
|
||||
# falls back to PyMuPDF for page rendering. Each page becomes one OCR block.
|
||||
expected = (
|
||||
f"## Page 1\n\n\n{_OCR_BLOCK}\n\n\n"
|
||||
f"## Page 2\n\n\n{_OCR_BLOCK}\n\n\n"
|
||||
f"## Page 3\n\n\n{_OCR_BLOCK}"
|
||||
)
|
||||
assert _convert("pdf_multipage.pdf", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pdf_scanned_*.pdf — raster-only pages → full-page OCR
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pdf_scanned_invoice(svc: MockOCRService) -> None:
|
||||
assert _convert("pdf_scanned_invoice.pdf", svc) == _PAGE_1_SCANNED
|
||||
|
||||
|
||||
def test_pdf_scanned_meeting_minutes(svc: MockOCRService) -> None:
|
||||
assert _convert("pdf_scanned_meeting_minutes.pdf", svc) == _PAGE_1_SCANNED
|
||||
|
||||
|
||||
def test_pdf_scanned_minimal(svc: MockOCRService) -> None:
|
||||
assert _convert("pdf_scanned_minimal.pdf", svc) == _PAGE_1_SCANNED
|
||||
|
||||
|
||||
def test_pdf_scanned_sales_report(svc: MockOCRService) -> None:
|
||||
assert _convert("pdf_scanned_sales_report.pdf", svc) == _PAGE_1_SCANNED
|
||||
|
||||
|
||||
def test_pdf_scanned_report(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
f"{_PAGE_1_SCANNED}\n\n\n\n"
|
||||
f"## Page 2\n\n\n\n\n{_OCR_BLOCK}\n\n\n\n"
|
||||
f"## Page 3\n\n\n\n\n{_OCR_BLOCK}"
|
||||
)
|
||||
assert _convert("pdf_scanned_report.pdf", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scanned PDF fallback path (pdfplumber finds no text → full-page OCR)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pdf_scanned_fallback_format(svc: MockOCRService) -> None:
|
||||
"""_ocr_full_pages emits *[Image OCR]...[End OCR]* for each page."""
|
||||
path = TEST_DATA_DIR / "pdf_image_start.pdf"
|
||||
if not path.exists():
|
||||
pytest.skip(f"Test file not found: {path}")
|
||||
|
||||
converter = PdfConverterWithOCR()
|
||||
with patch("pdfplumber.open") as mock_plumber:
|
||||
mock_pdf = MagicMock()
|
||||
mock_page = MagicMock()
|
||||
mock_page.page_number = 1
|
||||
mock_pdf.pages = [mock_page]
|
||||
mock_pdf.__enter__.return_value = mock_pdf
|
||||
mock_plumber.return_value = mock_pdf
|
||||
|
||||
with open(path, "rb") as f:
|
||||
md = converter._ocr_full_pages(io.BytesIO(f.read()), svc)
|
||||
|
||||
expected = "## Page 1\n\n\n" "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
assert (
|
||||
md == expected
|
||||
), f"_ocr_full_pages must produce:\n{expected!r}\nActual:\n{md!r}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# No OCR service — no OCR tags emitted
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pdf_no_ocr_service_no_tags() -> None:
|
||||
path = TEST_DATA_DIR / "pdf_image_middle.pdf"
|
||||
if not path.exists():
|
||||
pytest.skip(f"Test file not found: {path}")
|
||||
converter = PdfConverterWithOCR()
|
||||
with open(path, "rb") as f:
|
||||
md = converter.convert(f, StreamInfo(extension=".pdf")).text_content
|
||||
assert "*[Image OCR]" not in md
|
||||
assert "[End OCR]*" not in md
|
||||
@@ -0,0 +1,148 @@
|
||||
"""
|
||||
Unit tests for PptxConverterWithOCR.
|
||||
|
||||
For each PPTX test file: convert with a mock OCR service then compare the
|
||||
full output string against the expected snapshot.
|
||||
|
||||
OCR block format used by the converter:
|
||||
*[Image OCR]
|
||||
MOCK_OCR_TEXT_12345
|
||||
[End OCR]*
|
||||
|
||||
Note: PPTX slide text uses literal backslash-n (\\n) sequences from the
|
||||
underlying PPTX converter template; OCR blocks use real newlines.
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from markitdown_ocr._ocr_service import OCRResult # noqa: E402
|
||||
from markitdown_ocr._pptx_converter_with_ocr import ( # noqa: E402
|
||||
PptxConverterWithOCR,
|
||||
)
|
||||
from markitdown import StreamInfo # noqa: E402
|
||||
|
||||
TEST_DATA_DIR = Path(__file__).parent / "ocr_test_data"
|
||||
|
||||
_MOCK_TEXT = "MOCK_OCR_TEXT_12345"
|
||||
_OCR_BLOCK = f"*[Image OCR]\n{_MOCK_TEXT}\n[End OCR]*"
|
||||
|
||||
|
||||
class MockOCRService:
|
||||
def extract_text(
|
||||
self, # noqa: ANN101
|
||||
image_stream: Any,
|
||||
**kwargs: Any,
|
||||
) -> OCRResult:
|
||||
return OCRResult(text=_MOCK_TEXT, backend_used="mock")
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def svc() -> MockOCRService:
|
||||
return MockOCRService()
|
||||
|
||||
|
||||
def _convert(filename: str, ocr_service: MockOCRService) -> str:
|
||||
path = TEST_DATA_DIR / filename
|
||||
if not path.exists():
|
||||
pytest.skip(f"Test file not found: {path}")
|
||||
converter = PptxConverterWithOCR()
|
||||
with open(path, "rb") as f:
|
||||
return converter.convert(
|
||||
f, StreamInfo(extension=".pptx"), ocr_service=ocr_service
|
||||
).text_content
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pptx_image_start.pptx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pptx_image_start(svc: MockOCRService) -> None:
|
||||
# Slide 1: title "Welcome" followed by an image
|
||||
expected = (
|
||||
"\\n\\n<!-- Slide number: 1 -->\\n# Welcome\\n\\n"
|
||||
"\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
)
|
||||
assert _convert("pptx_image_start.pptx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pptx_image_middle.pptx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pptx_image_middle(svc: MockOCRService) -> None:
|
||||
# Slide 1: Introduction | Slide 2: Architecture + image | Slide 3: Conclusion # noqa: E501
|
||||
expected = (
|
||||
"\\n\\n<!-- Slide number: 1 -->\\n# Introduction"
|
||||
"\\n\\n\\n\\n<!-- Slide number: 2 -->\\n# Architecture\\n\\n"
|
||||
"\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
"\\n\\n<!-- Slide number: 3 -->\\n# Conclusion\\n\\n"
|
||||
)
|
||||
assert _convert("pptx_image_middle.pptx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pptx_image_end.pptx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pptx_image_end(svc: MockOCRService) -> None:
|
||||
# Slide 1: Presentation | Slide 2: Thank You + image
|
||||
expected = (
|
||||
"\\n\\n<!-- Slide number: 1 -->\\n# Presentation"
|
||||
"\\n\\n\\n\\n<!-- Slide number: 2 -->\\n# Thank You\\n\\n"
|
||||
"\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
)
|
||||
assert _convert("pptx_image_end.pptx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pptx_multiple_images.pptx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pptx_multiple_images(svc: MockOCRService) -> None:
|
||||
# Slide 1: two images, no title text
|
||||
expected = (
|
||||
"\\n\\n<!-- Slide number: 1 -->\\n# \\n"
|
||||
"\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
"\n\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
)
|
||||
assert _convert("pptx_multiple_images.pptx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# pptx_complex_layout.pptx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pptx_complex_layout(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"\\n\\n<!-- Slide number: 1 -->\\n# Product Comparison"
|
||||
"\\n\\nOur products lead the market\\n"
|
||||
"\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
)
|
||||
assert _convert("pptx_complex_layout.pptx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# No OCR service — no OCR tags emitted
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_pptx_no_ocr_service_no_tags() -> None:
|
||||
path = TEST_DATA_DIR / "pptx_image_middle.pptx"
|
||||
if not path.exists():
|
||||
pytest.skip(f"Test file not found: {path}")
|
||||
converter = PptxConverterWithOCR()
|
||||
with open(path, "rb") as f:
|
||||
md = converter.convert(f, StreamInfo(extension=".pptx")).text_content
|
||||
assert "*[Image OCR]" not in md
|
||||
assert "[End OCR]*" not in md
|
||||
@@ -0,0 +1,249 @@
|
||||
"""
|
||||
Unit tests for XlsxConverterWithOCR.
|
||||
|
||||
For each XLSX test file: convert with a mock OCR service then compare the
|
||||
full output string against the expected snapshot.
|
||||
|
||||
OCR block format used by the converter:
|
||||
*[Image OCR]
|
||||
MOCK_OCR_TEXT_12345
|
||||
[End OCR]*
|
||||
|
||||
Images are grouped at the end of each sheet under:
|
||||
### Images in this sheet:
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||||
|
||||
from markitdown_ocr._ocr_service import OCRResult # noqa: E402
|
||||
from markitdown_ocr._xlsx_converter_with_ocr import ( # noqa: E402
|
||||
XlsxConverterWithOCR,
|
||||
)
|
||||
from markitdown import StreamInfo # noqa: E402
|
||||
|
||||
TEST_DATA_DIR = Path(__file__).parent / "ocr_test_data"
|
||||
|
||||
_MOCK_TEXT = "MOCK_OCR_TEXT_12345"
|
||||
_OCR_BLOCK = f"*[Image OCR]\n{_MOCK_TEXT}\n[End OCR]*"
|
||||
_IMG_SECTION = "### Images in this sheet:"
|
||||
|
||||
|
||||
class MockOCRService:
|
||||
def extract_text(
|
||||
self, # noqa: ANN101
|
||||
image_stream: Any,
|
||||
**kwargs: Any,
|
||||
) -> OCRResult:
|
||||
return OCRResult(text=_MOCK_TEXT, backend_used="mock")
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def svc() -> MockOCRService:
|
||||
return MockOCRService()
|
||||
|
||||
|
||||
def _convert(filename: str, ocr_service: MockOCRService) -> str:
|
||||
path = TEST_DATA_DIR / filename
|
||||
if not path.exists():
|
||||
pytest.skip(f"Test file not found: {path}")
|
||||
converter = XlsxConverterWithOCR()
|
||||
with open(path, "rb") as f:
|
||||
return converter.convert(
|
||||
f, StreamInfo(extension=".xlsx"), ocr_service=ocr_service
|
||||
).text_content
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# xlsx_image_start.xlsx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_xlsx_image_start(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"## Sales Q1\n\n"
|
||||
"| Product | Sales |\n"
|
||||
"| --- | --- |\n"
|
||||
"| Widget A | 100 |\n"
|
||||
"| Widget B | 150 |\n\n"
|
||||
"### Images in this sheet:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"## Forecast Q2\n\n"
|
||||
"| Projected Sales | Unnamed: 1 |\n"
|
||||
"| --- | --- |\n"
|
||||
"| Widget A | 120 |\n"
|
||||
"| Widget B | 180 |\n\n"
|
||||
"### Images in this sheet:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
)
|
||||
assert _convert("xlsx_image_start.xlsx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# xlsx_image_middle.xlsx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_xlsx_image_middle(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"## Revenue\n\n"
|
||||
"| Q1 Report | Unnamed: 1 |\n"
|
||||
"| --- | --- |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| Revenue | $50,000 |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| Profit Margin | 40% |\n\n"
|
||||
"### Images in this sheet:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"## Expenses\n\n"
|
||||
"| Expense Breakdown | Unnamed: 1 |\n"
|
||||
"| --- | --- |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| Expenses | $30,000 |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| Savings | $5,000 |\n\n"
|
||||
"### Images in this sheet:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
)
|
||||
assert _convert("xlsx_image_middle.xlsx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# xlsx_image_end.xlsx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_xlsx_image_end(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"## Sheet\n\n"
|
||||
"| Financial Summary | Unnamed: 1 |\n"
|
||||
"| --- | --- |\n"
|
||||
"| Total Revenue | $500,000 |\n"
|
||||
"| Total Expenses | $300,000 |\n"
|
||||
"| Net Profit | $200,000 |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| Signature: | NaN |\n\n"
|
||||
"### Images in this sheet:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"## Budget\n\n"
|
||||
"| Budget Allocation | Unnamed: 1 |\n"
|
||||
"| --- | --- |\n"
|
||||
"| Marketing | $100,000 |\n"
|
||||
"| R&D | $150,000 |\n"
|
||||
"| Operations | $50,000 |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| Approved: | NaN |\n\n"
|
||||
"### Images in this sheet:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
)
|
||||
assert _convert("xlsx_image_end.xlsx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# xlsx_multiple_images.xlsx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_xlsx_multiple_images(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"## Overview\n\n"
|
||||
"| Dashboard |\n"
|
||||
"| --- |\n"
|
||||
"| Status: Active |\n"
|
||||
"| NaN |\n"
|
||||
"| NaN |\n"
|
||||
"| NaN |\n"
|
||||
"| NaN |\n"
|
||||
"| Performance Summary |\n\n"
|
||||
"### Images in this sheet:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"## Details\n\n"
|
||||
"| Detailed Metrics |\n"
|
||||
"| --- |\n"
|
||||
"| System Health |\n\n"
|
||||
"### Images in this sheet:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"## Summary\n\n"
|
||||
"| Quarter Summary |\n"
|
||||
"| --- |\n"
|
||||
"| Overall Performance |\n\n"
|
||||
"### Images in this sheet:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
)
|
||||
assert _convert("xlsx_multiple_images.xlsx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# xlsx_complex_layout.xlsx
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_xlsx_complex_layout(svc: MockOCRService) -> None:
|
||||
expected = (
|
||||
"## Complex Report\n\n"
|
||||
"| Annual Report 2024 | Unnamed: 1 |\n"
|
||||
"| --- | --- |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| Month | Sales |\n"
|
||||
"| Jan | 1000 |\n"
|
||||
"| Feb | 1200 |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| Total | 2200 |\n\n"
|
||||
"### Images in this sheet:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"## Customers\n\n"
|
||||
"| Customer Metrics | Unnamed: 1 |\n"
|
||||
"| --- | --- |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| New Customers | 250 |\n"
|
||||
"| Retention Rate | 92% |\n\n"
|
||||
"### Images in this sheet:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n"
|
||||
"## Regions\n\n"
|
||||
"| Regional Breakdown | Unnamed: 1 |\n"
|
||||
"| --- | --- |\n"
|
||||
"| NaN | NaN |\n"
|
||||
"| Region | Revenue |\n"
|
||||
"| North | $800K |\n"
|
||||
"| South | $600K |\n\n"
|
||||
"### Images in this sheet:\n\n"
|
||||
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
|
||||
)
|
||||
assert _convert("xlsx_complex_layout.xlsx", svc) == expected
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# No OCR service — no OCR tags emitted
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_xlsx_no_ocr_service_no_tags() -> None:
|
||||
path = TEST_DATA_DIR / "xlsx_image_middle.xlsx"
|
||||
if not path.exists():
|
||||
pytest.skip(f"Test file not found: {path}")
|
||||
converter = XlsxConverterWithOCR()
|
||||
with open(path, "rb") as f:
|
||||
md = converter.convert(f, StreamInfo(extension=".xlsx")).text_content
|
||||
assert "*[Image OCR]" not in md
|
||||
assert "[End OCR]*" not in md
|
||||
@@ -0,0 +1,111 @@
|
||||
# MarkItDown Sample Plugin
|
||||
|
||||
[](https://pypi.org/project/markitdown-sample-plugin/)
|
||||

|
||||
[](https://github.com/microsoft/autogen)
|
||||
|
||||
|
||||
This project shows how to create a sample plugin for MarkItDown. The most important parts are as follows:
|
||||
|
||||
Next, implement your custom DocumentConverter:
|
||||
|
||||
```python
|
||||
from typing import BinaryIO, Any
|
||||
from markitdown import MarkItDown, DocumentConverter, DocumentConverterResult, StreamInfo
|
||||
|
||||
class RtfConverter(DocumentConverter):
|
||||
|
||||
def __init__(
|
||||
self, priority: float = DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT
|
||||
):
|
||||
super().__init__(priority=priority)
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> bool:
|
||||
|
||||
# Implement logic to check if the file stream is an RTF file
|
||||
# ...
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
|
||||
# Implement logic to convert the file stream to Markdown
|
||||
# ...
|
||||
raise NotImplementedError()
|
||||
```
|
||||
|
||||
Next, make sure your package implements and exports the following:
|
||||
|
||||
```python
|
||||
# The version of the plugin interface that this plugin uses.
|
||||
# The only supported version is 1 for now.
|
||||
__plugin_interface_version__ = 1
|
||||
|
||||
# The main entrypoint for the plugin. This is called each time MarkItDown instances are created.
|
||||
def register_converters(markitdown: MarkItDown, **kwargs):
|
||||
"""
|
||||
Called during construction of MarkItDown instances to register converters provided by plugins.
|
||||
"""
|
||||
|
||||
# Simply create and attach an RtfConverter instance
|
||||
markitdown.register_converter(RtfConverter())
|
||||
```
|
||||
|
||||
|
||||
Finally, create an entrypoint in the `pyproject.toml` file:
|
||||
|
||||
```toml
|
||||
[project.entry-points."markitdown.plugin"]
|
||||
sample_plugin = "markitdown_sample_plugin"
|
||||
```
|
||||
|
||||
Here, the value of `sample_plugin` can be any key, but should ideally be the name of the plugin. The value is the fully qualified name of the package implementing the plugin.
|
||||
|
||||
|
||||
## Installation
|
||||
|
||||
To use the plugin with MarkItDown, it must be installed. To install the plugin from the current directory use:
|
||||
|
||||
```bash
|
||||
pip install -e .
|
||||
```
|
||||
|
||||
Once the plugin package is installed, verify that it is available to MarkItDown by running:
|
||||
|
||||
```bash
|
||||
markitdown --list-plugins
|
||||
```
|
||||
|
||||
To use the plugin for a conversion use the `--use-plugins` flag. For example, to convert an RTF file:
|
||||
|
||||
```bash
|
||||
markitdown --use-plugins path-to-file.rtf
|
||||
```
|
||||
|
||||
In Python, plugins can be enabled as follows:
|
||||
|
||||
```python
|
||||
from markitdown import MarkItDown
|
||||
|
||||
md = MarkItDown(enable_plugins=True)
|
||||
result = md.convert("path-to-file.rtf")
|
||||
print(result.text_content)
|
||||
```
|
||||
|
||||
## Trademarks
|
||||
|
||||
This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
|
||||
trademarks or logos is subject to and must follow
|
||||
[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
|
||||
Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
|
||||
Any use of third-party trademarks or logos are subject to those third-party's policies.
|
||||
@@ -0,0 +1,70 @@
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project]
|
||||
name = "markitdown-sample-plugin"
|
||||
dynamic = ["version"]
|
||||
description = 'A sample plugin for the "markitdown" library.'
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
license = "MIT"
|
||||
keywords = []
|
||||
authors = [
|
||||
{ name = "Adam Fourney", email = "adamfo@microsoft.com" },
|
||||
]
|
||||
classifiers = [
|
||||
"Development Status :: 4 - Beta",
|
||||
"Programming Language :: Python",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Programming Language :: Python :: 3.13",
|
||||
"Programming Language :: Python :: Implementation :: CPython",
|
||||
"Programming Language :: Python :: Implementation :: PyPy",
|
||||
]
|
||||
dependencies = [
|
||||
"markitdown>=0.1.0a1",
|
||||
"striprtf",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Documentation = "https://github.com/microsoft/markitdown#readme"
|
||||
Issues = "https://github.com/microsoft/markitdown/issues"
|
||||
Source = "https://github.com/microsoft/markitdown"
|
||||
|
||||
[tool.hatch.version]
|
||||
path = "src/markitdown_sample_plugin/__about__.py"
|
||||
|
||||
# IMPORTANT: MarkItDown will look for this entry point to find the plugin.
|
||||
[project.entry-points."markitdown.plugin"]
|
||||
sample_plugin = "markitdown_sample_plugin"
|
||||
|
||||
[tool.hatch.envs.types]
|
||||
extra-dependencies = [
|
||||
"mypy>=1.0.0",
|
||||
]
|
||||
[tool.hatch.envs.types.scripts]
|
||||
check = "mypy --install-types --non-interactive {args:src/markitdown_sample_plugin tests}"
|
||||
|
||||
[tool.coverage.run]
|
||||
source_pkgs = ["markitdown-sample-plugin", "tests"]
|
||||
branch = true
|
||||
parallel = true
|
||||
omit = [
|
||||
"src/markitdown_sample_plugin/__about__.py",
|
||||
]
|
||||
|
||||
[tool.coverage.paths]
|
||||
markitdown-sample-plugin = ["src/markitdown_sample_plugin", "*/markitdown-sample-plugin/src/markitdown_sample_plugin"]
|
||||
tests = ["tests", "*/markitdown-sample-plugin/tests"]
|
||||
|
||||
[tool.coverage.report]
|
||||
exclude_lines = [
|
||||
"no cov",
|
||||
"if __name__ == .__main__.:",
|
||||
"if TYPE_CHECKING:",
|
||||
]
|
||||
|
||||
[tool.hatch.build.targets.sdist]
|
||||
only-include = ["src/markitdown_sample_plugin"]
|
||||
@@ -0,0 +1,4 @@
|
||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
__version__ = "0.1.0a1"
|
||||
@@ -0,0 +1,13 @@
|
||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
from ._plugin import __plugin_interface_version__, register_converters, RtfConverter
|
||||
from .__about__ import __version__
|
||||
|
||||
__all__ = [
|
||||
"__version__",
|
||||
"__plugin_interface_version__",
|
||||
"register_converters",
|
||||
"RtfConverter",
|
||||
]
|
||||
@@ -0,0 +1,71 @@
|
||||
import locale
|
||||
from typing import BinaryIO, Any
|
||||
from striprtf.striprtf import rtf_to_text
|
||||
|
||||
from markitdown import (
|
||||
MarkItDown,
|
||||
DocumentConverter,
|
||||
DocumentConverterResult,
|
||||
StreamInfo,
|
||||
)
|
||||
|
||||
|
||||
__plugin_interface_version__ = (
|
||||
1 # The version of the plugin interface that this plugin uses
|
||||
)
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"text/rtf",
|
||||
"application/rtf",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".rtf"]
|
||||
|
||||
|
||||
def register_converters(markitdown: MarkItDown, **kwargs):
|
||||
"""
|
||||
Called during construction of MarkItDown instances to register converters provided by plugins.
|
||||
"""
|
||||
|
||||
# Simply create and attach an RtfConverter instance
|
||||
markitdown.register_converter(RtfConverter())
|
||||
|
||||
|
||||
class RtfConverter(DocumentConverter):
|
||||
"""
|
||||
Converts an RTF file to in the simplest possible way.
|
||||
"""
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
# Read the file stream into an str using hte provided charset encoding, or using the system default
|
||||
encoding = stream_info.charset or locale.getpreferredencoding()
|
||||
stream_data = file_stream.read().decode(encoding)
|
||||
|
||||
# Return the result
|
||||
return DocumentConverterResult(
|
||||
title=None,
|
||||
markdown=rtf_to_text(stream_data),
|
||||
)
|
||||
@@ -0,0 +1,3 @@
|
||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
+251
@@ -0,0 +1,251 @@
|
||||
{\rtf1\adeflang1025\ansi\ansicpg1252\uc1\adeff31507\deff0\stshfdbch31506\stshfloch31506\stshfhich31506\stshfbi31507\deflang1033\deflangfe1033\themelang1033\themelangfe0\themelangcs0{\fonttbl{\f0\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f34\fbidi \froman\fcharset0\fprq2{\*\panose 02040503050406030204}Cambria Math;}
|
||||
{\f42\fbidi \fswiss\fcharset0\fprq2 Aptos Display;}{\f43\fbidi \fswiss\fcharset0\fprq2 Aptos;}{\flomajor\f31500\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}
|
||||
{\fdbmajor\f31501\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\fhimajor\f31502\fbidi \fswiss\fcharset0\fprq2 Aptos Display;}{\fbimajor\f31503\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}
|
||||
{\flominor\f31504\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\fdbminor\f31505\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\fhiminor\f31506\fbidi \fswiss\fcharset0\fprq2 Aptos;}
|
||||
{\fbiminor\f31507\fbidi \froman\fcharset0\fprq2{\*\panose 02020603050405020304}Times New Roman;}{\f51\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\f52\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}
|
||||
{\f54\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\f55\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\f56\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\f57\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}
|
||||
{\f58\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\f59\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\f391\fbidi \froman\fcharset238\fprq2 Cambria Math CE;}{\f392\fbidi \froman\fcharset204\fprq2 Cambria Math Cyr;}
|
||||
{\f394\fbidi \froman\fcharset161\fprq2 Cambria Math Greek;}{\f395\fbidi \froman\fcharset162\fprq2 Cambria Math Tur;}{\f398\fbidi \froman\fcharset186\fprq2 Cambria Math Baltic;}{\f399\fbidi \froman\fcharset163\fprq2 Cambria Math (Vietnamese);}
|
||||
{\f471\fbidi \fswiss\fcharset238\fprq2 Aptos Display CE;}{\f472\fbidi \fswiss\fcharset204\fprq2 Aptos Display Cyr;}{\f474\fbidi \fswiss\fcharset161\fprq2 Aptos Display Greek;}{\f475\fbidi \fswiss\fcharset162\fprq2 Aptos Display Tur;}
|
||||
{\f478\fbidi \fswiss\fcharset186\fprq2 Aptos Display Baltic;}{\f479\fbidi \fswiss\fcharset163\fprq2 Aptos Display (Vietnamese);}{\f481\fbidi \fswiss\fcharset238\fprq2 Aptos CE;}{\f482\fbidi \fswiss\fcharset204\fprq2 Aptos Cyr;}
|
||||
{\f484\fbidi \fswiss\fcharset161\fprq2 Aptos Greek;}{\f485\fbidi \fswiss\fcharset162\fprq2 Aptos Tur;}{\f488\fbidi \fswiss\fcharset186\fprq2 Aptos Baltic;}{\f489\fbidi \fswiss\fcharset163\fprq2 Aptos (Vietnamese);}
|
||||
{\flomajor\f31508\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\flomajor\f31509\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}{\flomajor\f31511\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}
|
||||
{\flomajor\f31512\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\flomajor\f31513\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\flomajor\f31514\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}
|
||||
{\flomajor\f31515\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\flomajor\f31516\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\fdbmajor\f31518\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}
|
||||
{\fdbmajor\f31519\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}{\fdbmajor\f31521\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\fdbmajor\f31522\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}
|
||||
{\fdbmajor\f31523\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\fdbmajor\f31524\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}{\fdbmajor\f31525\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}
|
||||
{\fdbmajor\f31526\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\fhimajor\f31528\fbidi \fswiss\fcharset238\fprq2 Aptos Display CE;}{\fhimajor\f31529\fbidi \fswiss\fcharset204\fprq2 Aptos Display Cyr;}
|
||||
{\fhimajor\f31531\fbidi \fswiss\fcharset161\fprq2 Aptos Display Greek;}{\fhimajor\f31532\fbidi \fswiss\fcharset162\fprq2 Aptos Display Tur;}{\fhimajor\f31535\fbidi \fswiss\fcharset186\fprq2 Aptos Display Baltic;}
|
||||
{\fhimajor\f31536\fbidi \fswiss\fcharset163\fprq2 Aptos Display (Vietnamese);}{\fbimajor\f31538\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\fbimajor\f31539\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}
|
||||
{\fbimajor\f31541\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\fbimajor\f31542\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\fbimajor\f31543\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}
|
||||
{\fbimajor\f31544\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}{\fbimajor\f31545\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\fbimajor\f31546\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}
|
||||
{\flominor\f31548\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\flominor\f31549\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}{\flominor\f31551\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}
|
||||
{\flominor\f31552\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\flominor\f31553\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\flominor\f31554\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}
|
||||
{\flominor\f31555\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\flominor\f31556\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\fdbminor\f31558\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}
|
||||
{\fdbminor\f31559\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}{\fdbminor\f31561\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\fdbminor\f31562\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}
|
||||
{\fdbminor\f31563\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}{\fdbminor\f31564\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}{\fdbminor\f31565\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}
|
||||
{\fdbminor\f31566\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}{\fhiminor\f31568\fbidi \fswiss\fcharset238\fprq2 Aptos CE;}{\fhiminor\f31569\fbidi \fswiss\fcharset204\fprq2 Aptos Cyr;}
|
||||
{\fhiminor\f31571\fbidi \fswiss\fcharset161\fprq2 Aptos Greek;}{\fhiminor\f31572\fbidi \fswiss\fcharset162\fprq2 Aptos Tur;}{\fhiminor\f31575\fbidi \fswiss\fcharset186\fprq2 Aptos Baltic;}
|
||||
{\fhiminor\f31576\fbidi \fswiss\fcharset163\fprq2 Aptos (Vietnamese);}{\fbiminor\f31578\fbidi \froman\fcharset238\fprq2 Times New Roman CE;}{\fbiminor\f31579\fbidi \froman\fcharset204\fprq2 Times New Roman Cyr;}
|
||||
{\fbiminor\f31581\fbidi \froman\fcharset161\fprq2 Times New Roman Greek;}{\fbiminor\f31582\fbidi \froman\fcharset162\fprq2 Times New Roman Tur;}{\fbiminor\f31583\fbidi \froman\fcharset177\fprq2 Times New Roman (Hebrew);}
|
||||
{\fbiminor\f31584\fbidi \froman\fcharset178\fprq2 Times New Roman (Arabic);}{\fbiminor\f31585\fbidi \froman\fcharset186\fprq2 Times New Roman Baltic;}{\fbiminor\f31586\fbidi \froman\fcharset163\fprq2 Times New Roman (Vietnamese);}}
|
||||
{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;
|
||||
\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;\red0\green0\blue0;\red0\green0\blue0;\caccentone\ctint255\cshade191\red15\green71\blue97;
|
||||
\ctextone\ctint166\cshade255\red89\green89\blue89;\ctextone\ctint216\cshade255\red39\green39\blue39;\ctextone\ctint191\cshade255\red64\green64\blue64;}{\*\defchp \f31506\fs24\kerning2 }{\*\defpap \ql \li0\ri0\sa160\sl278\slmult1
|
||||
\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 }\noqfpromote {\stylesheet{\ql \li0\ri0\sa160\sl278\slmult1\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af31507\afs24\alang1025
|
||||
\ltrch\fcs0 \f31506\fs24\lang1033\langfe1033\kerning2\cgrid\langnp1033\langfenp1033 \snext0 \sqformat \spriority0 Normal;}{\s1\ql \li0\ri0\sb360\sa80\sl278\slmult1
|
||||
\keep\keepn\widctlpar\wrapdefault\aspalpha\aspnum\faauto\outlinelevel0\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af31503\afs40\alang1025 \ltrch\fcs0
|
||||
\fs40\cf19\lang1033\langfe1033\kerning2\loch\f31502\hich\af31502\dbch\af31501\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink15 \sqformat \spriority9 \styrsid15678446 heading 1;}{\s2\ql \li0\ri0\sb160\sa80\sl278\slmult1
|
||||
\keep\keepn\widctlpar\wrapdefault\aspalpha\aspnum\faauto\outlinelevel1\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af31503\afs32\alang1025 \ltrch\fcs0
|
||||
\fs32\cf19\lang1033\langfe1033\kerning2\loch\f31502\hich\af31502\dbch\af31501\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink16 \ssemihidden \sunhideused \sqformat \spriority9 \styrsid15678446 heading 2;}{\s3\ql \li0\ri0\sb160\sa80\sl278\slmult1
|
||||
\keep\keepn\widctlpar\wrapdefault\aspalpha\aspnum\faauto\outlinelevel2\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af31503\afs28\alang1025 \ltrch\fcs0
|
||||
\fs28\cf19\lang1033\langfe1033\kerning2\loch\f31506\hich\af31506\dbch\af31501\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink17 \ssemihidden \sunhideused \sqformat \spriority9 \styrsid15678446 heading 3;}{\s4\ql \li0\ri0\sb80\sa40\sl278\slmult1
|
||||
\keep\keepn\widctlpar\wrapdefault\aspalpha\aspnum\faauto\outlinelevel3\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \ai\af31503\afs24\alang1025 \ltrch\fcs0
|
||||
\i\fs24\cf19\lang1033\langfe1033\kerning2\loch\f31506\hich\af31506\dbch\af31501\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink18 \ssemihidden \sunhideused \sqformat \spriority9 \styrsid15678446 heading 4;}{\s5\ql \li0\ri0\sb80\sa40\sl278\slmult1
|
||||
\keep\keepn\widctlpar\wrapdefault\aspalpha\aspnum\faauto\outlinelevel4\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af31503\afs24\alang1025 \ltrch\fcs0
|
||||
\fs24\cf19\lang1033\langfe1033\kerning2\loch\f31506\hich\af31506\dbch\af31501\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink19 \ssemihidden \sunhideused \sqformat \spriority9 \styrsid15678446 heading 5;}{\s6\ql \li0\ri0\sb40\sl278\slmult1
|
||||
\keep\keepn\widctlpar\wrapdefault\aspalpha\aspnum\faauto\outlinelevel5\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \ai\af31503\afs24\alang1025 \ltrch\fcs0
|
||||
\i\fs24\cf20\lang1033\langfe1033\kerning2\loch\f31506\hich\af31506\dbch\af31501\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink20 \ssemihidden \sunhideused \sqformat \spriority9 \styrsid15678446 heading 6;}{\s7\ql \li0\ri0\sb40\sl278\slmult1
|
||||
\keep\keepn\widctlpar\wrapdefault\aspalpha\aspnum\faauto\outlinelevel6\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af31503\afs24\alang1025 \ltrch\fcs0
|
||||
\fs24\cf20\lang1033\langfe1033\kerning2\loch\f31506\hich\af31506\dbch\af31501\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink21 \ssemihidden \sunhideused \sqformat \spriority9 \styrsid15678446 heading 7;}{\s8\ql \li0\ri0\sl278\slmult1
|
||||
\keep\keepn\widctlpar\wrapdefault\aspalpha\aspnum\faauto\outlinelevel7\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \ai\af31503\afs24\alang1025 \ltrch\fcs0
|
||||
\i\fs24\cf21\lang1033\langfe1033\kerning2\loch\f31506\hich\af31506\dbch\af31501\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink22 \ssemihidden \sunhideused \sqformat \spriority9 \styrsid15678446 heading 8;}{\s9\ql \li0\ri0\sl278\slmult1
|
||||
\keep\keepn\widctlpar\wrapdefault\aspalpha\aspnum\faauto\outlinelevel8\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af31503\afs24\alang1025 \ltrch\fcs0
|
||||
\fs24\cf21\lang1033\langfe1033\kerning2\loch\f31506\hich\af31506\dbch\af31501\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink23 \ssemihidden \sunhideused \sqformat \spriority9 \styrsid15678446 heading 9;}{\*\cs10 \additive
|
||||
\ssemihidden \sunhideused \spriority1 Default Paragraph Font;}{\*
|
||||
\ts11\tsrowd\trftsWidthB3\trpaddl108\trpaddr108\trpaddfl3\trpaddft3\trpaddfb3\trpaddfr3\trcbpat1\trcfpat1\tblind0\tblindtype3\tsvertalt\tsbrdrt\tsbrdrl\tsbrdrb\tsbrdrr\tsbrdrdgl\tsbrdrdgr\tsbrdrh\tsbrdrv \ql \li0\ri0\sa160\sl278\slmult1
|
||||
\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af31507\afs24\alang1025 \ltrch\fcs0 \f31506\fs24\lang1033\langfe1033\kerning2\cgrid\langnp1033\langfenp1033 \snext11 \ssemihidden \sunhideused Normal Table;}{\*\cs15
|
||||
\additive \rtlch\fcs1 \af31503\afs40 \ltrch\fcs0 \fs40\cf19\loch\f31502\hich\af31502\dbch\af31501 \sbasedon10 \slink1 \spriority9 \styrsid15678446 Heading 1 Char;}{\*\cs16 \additive \rtlch\fcs1 \af31503\afs32 \ltrch\fcs0
|
||||
\fs32\cf19\loch\f31502\hich\af31502\dbch\af31501 \sbasedon10 \slink2 \ssemihidden \spriority9 \styrsid15678446 Heading 2 Char;}{\*\cs17 \additive \rtlch\fcs1 \af31503\afs28 \ltrch\fcs0 \fs28\cf19\dbch\af31501
|
||||
\sbasedon10 \slink3 \ssemihidden \spriority9 \styrsid15678446 Heading 3 Char;}{\*\cs18 \additive \rtlch\fcs1 \ai\af31503 \ltrch\fcs0 \i\cf19\dbch\af31501 \sbasedon10 \slink4 \ssemihidden \spriority9 \styrsid15678446 Heading 4 Char;}{\*\cs19 \additive
|
||||
\rtlch\fcs1 \af31503 \ltrch\fcs0 \cf19\dbch\af31501 \sbasedon10 \slink5 \ssemihidden \spriority9 \styrsid15678446 Heading 5 Char;}{\*\cs20 \additive \rtlch\fcs1 \ai\af31503 \ltrch\fcs0 \i\cf20\dbch\af31501
|
||||
\sbasedon10 \slink6 \ssemihidden \spriority9 \styrsid15678446 Heading 6 Char;}{\*\cs21 \additive \rtlch\fcs1 \af31503 \ltrch\fcs0 \cf20\dbch\af31501 \sbasedon10 \slink7 \ssemihidden \spriority9 \styrsid15678446 Heading 7 Char;}{\*\cs22 \additive
|
||||
\rtlch\fcs1 \ai\af31503 \ltrch\fcs0 \i\cf21\dbch\af31501 \sbasedon10 \slink8 \ssemihidden \spriority9 \styrsid15678446 Heading 8 Char;}{\*\cs23 \additive \rtlch\fcs1 \af31503 \ltrch\fcs0 \cf21\dbch\af31501
|
||||
\sbasedon10 \slink9 \ssemihidden \spriority9 \styrsid15678446 Heading 9 Char;}{\s24\ql \li0\ri0\sa80\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\contextualspace \rtlch\fcs1 \af31503\afs56\alang1025 \ltrch\fcs0
|
||||
\fs56\expnd-2\expndtw-10\lang1033\langfe1033\kerning28\loch\f31502\hich\af31502\dbch\af31501\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink25 \sqformat \spriority10 \styrsid15678446 Title;}{\*\cs25 \additive \rtlch\fcs1 \af31503\afs56
|
||||
\ltrch\fcs0 \fs56\expnd-2\expndtw-10\kerning28\loch\f31502\hich\af31502\dbch\af31501 \sbasedon10 \slink24 \spriority10 \styrsid15678446 Title Char;}{\s26\ql \li0\ri0\sa160\sl278\slmult1
|
||||
\widctlpar\wrapdefault\aspalpha\aspnum\faauto\ilvl1\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af31503\afs28\alang1025 \ltrch\fcs0 \fs28\expnd3\expndtw15\cf20\lang1033\langfe1033\kerning2\loch\f31506\hich\af31506\dbch\af31501\cgrid\langnp1033\langfenp1033
|
||||
\sbasedon0 \snext0 \slink27 \sqformat \spriority11 \styrsid15678446 Subtitle;}{\*\cs27 \additive \rtlch\fcs1 \af31503\afs28 \ltrch\fcs0 \fs28\expnd3\expndtw15\cf20\dbch\af31501 \sbasedon10 \slink26 \spriority11 \styrsid15678446 Subtitle Char;}{
|
||||
\s28\qc \li0\ri0\sb160\sa160\sl278\slmult1\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \ai\af31507\afs24\alang1025 \ltrch\fcs0 \i\f31506\fs24\cf22\lang1033\langfe1033\kerning2\cgrid\langnp1033\langfenp1033
|
||||
\sbasedon0 \snext0 \slink29 \sqformat \spriority29 \styrsid15678446 Quote;}{\*\cs29 \additive \rtlch\fcs1 \ai\af0 \ltrch\fcs0 \i\cf22 \sbasedon10 \slink28 \spriority29 \styrsid15678446 Quote Char;}{\s30\ql \li720\ri0\sa160\sl278\slmult1
|
||||
\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin720\itap0\contextualspace \rtlch\fcs1 \af31507\afs24\alang1025 \ltrch\fcs0 \f31506\fs24\lang1033\langfe1033\kerning2\cgrid\langnp1033\langfenp1033
|
||||
\sbasedon0 \snext30 \sqformat \spriority34 \styrsid15678446 List Paragraph;}{\*\cs31 \additive \rtlch\fcs1 \ai\af0 \ltrch\fcs0 \i\cf19 \sbasedon10 \sqformat \spriority21 \styrsid15678446 Intense Emphasis;}{\s32\qc \li864\ri864\sb360\sa360\sl278\slmult1
|
||||
\widctlpar\brdrt\brdrs\brdrw10\brsp200\brdrcf19 \brdrb\brdrs\brdrw10\brsp200\brdrcf19 \wrapdefault\aspalpha\aspnum\faauto\adjustright\rin864\lin864\itap0 \rtlch\fcs1 \ai\af31507\afs24\alang1025 \ltrch\fcs0
|
||||
\i\f31506\fs24\cf19\lang1033\langfe1033\kerning2\cgrid\langnp1033\langfenp1033 \sbasedon0 \snext0 \slink33 \sqformat \spriority30 \styrsid15678446 Intense Quote;}{\*\cs33 \additive \rtlch\fcs1 \ai\af0 \ltrch\fcs0 \i\cf19
|
||||
\sbasedon10 \slink32 \spriority30 \styrsid15678446 Intense Quote Char;}{\*\cs34 \additive \rtlch\fcs1 \ab\af0 \ltrch\fcs0 \b\scaps\expnd1\expndtw5\cf19 \sbasedon10 \sqformat \spriority32 \styrsid15678446 Intense Reference;}}{\*\rsidtbl \rsid3543682
|
||||
\rsid6316520\rsid7364952\rsid8278432\rsid9589131\rsid10298217\rsid15678446\rsid15953651}{\mmathPr\mmathFont34\mbrkBin0\mbrkBinSub0\msmallFrac0\mdispDef1\mlMargin0\mrMargin0\mdefJc1\mwrapIndent1440\mintLim0\mnaryLim1}{\info{\author Adam Fourney}
|
||||
{\operator Adam Fourney}{\creatim\yr2025\mo2\dy9\hr22\min56}{\revtim\yr2025\mo2\dy9\hr22\min58}{\version1}{\edmins2}{\nofpages1}{\nofwords17}{\nofchars98}{\nofcharsws114}{\vern115}}{\*\xmlnstbl {\xmlns1 http://schemas.microsoft.com/office/word/2003/wordm
|
||||
l}}\paperw12240\paperh15840\margl1440\margr1440\margt1440\margb1440\gutter0\ltrsect
|
||||
\widowctrl\ftnbj\aenddoc\trackmoves0\trackformatting1\donotembedsysfont1\relyonvml0\donotembedlingdata0\grfdocevents0\validatexml1\showplaceholdtext0\ignoremixedcontent0\saveinvalidxml0\showxmlerrors1\noxlattoyen
|
||||
\expshrtn\noultrlspc\dntblnsbdb\nospaceforul\formshade\horzdoc\dgmargin\dghspace180\dgvspace180\dghorigin1440\dgvorigin1440\dghshow1\dgvshow1
|
||||
\jexpand\viewkind1\viewscale100\pgbrdrhead\pgbrdrfoot\splytwnine\ftnlytwnine\htmautsp\nolnhtadjtbl\useltbaln\alntblind\lytcalctblwd\lyttblrtgr\lnbrkrule\nobrkwrptbl\snaptogridincell\allowfieldendsel\wrppunct
|
||||
\asianbrkrule\rsidroot15678446\newtblstyruls\nogrowautofit\usenormstyforlist\noindnmbrts\felnbrelev\nocxsptable\indrlsweleven\noafcnsttbl\afelev\utinl\hwelev\spltpgpar\notcvasp\notbrkcnstfrctbl\notvatxbx\krnprsnet\cachedcolbal \nouicompat \fet0
|
||||
{\*\wgrffmtfilter 2450}\nofeaturethrottle1\ilfomacatclnup0\ltrpar \sectd \ltrsect\linex0\endnhere\sectlinegrid360\sectdefaultcl\sftnbj {\*\pnseclvl1\pnucrm\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl2\pnucltr\pnstart1\pnindent720\pnhang
|
||||
{\pntxta .}}{\*\pnseclvl3\pndec\pnstart1\pnindent720\pnhang {\pntxta .}}{\*\pnseclvl4\pnlcltr\pnstart1\pnindent720\pnhang {\pntxta )}}{\*\pnseclvl5\pndec\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl6\pnlcltr\pnstart1\pnindent720\pnhang
|
||||
{\pntxtb (}{\pntxta )}}{\*\pnseclvl7\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl8\pnlcltr\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}{\*\pnseclvl9\pnlcrm\pnstart1\pnindent720\pnhang {\pntxtb (}{\pntxta )}}
|
||||
\pard\plain \ltrpar\s24\ql \li0\ri0\sa80\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0\pararsid15678446\contextualspace \rtlch\fcs1 \af31503\afs56\alang1025 \ltrch\fcs0
|
||||
\fs56\expnd-2\expndtw-10\lang1033\langfe1033\kerning28\loch\af31502\hich\af31502\dbch\af31501\cgrid\langnp1033\langfenp1033 {\rtlch\fcs1 \af31503 \ltrch\fcs0 \insrsid15678446 \hich\af31502\dbch\af31501\loch\f31502 This is a
|
||||
\hich\af31502\dbch\af31501\loch\f31502 S\hich\af31502\dbch\af31501\loch\f31502 ample RT\hich\af31502\dbch\af31501\loch\f31502 F \hich\af31502\dbch\af31501\loch\f31502 File}{\rtlch\fcs1 \af31503 \ltrch\fcs0 \insrsid8278432
|
||||
\par }\pard\plain \ltrpar\ql \li0\ri0\sa160\sl278\slmult1\widctlpar\wrapdefault\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0 \rtlch\fcs1 \af31507\afs24\alang1025 \ltrch\fcs0 \f31506\fs24\lang1033\langfe1033\kerning2\cgrid\langnp1033\langfenp1033 {
|
||||
\rtlch\fcs1 \af31507 \ltrch\fcs0 \insrsid15678446
|
||||
\par It is included to test if the MarkItDown sample plugin can correctly convert RTF files.
|
||||
\par }{\*\themedata 504b030414000600080000002100e9de0fbfff0000001c020000130000005b436f6e74656e745f54797065735d2e786d6cac91cb4ec3301045f748fc83e52d4a
|
||||
9cb2400825e982c78ec7a27cc0c8992416c9d8b2a755fbf74cd25442a820166c2cd933f79e3be372bd1f07b5c3989ca74aaff2422b24eb1b475da5df374fd9ad
|
||||
5689811a183c61a50f98f4babebc2837878049899a52a57be670674cb23d8e90721f90a4d2fa3802cb35762680fd800ecd7551dc18eb899138e3c943d7e503b6
|
||||
b01d583deee5f99824e290b4ba3f364eac4a430883b3c092d4eca8f946c916422ecab927f52ea42b89a1cd59c254f919b0e85e6535d135a8de20f20b8c12c3b0
|
||||
0c895fcf6720192de6bf3b9e89ecdbd6596cbcdd8eb28e7c365ecc4ec1ff1460f53fe813d3cc7f5b7f020000ffff0300504b030414000600080000002100a5d6
|
||||
a7e7c0000000360100000b0000005f72656c732f2e72656c73848fcf6ac3300c87ef85bd83d17d51d2c31825762fa590432fa37d00e1287f68221bdb1bebdb4f
|
||||
c7060abb0884a4eff7a93dfeae8bf9e194e720169aaa06c3e2433fcb68e1763dbf7f82c985a4a725085b787086a37bdbb55fbc50d1a33ccd311ba548b6309512
|
||||
0f88d94fbc52ae4264d1c910d24a45db3462247fa791715fd71f989e19e0364cd3f51652d73760ae8fa8c9ffb3c330cc9e4fc17faf2ce545046e37944c69e462
|
||||
a1a82fe353bd90a865aad41ed0b5b8f9d6fd010000ffff0300504b0304140006000800000021006b799616830000008a0000001c0000007468656d652f746865
|
||||
6d652f7468656d654d616e616765722e786d6c0ccc4d0ac3201040e17da17790d93763bb284562b2cbaebbf600439c1a41c7a0d29fdbd7e5e38337cedf14d59b
|
||||
4b0d592c9c070d8a65cd2e88b7f07c2ca71ba8da481cc52c6ce1c715e6e97818c9b48d13df49c873517d23d59085adb5dd20d6b52bd521ef2cdd5eb9246a3d8b
|
||||
4757e8d3f729e245eb2b260a0238fd010000ffff0300504b030414000600080000002100d3d1e707f007000012220000160000007468656d652f7468656d652f
|
||||
7468656d65312e786d6cec5a4b8fdbc811be07c87f20789745ea414903cb0b3d3d6bcfd8034b76b0c796d812dbd36413ecd6cc080b0381f794cb020b6c825c02
|
||||
e496431064812c90452ef931066c249b1f91ea2645754b2dcf030662043373215b5f557f5d555d556cf2e1175731752e70c6094bbaaeffc0731d9ccc59489265
|
||||
d77d391d57daaec3054a42445982bbee1a73f78b47bffcc5437424221c6307e4137e84ba6e24447a54adf2390c23fe80a53881df162c8b9180db6c590d337409
|
||||
7a635aad795e508d11495c274131a87dbe58903976a652a5fb68a37c44e136115c0ecc693691aab121a1b0e1b92f117ccd0734732e10edba304fc82ea7f84ab8
|
||||
0e455cc00f5dd7537f6ef5d1c32a3a2a84a83820abc98dd55f21570884e7353567b69c95937aa35abbe197fa15808a7ddca82dff4b7d0a80e6735869ce45d7e9
|
||||
3703af5d2bb01a28bfb4e8eeb4fcba89d7f4d7f738fb9da05f6b18fa1528d7dfd8c37be3ce68d834f00a94e39b7bf89e57eb77ea065e81727cb0876f8c7aadda
|
||||
c8c02b50444972be8f0e5aed7650a04bc882d1632bbc13045e6b58c0b728888632bae4140b968843b116a3d72c1b03400229122471c43ac50b348728eea58271
|
||||
6748784ad1da755294300ec35ecdf721f41a5eadfc571647471869d2921730e17b43928fc3e7194945d77d025a5d0df2fea79fdebdfdf1dddbbfbffbe69b776f
|
||||
ffea9c906524725586dc314a96badccf7ffaee3f7ff8b5f3efbffdf1e7ef7f6bc7731dffe12fbff9f08f7f7e4c3d6cb5ad29deffee870f3ffef0fef7dffeebcf
|
||||
df5bb4f73234d3e1531263ee3cc397ce0b16c30295294cfe7896dd4e621a21a24bf49225470992b358f48f4464a09fad1145165c1f9b767c9541aab1011faf5e
|
||||
1b842751b612c4a2f169141bc053c6689f65562b3c957369669eae92a57df26ca5e35e2074619b7b8012c3cba3550a3996d8540e226cd03ca328116889132c1c
|
||||
f91b3bc7d8b2baaf0831ec7a4ae619e36c219caf88d347c46a92299919d1b4153a2631f8656d2308fe366c73facae9336a5bf5105f9848d81b885ac84f3135cc
|
||||
f818ad048a6d2aa728a6bac14f90886c2427eb6caee3465c80a7979832671462ce6d32cf3358afe6f4a708b29bd5eda7741d9bc84c90739bce13c4988e1cb2f3
|
||||
4184e2d4869d9024d2b15ff2730851e49c3161839f327387c87bf0034a0ebafb15c186bbafcf062f21cbe994b601227f5965165f3ec6cc88dfc99a2e10b6a59a
|
||||
5e161b29b697116b74f4574b23b44f30a6e81285183b2fbfb430e8b3d4b0f996f49308b2ca31b605d61364c6aabc4f30875e493637fb79f2847023642778c90e
|
||||
f0395def249e354a62941dd2fc0cbcaedb7c34cb60335a283ca7f3731df88c400f08f16235ca730e3ab4e03ea8f52c42460193f7dc1eafebccf0df4df618eccb
|
||||
d7068d1bec4b90c1b79681c4aecb7cd43653448d09b6013345c439b1a55b1031dcbf1591c55589adac720b73d36edd00dd91d1f4c424b9a603fadf743e9640fc
|
||||
343d8f5db191b06ed9ed1c4a28c73b3dce21dc6e67336059483effc6668856c919865ab29fb5eefb9afbbec6fdbfef6b0eede7fb6ee650cf71dfcdb8d065dc77
|
||||
33c501cba7e966b60d0cf436f290213fec51473ff1c1939f05a17422d6149f7075f8c3e199261cc3a09453a79eb83c094c23b894650e263070cb0c29192763e2
|
||||
5744449308a57042e4bb52c99217aa97dc4919878323356cd52df174159fb2303ff054274c5e5e593912db71af09474ff9381c56891c1db48a41c94f9daa025f
|
||||
c576a90e5b3704a4ec6d4868939924ea1612adcde03524e4d9d9a761d1b1b0684bf51b57ed9902a8955e81876e071ed5bb6eb32109c149399f43831e4a3fe5ae
|
||||
de785739f3537afa90318d0880c3c57c2570345f7aba23b91e5c9e5c5d1e6a37f0b4414239250f2b9384b28c6af078048fc24574cad19bd0b8adaf3b5b971af4
|
||||
a429d47c10df5b1aadf6c758dcd5d720b79b1b68a2670a9a38975d37a8372164e628edba0b383886cb3885d8e1f2b90bd125bc7d998b2cdff077c92c69c6c510
|
||||
f12837b84a3ab97b622270e65012775db9fcd20d3451394471f36b90103e5b721d482b9f1b3970bae964bc58e0b9d0ddae8d484be7b790e1f35c61fd5589df1d
|
||||
2c25d90adc3d89c24b674657d90b0421d66cf9d28021e1f0fec0cfad191278215626b26dfced14a622f9eb6fa4540ce5e388a6112a2a8a9ecc73b8aa27251d75
|
||||
57da40bb2bd60c06d54c5214c2d9521658dda846352d4b57cee160d5bd5e485a4e4b9adb9a6964155935ed59cc98615306766c79b722afb1da9818729a5ee1f3
|
||||
d4bd9b723b9b5cb7d3279455020c5edaef6ea55fa3b69dcca02619efa76199b38b51b3766c16780db59b14092deb071bb53b762b6b84753a18bc53e507b9dda8
|
||||
85a1c5a6af5496566fcef597db6cf61a92c710badc15cd5f77d304ee6454f2f42c53be9db1705d5c529e279adce7b22795489abcc00b8784579b7eb2746fbe3d
|
||||
f257ae7ed10c28b41493b5ab14b4367ba6608197a2f986bd8d7029a16686d6bb1456c78ab67e575c6d28cb561df0ca843c5f3598b6b0145ced5b118ec83304ad
|
||||
ed44357679ee05da57a2c82f70e5ac32d275bff69abdc6a0d61c54bc76735469d41b5ea5ddecd52bbd66b3ee8f9abe37ecd7de003d11c57e33fff4610c6f82e8
|
||||
baf800428def7d04116f5e763d98b3b8cad4470e55e57df511845f3bfc110438126805b571a7dee907954ebd37ae3486fd76a53308fa956130680dc7c341b3dd
|
||||
19bf719d0b056ef4ea8346306a57027f30a834024fd26f772aad46add66bb47aed51a3f7a6703fac3ccfc1852dc07c8ad7a3ff020000ffff0300504b03041400
|
||||
06000800000021000dd1909fb60000001b010000270000007468656d652f7468656d652f5f72656c732f7468656d654d616e616765722e786d6c2e72656c7384
|
||||
8f4d0ac2301484f78277086f6fd3ba109126dd88d0add40384e4350d363f2451eced0dae2c082e8761be9969bb979dc9136332de3168aa1a083ae995719ac16d
|
||||
b8ec8e4052164e89d93b64b060828e6f37ed1567914b284d262452282e3198720e274a939cd08a54f980ae38a38f56e422a3a641c8bbd048f7757da0f19b017c
|
||||
c524bd62107bd5001996509affb3fd381a89672f1f165dfe514173d9850528a2c6cce0239baa4c04ca5bbabac4df000000ffff0300504b01022d001400060008
|
||||
0000002100e9de0fbfff0000001c0200001300000000000000000000000000000000005b436f6e74656e745f54797065735d2e786d6c504b01022d0014000600
|
||||
080000002100a5d6a7e7c0000000360100000b00000000000000000000000000300100005f72656c732f2e72656c73504b01022d00140006000800000021006b
|
||||
799616830000008a0000001c00000000000000000000000000190200007468656d652f7468656d652f7468656d654d616e616765722e786d6c504b01022d0014
|
||||
000600080000002100d3d1e707f0070000122200001600000000000000000000000000d60200007468656d652f7468656d652f7468656d65312e786d6c504b01
|
||||
022d00140006000800000021000dd1909fb60000001b0100002700000000000000000000000000fa0a00007468656d652f7468656d652f5f72656c732f7468656d654d616e616765722e786d6c2e72656c73504b050600000000050005005d010000f50b00000000}
|
||||
{\*\colorschememapping 3c3f786d6c2076657273696f6e3d22312e302220656e636f64696e673d225554462d3822207374616e64616c6f6e653d22796573223f3e0d0a3c613a636c724d
|
||||
617020786d6c6e733a613d22687474703a2f2f736368656d61732e6f70656e786d6c666f726d6174732e6f72672f64726177696e676d6c2f323030362f6d6169
|
||||
6e22206267313d226c743122207478313d22646b3122206267323d226c743222207478323d22646b322220616363656e74313d22616363656e74312220616363
|
||||
656e74323d22616363656e74322220616363656e74333d22616363656e74332220616363656e74343d22616363656e74342220616363656e74353d22616363656e74352220616363656e74363d22616363656e74362220686c696e6b3d22686c696e6b2220666f6c486c696e6b3d22666f6c486c696e6b222f3e}
|
||||
{\*\latentstyles\lsdstimax376\lsdlockeddef0\lsdsemihiddendef0\lsdunhideuseddef0\lsdqformatdef0\lsdprioritydef99{\lsdlockedexcept \lsdqformat1 \lsdpriority0 \lsdlocked0 Normal;\lsdqformat1 \lsdpriority9 \lsdlocked0 heading 1;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 2;\lsdsemihidden1 \lsdunhideused1 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 3;\lsdsemihidden1 \lsdunhideused1 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 4;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 5;\lsdsemihidden1 \lsdunhideused1 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 6;\lsdsemihidden1 \lsdunhideused1 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 7;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 8;\lsdsemihidden1 \lsdunhideused1 \lsdqformat1 \lsdpriority9 \lsdlocked0 heading 9;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 index 1;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 index 2;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 index 3;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 index 4;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 index 5;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 index 6;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 index 7;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 index 8;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 index 9;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdpriority39 \lsdlocked0 toc 1;\lsdsemihidden1 \lsdunhideused1 \lsdpriority39 \lsdlocked0 toc 2;\lsdsemihidden1 \lsdunhideused1 \lsdpriority39 \lsdlocked0 toc 3;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdpriority39 \lsdlocked0 toc 4;\lsdsemihidden1 \lsdunhideused1 \lsdpriority39 \lsdlocked0 toc 5;\lsdsemihidden1 \lsdunhideused1 \lsdpriority39 \lsdlocked0 toc 6;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdpriority39 \lsdlocked0 toc 7;\lsdsemihidden1 \lsdunhideused1 \lsdpriority39 \lsdlocked0 toc 8;\lsdsemihidden1 \lsdunhideused1 \lsdpriority39 \lsdlocked0 toc 9;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Normal Indent;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 footnote text;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 annotation text;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 header;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 footer;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 index heading;\lsdsemihidden1 \lsdunhideused1 \lsdqformat1 \lsdpriority35 \lsdlocked0 caption;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 table of figures;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 envelope address;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 envelope return;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 footnote reference;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 annotation reference;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 line number;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 page number;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 endnote reference;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 endnote text;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 table of authorities;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 macro;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 toa heading;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Bullet;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Number;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List 2;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List 3;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List 4;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List 5;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Bullet 2;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Bullet 3;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Bullet 4;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Bullet 5;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Number 2;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Number 3;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Number 4;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Number 5;\lsdqformat1 \lsdpriority10 \lsdlocked0 Title;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Closing;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Signature;\lsdsemihidden1 \lsdunhideused1 \lsdpriority1 \lsdlocked0 Default Paragraph Font;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Body Text;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Body Text Indent;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Continue;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Continue 2;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Continue 3;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Continue 4;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 List Continue 5;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Message Header;\lsdqformat1 \lsdpriority11 \lsdlocked0 Subtitle;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Salutation;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Date;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Body Text First Indent;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Body Text First Indent 2;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Note Heading;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Body Text 2;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Body Text 3;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Body Text Indent 2;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Body Text Indent 3;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Block Text;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Hyperlink;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 FollowedHyperlink;\lsdqformat1 \lsdpriority22 \lsdlocked0 Strong;
|
||||
\lsdqformat1 \lsdpriority20 \lsdlocked0 Emphasis;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Document Map;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Plain Text;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 E-mail Signature;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 HTML Top of Form;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 HTML Bottom of Form;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Normal (Web);\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 HTML Acronym;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 HTML Address;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 HTML Cite;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 HTML Code;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 HTML Definition;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 HTML Keyboard;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 HTML Preformatted;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 HTML Sample;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 HTML Typewriter;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 HTML Variable;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Normal Table;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 annotation subject;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 No List;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Outline List 1;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Outline List 2;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Outline List 3;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Simple 1;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Simple 2;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Simple 3;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Classic 1;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Classic 2;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Classic 3;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Classic 4;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Colorful 1;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Colorful 2;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Colorful 3;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Columns 1;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Columns 2;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Columns 3;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Columns 4;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Columns 5;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Grid 1;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Grid 2;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Grid 3;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Grid 4;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Grid 5;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Grid 6;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Grid 7;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Grid 8;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table List 1;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table List 2;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table List 3;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table List 4;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table List 5;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table List 6;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table List 7;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table List 8;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table 3D effects 1;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table 3D effects 2;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table 3D effects 3;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Contemporary;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Elegant;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Professional;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Subtle 1;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Subtle 2;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Web 1;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Web 2;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Web 3;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Balloon Text;\lsdpriority39 \lsdlocked0 Table Grid;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Table Theme;\lsdsemihidden1 \lsdlocked0 Placeholder Text;
|
||||
\lsdqformat1 \lsdpriority1 \lsdlocked0 No Spacing;\lsdpriority60 \lsdlocked0 Light Shading;\lsdpriority61 \lsdlocked0 Light List;\lsdpriority62 \lsdlocked0 Light Grid;\lsdpriority63 \lsdlocked0 Medium Shading 1;\lsdpriority64 \lsdlocked0 Medium Shading 2;
|
||||
\lsdpriority65 \lsdlocked0 Medium List 1;\lsdpriority66 \lsdlocked0 Medium List 2;\lsdpriority67 \lsdlocked0 Medium Grid 1;\lsdpriority68 \lsdlocked0 Medium Grid 2;\lsdpriority69 \lsdlocked0 Medium Grid 3;\lsdpriority70 \lsdlocked0 Dark List;
|
||||
\lsdpriority71 \lsdlocked0 Colorful Shading;\lsdpriority72 \lsdlocked0 Colorful List;\lsdpriority73 \lsdlocked0 Colorful Grid;\lsdpriority60 \lsdlocked0 Light Shading Accent 1;\lsdpriority61 \lsdlocked0 Light List Accent 1;
|
||||
\lsdpriority62 \lsdlocked0 Light Grid Accent 1;\lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 1;\lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 1;\lsdpriority65 \lsdlocked0 Medium List 1 Accent 1;\lsdsemihidden1 \lsdlocked0 Revision;
|
||||
\lsdqformat1 \lsdpriority34 \lsdlocked0 List Paragraph;\lsdqformat1 \lsdpriority29 \lsdlocked0 Quote;\lsdqformat1 \lsdpriority30 \lsdlocked0 Intense Quote;\lsdpriority66 \lsdlocked0 Medium List 2 Accent 1;\lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 1;
|
||||
\lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 1;\lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 1;\lsdpriority70 \lsdlocked0 Dark List Accent 1;\lsdpriority71 \lsdlocked0 Colorful Shading Accent 1;\lsdpriority72 \lsdlocked0 Colorful List Accent 1;
|
||||
\lsdpriority73 \lsdlocked0 Colorful Grid Accent 1;\lsdpriority60 \lsdlocked0 Light Shading Accent 2;\lsdpriority61 \lsdlocked0 Light List Accent 2;\lsdpriority62 \lsdlocked0 Light Grid Accent 2;\lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 2;
|
||||
\lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 2;\lsdpriority65 \lsdlocked0 Medium List 1 Accent 2;\lsdpriority66 \lsdlocked0 Medium List 2 Accent 2;\lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 2;\lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 2;
|
||||
\lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 2;\lsdpriority70 \lsdlocked0 Dark List Accent 2;\lsdpriority71 \lsdlocked0 Colorful Shading Accent 2;\lsdpriority72 \lsdlocked0 Colorful List Accent 2;\lsdpriority73 \lsdlocked0 Colorful Grid Accent 2;
|
||||
\lsdpriority60 \lsdlocked0 Light Shading Accent 3;\lsdpriority61 \lsdlocked0 Light List Accent 3;\lsdpriority62 \lsdlocked0 Light Grid Accent 3;\lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 3;\lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 3;
|
||||
\lsdpriority65 \lsdlocked0 Medium List 1 Accent 3;\lsdpriority66 \lsdlocked0 Medium List 2 Accent 3;\lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 3;\lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 3;\lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 3;
|
||||
\lsdpriority70 \lsdlocked0 Dark List Accent 3;\lsdpriority71 \lsdlocked0 Colorful Shading Accent 3;\lsdpriority72 \lsdlocked0 Colorful List Accent 3;\lsdpriority73 \lsdlocked0 Colorful Grid Accent 3;\lsdpriority60 \lsdlocked0 Light Shading Accent 4;
|
||||
\lsdpriority61 \lsdlocked0 Light List Accent 4;\lsdpriority62 \lsdlocked0 Light Grid Accent 4;\lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 4;\lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 4;\lsdpriority65 \lsdlocked0 Medium List 1 Accent 4;
|
||||
\lsdpriority66 \lsdlocked0 Medium List 2 Accent 4;\lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 4;\lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 4;\lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 4;\lsdpriority70 \lsdlocked0 Dark List Accent 4;
|
||||
\lsdpriority71 \lsdlocked0 Colorful Shading Accent 4;\lsdpriority72 \lsdlocked0 Colorful List Accent 4;\lsdpriority73 \lsdlocked0 Colorful Grid Accent 4;\lsdpriority60 \lsdlocked0 Light Shading Accent 5;\lsdpriority61 \lsdlocked0 Light List Accent 5;
|
||||
\lsdpriority62 \lsdlocked0 Light Grid Accent 5;\lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 5;\lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 5;\lsdpriority65 \lsdlocked0 Medium List 1 Accent 5;\lsdpriority66 \lsdlocked0 Medium List 2 Accent 5;
|
||||
\lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 5;\lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 5;\lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 5;\lsdpriority70 \lsdlocked0 Dark List Accent 5;\lsdpriority71 \lsdlocked0 Colorful Shading Accent 5;
|
||||
\lsdpriority72 \lsdlocked0 Colorful List Accent 5;\lsdpriority73 \lsdlocked0 Colorful Grid Accent 5;\lsdpriority60 \lsdlocked0 Light Shading Accent 6;\lsdpriority61 \lsdlocked0 Light List Accent 6;\lsdpriority62 \lsdlocked0 Light Grid Accent 6;
|
||||
\lsdpriority63 \lsdlocked0 Medium Shading 1 Accent 6;\lsdpriority64 \lsdlocked0 Medium Shading 2 Accent 6;\lsdpriority65 \lsdlocked0 Medium List 1 Accent 6;\lsdpriority66 \lsdlocked0 Medium List 2 Accent 6;
|
||||
\lsdpriority67 \lsdlocked0 Medium Grid 1 Accent 6;\lsdpriority68 \lsdlocked0 Medium Grid 2 Accent 6;\lsdpriority69 \lsdlocked0 Medium Grid 3 Accent 6;\lsdpriority70 \lsdlocked0 Dark List Accent 6;\lsdpriority71 \lsdlocked0 Colorful Shading Accent 6;
|
||||
\lsdpriority72 \lsdlocked0 Colorful List Accent 6;\lsdpriority73 \lsdlocked0 Colorful Grid Accent 6;\lsdqformat1 \lsdpriority19 \lsdlocked0 Subtle Emphasis;\lsdqformat1 \lsdpriority21 \lsdlocked0 Intense Emphasis;
|
||||
\lsdqformat1 \lsdpriority31 \lsdlocked0 Subtle Reference;\lsdqformat1 \lsdpriority32 \lsdlocked0 Intense Reference;\lsdqformat1 \lsdpriority33 \lsdlocked0 Book Title;\lsdsemihidden1 \lsdunhideused1 \lsdpriority37 \lsdlocked0 Bibliography;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdqformat1 \lsdpriority39 \lsdlocked0 TOC Heading;\lsdpriority41 \lsdlocked0 Plain Table 1;\lsdpriority42 \lsdlocked0 Plain Table 2;\lsdpriority43 \lsdlocked0 Plain Table 3;\lsdpriority44 \lsdlocked0 Plain Table 4;
|
||||
\lsdpriority45 \lsdlocked0 Plain Table 5;\lsdpriority40 \lsdlocked0 Grid Table Light;\lsdpriority46 \lsdlocked0 Grid Table 1 Light;\lsdpriority47 \lsdlocked0 Grid Table 2;\lsdpriority48 \lsdlocked0 Grid Table 3;\lsdpriority49 \lsdlocked0 Grid Table 4;
|
||||
\lsdpriority50 \lsdlocked0 Grid Table 5 Dark;\lsdpriority51 \lsdlocked0 Grid Table 6 Colorful;\lsdpriority52 \lsdlocked0 Grid Table 7 Colorful;\lsdpriority46 \lsdlocked0 Grid Table 1 Light Accent 1;\lsdpriority47 \lsdlocked0 Grid Table 2 Accent 1;
|
||||
\lsdpriority48 \lsdlocked0 Grid Table 3 Accent 1;\lsdpriority49 \lsdlocked0 Grid Table 4 Accent 1;\lsdpriority50 \lsdlocked0 Grid Table 5 Dark Accent 1;\lsdpriority51 \lsdlocked0 Grid Table 6 Colorful Accent 1;
|
||||
\lsdpriority52 \lsdlocked0 Grid Table 7 Colorful Accent 1;\lsdpriority46 \lsdlocked0 Grid Table 1 Light Accent 2;\lsdpriority47 \lsdlocked0 Grid Table 2 Accent 2;\lsdpriority48 \lsdlocked0 Grid Table 3 Accent 2;
|
||||
\lsdpriority49 \lsdlocked0 Grid Table 4 Accent 2;\lsdpriority50 \lsdlocked0 Grid Table 5 Dark Accent 2;\lsdpriority51 \lsdlocked0 Grid Table 6 Colorful Accent 2;\lsdpriority52 \lsdlocked0 Grid Table 7 Colorful Accent 2;
|
||||
\lsdpriority46 \lsdlocked0 Grid Table 1 Light Accent 3;\lsdpriority47 \lsdlocked0 Grid Table 2 Accent 3;\lsdpriority48 \lsdlocked0 Grid Table 3 Accent 3;\lsdpriority49 \lsdlocked0 Grid Table 4 Accent 3;
|
||||
\lsdpriority50 \lsdlocked0 Grid Table 5 Dark Accent 3;\lsdpriority51 \lsdlocked0 Grid Table 6 Colorful Accent 3;\lsdpriority52 \lsdlocked0 Grid Table 7 Colorful Accent 3;\lsdpriority46 \lsdlocked0 Grid Table 1 Light Accent 4;
|
||||
\lsdpriority47 \lsdlocked0 Grid Table 2 Accent 4;\lsdpriority48 \lsdlocked0 Grid Table 3 Accent 4;\lsdpriority49 \lsdlocked0 Grid Table 4 Accent 4;\lsdpriority50 \lsdlocked0 Grid Table 5 Dark Accent 4;
|
||||
\lsdpriority51 \lsdlocked0 Grid Table 6 Colorful Accent 4;\lsdpriority52 \lsdlocked0 Grid Table 7 Colorful Accent 4;\lsdpriority46 \lsdlocked0 Grid Table 1 Light Accent 5;\lsdpriority47 \lsdlocked0 Grid Table 2 Accent 5;
|
||||
\lsdpriority48 \lsdlocked0 Grid Table 3 Accent 5;\lsdpriority49 \lsdlocked0 Grid Table 4 Accent 5;\lsdpriority50 \lsdlocked0 Grid Table 5 Dark Accent 5;\lsdpriority51 \lsdlocked0 Grid Table 6 Colorful Accent 5;
|
||||
\lsdpriority52 \lsdlocked0 Grid Table 7 Colorful Accent 5;\lsdpriority46 \lsdlocked0 Grid Table 1 Light Accent 6;\lsdpriority47 \lsdlocked0 Grid Table 2 Accent 6;\lsdpriority48 \lsdlocked0 Grid Table 3 Accent 6;
|
||||
\lsdpriority49 \lsdlocked0 Grid Table 4 Accent 6;\lsdpriority50 \lsdlocked0 Grid Table 5 Dark Accent 6;\lsdpriority51 \lsdlocked0 Grid Table 6 Colorful Accent 6;\lsdpriority52 \lsdlocked0 Grid Table 7 Colorful Accent 6;
|
||||
\lsdpriority46 \lsdlocked0 List Table 1 Light;\lsdpriority47 \lsdlocked0 List Table 2;\lsdpriority48 \lsdlocked0 List Table 3;\lsdpriority49 \lsdlocked0 List Table 4;\lsdpriority50 \lsdlocked0 List Table 5 Dark;
|
||||
\lsdpriority51 \lsdlocked0 List Table 6 Colorful;\lsdpriority52 \lsdlocked0 List Table 7 Colorful;\lsdpriority46 \lsdlocked0 List Table 1 Light Accent 1;\lsdpriority47 \lsdlocked0 List Table 2 Accent 1;\lsdpriority48 \lsdlocked0 List Table 3 Accent 1;
|
||||
\lsdpriority49 \lsdlocked0 List Table 4 Accent 1;\lsdpriority50 \lsdlocked0 List Table 5 Dark Accent 1;\lsdpriority51 \lsdlocked0 List Table 6 Colorful Accent 1;\lsdpriority52 \lsdlocked0 List Table 7 Colorful Accent 1;
|
||||
\lsdpriority46 \lsdlocked0 List Table 1 Light Accent 2;\lsdpriority47 \lsdlocked0 List Table 2 Accent 2;\lsdpriority48 \lsdlocked0 List Table 3 Accent 2;\lsdpriority49 \lsdlocked0 List Table 4 Accent 2;
|
||||
\lsdpriority50 \lsdlocked0 List Table 5 Dark Accent 2;\lsdpriority51 \lsdlocked0 List Table 6 Colorful Accent 2;\lsdpriority52 \lsdlocked0 List Table 7 Colorful Accent 2;\lsdpriority46 \lsdlocked0 List Table 1 Light Accent 3;
|
||||
\lsdpriority47 \lsdlocked0 List Table 2 Accent 3;\lsdpriority48 \lsdlocked0 List Table 3 Accent 3;\lsdpriority49 \lsdlocked0 List Table 4 Accent 3;\lsdpriority50 \lsdlocked0 List Table 5 Dark Accent 3;
|
||||
\lsdpriority51 \lsdlocked0 List Table 6 Colorful Accent 3;\lsdpriority52 \lsdlocked0 List Table 7 Colorful Accent 3;\lsdpriority46 \lsdlocked0 List Table 1 Light Accent 4;\lsdpriority47 \lsdlocked0 List Table 2 Accent 4;
|
||||
\lsdpriority48 \lsdlocked0 List Table 3 Accent 4;\lsdpriority49 \lsdlocked0 List Table 4 Accent 4;\lsdpriority50 \lsdlocked0 List Table 5 Dark Accent 4;\lsdpriority51 \lsdlocked0 List Table 6 Colorful Accent 4;
|
||||
\lsdpriority52 \lsdlocked0 List Table 7 Colorful Accent 4;\lsdpriority46 \lsdlocked0 List Table 1 Light Accent 5;\lsdpriority47 \lsdlocked0 List Table 2 Accent 5;\lsdpriority48 \lsdlocked0 List Table 3 Accent 5;
|
||||
\lsdpriority49 \lsdlocked0 List Table 4 Accent 5;\lsdpriority50 \lsdlocked0 List Table 5 Dark Accent 5;\lsdpriority51 \lsdlocked0 List Table 6 Colorful Accent 5;\lsdpriority52 \lsdlocked0 List Table 7 Colorful Accent 5;
|
||||
\lsdpriority46 \lsdlocked0 List Table 1 Light Accent 6;\lsdpriority47 \lsdlocked0 List Table 2 Accent 6;\lsdpriority48 \lsdlocked0 List Table 3 Accent 6;\lsdpriority49 \lsdlocked0 List Table 4 Accent 6;
|
||||
\lsdpriority50 \lsdlocked0 List Table 5 Dark Accent 6;\lsdpriority51 \lsdlocked0 List Table 6 Colorful Accent 6;\lsdpriority52 \lsdlocked0 List Table 7 Colorful Accent 6;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Mention;
|
||||
\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Smart Hyperlink;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Hashtag;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Unresolved Mention;\lsdsemihidden1 \lsdunhideused1 \lsdlocked0 Smart Link;}}{\*\datastore 01050000
|
||||
02000000180000004d73786d6c322e534158584d4c5265616465722e362e3000000000000000000000060000
|
||||
d0cf11e0a1b11ae1000000000000000000000000000000003e000300feff090006000000000000000000000001000000010000000000000000100000feffffff00000000feffffff0000000000000000ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
|
||||
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
|
||||
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
|
||||
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
|
||||
fffffffffffffffffdfffffffeffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
|
||||
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
|
||||
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
|
||||
ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
|
||||
ffffffffffffffffffffffffffffffff52006f006f007400200045006e00740072007900000000000000000000000000000000000000000000000000000000000000000000000000000000000000000016000500ffffffffffffffffffffffff0c6ad98892f1d411a65f0040963251e5000000000000000000000000f0af
|
||||
5b31897bdb01feffffff00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000ffffffffffffffffffffffff00000000000000000000000000000000000000000000000000000000
|
||||
00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000ffffffffffffffffffffffff0000000000000000000000000000000000000000000000000000
|
||||
000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000ffffffffffffffffffffffff000000000000000000000000000000000000000000000000
|
||||
0000000000000000000000000000000000000000000000000105000000000000}}
|
||||
@@ -0,0 +1,43 @@
|
||||
#!/usr/bin/env python3 -m pytest
|
||||
import os
|
||||
|
||||
from markitdown import MarkItDown, StreamInfo
|
||||
from markitdown_sample_plugin import RtfConverter
|
||||
|
||||
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
|
||||
|
||||
RTF_TEST_STRINGS = {
|
||||
"This is a Sample RTF File",
|
||||
"It is included to test if the MarkItDown sample plugin can correctly convert RTF files.",
|
||||
}
|
||||
|
||||
|
||||
def test_converter() -> None:
|
||||
"""Tests the RTF converter dirctly."""
|
||||
with open(os.path.join(TEST_FILES_DIR, "test.rtf"), "rb") as file_stream:
|
||||
converter = RtfConverter()
|
||||
result = converter.convert(
|
||||
file_stream=file_stream,
|
||||
stream_info=StreamInfo(
|
||||
mimetype="text/rtf", extension=".rtf", filename="test.rtf"
|
||||
),
|
||||
)
|
||||
|
||||
for test_string in RTF_TEST_STRINGS:
|
||||
assert test_string in result.text_content
|
||||
|
||||
|
||||
def test_markitdown() -> None:
|
||||
"""Tests that MarkItDown correctly loads the plugin."""
|
||||
md = MarkItDown(enable_plugins=True)
|
||||
result = md.convert(os.path.join(TEST_FILES_DIR, "test.rtf"))
|
||||
|
||||
for test_string in RTF_TEST_STRINGS:
|
||||
assert test_string in result.text_content
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
"""Runs this file's tests from the command line."""
|
||||
test_converter()
|
||||
test_markitdown()
|
||||
print("All tests passed.")
|
||||
@@ -0,0 +1,52 @@
|
||||
# MarkItDown
|
||||
|
||||
> [!IMPORTANT]
|
||||
> MarkItDown is a Python package and command-line utility for converting various files to Markdown (e.g., for indexing, text analysis, etc).
|
||||
>
|
||||
> For more information, and full documentation, see the project [README.md](https://github.com/microsoft/markitdown) on GitHub.
|
||||
|
||||
## Installation
|
||||
|
||||
From PyPI:
|
||||
|
||||
```bash
|
||||
pip install markitdown[all]
|
||||
```
|
||||
|
||||
From source:
|
||||
|
||||
```bash
|
||||
git clone git@github.com:microsoft/markitdown.git
|
||||
cd markitdown
|
||||
pip install -e packages/markitdown[all]
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Command-Line
|
||||
|
||||
```bash
|
||||
markitdown path-to-file.pdf > document.md
|
||||
```
|
||||
|
||||
### Python API
|
||||
|
||||
```python
|
||||
from markitdown import MarkItDown
|
||||
|
||||
md = MarkItDown()
|
||||
result = md.convert("test.xlsx")
|
||||
print(result.text_content)
|
||||
```
|
||||
|
||||
### More Information
|
||||
|
||||
For more information, and full documentation, see the project [README.md](https://github.com/microsoft/markitdown) on GitHub.
|
||||
|
||||
## Trademarks
|
||||
|
||||
This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
|
||||
trademarks or logos is subject to and must follow
|
||||
[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
|
||||
Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
|
||||
Any use of third-party trademarks or logos are subject to those third-party's policies.
|
||||
@@ -0,0 +1,232 @@
|
||||
# THIRD-PARTY SOFTWARE NOTICES AND INFORMATION
|
||||
|
||||
**Do Not Translate or Localize**
|
||||
|
||||
This project incorporates components from the projects listed below. The original copyright notices and the licenses
|
||||
under which MarkItDown received such components are set forth below. MarkItDown reserves all rights not expressly
|
||||
granted herein, whether by implication, estoppel or otherwise.
|
||||
|
||||
1.dwml (https://github.com/xiilei/dwml)
|
||||
|
||||
dwml NOTICES AND INFORMATION BEGIN HERE
|
||||
|
||||
-----------------------------------------
|
||||
|
||||
NOTE 1: What follows is a verbatim copy of dwml's LICENSE file, as it appeared on March 28th, 2025 - including
|
||||
placeholders for the copyright owner and year.
|
||||
|
||||
NOTE 2: The Apache License, Version 2.0, requires that modifications to the dwml source code be documented.
|
||||
The following section summarizes these changes. The full details are available in the MarkItDown source code
|
||||
repository under PR #1160 (https://github.com/microsoft/markitdown/pull/1160)
|
||||
|
||||
This project incorporates `dwml/latex_dict.py` and `dwml/omml.py` files without any additional logic modifications (which
|
||||
lives in `packages/markitdown/src/markitdown/converter_utils/docx/math` location). However, we have reformatted the code
|
||||
according to `black` code formatter. From `tests/docx.py` file, we have used `DOCXML_ROOT` XML namespaces and the rest of
|
||||
the file is not used.
|
||||
|
||||
-----------------------------------------
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "{}"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright {yyyy} {name of copyright owner}
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
||||
-----------------------------------------
|
||||
END OF dwml NOTICES AND INFORMATION
|
||||
@@ -26,25 +26,38 @@ classifiers = [
|
||||
dependencies = [
|
||||
"beautifulsoup4",
|
||||
"requests",
|
||||
"mammoth",
|
||||
"markdownify",
|
||||
"numpy",
|
||||
"magika~=0.6.1",
|
||||
"charset-normalizer",
|
||||
"defusedxml",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
all = [
|
||||
"python-pptx",
|
||||
"mammoth~=1.11.0",
|
||||
"pandas",
|
||||
"openpyxl",
|
||||
"xlrd",
|
||||
"pdfminer.six",
|
||||
"puremagic",
|
||||
"pydub",
|
||||
"lxml",
|
||||
"pdfminer.six>=20251230",
|
||||
"pdfplumber>=0.11.9",
|
||||
"olefile",
|
||||
"youtube-transcript-api",
|
||||
"pydub",
|
||||
"SpeechRecognition",
|
||||
"pathvalidate",
|
||||
"charset-normalizer",
|
||||
"openai",
|
||||
"youtube-transcript-api~=1.0.0",
|
||||
"azure-ai-documentintelligence",
|
||||
"azure-identity"
|
||||
"azure-identity",
|
||||
]
|
||||
pptx = ["python-pptx"]
|
||||
docx = ["mammoth~=1.11.0", "lxml"]
|
||||
xlsx = ["pandas", "openpyxl"]
|
||||
xls = ["pandas", "xlrd"]
|
||||
pdf = ["pdfminer.six>=20251230", "pdfplumber>=0.11.9"]
|
||||
outlook = ["olefile"]
|
||||
audio-transcription = ["pydub", "SpeechRecognition"]
|
||||
youtube-transcription = ["youtube-transcript-api"]
|
||||
az-doc-intel = ["azure-ai-documentintelligence", "azure-identity"]
|
||||
|
||||
[project.urls]
|
||||
Documentation = "https://github.com/microsoft/markitdown#readme"
|
||||
@@ -57,12 +70,24 @@ path = "src/markitdown/__about__.py"
|
||||
[project.scripts]
|
||||
markitdown = "markitdown.__main__:main"
|
||||
|
||||
[tool.hatch.envs.types]
|
||||
[tool.hatch.envs.default]
|
||||
features = ["all"]
|
||||
|
||||
[tool.hatch.envs.hatch-test]
|
||||
features = ["all"]
|
||||
extra-dependencies = [
|
||||
"openai",
|
||||
]
|
||||
|
||||
[tool.hatch.envs.types]
|
||||
features = ["all"]
|
||||
extra-dependencies = [
|
||||
"openai",
|
||||
"mypy>=1.0.0",
|
||||
]
|
||||
|
||||
[tool.hatch.envs.types.scripts]
|
||||
check = "mypy --install-types --non-interactive {args:src/markitdown tests}"
|
||||
check = "mypy --install-types --non-interactive --ignore-missing-imports {args:src/markitdown tests}"
|
||||
|
||||
[tool.coverage.run]
|
||||
source_pkgs = ["markitdown", "tests"]
|
||||
@@ -0,0 +1,4 @@
|
||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
__version__ = "0.1.6b2"
|
||||
@@ -0,0 +1,34 @@
|
||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
from .__about__ import __version__
|
||||
from ._markitdown import (
|
||||
MarkItDown,
|
||||
PRIORITY_SPECIFIC_FILE_FORMAT,
|
||||
PRIORITY_GENERIC_FILE_FORMAT,
|
||||
)
|
||||
from ._base_converter import DocumentConverterResult, DocumentConverter
|
||||
from ._stream_info import StreamInfo
|
||||
from ._exceptions import (
|
||||
MarkItDownException,
|
||||
MissingDependencyException,
|
||||
FailedConversionAttempt,
|
||||
FileConversionException,
|
||||
UnsupportedFormatException,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"__version__",
|
||||
"MarkItDown",
|
||||
"DocumentConverter",
|
||||
"DocumentConverterResult",
|
||||
"MarkItDownException",
|
||||
"MissingDependencyException",
|
||||
"FailedConversionAttempt",
|
||||
"FileConversionException",
|
||||
"UnsupportedFormatException",
|
||||
"StreamInfo",
|
||||
"PRIORITY_SPECIFIC_FILE_FORMAT",
|
||||
"PRIORITY_GENERIC_FILE_FORMAT",
|
||||
]
|
||||
@@ -0,0 +1,223 @@
|
||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
import argparse
|
||||
import sys
|
||||
import codecs
|
||||
from textwrap import dedent
|
||||
from importlib.metadata import entry_points
|
||||
from .__about__ import __version__
|
||||
from ._markitdown import MarkItDown, StreamInfo, DocumentConverterResult
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Convert various file formats to markdown.",
|
||||
prog="markitdown",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
usage=dedent(
|
||||
"""
|
||||
SYNTAX:
|
||||
|
||||
markitdown <OPTIONAL: FILENAME>
|
||||
If FILENAME is empty, markitdown reads from stdin.
|
||||
|
||||
EXAMPLE:
|
||||
|
||||
markitdown example.pdf
|
||||
|
||||
OR
|
||||
|
||||
cat example.pdf | markitdown
|
||||
|
||||
OR
|
||||
|
||||
markitdown < example.pdf
|
||||
|
||||
OR to save to a file use
|
||||
|
||||
markitdown example.pdf -o example.md
|
||||
|
||||
OR
|
||||
|
||||
markitdown example.pdf > example.md
|
||||
"""
|
||||
).strip(),
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--version",
|
||||
action="version",
|
||||
version=f"%(prog)s {__version__}",
|
||||
help="show the version number and exit",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-o",
|
||||
"--output",
|
||||
help="Output file name. If not provided, output is written to stdout.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-x",
|
||||
"--extension",
|
||||
help="Provide a hint about the file extension (e.g., when reading from stdin).",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-m",
|
||||
"--mime-type",
|
||||
help="Provide a hint about the file's MIME type.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-c",
|
||||
"--charset",
|
||||
help="Provide a hint about the file's charset (e.g, UTF-8).",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--use-docintel",
|
||||
action="store_true",
|
||||
help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-e",
|
||||
"--endpoint",
|
||||
type=str,
|
||||
help="Document Intelligence Endpoint. Required if using Document Intelligence.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"-p",
|
||||
"--use-plugins",
|
||||
action="store_true",
|
||||
help="Use 3rd-party plugins to convert files. Use --list-plugins to see installed plugins.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--list-plugins",
|
||||
action="store_true",
|
||||
help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--keep-data-uris",
|
||||
action="store_true",
|
||||
help="Keep data URIs (like base64-encoded images) in the output. By default, data URIs are truncated.",
|
||||
)
|
||||
|
||||
parser.add_argument("filename", nargs="?")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Parse the extension hint
|
||||
extension_hint = args.extension
|
||||
if extension_hint is not None:
|
||||
extension_hint = extension_hint.strip().lower()
|
||||
if len(extension_hint) > 0:
|
||||
if not extension_hint.startswith("."):
|
||||
extension_hint = "." + extension_hint
|
||||
else:
|
||||
extension_hint = None
|
||||
|
||||
# Parse the mime type
|
||||
mime_type_hint = args.mime_type
|
||||
if mime_type_hint is not None:
|
||||
mime_type_hint = mime_type_hint.strip()
|
||||
if len(mime_type_hint) > 0:
|
||||
if mime_type_hint.count("/") != 1:
|
||||
_exit_with_error(f"Invalid MIME type: {mime_type_hint}")
|
||||
else:
|
||||
mime_type_hint = None
|
||||
|
||||
# Parse the charset
|
||||
charset_hint = args.charset
|
||||
if charset_hint is not None:
|
||||
charset_hint = charset_hint.strip()
|
||||
if len(charset_hint) > 0:
|
||||
try:
|
||||
charset_hint = codecs.lookup(charset_hint).name
|
||||
except LookupError:
|
||||
_exit_with_error(f"Invalid charset: {charset_hint}")
|
||||
else:
|
||||
charset_hint = None
|
||||
|
||||
stream_info = None
|
||||
if (
|
||||
extension_hint is not None
|
||||
or mime_type_hint is not None
|
||||
or charset_hint is not None
|
||||
):
|
||||
stream_info = StreamInfo(
|
||||
extension=extension_hint, mimetype=mime_type_hint, charset=charset_hint
|
||||
)
|
||||
|
||||
if args.list_plugins:
|
||||
# List installed plugins, then exit
|
||||
print("Installed MarkItDown 3rd-party Plugins:\n")
|
||||
plugin_entry_points = list(entry_points(group="markitdown.plugin"))
|
||||
if len(plugin_entry_points) == 0:
|
||||
print(" * No 3rd-party plugins installed.")
|
||||
print(
|
||||
"\nFind plugins by searching for the hashtag #markitdown-plugin on GitHub.\n"
|
||||
)
|
||||
else:
|
||||
for entry_point in plugin_entry_points:
|
||||
print(f" * {entry_point.name:<16}\t(package: {entry_point.value})")
|
||||
print(
|
||||
"\nUse the -p (or --use-plugins) option to enable 3rd-party plugins.\n"
|
||||
)
|
||||
sys.exit(0)
|
||||
|
||||
if args.use_docintel:
|
||||
if args.endpoint is None:
|
||||
_exit_with_error(
|
||||
"Document Intelligence Endpoint is required when using Document Intelligence."
|
||||
)
|
||||
elif args.filename is None:
|
||||
_exit_with_error("Filename is required when using Document Intelligence.")
|
||||
|
||||
markitdown = MarkItDown(
|
||||
enable_plugins=args.use_plugins, docintel_endpoint=args.endpoint
|
||||
)
|
||||
else:
|
||||
markitdown = MarkItDown(enable_plugins=args.use_plugins)
|
||||
|
||||
if args.filename is None:
|
||||
result = markitdown.convert_stream(
|
||||
sys.stdin.buffer,
|
||||
stream_info=stream_info,
|
||||
keep_data_uris=args.keep_data_uris,
|
||||
)
|
||||
else:
|
||||
result = markitdown.convert(
|
||||
args.filename, stream_info=stream_info, keep_data_uris=args.keep_data_uris
|
||||
)
|
||||
|
||||
_handle_output(args, result)
|
||||
|
||||
|
||||
def _handle_output(args, result: DocumentConverterResult):
|
||||
"""Handle output to stdout or file"""
|
||||
if args.output:
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
f.write(result.markdown)
|
||||
else:
|
||||
# Handle stdout encoding errors more gracefully
|
||||
print(
|
||||
result.markdown.encode(sys.stdout.encoding, errors="replace").decode(
|
||||
sys.stdout.encoding
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _exit_with_error(message: str):
|
||||
print(message)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,105 @@
|
||||
from typing import Any, BinaryIO, Optional
|
||||
from ._stream_info import StreamInfo
|
||||
|
||||
|
||||
class DocumentConverterResult:
|
||||
"""The result of converting a document to Markdown."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
markdown: str,
|
||||
*,
|
||||
title: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Initialize the DocumentConverterResult.
|
||||
|
||||
The only required parameter is the converted Markdown text.
|
||||
The title, and any other metadata that may be added in the future, are optional.
|
||||
|
||||
Parameters:
|
||||
- markdown: The converted Markdown text.
|
||||
- title: Optional title of the document.
|
||||
"""
|
||||
self.markdown = markdown
|
||||
self.title = title
|
||||
|
||||
@property
|
||||
def text_content(self) -> str:
|
||||
"""Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__."""
|
||||
return self.markdown
|
||||
|
||||
@text_content.setter
|
||||
def text_content(self, markdown: str):
|
||||
"""Soft-deprecated alias for `markdown`. New code should migrate to using `markdown` or __str__."""
|
||||
self.markdown = markdown
|
||||
|
||||
def __str__(self) -> str:
|
||||
"""Return the converted Markdown text."""
|
||||
return self.markdown
|
||||
|
||||
|
||||
class DocumentConverter:
|
||||
"""Abstract superclass of all DocumentConverters."""
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
"""
|
||||
Return a quick determination on if the converter should attempt converting the document.
|
||||
This is primarily based `stream_info` (typically, `stream_info.mimetype`, `stream_info.extension`).
|
||||
In cases where the data is retrieved via HTTP, the `steam_info.url` might also be referenced to
|
||||
make a determination (e.g., special converters for Wikipedia, YouTube etc).
|
||||
Finally, it is conceivable that the `stream_info.filename` might be used to in cases
|
||||
where the filename is well-known (e.g., `Dockerfile`, `Makefile`, etc)
|
||||
|
||||
NOTE: The method signature is designed to match that of the convert() method. This provides some
|
||||
assurance that, if accepts() returns True, the convert() method will also be able to handle the document.
|
||||
|
||||
IMPORTANT: In rare cases, (e.g., OutlookMsgConverter) we need to read more from the stream to make a final
|
||||
determination. Read operations inevitably advances the position in file_stream. In these case, the position
|
||||
MUST be reset it MUST be reset before returning. This is because the convert() method may be called immediately
|
||||
after accepts(), and will expect the file_stream to be at the original position.
|
||||
|
||||
E.g.,
|
||||
cur_pos = file_stream.tell() # Save the current position
|
||||
data = file_stream.read(100) # ... peek at the first 100 bytes, etc.
|
||||
file_stream.seek(cur_pos) # Reset the position to the original position
|
||||
|
||||
Parameters:
|
||||
- file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
|
||||
- stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set)
|
||||
- kwargs: Additional keyword arguments for the converter.
|
||||
|
||||
Returns:
|
||||
- bool: True if the converter can handle the document, False otherwise.
|
||||
"""
|
||||
raise NotImplementedError(
|
||||
f"The subclass, {type(self).__name__}, must implement the accepts() method to determine if they can handle the document."
|
||||
)
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
"""
|
||||
Convert a document to Markdown text.
|
||||
|
||||
Parameters:
|
||||
- file_stream: The file-like object to convert. Must support seek(), tell(), and read() methods.
|
||||
- stream_info: The StreamInfo object containing metadata about the file (mimetype, extension, charset, set)
|
||||
- kwargs: Additional keyword arguments for the converter.
|
||||
|
||||
Returns:
|
||||
- DocumentConverterResult: The result of the conversion, which includes the title and markdown content.
|
||||
|
||||
Raises:
|
||||
- FileConversionException: If the mimetype is recognized, but the conversion fails for some other reason.
|
||||
- MissingDependencyException: If the converter requires a dependency that is not installed.
|
||||
"""
|
||||
raise NotImplementedError("Subclasses must implement this method")
|
||||
@@ -0,0 +1,76 @@
|
||||
from typing import Optional, List, Any
|
||||
|
||||
MISSING_DEPENDENCY_MESSAGE = """{converter} recognized the input as a potential {extension} file, but the dependencies needed to read {extension} files have not been installed. To resolve this error, include the optional dependency [{feature}] or [all] when installing MarkItDown. For example:
|
||||
|
||||
* pip install markitdown[{feature}]
|
||||
* pip install markitdown[all]
|
||||
* pip install markitdown[{feature}, ...]
|
||||
* etc."""
|
||||
|
||||
|
||||
class MarkItDownException(Exception):
|
||||
"""
|
||||
Base exception class for MarkItDown.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class MissingDependencyException(MarkItDownException):
|
||||
"""
|
||||
Converters shipped with MarkItDown may depend on optional
|
||||
dependencies. This exception is thrown when a converter's
|
||||
convert() method is called, but the required dependency is not
|
||||
installed. This is not necessarily a fatal error, as the converter
|
||||
will simply be skipped (an error will bubble up only if no other
|
||||
suitable converter is found).
|
||||
|
||||
Error messages should clearly indicate which dependency is missing.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class UnsupportedFormatException(MarkItDownException):
|
||||
"""
|
||||
Thrown when no suitable converter was found for the given file.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class FailedConversionAttempt(object):
|
||||
"""
|
||||
Represents an a single attempt to convert a file.
|
||||
"""
|
||||
|
||||
def __init__(self, converter: Any, exc_info: Optional[tuple] = None):
|
||||
self.converter = converter
|
||||
self.exc_info = exc_info
|
||||
|
||||
|
||||
class FileConversionException(MarkItDownException):
|
||||
"""
|
||||
Thrown when a suitable converter was found, but the conversion
|
||||
process fails for any reason.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
message: Optional[str] = None,
|
||||
attempts: Optional[List[FailedConversionAttempt]] = None,
|
||||
):
|
||||
self.attempts = attempts
|
||||
|
||||
if message is None:
|
||||
if attempts is None:
|
||||
message = "File conversion failed."
|
||||
else:
|
||||
message = f"File conversion failed after {len(attempts)} attempts:\n"
|
||||
for attempt in attempts:
|
||||
if attempt.exc_info is None:
|
||||
message += f" - {type(attempt.converter).__name__} provided no execution info."
|
||||
else:
|
||||
message += f" - {type(attempt.converter).__name__} threw {attempt.exc_info[0].__name__} with message: {attempt.exc_info[1]}\n"
|
||||
|
||||
super().__init__(message)
|
||||
@@ -0,0 +1,783 @@
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import shutil
|
||||
import traceback
|
||||
import io
|
||||
from dataclasses import dataclass
|
||||
from importlib.metadata import entry_points
|
||||
from typing import Any, List, Dict, Optional, Union, BinaryIO
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
from warnings import warn
|
||||
import requests
|
||||
import magika
|
||||
import charset_normalizer
|
||||
import codecs
|
||||
|
||||
from ._stream_info import StreamInfo
|
||||
from ._uri_utils import parse_data_uri, file_uri_to_path
|
||||
|
||||
from .converters import (
|
||||
PlainTextConverter,
|
||||
HtmlConverter,
|
||||
RssConverter,
|
||||
WikipediaConverter,
|
||||
YouTubeConverter,
|
||||
IpynbConverter,
|
||||
BingSerpConverter,
|
||||
PdfConverter,
|
||||
DocxConverter,
|
||||
XlsxConverter,
|
||||
XlsConverter,
|
||||
PptxConverter,
|
||||
ImageConverter,
|
||||
AudioConverter,
|
||||
OutlookMsgConverter,
|
||||
ZipConverter,
|
||||
EpubConverter,
|
||||
DocumentIntelligenceConverter,
|
||||
CsvConverter,
|
||||
)
|
||||
|
||||
from ._base_converter import DocumentConverter, DocumentConverterResult
|
||||
|
||||
from ._exceptions import (
|
||||
FileConversionException,
|
||||
UnsupportedFormatException,
|
||||
FailedConversionAttempt,
|
||||
)
|
||||
|
||||
|
||||
# Lower priority values are tried first.
|
||||
PRIORITY_SPECIFIC_FILE_FORMAT = (
|
||||
0.0 # e.g., .docx, .pdf, .xlsx, Or specific pages, e.g., wikipedia
|
||||
)
|
||||
PRIORITY_GENERIC_FILE_FORMAT = (
|
||||
10.0 # Near catch-all converters for mimetypes like text/*, etc.
|
||||
)
|
||||
|
||||
|
||||
_plugins: Union[None, List[Any]] = None # If None, plugins have not been loaded yet.
|
||||
|
||||
|
||||
def _load_plugins() -> Union[None, List[Any]]:
|
||||
"""Lazy load plugins, exiting early if already loaded."""
|
||||
global _plugins
|
||||
|
||||
# Skip if we've already loaded plugins
|
||||
if _plugins is not None:
|
||||
return _plugins
|
||||
|
||||
# Load plugins
|
||||
_plugins = []
|
||||
for entry_point in entry_points(group="markitdown.plugin"):
|
||||
try:
|
||||
_plugins.append(entry_point.load())
|
||||
except Exception:
|
||||
tb = traceback.format_exc()
|
||||
warn(f"Plugin '{entry_point.name}' failed to load ... skipping:\n{tb}")
|
||||
|
||||
return _plugins
|
||||
|
||||
|
||||
@dataclass(kw_only=True, frozen=True)
|
||||
class ConverterRegistration:
|
||||
"""A registration of a converter with its priority and other metadata."""
|
||||
|
||||
converter: DocumentConverter
|
||||
priority: float
|
||||
|
||||
|
||||
class MarkItDown:
|
||||
"""(In preview) An extremely simple text-based document reader, suitable for LLM use.
|
||||
This reader will convert common file-types or webpages to Markdown."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
enable_builtins: Union[None, bool] = None,
|
||||
enable_plugins: Union[None, bool] = None,
|
||||
**kwargs,
|
||||
):
|
||||
self._builtins_enabled = False
|
||||
self._plugins_enabled = False
|
||||
|
||||
requests_session = kwargs.get("requests_session")
|
||||
if requests_session is None:
|
||||
self._requests_session = requests.Session()
|
||||
# Signal that we prefer markdown over HTML, etc. if the server supports it.
|
||||
# e.g., https://blog.cloudflare.com/markdown-for-agents/
|
||||
self._requests_session.headers.update(
|
||||
{
|
||||
"Accept": "text/markdown, text/html;q=0.9, text/plain;q=0.8, */*;q=0.1"
|
||||
}
|
||||
)
|
||||
else:
|
||||
self._requests_session = requests_session
|
||||
|
||||
self._magika = magika.Magika()
|
||||
|
||||
# TODO - remove these (see enable_builtins)
|
||||
self._llm_client: Any = None
|
||||
self._llm_model: Union[str | None] = None
|
||||
self._llm_prompt: Union[str | None] = None
|
||||
self._exiftool_path: Union[str | None] = None
|
||||
self._style_map: Union[str | None] = None
|
||||
|
||||
# Register the converters
|
||||
self._converters: List[ConverterRegistration] = []
|
||||
|
||||
if (
|
||||
enable_builtins is None or enable_builtins
|
||||
): # Default to True when not specified
|
||||
self.enable_builtins(**kwargs)
|
||||
|
||||
if enable_plugins:
|
||||
self.enable_plugins(**kwargs)
|
||||
|
||||
def enable_builtins(self, **kwargs) -> None:
|
||||
"""
|
||||
Enable and register built-in converters.
|
||||
Built-in converters are enabled by default.
|
||||
This method should only be called once, if built-ins were initially disabled.
|
||||
"""
|
||||
if not self._builtins_enabled:
|
||||
# TODO: Move these into converter constructors
|
||||
self._llm_client = kwargs.get("llm_client")
|
||||
self._llm_model = kwargs.get("llm_model")
|
||||
self._llm_prompt = kwargs.get("llm_prompt")
|
||||
self._exiftool_path = kwargs.get("exiftool_path")
|
||||
self._style_map = kwargs.get("style_map")
|
||||
|
||||
if self._exiftool_path is None:
|
||||
self._exiftool_path = os.getenv("EXIFTOOL_PATH")
|
||||
|
||||
# Still none? Check well-known paths
|
||||
if self._exiftool_path is None:
|
||||
candidate = shutil.which("exiftool")
|
||||
if candidate:
|
||||
candidate = os.path.abspath(candidate)
|
||||
if any(
|
||||
d == os.path.dirname(candidate)
|
||||
for d in [
|
||||
"/usr/bin",
|
||||
"/usr/local/bin",
|
||||
"/opt",
|
||||
"/opt/bin",
|
||||
"/opt/local/bin",
|
||||
"/opt/homebrew/bin",
|
||||
"C:\\Windows\\System32",
|
||||
"C:\\Program Files",
|
||||
"C:\\Program Files (x86)",
|
||||
]
|
||||
):
|
||||
self._exiftool_path = candidate
|
||||
|
||||
# Register converters for successful browsing operations
|
||||
# Later registrations are tried first / take higher priority than earlier registrations
|
||||
# To this end, the most specific converters should appear below the most generic converters
|
||||
self.register_converter(
|
||||
PlainTextConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
|
||||
)
|
||||
self.register_converter(
|
||||
ZipConverter(markitdown=self), priority=PRIORITY_GENERIC_FILE_FORMAT
|
||||
)
|
||||
self.register_converter(
|
||||
HtmlConverter(), priority=PRIORITY_GENERIC_FILE_FORMAT
|
||||
)
|
||||
self.register_converter(RssConverter())
|
||||
self.register_converter(WikipediaConverter())
|
||||
self.register_converter(YouTubeConverter())
|
||||
self.register_converter(BingSerpConverter())
|
||||
self.register_converter(DocxConverter())
|
||||
self.register_converter(XlsxConverter())
|
||||
self.register_converter(XlsConverter())
|
||||
self.register_converter(PptxConverter())
|
||||
self.register_converter(AudioConverter())
|
||||
self.register_converter(ImageConverter())
|
||||
self.register_converter(IpynbConverter())
|
||||
self.register_converter(PdfConverter())
|
||||
self.register_converter(OutlookMsgConverter())
|
||||
self.register_converter(EpubConverter())
|
||||
self.register_converter(CsvConverter())
|
||||
|
||||
# Register Document Intelligence converter at the top of the stack if endpoint is provided
|
||||
docintel_endpoint = kwargs.get("docintel_endpoint")
|
||||
if docintel_endpoint is not None:
|
||||
docintel_args: Dict[str, Any] = {}
|
||||
docintel_args["endpoint"] = docintel_endpoint
|
||||
|
||||
docintel_credential = kwargs.get("docintel_credential")
|
||||
if docintel_credential is not None:
|
||||
docintel_args["credential"] = docintel_credential
|
||||
|
||||
docintel_types = kwargs.get("docintel_file_types")
|
||||
if docintel_types is not None:
|
||||
docintel_args["file_types"] = docintel_types
|
||||
|
||||
docintel_version = kwargs.get("docintel_api_version")
|
||||
if docintel_version is not None:
|
||||
docintel_args["api_version"] = docintel_version
|
||||
|
||||
self.register_converter(
|
||||
DocumentIntelligenceConverter(**docintel_args),
|
||||
)
|
||||
|
||||
self._builtins_enabled = True
|
||||
else:
|
||||
warn("Built-in converters are already enabled.", RuntimeWarning)
|
||||
|
||||
def enable_plugins(self, **kwargs) -> None:
|
||||
"""
|
||||
Enable and register converters provided by plugins.
|
||||
Plugins are disabled by default.
|
||||
This method should only be called once, if plugins were initially disabled.
|
||||
"""
|
||||
if not self._plugins_enabled:
|
||||
# Load plugins
|
||||
plugins = _load_plugins()
|
||||
assert plugins is not None
|
||||
for plugin in plugins:
|
||||
try:
|
||||
plugin.register_converters(self, **kwargs)
|
||||
except Exception:
|
||||
tb = traceback.format_exc()
|
||||
warn(f"Plugin '{plugin}' failed to register converters:\n{tb}")
|
||||
self._plugins_enabled = True
|
||||
else:
|
||||
warn("Plugins converters are already enabled.", RuntimeWarning)
|
||||
|
||||
def convert(
|
||||
self,
|
||||
source: Union[str, requests.Response, Path, BinaryIO],
|
||||
*,
|
||||
stream_info: Optional[StreamInfo] = None,
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult: # TODO: deal with kwargs
|
||||
"""
|
||||
Args:
|
||||
- source: can be a path (str or Path), url, or a requests.response object
|
||||
- stream_info: optional stream info to use for the conversion. If None, infer from source
|
||||
- kwargs: additional arguments to pass to the converter
|
||||
"""
|
||||
|
||||
# Local path or url
|
||||
if isinstance(source, str):
|
||||
if (
|
||||
source.startswith("http:")
|
||||
or source.startswith("https:")
|
||||
or source.startswith("file:")
|
||||
or source.startswith("data:")
|
||||
):
|
||||
# Rename the url argument to mock_url
|
||||
# (Deprecated -- use stream_info)
|
||||
_kwargs = {k: v for k, v in kwargs.items()}
|
||||
if "url" in _kwargs:
|
||||
_kwargs["mock_url"] = _kwargs["url"]
|
||||
del _kwargs["url"]
|
||||
|
||||
return self.convert_uri(source, stream_info=stream_info, **_kwargs)
|
||||
else:
|
||||
return self.convert_local(source, stream_info=stream_info, **kwargs)
|
||||
# Path object
|
||||
elif isinstance(source, Path):
|
||||
return self.convert_local(source, stream_info=stream_info, **kwargs)
|
||||
# Request response
|
||||
elif isinstance(source, requests.Response):
|
||||
return self.convert_response(source, stream_info=stream_info, **kwargs)
|
||||
# Binary stream
|
||||
elif (
|
||||
hasattr(source, "read")
|
||||
and callable(source.read)
|
||||
and not isinstance(source, io.TextIOBase)
|
||||
):
|
||||
return self.convert_stream(source, stream_info=stream_info, **kwargs)
|
||||
else:
|
||||
raise TypeError(
|
||||
f"Invalid source type: {type(source)}. Expected str, requests.Response, BinaryIO."
|
||||
)
|
||||
|
||||
def convert_local(
|
||||
self,
|
||||
path: Union[str, Path],
|
||||
*,
|
||||
stream_info: Optional[StreamInfo] = None,
|
||||
file_extension: Optional[str] = None, # Deprecated -- use stream_info
|
||||
url: Optional[str] = None, # Deprecated -- use stream_info
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
if isinstance(path, Path):
|
||||
path = str(path)
|
||||
|
||||
# Build a base StreamInfo object from which to start guesses
|
||||
base_guess = StreamInfo(
|
||||
local_path=path,
|
||||
extension=os.path.splitext(path)[1],
|
||||
filename=os.path.basename(path),
|
||||
)
|
||||
|
||||
# Extend the base_guess with any additional info from the arguments
|
||||
if stream_info is not None:
|
||||
base_guess = base_guess.copy_and_update(stream_info)
|
||||
|
||||
if file_extension is not None:
|
||||
# Deprecated -- use stream_info
|
||||
base_guess = base_guess.copy_and_update(extension=file_extension)
|
||||
|
||||
if url is not None:
|
||||
# Deprecated -- use stream_info
|
||||
base_guess = base_guess.copy_and_update(url=url)
|
||||
|
||||
with open(path, "rb") as fh:
|
||||
guesses = self._get_stream_info_guesses(
|
||||
file_stream=fh, base_guess=base_guess
|
||||
)
|
||||
return self._convert(file_stream=fh, stream_info_guesses=guesses, **kwargs)
|
||||
|
||||
def convert_stream(
|
||||
self,
|
||||
stream: BinaryIO,
|
||||
*,
|
||||
stream_info: Optional[StreamInfo] = None,
|
||||
file_extension: Optional[str] = None, # Deprecated -- use stream_info
|
||||
url: Optional[str] = None, # Deprecated -- use stream_info
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
guesses: List[StreamInfo] = []
|
||||
|
||||
# Do we have anything on which to base a guess?
|
||||
base_guess = None
|
||||
if stream_info is not None or file_extension is not None or url is not None:
|
||||
# Start with a non-Null base guess
|
||||
if stream_info is None:
|
||||
base_guess = StreamInfo()
|
||||
else:
|
||||
base_guess = stream_info
|
||||
|
||||
if file_extension is not None:
|
||||
# Deprecated -- use stream_info
|
||||
assert base_guess is not None # for mypy
|
||||
base_guess = base_guess.copy_and_update(extension=file_extension)
|
||||
|
||||
if url is not None:
|
||||
# Deprecated -- use stream_info
|
||||
assert base_guess is not None # for mypy
|
||||
base_guess = base_guess.copy_and_update(url=url)
|
||||
|
||||
# Check if we have a seekable stream. If not, load the entire stream into memory.
|
||||
if not stream.seekable():
|
||||
buffer = io.BytesIO()
|
||||
while True:
|
||||
chunk = stream.read(4096)
|
||||
if not chunk:
|
||||
break
|
||||
buffer.write(chunk)
|
||||
buffer.seek(0)
|
||||
stream = buffer
|
||||
|
||||
# Add guesses based on stream content
|
||||
guesses = self._get_stream_info_guesses(
|
||||
file_stream=stream, base_guess=base_guess or StreamInfo()
|
||||
)
|
||||
return self._convert(file_stream=stream, stream_info_guesses=guesses, **kwargs)
|
||||
|
||||
def convert_url(
|
||||
self,
|
||||
url: str,
|
||||
*,
|
||||
stream_info: Optional[StreamInfo] = None,
|
||||
file_extension: Optional[str] = None,
|
||||
mock_url: Optional[str] = None,
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
"""Alias for convert_uri()"""
|
||||
# convert_url will likely be deprecated in the future in favor of convert_uri
|
||||
return self.convert_uri(
|
||||
url,
|
||||
stream_info=stream_info,
|
||||
file_extension=file_extension,
|
||||
mock_url=mock_url,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def convert_uri(
|
||||
self,
|
||||
uri: str,
|
||||
*,
|
||||
stream_info: Optional[StreamInfo] = None,
|
||||
file_extension: Optional[str] = None, # Deprecated -- use stream_info
|
||||
mock_url: Optional[
|
||||
str
|
||||
] = None, # Mock the request as if it came from a different URL
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
uri = uri.strip()
|
||||
|
||||
# File URIs
|
||||
if uri.startswith("file:"):
|
||||
netloc, path = file_uri_to_path(uri)
|
||||
if netloc and netloc != "localhost":
|
||||
raise ValueError(
|
||||
f"Unsupported file URI: {uri}. Netloc must be empty or localhost."
|
||||
)
|
||||
return self.convert_local(
|
||||
path,
|
||||
stream_info=stream_info,
|
||||
file_extension=file_extension,
|
||||
url=mock_url,
|
||||
**kwargs,
|
||||
)
|
||||
# Data URIs
|
||||
elif uri.startswith("data:"):
|
||||
mimetype, attributes, data = parse_data_uri(uri)
|
||||
|
||||
base_guess = StreamInfo(
|
||||
mimetype=mimetype,
|
||||
charset=attributes.get("charset"),
|
||||
)
|
||||
if stream_info is not None:
|
||||
base_guess = base_guess.copy_and_update(stream_info)
|
||||
|
||||
return self.convert_stream(
|
||||
io.BytesIO(data),
|
||||
stream_info=base_guess,
|
||||
file_extension=file_extension,
|
||||
url=mock_url,
|
||||
**kwargs,
|
||||
)
|
||||
# HTTP/HTTPS URIs
|
||||
elif uri.startswith("http:") or uri.startswith("https:"):
|
||||
response = self._requests_session.get(uri, stream=True)
|
||||
response.raise_for_status()
|
||||
return self.convert_response(
|
||||
response,
|
||||
stream_info=stream_info,
|
||||
file_extension=file_extension,
|
||||
url=mock_url,
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unsupported URI scheme: {uri.split(':')[0]}. Supported schemes are: file:, data:, http:, https:"
|
||||
)
|
||||
|
||||
def convert_response(
|
||||
self,
|
||||
response: requests.Response,
|
||||
*,
|
||||
stream_info: Optional[StreamInfo] = None,
|
||||
file_extension: Optional[str] = None, # Deprecated -- use stream_info
|
||||
url: Optional[str] = None, # Deprecated -- use stream_info
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
# If there is a content-type header, get the mimetype and charset (if present)
|
||||
mimetype: Optional[str] = None
|
||||
charset: Optional[str] = None
|
||||
|
||||
if "content-type" in response.headers:
|
||||
parts = response.headers["content-type"].split(";")
|
||||
mimetype = parts.pop(0).strip()
|
||||
for part in parts:
|
||||
if part.strip().startswith("charset="):
|
||||
_charset = part.split("=")[1].strip()
|
||||
if len(_charset) > 0:
|
||||
charset = _charset
|
||||
|
||||
# If there is a content-disposition header, get the filename and possibly the extension
|
||||
filename: Optional[str] = None
|
||||
extension: Optional[str] = None
|
||||
if "content-disposition" in response.headers:
|
||||
m = re.search(r"filename=([^;]+)", response.headers["content-disposition"])
|
||||
if m:
|
||||
filename = m.group(1).strip("\"'")
|
||||
_, _extension = os.path.splitext(filename)
|
||||
if len(_extension) > 0:
|
||||
extension = _extension
|
||||
|
||||
# If there is still no filename, try to read it from the url
|
||||
if filename is None:
|
||||
parsed_url = urlparse(response.url)
|
||||
_, _extension = os.path.splitext(parsed_url.path)
|
||||
if len(_extension) > 0: # Looks like this might be a file!
|
||||
filename = os.path.basename(parsed_url.path)
|
||||
extension = _extension
|
||||
|
||||
# Create an initial guess from all this information
|
||||
base_guess = StreamInfo(
|
||||
mimetype=mimetype,
|
||||
charset=charset,
|
||||
filename=filename,
|
||||
extension=extension,
|
||||
url=response.url,
|
||||
)
|
||||
|
||||
# Update with any additional info from the arguments
|
||||
if stream_info is not None:
|
||||
base_guess = base_guess.copy_and_update(stream_info)
|
||||
if file_extension is not None:
|
||||
# Deprecated -- use stream_info
|
||||
base_guess = base_guess.copy_and_update(extension=file_extension)
|
||||
if url is not None:
|
||||
# Deprecated -- use stream_info
|
||||
base_guess = base_guess.copy_and_update(url=url)
|
||||
|
||||
# Read into BytesIO
|
||||
buffer = io.BytesIO()
|
||||
for chunk in response.iter_content(chunk_size=512):
|
||||
buffer.write(chunk)
|
||||
buffer.seek(0)
|
||||
|
||||
# Convert
|
||||
guesses = self._get_stream_info_guesses(
|
||||
file_stream=buffer, base_guess=base_guess
|
||||
)
|
||||
return self._convert(file_stream=buffer, stream_info_guesses=guesses, **kwargs)
|
||||
|
||||
def _convert(
|
||||
self, *, file_stream: BinaryIO, stream_info_guesses: List[StreamInfo], **kwargs
|
||||
) -> DocumentConverterResult:
|
||||
res: Union[None, DocumentConverterResult] = None
|
||||
|
||||
# Keep track of which converters throw exceptions
|
||||
failed_attempts: List[FailedConversionAttempt] = []
|
||||
|
||||
# Create a copy of the page_converters list, sorted by priority.
|
||||
# We do this with each call to _convert because the priority of converters may change between calls.
|
||||
# The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
|
||||
sorted_registrations = sorted(self._converters, key=lambda x: x.priority)
|
||||
|
||||
# Remember the initial stream position so that we can return to it
|
||||
cur_pos = file_stream.tell()
|
||||
|
||||
for stream_info in stream_info_guesses + [StreamInfo()]:
|
||||
for converter_registration in sorted_registrations:
|
||||
converter = converter_registration.converter
|
||||
# Sanity check -- make sure the cur_pos is still the same
|
||||
assert (
|
||||
cur_pos == file_stream.tell()
|
||||
), "File stream position should NOT change between guess iterations"
|
||||
|
||||
_kwargs = {k: v for k, v in kwargs.items()}
|
||||
|
||||
# Copy any additional global options
|
||||
if "llm_client" not in _kwargs and self._llm_client is not None:
|
||||
_kwargs["llm_client"] = self._llm_client
|
||||
|
||||
if "llm_model" not in _kwargs and self._llm_model is not None:
|
||||
_kwargs["llm_model"] = self._llm_model
|
||||
|
||||
if "llm_prompt" not in _kwargs and self._llm_prompt is not None:
|
||||
_kwargs["llm_prompt"] = self._llm_prompt
|
||||
|
||||
if "style_map" not in _kwargs and self._style_map is not None:
|
||||
_kwargs["style_map"] = self._style_map
|
||||
|
||||
if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
|
||||
_kwargs["exiftool_path"] = self._exiftool_path
|
||||
|
||||
# Add the list of converters for nested processing
|
||||
_kwargs["_parent_converters"] = self._converters
|
||||
|
||||
# Add legaxy kwargs
|
||||
if stream_info is not None:
|
||||
if stream_info.extension is not None:
|
||||
_kwargs["file_extension"] = stream_info.extension
|
||||
|
||||
if stream_info.url is not None:
|
||||
_kwargs["url"] = stream_info.url
|
||||
|
||||
# Check if the converter will accept the file, and if so, try to convert it
|
||||
_accepts = False
|
||||
try:
|
||||
_accepts = converter.accepts(file_stream, stream_info, **_kwargs)
|
||||
except NotImplementedError:
|
||||
pass
|
||||
|
||||
# accept() should not have changed the file stream position
|
||||
assert (
|
||||
cur_pos == file_stream.tell()
|
||||
), f"{type(converter).__name__}.accept() should NOT change the file_stream position"
|
||||
|
||||
# Attempt the conversion
|
||||
if _accepts:
|
||||
try:
|
||||
res = converter.convert(file_stream, stream_info, **_kwargs)
|
||||
except Exception:
|
||||
failed_attempts.append(
|
||||
FailedConversionAttempt(
|
||||
converter=converter, exc_info=sys.exc_info()
|
||||
)
|
||||
)
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
if res is not None:
|
||||
# Normalize the content
|
||||
res.text_content = "\n".join(
|
||||
[line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
|
||||
)
|
||||
res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
|
||||
return res
|
||||
|
||||
# If we got this far without success, report any exceptions
|
||||
if len(failed_attempts) > 0:
|
||||
raise FileConversionException(attempts=failed_attempts)
|
||||
|
||||
# Nothing can handle it!
|
||||
raise UnsupportedFormatException(
|
||||
"Could not convert stream to Markdown. No converter attempted a conversion, suggesting that the filetype is simply not supported."
|
||||
)
|
||||
|
||||
def register_page_converter(self, converter: DocumentConverter) -> None:
|
||||
"""DEPRECATED: User register_converter instead."""
|
||||
warn(
|
||||
"register_page_converter is deprecated. Use register_converter instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
self.register_converter(converter)
|
||||
|
||||
def register_converter(
|
||||
self,
|
||||
converter: DocumentConverter,
|
||||
*,
|
||||
priority: float = PRIORITY_SPECIFIC_FILE_FORMAT,
|
||||
) -> None:
|
||||
"""
|
||||
Register a DocumentConverter with a given priority.
|
||||
|
||||
Priorities work as follows: By default, most converters get priority
|
||||
DocumentConverter.PRIORITY_SPECIFIC_FILE_FORMAT (== 0). The exception
|
||||
is the PlainTextConverter, HtmlConverter, and ZipConverter, which get
|
||||
priority PRIORITY_SPECIFIC_FILE_FORMAT (== 10), with lower values
|
||||
being tried first (i.e., higher priority).
|
||||
|
||||
Just prior to conversion, the converters are sorted by priority, using
|
||||
a stable sort. This means that converters with the same priority will
|
||||
remain in the same order, with the most recently registered converters
|
||||
appearing first.
|
||||
|
||||
We have tight control over the order of built-in converters, but
|
||||
plugins can register converters in any order. The registration's priority
|
||||
field reasserts some control over the order of converters.
|
||||
|
||||
Plugins can register converters with any priority, to appear before or
|
||||
after the built-ins. For example, a plugin with priority 9 will run
|
||||
before the PlainTextConverter, but after the built-in converters.
|
||||
"""
|
||||
self._converters.insert(
|
||||
0, ConverterRegistration(converter=converter, priority=priority)
|
||||
)
|
||||
|
||||
def _get_stream_info_guesses(
|
||||
self, file_stream: BinaryIO, base_guess: StreamInfo
|
||||
) -> List[StreamInfo]:
|
||||
"""
|
||||
Given a base guess, attempt to guess or expand on the stream info using the stream content (via magika).
|
||||
"""
|
||||
guesses: List[StreamInfo] = []
|
||||
|
||||
# Enhance the base guess with information based on the extension or mimetype
|
||||
enhanced_guess = base_guess.copy_and_update()
|
||||
|
||||
# If there's an extension and no mimetype, try to guess the mimetype
|
||||
if base_guess.mimetype is None and base_guess.extension is not None:
|
||||
_m, _ = mimetypes.guess_type(
|
||||
"placeholder" + base_guess.extension, strict=False
|
||||
)
|
||||
if _m is not None:
|
||||
enhanced_guess = enhanced_guess.copy_and_update(mimetype=_m)
|
||||
|
||||
# If there's a mimetype and no extension, try to guess the extension
|
||||
if base_guess.mimetype is not None and base_guess.extension is None:
|
||||
_e = mimetypes.guess_all_extensions(base_guess.mimetype, strict=False)
|
||||
if len(_e) > 0:
|
||||
enhanced_guess = enhanced_guess.copy_and_update(extension=_e[0])
|
||||
|
||||
# Call magika to guess from the stream
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
result = self._magika.identify_stream(file_stream)
|
||||
if result.status == "ok" and result.prediction.output.label != "unknown":
|
||||
# If it's text, also guess the charset
|
||||
charset = None
|
||||
if result.prediction.output.is_text:
|
||||
# Read the first 4k to guess the charset
|
||||
file_stream.seek(cur_pos)
|
||||
stream_page = file_stream.read(4096)
|
||||
charset_result = charset_normalizer.from_bytes(stream_page).best()
|
||||
|
||||
if charset_result is not None:
|
||||
charset = self._normalize_charset(charset_result.encoding)
|
||||
|
||||
# Normalize the first extension listed
|
||||
guessed_extension = None
|
||||
if len(result.prediction.output.extensions) > 0:
|
||||
guessed_extension = "." + result.prediction.output.extensions[0]
|
||||
|
||||
# Determine if the guess is compatible with the base guess
|
||||
compatible = True
|
||||
if (
|
||||
base_guess.mimetype is not None
|
||||
and base_guess.mimetype != result.prediction.output.mime_type
|
||||
):
|
||||
compatible = False
|
||||
|
||||
if (
|
||||
base_guess.extension is not None
|
||||
and base_guess.extension.lstrip(".")
|
||||
not in result.prediction.output.extensions
|
||||
):
|
||||
compatible = False
|
||||
|
||||
if (
|
||||
base_guess.charset is not None
|
||||
and self._normalize_charset(base_guess.charset) != charset
|
||||
):
|
||||
compatible = False
|
||||
|
||||
if compatible:
|
||||
# Add the compatible base guess
|
||||
guesses.append(
|
||||
StreamInfo(
|
||||
mimetype=base_guess.mimetype
|
||||
or result.prediction.output.mime_type,
|
||||
extension=base_guess.extension or guessed_extension,
|
||||
charset=base_guess.charset or charset,
|
||||
filename=base_guess.filename,
|
||||
local_path=base_guess.local_path,
|
||||
url=base_guess.url,
|
||||
)
|
||||
)
|
||||
else:
|
||||
# The magika guess was incompatible with the base guess, so add both guesses
|
||||
guesses.append(enhanced_guess)
|
||||
guesses.append(
|
||||
StreamInfo(
|
||||
mimetype=result.prediction.output.mime_type,
|
||||
extension=guessed_extension,
|
||||
charset=charset,
|
||||
filename=base_guess.filename,
|
||||
local_path=base_guess.local_path,
|
||||
url=base_guess.url,
|
||||
)
|
||||
)
|
||||
else:
|
||||
# There were no other guesses, so just add the base guess
|
||||
guesses.append(enhanced_guess)
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
return guesses
|
||||
|
||||
def _normalize_charset(self, charset: str | None) -> str | None:
|
||||
"""
|
||||
Normalize a charset string to a canonical form.
|
||||
"""
|
||||
if charset is None:
|
||||
return None
|
||||
try:
|
||||
return codecs.lookup(charset).name
|
||||
except LookupError:
|
||||
return charset
|
||||
@@ -0,0 +1,32 @@
|
||||
from dataclasses import dataclass, asdict
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass(kw_only=True, frozen=True)
|
||||
class StreamInfo:
|
||||
"""The StreamInfo class is used to store information about a file stream.
|
||||
All fields can be None, and will depend on how the stream was opened.
|
||||
"""
|
||||
|
||||
mimetype: Optional[str] = None
|
||||
extension: Optional[str] = None
|
||||
charset: Optional[str] = None
|
||||
filename: Optional[
|
||||
str
|
||||
] = None # From local path, url, or Content-Disposition header
|
||||
local_path: Optional[str] = None # If read from disk
|
||||
url: Optional[str] = None # If read from url
|
||||
|
||||
def copy_and_update(self, *args, **kwargs):
|
||||
"""Copy the StreamInfo object and update it with the given StreamInfo
|
||||
instance and/or other keyword arguments."""
|
||||
new_info = asdict(self)
|
||||
|
||||
for si in args:
|
||||
assert isinstance(si, StreamInfo)
|
||||
new_info.update({k: v for k, v in asdict(si).items() if v is not None})
|
||||
|
||||
if len(kwargs) > 0:
|
||||
new_info.update(kwargs)
|
||||
|
||||
return StreamInfo(**new_info)
|
||||
@@ -0,0 +1,52 @@
|
||||
import base64
|
||||
import os
|
||||
from typing import Tuple, Dict
|
||||
from urllib.request import url2pathname
|
||||
from urllib.parse import urlparse, unquote_to_bytes
|
||||
|
||||
|
||||
def file_uri_to_path(file_uri: str) -> Tuple[str | None, str]:
|
||||
"""Convert a file URI to a local file path"""
|
||||
parsed = urlparse(file_uri)
|
||||
if parsed.scheme != "file":
|
||||
raise ValueError(f"Not a file URL: {file_uri}")
|
||||
|
||||
netloc = parsed.netloc if parsed.netloc else None
|
||||
path = os.path.abspath(url2pathname(parsed.path))
|
||||
return netloc, path
|
||||
|
||||
|
||||
def parse_data_uri(uri: str) -> Tuple[str | None, Dict[str, str], bytes]:
|
||||
if not uri.startswith("data:"):
|
||||
raise ValueError("Not a data URI")
|
||||
|
||||
header, _, data = uri.partition(",")
|
||||
if not _:
|
||||
raise ValueError("Malformed data URI, missing ',' separator")
|
||||
|
||||
meta = header[5:] # Strip 'data:'
|
||||
parts = meta.split(";")
|
||||
|
||||
is_base64 = False
|
||||
# Ends with base64?
|
||||
if parts[-1] == "base64":
|
||||
parts.pop()
|
||||
is_base64 = True
|
||||
|
||||
mime_type = None # Normally this would default to text/plain but we won't assume
|
||||
if len(parts) and len(parts[0]) > 0:
|
||||
# First part is the mime type
|
||||
mime_type = parts.pop(0)
|
||||
|
||||
attributes: Dict[str, str] = {}
|
||||
for part in parts:
|
||||
# Handle key=value pairs in the middle
|
||||
if "=" in part:
|
||||
key, value = part.split("=", 1)
|
||||
attributes[key] = value
|
||||
elif len(part) > 0:
|
||||
attributes[part] = ""
|
||||
|
||||
content = base64.b64decode(data) if is_base64 else unquote_to_bytes(data)
|
||||
|
||||
return mime_type, attributes, content
|
||||
@@ -0,0 +1,273 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
|
||||
On 25/03/2025
|
||||
"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
|
||||
|
||||
BLANK = ""
|
||||
BACKSLASH = "\\"
|
||||
ALN = "&"
|
||||
|
||||
CHR = {
|
||||
# Unicode : Latex Math Symbols
|
||||
# Top accents
|
||||
"\u0300": "\\grave{{{0}}}",
|
||||
"\u0301": "\\acute{{{0}}}",
|
||||
"\u0302": "\\hat{{{0}}}",
|
||||
"\u0303": "\\tilde{{{0}}}",
|
||||
"\u0304": "\\bar{{{0}}}",
|
||||
"\u0305": "\\overbar{{{0}}}",
|
||||
"\u0306": "\\breve{{{0}}}",
|
||||
"\u0307": "\\dot{{{0}}}",
|
||||
"\u0308": "\\ddot{{{0}}}",
|
||||
"\u0309": "\\ovhook{{{0}}}",
|
||||
"\u030a": "\\ocirc{{{0}}}}",
|
||||
"\u030c": "\\check{{{0}}}}",
|
||||
"\u0310": "\\candra{{{0}}}",
|
||||
"\u0312": "\\oturnedcomma{{{0}}}",
|
||||
"\u0315": "\\ocommatopright{{{0}}}",
|
||||
"\u031a": "\\droang{{{0}}}",
|
||||
"\u0338": "\\not{{{0}}}",
|
||||
"\u20d0": "\\leftharpoonaccent{{{0}}}",
|
||||
"\u20d1": "\\rightharpoonaccent{{{0}}}",
|
||||
"\u20d2": "\\vertoverlay{{{0}}}",
|
||||
"\u20d6": "\\overleftarrow{{{0}}}",
|
||||
"\u20d7": "\\vec{{{0}}}",
|
||||
"\u20db": "\\dddot{{{0}}}",
|
||||
"\u20dc": "\\ddddot{{{0}}}",
|
||||
"\u20e1": "\\overleftrightarrow{{{0}}}",
|
||||
"\u20e7": "\\annuity{{{0}}}",
|
||||
"\u20e9": "\\widebridgeabove{{{0}}}",
|
||||
"\u20f0": "\\asteraccent{{{0}}}",
|
||||
# Bottom accents
|
||||
"\u0330": "\\wideutilde{{{0}}}",
|
||||
"\u0331": "\\underbar{{{0}}}",
|
||||
"\u20e8": "\\threeunderdot{{{0}}}",
|
||||
"\u20ec": "\\underrightharpoondown{{{0}}}",
|
||||
"\u20ed": "\\underleftharpoondown{{{0}}}",
|
||||
"\u20ee": "\\underledtarrow{{{0}}}",
|
||||
"\u20ef": "\\underrightarrow{{{0}}}",
|
||||
# Over | group
|
||||
"\u23b4": "\\overbracket{{{0}}}",
|
||||
"\u23dc": "\\overparen{{{0}}}",
|
||||
"\u23de": "\\overbrace{{{0}}}",
|
||||
# Under| group
|
||||
"\u23b5": "\\underbracket{{{0}}}",
|
||||
"\u23dd": "\\underparen{{{0}}}",
|
||||
"\u23df": "\\underbrace{{{0}}}",
|
||||
}
|
||||
|
||||
CHR_BO = {
|
||||
# Big operators,
|
||||
"\u2140": "\\Bbbsum",
|
||||
"\u220f": "\\prod",
|
||||
"\u2210": "\\coprod",
|
||||
"\u2211": "\\sum",
|
||||
"\u222b": "\\int",
|
||||
"\u22c0": "\\bigwedge",
|
||||
"\u22c1": "\\bigvee",
|
||||
"\u22c2": "\\bigcap",
|
||||
"\u22c3": "\\bigcup",
|
||||
"\u2a00": "\\bigodot",
|
||||
"\u2a01": "\\bigoplus",
|
||||
"\u2a02": "\\bigotimes",
|
||||
}
|
||||
|
||||
T = {
|
||||
"\u2192": "\\rightarrow ",
|
||||
# Greek letters
|
||||
"\U0001d6fc": "\\alpha ",
|
||||
"\U0001d6fd": "\\beta ",
|
||||
"\U0001d6fe": "\\gamma ",
|
||||
"\U0001d6ff": "\\theta ",
|
||||
"\U0001d700": "\\epsilon ",
|
||||
"\U0001d701": "\\zeta ",
|
||||
"\U0001d702": "\\eta ",
|
||||
"\U0001d703": "\\theta ",
|
||||
"\U0001d704": "\\iota ",
|
||||
"\U0001d705": "\\kappa ",
|
||||
"\U0001d706": "\\lambda ",
|
||||
"\U0001d707": "\\m ",
|
||||
"\U0001d708": "\\n ",
|
||||
"\U0001d709": "\\xi ",
|
||||
"\U0001d70a": "\\omicron ",
|
||||
"\U0001d70b": "\\pi ",
|
||||
"\U0001d70c": "\\rho ",
|
||||
"\U0001d70d": "\\varsigma ",
|
||||
"\U0001d70e": "\\sigma ",
|
||||
"\U0001d70f": "\\ta ",
|
||||
"\U0001d710": "\\upsilon ",
|
||||
"\U0001d711": "\\phi ",
|
||||
"\U0001d712": "\\chi ",
|
||||
"\U0001d713": "\\psi ",
|
||||
"\U0001d714": "\\omega ",
|
||||
"\U0001d715": "\\partial ",
|
||||
"\U0001d716": "\\varepsilon ",
|
||||
"\U0001d717": "\\vartheta ",
|
||||
"\U0001d718": "\\varkappa ",
|
||||
"\U0001d719": "\\varphi ",
|
||||
"\U0001d71a": "\\varrho ",
|
||||
"\U0001d71b": "\\varpi ",
|
||||
# Relation symbols
|
||||
"\u2190": "\\leftarrow ",
|
||||
"\u2191": "\\uparrow ",
|
||||
"\u2192": "\\rightarrow ",
|
||||
"\u2193": "\\downright ",
|
||||
"\u2194": "\\leftrightarrow ",
|
||||
"\u2195": "\\updownarrow ",
|
||||
"\u2196": "\\nwarrow ",
|
||||
"\u2197": "\\nearrow ",
|
||||
"\u2198": "\\searrow ",
|
||||
"\u2199": "\\swarrow ",
|
||||
"\u22ee": "\\vdots ",
|
||||
"\u22ef": "\\cdots ",
|
||||
"\u22f0": "\\adots ",
|
||||
"\u22f1": "\\ddots ",
|
||||
"\u2260": "\\ne ",
|
||||
"\u2264": "\\leq ",
|
||||
"\u2265": "\\geq ",
|
||||
"\u2266": "\\leqq ",
|
||||
"\u2267": "\\geqq ",
|
||||
"\u2268": "\\lneqq ",
|
||||
"\u2269": "\\gneqq ",
|
||||
"\u226a": "\\ll ",
|
||||
"\u226b": "\\gg ",
|
||||
"\u2208": "\\in ",
|
||||
"\u2209": "\\notin ",
|
||||
"\u220b": "\\ni ",
|
||||
"\u220c": "\\nni ",
|
||||
# Ordinary symbols
|
||||
"\u221e": "\\infty ",
|
||||
# Binary relations
|
||||
"\u00b1": "\\pm ",
|
||||
"\u2213": "\\mp ",
|
||||
# Italic, Latin, uppercase
|
||||
"\U0001d434": "A",
|
||||
"\U0001d435": "B",
|
||||
"\U0001d436": "C",
|
||||
"\U0001d437": "D",
|
||||
"\U0001d438": "E",
|
||||
"\U0001d439": "F",
|
||||
"\U0001d43a": "G",
|
||||
"\U0001d43b": "H",
|
||||
"\U0001d43c": "I",
|
||||
"\U0001d43d": "J",
|
||||
"\U0001d43e": "K",
|
||||
"\U0001d43f": "L",
|
||||
"\U0001d440": "M",
|
||||
"\U0001d441": "N",
|
||||
"\U0001d442": "O",
|
||||
"\U0001d443": "P",
|
||||
"\U0001d444": "Q",
|
||||
"\U0001d445": "R",
|
||||
"\U0001d446": "S",
|
||||
"\U0001d447": "T",
|
||||
"\U0001d448": "U",
|
||||
"\U0001d449": "V",
|
||||
"\U0001d44a": "W",
|
||||
"\U0001d44b": "X",
|
||||
"\U0001d44c": "Y",
|
||||
"\U0001d44d": "Z",
|
||||
# Italic, Latin, lowercase
|
||||
"\U0001d44e": "a",
|
||||
"\U0001d44f": "b",
|
||||
"\U0001d450": "c",
|
||||
"\U0001d451": "d",
|
||||
"\U0001d452": "e",
|
||||
"\U0001d453": "f",
|
||||
"\U0001d454": "g",
|
||||
"\U0001d456": "i",
|
||||
"\U0001d457": "j",
|
||||
"\U0001d458": "k",
|
||||
"\U0001d459": "l",
|
||||
"\U0001d45a": "m",
|
||||
"\U0001d45b": "n",
|
||||
"\U0001d45c": "o",
|
||||
"\U0001d45d": "p",
|
||||
"\U0001d45e": "q",
|
||||
"\U0001d45f": "r",
|
||||
"\U0001d460": "s",
|
||||
"\U0001d461": "t",
|
||||
"\U0001d462": "u",
|
||||
"\U0001d463": "v",
|
||||
"\U0001d464": "w",
|
||||
"\U0001d465": "x",
|
||||
"\U0001d466": "y",
|
||||
"\U0001d467": "z",
|
||||
}
|
||||
|
||||
FUNC = {
|
||||
"sin": "\\sin({fe})",
|
||||
"cos": "\\cos({fe})",
|
||||
"tan": "\\tan({fe})",
|
||||
"arcsin": "\\arcsin({fe})",
|
||||
"arccos": "\\arccos({fe})",
|
||||
"arctan": "\\arctan({fe})",
|
||||
"arccot": "\\arccot({fe})",
|
||||
"sinh": "\\sinh({fe})",
|
||||
"cosh": "\\cosh({fe})",
|
||||
"tanh": "\\tanh({fe})",
|
||||
"coth": "\\coth({fe})",
|
||||
"sec": "\\sec({fe})",
|
||||
"csc": "\\csc({fe})",
|
||||
}
|
||||
|
||||
FUNC_PLACE = "{fe}"
|
||||
|
||||
BRK = "\\\\"
|
||||
|
||||
CHR_DEFAULT = {
|
||||
"ACC_VAL": "\\hat{{{0}}}",
|
||||
}
|
||||
|
||||
POS = {
|
||||
"top": "\\overline{{{0}}}", # not sure
|
||||
"bot": "\\underline{{{0}}}",
|
||||
}
|
||||
|
||||
POS_DEFAULT = {
|
||||
"BAR_VAL": "\\overline{{{0}}}",
|
||||
}
|
||||
|
||||
SUB = "_{{{0}}}"
|
||||
|
||||
SUP = "^{{{0}}}"
|
||||
|
||||
F = {
|
||||
"bar": "\\frac{{{num}}}{{{den}}}",
|
||||
"skw": r"^{{{num}}}/_{{{den}}}",
|
||||
"noBar": "\\genfrac{{}}{{}}{{0pt}}{{}}{{{num}}}{{{den}}}",
|
||||
"lin": "{{{num}}}/{{{den}}}",
|
||||
}
|
||||
F_DEFAULT = "\\frac{{{num}}}{{{den}}}"
|
||||
|
||||
D = "\\left{left}{text}\\right{right}"
|
||||
|
||||
D_DEFAULT = {
|
||||
"left": "(",
|
||||
"right": ")",
|
||||
"null": ".",
|
||||
}
|
||||
|
||||
RAD = "\\sqrt[{deg}]{{{text}}}"
|
||||
|
||||
RAD_DEFAULT = "\\sqrt{{{text}}}"
|
||||
|
||||
ARR = "\\begin{{array}}{{c}}{text}\\end{{array}}"
|
||||
|
||||
LIM_FUNC = {
|
||||
"lim": "\\lim_{{{lim}}}",
|
||||
"max": "\\max_{{{lim}}}",
|
||||
"min": "\\min_{{{lim}}}",
|
||||
}
|
||||
|
||||
LIM_TO = ("\\rightarrow", "\\to")
|
||||
|
||||
LIM_UPP = "\\overset{{{lim}}}{{{text}}}"
|
||||
|
||||
M = "\\begin{{matrix}}{text}\\end{{matrix}}"
|
||||
@@ -0,0 +1,400 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Office Math Markup Language (OMML)
|
||||
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py
|
||||
On 25/03/2025
|
||||
"""
|
||||
|
||||
from defusedxml import ElementTree as ET
|
||||
|
||||
from .latex_dict import (
|
||||
CHARS,
|
||||
CHR,
|
||||
CHR_BO,
|
||||
CHR_DEFAULT,
|
||||
POS,
|
||||
POS_DEFAULT,
|
||||
SUB,
|
||||
SUP,
|
||||
F,
|
||||
F_DEFAULT,
|
||||
T,
|
||||
FUNC,
|
||||
D,
|
||||
D_DEFAULT,
|
||||
RAD,
|
||||
RAD_DEFAULT,
|
||||
ARR,
|
||||
LIM_FUNC,
|
||||
LIM_TO,
|
||||
LIM_UPP,
|
||||
M,
|
||||
BRK,
|
||||
BLANK,
|
||||
BACKSLASH,
|
||||
ALN,
|
||||
FUNC_PLACE,
|
||||
)
|
||||
|
||||
OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}"
|
||||
|
||||
|
||||
def load(stream):
|
||||
tree = ET.parse(stream)
|
||||
for omath in tree.findall(OMML_NS + "oMath"):
|
||||
yield oMath2Latex(omath)
|
||||
|
||||
|
||||
def load_string(string):
|
||||
root = ET.fromstring(string)
|
||||
for omath in root.findall(OMML_NS + "oMath"):
|
||||
yield oMath2Latex(omath)
|
||||
|
||||
|
||||
def escape_latex(strs):
|
||||
last = None
|
||||
new_chr = []
|
||||
strs = strs.replace(r"\\", "\\")
|
||||
for c in strs:
|
||||
if (c in CHARS) and (last != BACKSLASH):
|
||||
new_chr.append(BACKSLASH + c)
|
||||
else:
|
||||
new_chr.append(c)
|
||||
last = c
|
||||
return BLANK.join(new_chr)
|
||||
|
||||
|
||||
def get_val(key, default=None, store=CHR):
|
||||
if key is not None:
|
||||
return key if not store else store.get(key, key)
|
||||
else:
|
||||
return default
|
||||
|
||||
|
||||
class Tag2Method(object):
|
||||
def call_method(self, elm, stag=None):
|
||||
getmethod = self.tag2meth.get
|
||||
if stag is None:
|
||||
stag = elm.tag.replace(OMML_NS, "")
|
||||
method = getmethod(stag)
|
||||
if method:
|
||||
return method(self, elm)
|
||||
else:
|
||||
return None
|
||||
|
||||
def process_children_list(self, elm, include=None):
|
||||
"""
|
||||
process children of the elm,return iterable
|
||||
"""
|
||||
for _e in list(elm):
|
||||
if OMML_NS not in _e.tag:
|
||||
continue
|
||||
stag = _e.tag.replace(OMML_NS, "")
|
||||
if include and (stag not in include):
|
||||
continue
|
||||
t = self.call_method(_e, stag=stag)
|
||||
if t is None:
|
||||
t = self.process_unknow(_e, stag)
|
||||
if t is None:
|
||||
continue
|
||||
yield (stag, t, _e)
|
||||
|
||||
def process_children_dict(self, elm, include=None):
|
||||
"""
|
||||
process children of the elm,return dict
|
||||
"""
|
||||
latex_chars = dict()
|
||||
for stag, t, e in self.process_children_list(elm, include):
|
||||
latex_chars[stag] = t
|
||||
return latex_chars
|
||||
|
||||
def process_children(self, elm, include=None):
|
||||
"""
|
||||
process children of the elm,return string
|
||||
"""
|
||||
return BLANK.join(
|
||||
(
|
||||
t if not isinstance(t, Tag2Method) else str(t)
|
||||
for stag, t, e in self.process_children_list(elm, include)
|
||||
)
|
||||
)
|
||||
|
||||
def process_unknow(self, elm, stag):
|
||||
return None
|
||||
|
||||
|
||||
class Pr(Tag2Method):
|
||||
text = ""
|
||||
|
||||
__val_tags = ("chr", "pos", "begChr", "endChr", "type")
|
||||
|
||||
__innerdict = None # can't use the __dict__
|
||||
|
||||
""" common properties of element"""
|
||||
|
||||
def __init__(self, elm):
|
||||
self.__innerdict = {}
|
||||
self.text = self.process_children(elm)
|
||||
|
||||
def __str__(self):
|
||||
return self.text
|
||||
|
||||
def __unicode__(self):
|
||||
return self.__str__(self)
|
||||
|
||||
def __getattr__(self, name):
|
||||
return self.__innerdict.get(name, None)
|
||||
|
||||
def do_brk(self, elm):
|
||||
self.__innerdict["brk"] = BRK
|
||||
return BRK
|
||||
|
||||
def do_common(self, elm):
|
||||
stag = elm.tag.replace(OMML_NS, "")
|
||||
if stag in self.__val_tags:
|
||||
t = elm.get("{0}val".format(OMML_NS))
|
||||
self.__innerdict[stag] = t
|
||||
return None
|
||||
|
||||
tag2meth = {
|
||||
"brk": do_brk,
|
||||
"chr": do_common,
|
||||
"pos": do_common,
|
||||
"begChr": do_common,
|
||||
"endChr": do_common,
|
||||
"type": do_common,
|
||||
}
|
||||
|
||||
|
||||
class oMath2Latex(Tag2Method):
|
||||
"""
|
||||
Convert oMath element of omml to latex
|
||||
"""
|
||||
|
||||
_t_dict = T
|
||||
|
||||
__direct_tags = ("box", "sSub", "sSup", "sSubSup", "num", "den", "deg", "e")
|
||||
|
||||
def __init__(self, element):
|
||||
self._latex = self.process_children(element)
|
||||
|
||||
def __str__(self):
|
||||
return self.latex
|
||||
|
||||
def __unicode__(self):
|
||||
return self.__str__(self)
|
||||
|
||||
def process_unknow(self, elm, stag):
|
||||
if stag in self.__direct_tags:
|
||||
return self.process_children(elm)
|
||||
elif stag[-2:] == "Pr":
|
||||
return Pr(elm)
|
||||
else:
|
||||
return None
|
||||
|
||||
@property
|
||||
def latex(self):
|
||||
return self._latex
|
||||
|
||||
def do_acc(self, elm):
|
||||
"""
|
||||
the accent function
|
||||
"""
|
||||
c_dict = self.process_children_dict(elm)
|
||||
latex_s = get_val(
|
||||
c_dict["accPr"].chr, default=CHR_DEFAULT.get("ACC_VAL"), store=CHR
|
||||
)
|
||||
return latex_s.format(c_dict["e"])
|
||||
|
||||
def do_bar(self, elm):
|
||||
"""
|
||||
the bar function
|
||||
"""
|
||||
c_dict = self.process_children_dict(elm)
|
||||
pr = c_dict["barPr"]
|
||||
latex_s = get_val(pr.pos, default=POS_DEFAULT.get("BAR_VAL"), store=POS)
|
||||
return pr.text + latex_s.format(c_dict["e"])
|
||||
|
||||
def do_d(self, elm):
|
||||
"""
|
||||
the delimiter object
|
||||
"""
|
||||
c_dict = self.process_children_dict(elm)
|
||||
pr = c_dict["dPr"]
|
||||
null = D_DEFAULT.get("null")
|
||||
s_val = get_val(pr.begChr, default=D_DEFAULT.get("left"), store=T)
|
||||
e_val = get_val(pr.endChr, default=D_DEFAULT.get("right"), store=T)
|
||||
return pr.text + D.format(
|
||||
left=null if not s_val else escape_latex(s_val),
|
||||
text=c_dict["e"],
|
||||
right=null if not e_val else escape_latex(e_val),
|
||||
)
|
||||
|
||||
def do_spre(self, elm):
|
||||
"""
|
||||
the Pre-Sub-Superscript object -- Not support yet
|
||||
"""
|
||||
pass
|
||||
|
||||
def do_sub(self, elm):
|
||||
text = self.process_children(elm)
|
||||
return SUB.format(text)
|
||||
|
||||
def do_sup(self, elm):
|
||||
text = self.process_children(elm)
|
||||
return SUP.format(text)
|
||||
|
||||
def do_f(self, elm):
|
||||
"""
|
||||
the fraction object
|
||||
"""
|
||||
c_dict = self.process_children_dict(elm)
|
||||
pr = c_dict["fPr"]
|
||||
latex_s = get_val(pr.type, default=F_DEFAULT, store=F)
|
||||
return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den"))
|
||||
|
||||
def do_func(self, elm):
|
||||
"""
|
||||
the Function-Apply object (Examples:sin cos)
|
||||
"""
|
||||
c_dict = self.process_children_dict(elm)
|
||||
func_name = c_dict.get("fName")
|
||||
return func_name.replace(FUNC_PLACE, c_dict.get("e"))
|
||||
|
||||
def do_fname(self, elm):
|
||||
"""
|
||||
the func name
|
||||
"""
|
||||
latex_chars = []
|
||||
for stag, t, e in self.process_children_list(elm):
|
||||
if stag == "r":
|
||||
if FUNC.get(t):
|
||||
latex_chars.append(FUNC[t])
|
||||
else:
|
||||
raise NotImplementedError("Not support func %s" % t)
|
||||
else:
|
||||
latex_chars.append(t)
|
||||
t = BLANK.join(latex_chars)
|
||||
return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this
|
||||
|
||||
def do_groupchr(self, elm):
|
||||
"""
|
||||
the Group-Character object
|
||||
"""
|
||||
c_dict = self.process_children_dict(elm)
|
||||
pr = c_dict["groupChrPr"]
|
||||
latex_s = get_val(pr.chr)
|
||||
return pr.text + latex_s.format(c_dict["e"])
|
||||
|
||||
def do_rad(self, elm):
|
||||
"""
|
||||
the radical object
|
||||
"""
|
||||
c_dict = self.process_children_dict(elm)
|
||||
text = c_dict.get("e")
|
||||
deg_text = c_dict.get("deg")
|
||||
if deg_text:
|
||||
return RAD.format(deg=deg_text, text=text)
|
||||
else:
|
||||
return RAD_DEFAULT.format(text=text)
|
||||
|
||||
def do_eqarr(self, elm):
|
||||
"""
|
||||
the Array object
|
||||
"""
|
||||
return ARR.format(
|
||||
text=BRK.join(
|
||||
[t for stag, t, e in self.process_children_list(elm, include=("e",))]
|
||||
)
|
||||
)
|
||||
|
||||
def do_limlow(self, elm):
|
||||
"""
|
||||
the Lower-Limit object
|
||||
"""
|
||||
t_dict = self.process_children_dict(elm, include=("e", "lim"))
|
||||
latex_s = LIM_FUNC.get(t_dict["e"])
|
||||
if not latex_s:
|
||||
raise NotImplementedError("Not support lim %s" % t_dict["e"])
|
||||
else:
|
||||
return latex_s.format(lim=t_dict.get("lim"))
|
||||
|
||||
def do_limupp(self, elm):
|
||||
"""
|
||||
the Upper-Limit object
|
||||
"""
|
||||
t_dict = self.process_children_dict(elm, include=("e", "lim"))
|
||||
return LIM_UPP.format(lim=t_dict.get("lim"), text=t_dict.get("e"))
|
||||
|
||||
def do_lim(self, elm):
|
||||
"""
|
||||
the lower limit of the limLow object and the upper limit of the limUpp function
|
||||
"""
|
||||
return self.process_children(elm).replace(LIM_TO[0], LIM_TO[1])
|
||||
|
||||
def do_m(self, elm):
|
||||
"""
|
||||
the Matrix object
|
||||
"""
|
||||
rows = []
|
||||
for stag, t, e in self.process_children_list(elm):
|
||||
if stag == "mPr":
|
||||
pass
|
||||
elif stag == "mr":
|
||||
rows.append(t)
|
||||
return M.format(text=BRK.join(rows))
|
||||
|
||||
def do_mr(self, elm):
|
||||
"""
|
||||
a single row of the matrix m
|
||||
"""
|
||||
return ALN.join(
|
||||
[t for stag, t, e in self.process_children_list(elm, include=("e",))]
|
||||
)
|
||||
|
||||
def do_nary(self, elm):
|
||||
"""
|
||||
the n-ary object
|
||||
"""
|
||||
res = []
|
||||
bo = ""
|
||||
for stag, t, e in self.process_children_list(elm):
|
||||
if stag == "naryPr":
|
||||
bo = get_val(t.chr, store=CHR_BO)
|
||||
else:
|
||||
res.append(t)
|
||||
return bo + BLANK.join(res)
|
||||
|
||||
def do_r(self, elm):
|
||||
"""
|
||||
Get text from 'r' element,And try convert them to latex symbols
|
||||
@todo text style support , (sty)
|
||||
@todo \text (latex pure text support)
|
||||
"""
|
||||
_str = []
|
||||
for s in elm.findtext("./{0}t".format(OMML_NS)):
|
||||
# s = s if isinstance(s,unicode) else unicode(s,'utf-8')
|
||||
_str.append(self._t_dict.get(s, s))
|
||||
return escape_latex(BLANK.join(_str))
|
||||
|
||||
tag2meth = {
|
||||
"acc": do_acc,
|
||||
"r": do_r,
|
||||
"bar": do_bar,
|
||||
"sub": do_sub,
|
||||
"sup": do_sup,
|
||||
"f": do_f,
|
||||
"func": do_func,
|
||||
"fName": do_fname,
|
||||
"groupChr": do_groupchr,
|
||||
"d": do_d,
|
||||
"rad": do_rad,
|
||||
"eqArr": do_eqarr,
|
||||
"limLow": do_limlow,
|
||||
"limUpp": do_limupp,
|
||||
"lim": do_lim,
|
||||
"m": do_m,
|
||||
"mr": do_mr,
|
||||
"nary": do_nary,
|
||||
}
|
||||
@@ -0,0 +1,156 @@
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
from typing import BinaryIO
|
||||
from xml.etree import ElementTree as ET
|
||||
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
from .math.omml import OMML_NS, oMath2Latex
|
||||
|
||||
MATH_ROOT_TEMPLATE = "".join(
|
||||
(
|
||||
"<w:document ",
|
||||
'xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" ',
|
||||
'xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" ',
|
||||
'xmlns:o="urn:schemas-microsoft-com:office:office" ',
|
||||
'xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" ',
|
||||
'xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" ',
|
||||
'xmlns:v="urn:schemas-microsoft-com:vml" ',
|
||||
'xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" ',
|
||||
'xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" ',
|
||||
'xmlns:w10="urn:schemas-microsoft-com:office:word" ',
|
||||
'xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" ',
|
||||
'xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" ',
|
||||
'xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" ',
|
||||
'xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" ',
|
||||
'xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" ',
|
||||
'xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 wp14">',
|
||||
"{0}</w:document>",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def _convert_omath_to_latex(tag: Tag) -> str:
|
||||
"""
|
||||
Converts an OMML (Office Math Markup Language) tag to LaTeX format.
|
||||
|
||||
Args:
|
||||
tag (Tag): A BeautifulSoup Tag object representing the OMML element.
|
||||
|
||||
Returns:
|
||||
str: The LaTeX representation of the OMML element.
|
||||
"""
|
||||
# Format the tag into a complete XML document string
|
||||
math_root = ET.fromstring(MATH_ROOT_TEMPLATE.format(str(tag)))
|
||||
# Find the 'oMath' element within the XML document
|
||||
math_element = math_root.find(OMML_NS + "oMath")
|
||||
# Convert the 'oMath' element to LaTeX using the oMath2Latex function
|
||||
latex = oMath2Latex(math_element).latex
|
||||
return latex
|
||||
|
||||
|
||||
def _get_omath_tag_replacement(tag: Tag, block: bool = False) -> Tag:
|
||||
"""
|
||||
Creates a replacement tag for an OMML (Office Math Markup Language) element.
|
||||
|
||||
Args:
|
||||
tag (Tag): A BeautifulSoup Tag object representing the "oMath" element.
|
||||
block (bool, optional): If True, the LaTeX will be wrapped in double dollar signs for block mode. Defaults to False.
|
||||
|
||||
Returns:
|
||||
Tag: A BeautifulSoup Tag object representing the replacement element.
|
||||
"""
|
||||
t_tag = Tag(name="w:t")
|
||||
t_tag.string = (
|
||||
f"$${_convert_omath_to_latex(tag)}$$"
|
||||
if block
|
||||
else f"${_convert_omath_to_latex(tag)}$"
|
||||
)
|
||||
r_tag = Tag(name="w:r")
|
||||
r_tag.append(t_tag)
|
||||
return r_tag
|
||||
|
||||
|
||||
def _replace_equations(tag: Tag):
|
||||
"""
|
||||
Replaces OMML (Office Math Markup Language) elements with their LaTeX equivalents.
|
||||
|
||||
Args:
|
||||
tag (Tag): A BeautifulSoup Tag object representing the OMML element. Could be either "oMathPara" or "oMath".
|
||||
|
||||
Raises:
|
||||
ValueError: If the tag is not supported.
|
||||
"""
|
||||
if tag.name == "oMathPara":
|
||||
# Create a new paragraph tag
|
||||
p_tag = Tag(name="w:p")
|
||||
# Replace each 'oMath' child tag with its LaTeX equivalent as block equations
|
||||
for child_tag in tag.find_all("oMath"):
|
||||
p_tag.append(_get_omath_tag_replacement(child_tag, block=True))
|
||||
# Replace the original 'oMathPara' tag with the new paragraph tag
|
||||
tag.replace_with(p_tag)
|
||||
elif tag.name == "oMath":
|
||||
# Replace the 'oMath' tag with its LaTeX equivalent as inline equation
|
||||
tag.replace_with(_get_omath_tag_replacement(tag, block=False))
|
||||
else:
|
||||
raise ValueError(f"Not supported tag: {tag.name}")
|
||||
|
||||
|
||||
def _pre_process_math(content: bytes) -> bytes:
|
||||
"""
|
||||
Pre-processes the math content in a DOCX -> XML file by converting OMML (Office Math Markup Language) elements to LaTeX.
|
||||
This preprocessed content can be directly replaced in the DOCX file -> XMLs.
|
||||
|
||||
Args:
|
||||
content (bytes): The XML content of the DOCX file as bytes.
|
||||
|
||||
Returns:
|
||||
bytes: The processed content with OMML elements replaced by their LaTeX equivalents, encoded as bytes.
|
||||
"""
|
||||
soup = BeautifulSoup(content.decode(), features="xml")
|
||||
for tag in soup.find_all("oMathPara"):
|
||||
_replace_equations(tag)
|
||||
for tag in soup.find_all("oMath"):
|
||||
_replace_equations(tag)
|
||||
return str(soup).encode()
|
||||
|
||||
|
||||
def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
|
||||
"""
|
||||
Pre-processes a DOCX file with provided steps.
|
||||
|
||||
The process works by unzipping the DOCX file in memory, transforming specific XML files
|
||||
(such as converting OMML elements to LaTeX), and then zipping everything back into a
|
||||
DOCX file without writing to disk.
|
||||
|
||||
Args:
|
||||
input_docx (BinaryIO): A binary input stream representing the DOCX file.
|
||||
|
||||
Returns:
|
||||
BinaryIO: A binary output stream representing the processed DOCX file.
|
||||
"""
|
||||
output_docx = BytesIO()
|
||||
# The files that need to be pre-processed from .docx
|
||||
pre_process_enable_files = [
|
||||
"word/document.xml",
|
||||
"word/footnotes.xml",
|
||||
"word/endnotes.xml",
|
||||
]
|
||||
with zipfile.ZipFile(input_docx, mode="r") as zip_input:
|
||||
files = {name: zip_input.read(name) for name in zip_input.namelist()}
|
||||
with zipfile.ZipFile(output_docx, mode="w") as zip_output:
|
||||
zip_output.comment = zip_input.comment
|
||||
for name, content in files.items():
|
||||
if name in pre_process_enable_files:
|
||||
try:
|
||||
# Pre-process the content
|
||||
updated_content = _pre_process_math(content)
|
||||
# In the future, if there are more pre-processing steps, they can be added here
|
||||
zip_output.writestr(name, updated_content)
|
||||
except Exception:
|
||||
# If there is an error in processing the content, write the original content
|
||||
zip_output.writestr(name, content)
|
||||
else:
|
||||
zip_output.writestr(name, content)
|
||||
output_docx.seek(0)
|
||||
return output_docx
|
||||
@@ -0,0 +1,48 @@
|
||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
from ._plain_text_converter import PlainTextConverter
|
||||
from ._html_converter import HtmlConverter
|
||||
from ._rss_converter import RssConverter
|
||||
from ._wikipedia_converter import WikipediaConverter
|
||||
from ._youtube_converter import YouTubeConverter
|
||||
from ._ipynb_converter import IpynbConverter
|
||||
from ._bing_serp_converter import BingSerpConverter
|
||||
from ._pdf_converter import PdfConverter
|
||||
from ._docx_converter import DocxConverter
|
||||
from ._xlsx_converter import XlsxConverter, XlsConverter
|
||||
from ._pptx_converter import PptxConverter
|
||||
from ._image_converter import ImageConverter
|
||||
from ._audio_converter import AudioConverter
|
||||
from ._outlook_msg_converter import OutlookMsgConverter
|
||||
from ._zip_converter import ZipConverter
|
||||
from ._doc_intel_converter import (
|
||||
DocumentIntelligenceConverter,
|
||||
DocumentIntelligenceFileType,
|
||||
)
|
||||
from ._epub_converter import EpubConverter
|
||||
from ._csv_converter import CsvConverter
|
||||
|
||||
__all__ = [
|
||||
"PlainTextConverter",
|
||||
"HtmlConverter",
|
||||
"RssConverter",
|
||||
"WikipediaConverter",
|
||||
"YouTubeConverter",
|
||||
"IpynbConverter",
|
||||
"BingSerpConverter",
|
||||
"PdfConverter",
|
||||
"DocxConverter",
|
||||
"XlsxConverter",
|
||||
"XlsConverter",
|
||||
"PptxConverter",
|
||||
"ImageConverter",
|
||||
"AudioConverter",
|
||||
"OutlookMsgConverter",
|
||||
"ZipConverter",
|
||||
"DocumentIntelligenceConverter",
|
||||
"DocumentIntelligenceFileType",
|
||||
"EpubConverter",
|
||||
"CsvConverter",
|
||||
]
|
||||
@@ -0,0 +1,101 @@
|
||||
from typing import Any, BinaryIO
|
||||
|
||||
from ._exiftool import exiftool_metadata
|
||||
from ._transcribe_audio import transcribe_audio
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._exceptions import MissingDependencyException
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"audio/x-wav",
|
||||
"audio/mpeg",
|
||||
"video/mp4",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [
|
||||
".wav",
|
||||
".mp3",
|
||||
".m4a",
|
||||
".mp4",
|
||||
]
|
||||
|
||||
|
||||
class AudioConverter(DocumentConverter):
|
||||
"""
|
||||
Converts audio files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
||||
"""
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
md_content = ""
|
||||
|
||||
# Add metadata
|
||||
metadata = exiftool_metadata(
|
||||
file_stream, exiftool_path=kwargs.get("exiftool_path")
|
||||
)
|
||||
if metadata:
|
||||
for f in [
|
||||
"Title",
|
||||
"Artist",
|
||||
"Author",
|
||||
"Band",
|
||||
"Album",
|
||||
"Genre",
|
||||
"Track",
|
||||
"DateTimeOriginal",
|
||||
"CreateDate",
|
||||
# "Duration", -- Wrong values when read from memory
|
||||
"NumChannels",
|
||||
"SampleRate",
|
||||
"AvgBytesPerSec",
|
||||
"BitsPerSample",
|
||||
]:
|
||||
if f in metadata:
|
||||
md_content += f"{f}: {metadata[f]}\n"
|
||||
|
||||
# Figure out the audio format for transcription
|
||||
if stream_info.extension == ".wav" or stream_info.mimetype == "audio/x-wav":
|
||||
audio_format = "wav"
|
||||
elif stream_info.extension == ".mp3" or stream_info.mimetype == "audio/mpeg":
|
||||
audio_format = "mp3"
|
||||
elif (
|
||||
stream_info.extension in [".mp4", ".m4a"]
|
||||
or stream_info.mimetype == "video/mp4"
|
||||
):
|
||||
audio_format = "mp4"
|
||||
else:
|
||||
audio_format = None
|
||||
|
||||
# Transcribe
|
||||
if audio_format:
|
||||
try:
|
||||
transcript = transcribe_audio(file_stream, audio_format=audio_format)
|
||||
if transcript:
|
||||
md_content += "\n\n### Audio Transcript:\n" + transcript
|
||||
except MissingDependencyException:
|
||||
pass
|
||||
|
||||
# Return the result
|
||||
return DocumentConverterResult(markdown=md_content.strip())
|
||||
@@ -0,0 +1,120 @@
|
||||
import re
|
||||
import base64
|
||||
import binascii
|
||||
from urllib.parse import parse_qs, urlparse
|
||||
from typing import Any, BinaryIO
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"text/html",
|
||||
"application/xhtml",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [
|
||||
".html",
|
||||
".htm",
|
||||
]
|
||||
|
||||
|
||||
class BingSerpConverter(DocumentConverter):
|
||||
"""
|
||||
Handle Bing results pages (only the organic search results).
|
||||
NOTE: It is better to use the Bing API
|
||||
"""
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
"""
|
||||
Make sure we're dealing with HTML content *from* Bing.
|
||||
"""
|
||||
|
||||
url = stream_info.url or ""
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if not re.search(r"^https://www\.bing\.com/search\?q=", url):
|
||||
# Not a Bing SERP URL
|
||||
return False
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
# Not HTML content
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
assert stream_info.url is not None
|
||||
|
||||
# Parse the query parameters
|
||||
parsed_params = parse_qs(urlparse(stream_info.url).query)
|
||||
query = parsed_params.get("q", [""])[0]
|
||||
|
||||
# Parse the stream
|
||||
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
||||
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
||||
|
||||
# Clean up some formatting
|
||||
for tptt in soup.find_all(class_="tptt"):
|
||||
if hasattr(tptt, "string") and tptt.string:
|
||||
tptt.string += " "
|
||||
for slug in soup.find_all(class_="algoSlug_icon"):
|
||||
slug.extract()
|
||||
|
||||
# Parse the algorithmic results
|
||||
_markdownify = _CustomMarkdownify(**kwargs)
|
||||
results = list()
|
||||
for result in soup.find_all(class_="b_algo"):
|
||||
if not hasattr(result, "find_all"):
|
||||
continue
|
||||
|
||||
# Rewrite redirect urls
|
||||
for a in result.find_all("a", href=True):
|
||||
parsed_href = urlparse(a["href"])
|
||||
qs = parse_qs(parsed_href.query)
|
||||
|
||||
# The destination is contained in the u parameter,
|
||||
# but appears to be base64 encoded, with some prefix
|
||||
if "u" in qs:
|
||||
u = (
|
||||
qs["u"][0][2:].strip() + "=="
|
||||
) # Python 3 doesn't care about extra padding
|
||||
|
||||
try:
|
||||
# RFC 4648 / Base64URL" variant, which uses "-" and "_"
|
||||
a["href"] = base64.b64decode(u, altchars="-_").decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
except binascii.Error:
|
||||
pass
|
||||
|
||||
# Convert to markdown
|
||||
md_result = _markdownify.convert_soup(result).strip()
|
||||
lines = [line.strip() for line in re.split(r"\n+", md_result)]
|
||||
results.append("\n".join([line for line in lines if len(line) > 0]))
|
||||
|
||||
webpage_text = (
|
||||
f"## A Bing search for '{query}' found the following results:\n\n"
|
||||
+ "\n\n".join(results)
|
||||
)
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown=webpage_text,
|
||||
title=None if soup.title is None else soup.title.string,
|
||||
)
|
||||
@@ -0,0 +1,77 @@
|
||||
import csv
|
||||
import io
|
||||
from typing import BinaryIO, Any
|
||||
from charset_normalizer import from_bytes
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"text/csv",
|
||||
"application/csv",
|
||||
]
|
||||
ACCEPTED_FILE_EXTENSIONS = [".csv"]
|
||||
|
||||
|
||||
class CsvConverter(DocumentConverter):
|
||||
"""
|
||||
Converts CSV files to Markdown tables.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Read the file content
|
||||
if stream_info.charset:
|
||||
content = file_stream.read().decode(stream_info.charset)
|
||||
else:
|
||||
content = str(from_bytes(file_stream.read()).best())
|
||||
|
||||
# Parse CSV content
|
||||
reader = csv.reader(io.StringIO(content))
|
||||
rows = list(reader)
|
||||
|
||||
if not rows:
|
||||
return DocumentConverterResult(markdown="")
|
||||
|
||||
# Create markdown table
|
||||
markdown_table = []
|
||||
|
||||
# Add header row
|
||||
markdown_table.append("| " + " | ".join(rows[0]) + " |")
|
||||
|
||||
# Add separator row
|
||||
markdown_table.append("| " + " | ".join(["---"] * len(rows[0])) + " |")
|
||||
|
||||
# Add data rows
|
||||
for row in rows[1:]:
|
||||
# Make sure row has the same number of columns as header
|
||||
while len(row) < len(rows[0]):
|
||||
row.append("")
|
||||
# Truncate if row has more columns than header
|
||||
row = row[: len(rows[0])]
|
||||
markdown_table.append("| " + " | ".join(row) + " |")
|
||||
|
||||
result = "\n".join(markdown_table)
|
||||
|
||||
return DocumentConverterResult(markdown=result)
|
||||
@@ -0,0 +1,254 @@
|
||||
import sys
|
||||
import re
|
||||
import os
|
||||
from typing import BinaryIO, Any, List
|
||||
from enum import Enum
|
||||
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._exceptions import MissingDependencyException
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||
from azure.ai.documentintelligence.models import (
|
||||
AnalyzeDocumentRequest,
|
||||
AnalyzeResult,
|
||||
DocumentAnalysisFeature,
|
||||
)
|
||||
from azure.core.credentials import AzureKeyCredential, TokenCredential
|
||||
from azure.identity import DefaultAzureCredential
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
# Define these types for type hinting when the package is not available
|
||||
class AzureKeyCredential:
|
||||
pass
|
||||
|
||||
class TokenCredential:
|
||||
pass
|
||||
|
||||
class DocumentIntelligenceClient:
|
||||
pass
|
||||
|
||||
class AnalyzeDocumentRequest:
|
||||
pass
|
||||
|
||||
class AnalyzeResult:
|
||||
pass
|
||||
|
||||
class DocumentAnalysisFeature:
|
||||
pass
|
||||
|
||||
class DefaultAzureCredential:
|
||||
pass
|
||||
|
||||
|
||||
# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
|
||||
# This constant is a temporary fix until the bug is resolved.
|
||||
CONTENT_FORMAT = "markdown"
|
||||
|
||||
|
||||
class DocumentIntelligenceFileType(str, Enum):
|
||||
"""Enum of file types supported by the Document Intelligence Converter."""
|
||||
|
||||
# No OCR
|
||||
DOCX = "docx"
|
||||
PPTX = "pptx"
|
||||
XLSX = "xlsx"
|
||||
HTML = "html"
|
||||
# OCR
|
||||
PDF = "pdf"
|
||||
JPEG = "jpeg"
|
||||
PNG = "png"
|
||||
BMP = "bmp"
|
||||
TIFF = "tiff"
|
||||
|
||||
|
||||
def _get_mime_type_prefixes(types: List[DocumentIntelligenceFileType]) -> List[str]:
|
||||
"""Get the MIME type prefixes for the given file types."""
|
||||
prefixes: List[str] = []
|
||||
for type_ in types:
|
||||
if type_ == DocumentIntelligenceFileType.DOCX:
|
||||
prefixes.append(
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
)
|
||||
elif type_ == DocumentIntelligenceFileType.PPTX:
|
||||
prefixes.append(
|
||||
"application/vnd.openxmlformats-officedocument.presentationml"
|
||||
)
|
||||
elif type_ == DocumentIntelligenceFileType.XLSX:
|
||||
prefixes.append(
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
)
|
||||
elif type_ == DocumentIntelligenceFileType.HTML:
|
||||
prefixes.append("text/html")
|
||||
prefixes.append("application/xhtml+xml")
|
||||
elif type_ == DocumentIntelligenceFileType.PDF:
|
||||
prefixes.append("application/pdf")
|
||||
prefixes.append("application/x-pdf")
|
||||
elif type_ == DocumentIntelligenceFileType.JPEG:
|
||||
prefixes.append("image/jpeg")
|
||||
elif type_ == DocumentIntelligenceFileType.PNG:
|
||||
prefixes.append("image/png")
|
||||
elif type_ == DocumentIntelligenceFileType.BMP:
|
||||
prefixes.append("image/bmp")
|
||||
elif type_ == DocumentIntelligenceFileType.TIFF:
|
||||
prefixes.append("image/tiff")
|
||||
return prefixes
|
||||
|
||||
|
||||
def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str]:
|
||||
"""Get the file extensions for the given file types."""
|
||||
extensions: List[str] = []
|
||||
for type_ in types:
|
||||
if type_ == DocumentIntelligenceFileType.DOCX:
|
||||
extensions.append(".docx")
|
||||
elif type_ == DocumentIntelligenceFileType.PPTX:
|
||||
extensions.append(".pptx")
|
||||
elif type_ == DocumentIntelligenceFileType.XLSX:
|
||||
extensions.append(".xlsx")
|
||||
elif type_ == DocumentIntelligenceFileType.PDF:
|
||||
extensions.append(".pdf")
|
||||
elif type_ == DocumentIntelligenceFileType.JPEG:
|
||||
extensions.append(".jpg")
|
||||
extensions.append(".jpeg")
|
||||
elif type_ == DocumentIntelligenceFileType.PNG:
|
||||
extensions.append(".png")
|
||||
elif type_ == DocumentIntelligenceFileType.BMP:
|
||||
extensions.append(".bmp")
|
||||
elif type_ == DocumentIntelligenceFileType.TIFF:
|
||||
extensions.append(".tiff")
|
||||
elif type_ == DocumentIntelligenceFileType.HTML:
|
||||
extensions.append(".html")
|
||||
return extensions
|
||||
|
||||
|
||||
class DocumentIntelligenceConverter(DocumentConverter):
|
||||
"""Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
endpoint: str,
|
||||
api_version: str = "2024-07-31-preview",
|
||||
credential: AzureKeyCredential | TokenCredential | None = None,
|
||||
file_types: List[DocumentIntelligenceFileType] = [
|
||||
DocumentIntelligenceFileType.DOCX,
|
||||
DocumentIntelligenceFileType.PPTX,
|
||||
DocumentIntelligenceFileType.XLSX,
|
||||
DocumentIntelligenceFileType.PDF,
|
||||
DocumentIntelligenceFileType.JPEG,
|
||||
DocumentIntelligenceFileType.PNG,
|
||||
DocumentIntelligenceFileType.BMP,
|
||||
DocumentIntelligenceFileType.TIFF,
|
||||
],
|
||||
):
|
||||
"""
|
||||
Initialize the DocumentIntelligenceConverter.
|
||||
|
||||
Args:
|
||||
endpoint (str): The endpoint for the Document Intelligence service.
|
||||
api_version (str): The API version to use. Defaults to "2024-07-31-preview".
|
||||
credential (AzureKeyCredential | TokenCredential | None): The credential to use for authentication.
|
||||
file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported file types.
|
||||
"""
|
||||
|
||||
super().__init__()
|
||||
self._file_types = file_types
|
||||
|
||||
# Raise an error if the dependencies are not available.
|
||||
# This is different than other converters since this one isn't even instantiated
|
||||
# unless explicitly requested.
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
"DocumentIntelligenceConverter requires the optional dependency [az-doc-intel] (or [all]) to be installed. E.g., `pip install markitdown[az-doc-intel]`"
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_dependency_exc_info[2]
|
||||
)
|
||||
|
||||
if credential is None:
|
||||
if os.environ.get("AZURE_API_KEY") is None:
|
||||
credential = DefaultAzureCredential()
|
||||
else:
|
||||
credential = AzureKeyCredential(os.environ["AZURE_API_KEY"])
|
||||
|
||||
self.endpoint = endpoint
|
||||
self.api_version = api_version
|
||||
self.doc_intel_client = DocumentIntelligenceClient(
|
||||
endpoint=self.endpoint,
|
||||
api_version=self.api_version,
|
||||
credential=credential,
|
||||
)
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in _get_file_extensions(self._file_types):
|
||||
return True
|
||||
|
||||
for prefix in _get_mime_type_prefixes(self._file_types):
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _analysis_features(self, stream_info: StreamInfo) -> List[str]:
|
||||
"""
|
||||
Helper needed to determine which analysis features to use.
|
||||
Certain document analysis features are not availiable for
|
||||
office filetypes (.xlsx, .pptx, .html, .docx)
|
||||
"""
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
# Types that don't support ocr
|
||||
no_ocr_types = [
|
||||
DocumentIntelligenceFileType.DOCX,
|
||||
DocumentIntelligenceFileType.PPTX,
|
||||
DocumentIntelligenceFileType.XLSX,
|
||||
DocumentIntelligenceFileType.HTML,
|
||||
]
|
||||
|
||||
if extension in _get_file_extensions(no_ocr_types):
|
||||
return []
|
||||
|
||||
for prefix in _get_mime_type_prefixes(no_ocr_types):
|
||||
if mimetype.startswith(prefix):
|
||||
return []
|
||||
|
||||
return [
|
||||
DocumentAnalysisFeature.FORMULAS, # enable formula extraction
|
||||
DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR
|
||||
DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction
|
||||
]
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Extract the text using Azure Document Intelligence
|
||||
poller = self.doc_intel_client.begin_analyze_document(
|
||||
model_id="prebuilt-layout",
|
||||
body=AnalyzeDocumentRequest(bytes_source=file_stream.read()),
|
||||
features=self._analysis_features(stream_info),
|
||||
output_content_format=CONTENT_FORMAT, # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
|
||||
)
|
||||
result: AnalyzeResult = poller.result()
|
||||
|
||||
# remove comments from the markdown content generated by Doc Intelligence and append to markdown string
|
||||
markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
|
||||
return DocumentConverterResult(markdown=markdown_text)
|
||||
@@ -0,0 +1,83 @@
|
||||
import sys
|
||||
import io
|
||||
from warnings import warn
|
||||
|
||||
from typing import BinaryIO, Any
|
||||
|
||||
from ._html_converter import HtmlConverter
|
||||
from ..converter_utils.docx.pre_process import pre_process_docx
|
||||
from .._base_converter import DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import mammoth
|
||||
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".docx"]
|
||||
|
||||
|
||||
class DocxConverter(HtmlConverter):
|
||||
"""
|
||||
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Check: the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
converter=type(self).__name__,
|
||||
extension=".docx",
|
||||
feature="docx",
|
||||
)
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
_dependency_exc_info[2]
|
||||
)
|
||||
|
||||
style_map = kwargs.get("style_map", None)
|
||||
pre_process_stream = pre_process_docx(file_stream)
|
||||
return self._html_converter.convert_string(
|
||||
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
|
||||
**kwargs,
|
||||
)
|
||||
@@ -0,0 +1,146 @@
|
||||
import os
|
||||
import zipfile
|
||||
from defusedxml import minidom
|
||||
from xml.dom.minidom import Document
|
||||
|
||||
from typing import BinaryIO, Any, Dict, List
|
||||
|
||||
from ._html_converter import HtmlConverter
|
||||
from .._base_converter import DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"application/epub",
|
||||
"application/epub+zip",
|
||||
"application/x-epub+zip",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".epub"]
|
||||
|
||||
MIME_TYPE_MAPPING = {
|
||||
".html": "text/html",
|
||||
".xhtml": "application/xhtml+xml",
|
||||
}
|
||||
|
||||
|
||||
class EpubConverter(HtmlConverter):
|
||||
"""
|
||||
Converts EPUB files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self._html_converter = HtmlConverter()
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
with zipfile.ZipFile(file_stream, "r") as z:
|
||||
# Extracts metadata (title, authors, language, publisher, date, description, cover) from an EPUB file."""
|
||||
|
||||
# Locate content.opf
|
||||
container_dom = minidom.parse(z.open("META-INF/container.xml"))
|
||||
opf_path = container_dom.getElementsByTagName("rootfile")[0].getAttribute(
|
||||
"full-path"
|
||||
)
|
||||
|
||||
# Parse content.opf
|
||||
opf_dom = minidom.parse(z.open(opf_path))
|
||||
metadata: Dict[str, Any] = {
|
||||
"title": self._get_text_from_node(opf_dom, "dc:title"),
|
||||
"authors": self._get_all_texts_from_nodes(opf_dom, "dc:creator"),
|
||||
"language": self._get_text_from_node(opf_dom, "dc:language"),
|
||||
"publisher": self._get_text_from_node(opf_dom, "dc:publisher"),
|
||||
"date": self._get_text_from_node(opf_dom, "dc:date"),
|
||||
"description": self._get_text_from_node(opf_dom, "dc:description"),
|
||||
"identifier": self._get_text_from_node(opf_dom, "dc:identifier"),
|
||||
}
|
||||
|
||||
# Extract manifest items (ID → href mapping)
|
||||
manifest = {
|
||||
item.getAttribute("id"): item.getAttribute("href")
|
||||
for item in opf_dom.getElementsByTagName("item")
|
||||
}
|
||||
|
||||
# Extract spine order (ID refs)
|
||||
spine_items = opf_dom.getElementsByTagName("itemref")
|
||||
spine_order = [item.getAttribute("idref") for item in spine_items]
|
||||
|
||||
# Convert spine order to actual file paths
|
||||
base_path = "/".join(
|
||||
opf_path.split("/")[:-1]
|
||||
) # Get base directory of content.opf
|
||||
spine = [
|
||||
f"{base_path}/{manifest[item_id]}" if base_path else manifest[item_id]
|
||||
for item_id in spine_order
|
||||
if item_id in manifest
|
||||
]
|
||||
|
||||
# Extract and convert the content
|
||||
markdown_content: List[str] = []
|
||||
for file in spine:
|
||||
if file in z.namelist():
|
||||
with z.open(file) as f:
|
||||
filename = os.path.basename(file)
|
||||
extension = os.path.splitext(filename)[1].lower()
|
||||
mimetype = MIME_TYPE_MAPPING.get(extension)
|
||||
converted_content = self._html_converter.convert(
|
||||
f,
|
||||
StreamInfo(
|
||||
mimetype=mimetype,
|
||||
extension=extension,
|
||||
filename=filename,
|
||||
),
|
||||
)
|
||||
markdown_content.append(converted_content.markdown.strip())
|
||||
|
||||
# Format and add the metadata
|
||||
metadata_markdown = []
|
||||
for key, value in metadata.items():
|
||||
if isinstance(value, list):
|
||||
value = ", ".join(value)
|
||||
if value:
|
||||
metadata_markdown.append(f"**{key.capitalize()}:** {value}")
|
||||
|
||||
markdown_content.insert(0, "\n".join(metadata_markdown))
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown="\n\n".join(markdown_content), title=metadata["title"]
|
||||
)
|
||||
|
||||
def _get_text_from_node(self, dom: Document, tag_name: str) -> str | None:
|
||||
"""Convenience function to extract a single occurrence of a tag (e.g., title)."""
|
||||
texts = self._get_all_texts_from_nodes(dom, tag_name)
|
||||
if len(texts) > 0:
|
||||
return texts[0]
|
||||
else:
|
||||
return None
|
||||
|
||||
def _get_all_texts_from_nodes(self, dom: Document, tag_name: str) -> List[str]:
|
||||
"""Helper function to extract all occurrences of a tag (e.g., multiple authors)."""
|
||||
texts: List[str] = []
|
||||
for node in dom.getElementsByTagName(tag_name):
|
||||
if node.firstChild and hasattr(node.firstChild, "nodeValue"):
|
||||
texts.append(node.firstChild.nodeValue.strip())
|
||||
return texts
|
||||
@@ -0,0 +1,52 @@
|
||||
import json
|
||||
import locale
|
||||
import subprocess
|
||||
from typing import Any, BinaryIO, Union
|
||||
|
||||
|
||||
def _parse_version(version: str) -> tuple:
|
||||
return tuple(map(int, (version.split("."))))
|
||||
|
||||
|
||||
def exiftool_metadata(
|
||||
file_stream: BinaryIO,
|
||||
*,
|
||||
exiftool_path: Union[str, None],
|
||||
) -> Any: # Need a better type for json data
|
||||
# Nothing to do
|
||||
if not exiftool_path:
|
||||
return {}
|
||||
|
||||
# Verify exiftool version
|
||||
try:
|
||||
version_output = subprocess.run(
|
||||
[exiftool_path, "-ver"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
).stdout.strip()
|
||||
version = _parse_version(version_output)
|
||||
min_version = (12, 24)
|
||||
if version < min_version:
|
||||
raise RuntimeError(
|
||||
f"ExifTool version {version_output} is vulnerable to CVE-2021-22204. "
|
||||
"Please upgrade to version 12.24 or later."
|
||||
)
|
||||
except (subprocess.CalledProcessError, ValueError) as e:
|
||||
raise RuntimeError("Failed to verify ExifTool version.") from e
|
||||
|
||||
# Run exiftool
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
output = subprocess.run(
|
||||
[exiftool_path, "-json", "-"],
|
||||
input=file_stream.read(),
|
||||
capture_output=True,
|
||||
text=False,
|
||||
).stdout
|
||||
|
||||
return json.loads(
|
||||
output.decode(locale.getpreferredencoding(False)),
|
||||
)[0]
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
@@ -0,0 +1,90 @@
|
||||
import io
|
||||
from typing import Any, BinaryIO, Optional
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from ._markdownify import _CustomMarkdownify
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"text/html",
|
||||
"application/xhtml",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [
|
||||
".html",
|
||||
".htm",
|
||||
]
|
||||
|
||||
|
||||
class HtmlConverter(DocumentConverter):
|
||||
"""Anything with content type text/html"""
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Parse the stream
|
||||
encoding = "utf-8" if stream_info.charset is None else stream_info.charset
|
||||
soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
|
||||
|
||||
# Remove javascript and style blocks
|
||||
for script in soup(["script", "style"]):
|
||||
script.extract()
|
||||
|
||||
# Print only the main content
|
||||
body_elm = soup.find("body")
|
||||
webpage_text = ""
|
||||
if body_elm:
|
||||
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
|
||||
else:
|
||||
webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
|
||||
|
||||
assert isinstance(webpage_text, str)
|
||||
|
||||
# remove leading and trailing \n
|
||||
webpage_text = webpage_text.strip()
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown=webpage_text,
|
||||
title=None if soup.title is None else soup.title.string,
|
||||
)
|
||||
|
||||
def convert_string(
|
||||
self, html_content: str, *, url: Optional[str] = None, **kwargs
|
||||
) -> DocumentConverterResult:
|
||||
"""
|
||||
Non-standard convenience method to convert a string to markdown.
|
||||
Given that many converters produce HTML as intermediate output, this
|
||||
allows for easy conversion of HTML to markdown.
|
||||
"""
|
||||
return self.convert(
|
||||
file_stream=io.BytesIO(html_content.encode("utf-8")),
|
||||
stream_info=StreamInfo(
|
||||
mimetype="text/html",
|
||||
extension=".html",
|
||||
charset="utf-8",
|
||||
url=url,
|
||||
),
|
||||
**kwargs,
|
||||
)
|
||||
@@ -0,0 +1,138 @@
|
||||
from typing import BinaryIO, Any, Union
|
||||
import base64
|
||||
import mimetypes
|
||||
from ._exiftool import exiftool_metadata
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
|
||||
ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
"image/jpeg",
|
||||
"image/png",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".jpg", ".jpeg", ".png"]
|
||||
|
||||
|
||||
class ImageConverter(DocumentConverter):
|
||||
"""
|
||||
Converts images to markdown via extraction of metadata (if `exiftool` is installed), and description via a multimodal LLM (if an llm_client is configured).
|
||||
"""
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any,
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in ACCEPTED_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
md_content = ""
|
||||
|
||||
# Add metadata
|
||||
metadata = exiftool_metadata(
|
||||
file_stream, exiftool_path=kwargs.get("exiftool_path")
|
||||
)
|
||||
|
||||
if metadata:
|
||||
for f in [
|
||||
"ImageSize",
|
||||
"Title",
|
||||
"Caption",
|
||||
"Description",
|
||||
"Keywords",
|
||||
"Artist",
|
||||
"Author",
|
||||
"DateTimeOriginal",
|
||||
"CreateDate",
|
||||
"GPSPosition",
|
||||
]:
|
||||
if f in metadata:
|
||||
md_content += f"{f}: {metadata[f]}\n"
|
||||
|
||||
# Try describing the image with GPT
|
||||
llm_client = kwargs.get("llm_client")
|
||||
llm_model = kwargs.get("llm_model")
|
||||
if llm_client is not None and llm_model is not None:
|
||||
llm_description = self._get_llm_description(
|
||||
file_stream,
|
||||
stream_info,
|
||||
client=llm_client,
|
||||
model=llm_model,
|
||||
prompt=kwargs.get("llm_prompt"),
|
||||
)
|
||||
|
||||
if llm_description is not None:
|
||||
md_content += "\n# Description:\n" + llm_description.strip() + "\n"
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown=md_content,
|
||||
)
|
||||
|
||||
def _get_llm_description(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
*,
|
||||
client,
|
||||
model,
|
||||
prompt=None,
|
||||
) -> Union[None, str]:
|
||||
if prompt is None or prompt.strip() == "":
|
||||
prompt = "Write a detailed caption for this image."
|
||||
|
||||
# Get the content type
|
||||
content_type = stream_info.mimetype
|
||||
if not content_type:
|
||||
content_type, _ = mimetypes.guess_type(
|
||||
"_dummy" + (stream_info.extension or "")
|
||||
)
|
||||
if not content_type:
|
||||
content_type = "application/octet-stream"
|
||||
|
||||
# Convert to base64
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
base64_image = base64.b64encode(file_stream.read()).decode("utf-8")
|
||||
except Exception as e:
|
||||
return None
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
# Prepare the data-uri
|
||||
data_uri = f"data:{content_type};base64,{base64_image}"
|
||||
|
||||
# Prepare the OpenAI API request
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": data_uri,
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
# Call the OpenAI API
|
||||
response = client.chat.completions.create(model=model, messages=messages)
|
||||
return response.choices[0].message.content
|
||||
@@ -0,0 +1,96 @@
|
||||
from typing import BinaryIO, Any
|
||||
import json
|
||||
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._exceptions import FileConversionException
|
||||
from .._stream_info import StreamInfo
|
||||
|
||||
CANDIDATE_MIME_TYPE_PREFIXES = [
|
||||
"application/json",
|
||||
]
|
||||
|
||||
ACCEPTED_FILE_EXTENSIONS = [".ipynb"]
|
||||
|
||||
|
||||
class IpynbConverter(DocumentConverter):
|
||||
"""Converts Jupyter Notebook (.ipynb) files to Markdown."""
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
|
||||
if extension in ACCEPTED_FILE_EXTENSIONS:
|
||||
return True
|
||||
|
||||
for prefix in CANDIDATE_MIME_TYPE_PREFIXES:
|
||||
if mimetype.startswith(prefix):
|
||||
# Read further to see if it's a notebook
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
encoding = stream_info.charset or "utf-8"
|
||||
notebook_content = file_stream.read().decode(encoding)
|
||||
return (
|
||||
"nbformat" in notebook_content
|
||||
and "nbformat_minor" in notebook_content
|
||||
)
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
return False
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
) -> DocumentConverterResult:
|
||||
# Parse and convert the notebook
|
||||
encoding = stream_info.charset or "utf-8"
|
||||
notebook_content = file_stream.read().decode(encoding=encoding)
|
||||
return self._convert(json.loads(notebook_content))
|
||||
|
||||
def _convert(self, notebook_content: dict) -> DocumentConverterResult:
|
||||
"""Helper function that converts notebook JSON content to Markdown."""
|
||||
try:
|
||||
md_output = []
|
||||
title = None
|
||||
|
||||
for cell in notebook_content.get("cells", []):
|
||||
cell_type = cell.get("cell_type", "")
|
||||
source_lines = cell.get("source", [])
|
||||
|
||||
if cell_type == "markdown":
|
||||
md_output.append("".join(source_lines))
|
||||
|
||||
# Extract the first # heading as title if not already found
|
||||
if title is None:
|
||||
for line in source_lines:
|
||||
if line.startswith("# "):
|
||||
title = line.lstrip("# ").strip()
|
||||
break
|
||||
|
||||
elif cell_type == "code":
|
||||
# Code cells are wrapped in Markdown code blocks
|
||||
md_output.append(f"```python\n{''.join(source_lines)}\n```")
|
||||
elif cell_type == "raw":
|
||||
md_output.append(f"```\n{''.join(source_lines)}\n```")
|
||||
|
||||
md_text = "\n\n".join(md_output)
|
||||
|
||||
# Check for title in notebook metadata
|
||||
title = notebook_content.get("metadata", {}).get("title", title)
|
||||
|
||||
return DocumentConverterResult(
|
||||
markdown=md_text,
|
||||
title=title,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise FileConversionException(
|
||||
f"Error converting .ipynb file: {str(e)}"
|
||||
) from e
|
||||
@@ -0,0 +1,50 @@
|
||||
from typing import BinaryIO, Union
|
||||
import base64
|
||||
import mimetypes
|
||||
from .._stream_info import StreamInfo
|
||||
|
||||
|
||||
def llm_caption(
|
||||
file_stream: BinaryIO, stream_info: StreamInfo, *, client, model, prompt=None
|
||||
) -> Union[None, str]:
|
||||
if prompt is None or prompt.strip() == "":
|
||||
prompt = "Write a detailed caption for this image."
|
||||
|
||||
# Get the content type
|
||||
content_type = stream_info.mimetype
|
||||
if not content_type:
|
||||
content_type, _ = mimetypes.guess_type("_dummy" + (stream_info.extension or ""))
|
||||
if not content_type:
|
||||
content_type = "application/octet-stream"
|
||||
|
||||
# Convert to base64
|
||||
cur_pos = file_stream.tell()
|
||||
try:
|
||||
base64_image = base64.b64encode(file_stream.read()).decode("utf-8")
|
||||
except Exception as e:
|
||||
return None
|
||||
finally:
|
||||
file_stream.seek(cur_pos)
|
||||
|
||||
# Prepare the data-uri
|
||||
data_uri = f"data:{content_type};base64,{base64_image}"
|
||||
|
||||
# Prepare the OpenAI API request
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": data_uri,
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
# Call the OpenAI API
|
||||
response = client.chat.completions.create(model=model, messages=messages)
|
||||
return response.choices[0].message.content
|
||||
@@ -0,0 +1,126 @@
|
||||
import re
|
||||
import markdownify
|
||||
|
||||
from typing import Any, Optional
|
||||
from urllib.parse import quote, unquote, urlparse, urlunparse
|
||||
|
||||
|
||||
class _CustomMarkdownify(markdownify.MarkdownConverter):
|
||||
"""
|
||||
A custom version of markdownify's MarkdownConverter. Changes include:
|
||||
|
||||
- Altering the default heading style to use '#', '##', etc.
|
||||
- Removing javascript hyperlinks.
|
||||
- Truncating images with large data:uri sources.
|
||||
- Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
|
||||
"""
|
||||
|
||||
def __init__(self, **options: Any):
|
||||
options["heading_style"] = options.get("heading_style", markdownify.ATX)
|
||||
options["keep_data_uris"] = options.get("keep_data_uris", False)
|
||||
# Explicitly cast options to the expected type if necessary
|
||||
super().__init__(**options)
|
||||
|
||||
def convert_hn(
|
||||
self,
|
||||
n: int,
|
||||
el: Any,
|
||||
text: str,
|
||||
convert_as_inline: Optional[bool] = False,
|
||||
**kwargs,
|
||||
) -> str:
|
||||
"""Same as usual, but be sure to start with a new line"""
|
||||
if not convert_as_inline:
|
||||
if not re.search(r"^\n", text):
|
||||
return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore
|
||||
|
||||
return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
|
||||
|
||||
def convert_a(
|
||||
self,
|
||||
el: Any,
|
||||
text: str,
|
||||
convert_as_inline: Optional[bool] = False,
|
||||
**kwargs,
|
||||
):
|
||||
"""Same as usual converter, but removes Javascript links and escapes URIs."""
|
||||
prefix, suffix, text = markdownify.chomp(text) # type: ignore
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
if el.find_parent("pre") is not None:
|
||||
return text
|
||||
|
||||
href = el.get("href")
|
||||
title = el.get("title")
|
||||
|
||||
# Escape URIs and skip non-http or file schemes
|
||||
if href:
|
||||
try:
|
||||
parsed_url = urlparse(href) # type: ignore
|
||||
if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore
|
||||
return "%s%s%s" % (prefix, text, suffix)
|
||||
href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore
|
||||
except ValueError: # It's not clear if this ever gets thrown
|
||||
return "%s%s%s" % (prefix, text, suffix)
|
||||
|
||||
# For the replacement see #29: text nodes underscores are escaped
|
||||
if (
|
||||
self.options["autolinks"]
|
||||
and text.replace(r"\_", "_") == href
|
||||
and not title
|
||||
and not self.options["default_title"]
|
||||
):
|
||||
# Shortcut syntax
|
||||
return "<%s>" % href
|
||||
if self.options["default_title"] and not title:
|
||||
title = href
|
||||
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
||||
return (
|
||||
"%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix)
|
||||
if href
|
||||
else text
|
||||
)
|
||||
|
||||
def convert_img(
|
||||
self,
|
||||
el: Any,
|
||||
text: str,
|
||||
convert_as_inline: Optional[bool] = False,
|
||||
**kwargs,
|
||||
) -> str:
|
||||
"""Same as usual converter, but removes data URIs"""
|
||||
|
||||
alt = el.attrs.get("alt", None) or ""
|
||||
src = el.attrs.get("src", None) or el.attrs.get("data-src", None) or ""
|
||||
title = el.attrs.get("title", None) or ""
|
||||
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
||||
# Remove all line breaks from alt
|
||||
alt = alt.replace("\n", " ")
|
||||
if (
|
||||
convert_as_inline
|
||||
and el.parent.name not in self.options["keep_inline_images_in"]
|
||||
):
|
||||
return alt
|
||||
|
||||
# Remove dataURIs
|
||||
if src.startswith("data:") and not self.options["keep_data_uris"]:
|
||||
src = src.split(",")[0] + "..."
|
||||
|
||||
return "" % (alt, src, title_part)
|
||||
|
||||
def convert_input(
|
||||
self,
|
||||
el: Any,
|
||||
text: str,
|
||||
convert_as_inline: Optional[bool] = False,
|
||||
**kwargs,
|
||||
) -> str:
|
||||
"""Convert checkboxes to Markdown [x]/[ ] syntax."""
|
||||
|
||||
if el.get("type") == "checkbox":
|
||||
return "[x] " if el.has_attr("checked") else "[ ] "
|
||||
return ""
|
||||
|
||||
def convert_soup(self, soup: Any) -> str:
|
||||
return super().convert_soup(soup) # type: ignore
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user