Cleanup and refactor, in preparation for plugin support. (#318)

* Work started moving converters to individual files. * Significant cleanup and refactor. * Moved everything to a packages subfolder. * Added sample plugin. * Added instructions to the README.md * Bumped version, and added a note about compatibility.
2025-02-10 15:21:44 -08:00
parent 73ba69d8cd
commit c73afcffea
60 changed files with 2755 additions and 1901 deletions
@@ -0,0 +1,52 @@
+# MarkItDown
+
+> [!IMPORTANT]
+> MarkItDown is a Python package and command-line utility for converting various files to Markdown (e.g., for indexing, text analysis, etc). 
+>
+> For more information, and full documentation, see the project [README.md](https://github.com/microsoft/markitdown) on GitHub.
+
+## Installation
+
+From PyPI:
+
+```bash
+pip install markitdown
+```
+
+From source:
+
+```bash
+git clone git@github.com:microsoft/markitdown.git
+cd markitdown
+pip install -e packages/markitdown
+```
+
+## Usage
+
+### Command-Line
+
+```bash
+markitdown path-to-file.pdf > document.md
+```
+
+### Python API
+
+```python
+from markitdown import MarkItDown
+
+md = MarkItDown()
+result = md.convert("test.xlsx")
+print(result.text_content)
+```
+
+### More Information
+
+For more information, and full documentation, see the project [README.md](https://github.com/microsoft/markitdown) on GitHub.
+
+## Trademarks
+
+This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
+trademarks or logos is subject to and must follow
+[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
+Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
+Any use of third-party trademarks or logos are subject to those third-party's policies.
@@ -0,0 +1,87 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "markitdown"
+dynamic = ["version"]
+description = 'Utility tool for converting various files to Markdown'
+readme = "README.md"
+requires-python = ">=3.10"
+license = "MIT"
+keywords = []
+authors = [
+  { name = "Adam Fourney", email = "adamfo@microsoft.com" },
+]
+classifiers = [
+  "Development Status :: 4 - Beta",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
+  "Programming Language :: Python :: Implementation :: CPython",
+  "Programming Language :: Python :: Implementation :: PyPy",
+]
+dependencies = [
+  "beautifulsoup4",
+  "requests",
+  "mammoth",
+  "markdownify",
+  "numpy",
+  "python-pptx",
+  "pandas",
+  "openpyxl",
+  "xlrd",
+  "pdfminer.six",
+  "puremagic",
+  "pydub",
+  "olefile",
+  "youtube-transcript-api",
+  "SpeechRecognition",
+  "pathvalidate",
+  "charset-normalizer",
+  "openai",
+  "azure-ai-documentintelligence",
+  "azure-identity"
+]
+
+[project.urls]
+Documentation = "https://github.com/microsoft/markitdown#readme"
+Issues = "https://github.com/microsoft/markitdown/issues"
+Source = "https://github.com/microsoft/markitdown"
+
+[tool.hatch.version]
+path = "src/markitdown/__about__.py"
+
+[project.scripts]
+markitdown = "markitdown.__main__:main"
+
+[tool.hatch.envs.types]
+extra-dependencies = [
+  "mypy>=1.0.0",
+]
+[tool.hatch.envs.types.scripts]
+check = "mypy --install-types --non-interactive {args:src/markitdown tests}"
+
+[tool.coverage.run]
+source_pkgs = ["markitdown", "tests"]
+branch = true
+parallel = true
+omit = [
+  "src/markitdown/__about__.py",
+]
+
+[tool.coverage.paths]
+markitdown = ["src/markitdown", "*/markitdown/src/markitdown"]
+tests = ["tests", "*/markitdown/tests"]
+
+[tool.coverage.report]
+exclude_lines = [
+  "no cov",
+  "if __name__ == .__main__.:",
+  "if TYPE_CHECKING:",
+]
+
+[tool.hatch.build.targets.sdist]
+only-include = ["src/markitdown"]
@@ -0,0 +1,4 @@
+# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
+#
+# SPDX-License-Identifier: MIT
+__version__ = "0.0.2a1"
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
+#
+# SPDX-License-Identifier: MIT
+
+from ._markitdown import MarkItDown
+from ._exceptions import (
+    MarkItDownException,
+    ConverterPrerequisiteException,
+    FileConversionException,
+    UnsupportedFormatException,
+)
+from .converters import DocumentConverter, DocumentConverterResult
+
+__all__ = [
+    "MarkItDown",
+    "DocumentConverter",
+    "DocumentConverterResult",
+    "MarkItDownException",
+    "ConverterPrerequisiteException",
+    "FileConversionException",
+    "UnsupportedFormatException",
+]
@@ -0,0 +1,139 @@
+# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
+#
+# SPDX-License-Identifier: MIT
+import argparse
+import sys
+from textwrap import dedent
+from importlib.metadata import entry_points
+from .__about__ import __version__
+from ._markitdown import MarkItDown, DocumentConverterResult
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert various file formats to markdown.",
+        prog="markitdown",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        usage=dedent(
+            """
+            SYNTAX:
+
+                markitdown <OPTIONAL: FILENAME>
+                If FILENAME is empty, markitdown reads from stdin.
+
+            EXAMPLE:
+
+                markitdown example.pdf
+
+                OR
+
+                cat example.pdf | markitdown
+
+                OR
+
+                markitdown < example.pdf
+                
+                OR to save to a file use
+    
+                markitdown example.pdf -o example.md
+                
+                OR
+                
+                markitdown example.pdf > example.md
+            """
+        ).strip(),
+    )
+
+    parser.add_argument(
+        "-v",
+        "--version",
+        action="version",
+        version=f"%(prog)s {__version__}",
+        help="show the version number and exit",
+    )
+
+    parser.add_argument(
+        "-o",
+        "--output",
+        help="Output file name. If not provided, output is written to stdout.",
+    )
+
+    parser.add_argument(
+        "-d",
+        "--use-docintel",
+        action="store_true",
+        help="Use Document Intelligence to extract text instead of offline conversion. Requires a valid Document Intelligence Endpoint.",
+    )
+
+    parser.add_argument(
+        "-e",
+        "--endpoint",
+        type=str,
+        help="Document Intelligence Endpoint. Required if using Document Intelligence.",
+    )
+
+    parser.add_argument(
+        "-p",
+        "--use-plugins",
+        action="store_true",
+        help="Use 3rd-party plugins to convert files. Use --list-plugins to see installed plugins.",
+    )
+
+    parser.add_argument(
+        "--list-plugins",
+        action="store_true",
+        help="List installed 3rd-party plugins. Plugins are loaded when using the -p or --use-plugin option.",
+    )
+
+    parser.add_argument("filename", nargs="?")
+    args = parser.parse_args()
+
+    if args.list_plugins:
+        # List installed plugins, then exit
+        print("Installed MarkItDown 3rd-party Plugins:\n")
+        plugin_entry_points = list(entry_points(group="markitdown.plugin"))
+        if len(plugin_entry_points) == 0:
+            print("  * No 3rd-party plugins installed.")
+            print(
+                "\nFind plugins by searching for the hashtag #markitdown-plugin on GitHub.\n"
+            )
+        else:
+            for entry_point in plugin_entry_points:
+                print(f"  * {entry_point.name:<16}\t(package: {entry_point.value})")
+            print(
+                "\nUse the -p (or --use-plugins) option to enable 3rd-party plugins.\n"
+            )
+        sys.exit(0)
+
+    if args.use_docintel:
+        if args.endpoint is None:
+            raise ValueError(
+                "Document Intelligence Endpoint is required when using Document Intelligence."
+            )
+        elif args.filename is None:
+            raise ValueError("Filename is required when using Document Intelligence.")
+        markitdown = MarkItDown(
+            enable_plugins=args.use_plugins, docintel_endpoint=args.endpoint
+        )
+    else:
+        markitdown = MarkItDown(enable_plugins=args.use_plugins)
+
+    if args.filename is None:
+        result = markitdown.convert_stream(sys.stdin.buffer)
+    else:
+        result = markitdown.convert(args.filename)
+
+    _handle_output(args, result)
+
+
+def _handle_output(args, result: DocumentConverterResult):
+    """Handle output to stdout or file"""
+    if args.output:
+        with open(args.output, "w", encoding="utf-8") as f:
+            f.write(result.text_content)
+    else:
+        print(result.text_content)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,37 @@
+class MarkItDownException(BaseException):
+    """
+    Base exception class for MarkItDown.
+    """
+
+    pass
+
+
+class ConverterPrerequisiteException(MarkItDownException):
+    """
+    Thrown when instantiating a DocumentConverter in cases where
+    a required library or dependency is not installed, an API key
+    is not set, or some other prerequisite is not met.
+
+    This is not necessarily a fatal error. If thrown during
+    MarkItDown's plugin loading phase, the converter will simply be
+    skipped, and a warning will be issued.
+    """
+
+    pass
+
+
+class FileConversionException(MarkItDownException):
+    """
+    Thrown when a suitable converter was found, but the conversion
+    process fails for any reason.
+    """
+
+    pass
+
+
+class UnsupportedFormatException(MarkItDownException):
+    """
+    Thrown when no suitable converter was found for the given file.
+    """
+
+    pass
@@ -0,0 +1,440 @@
+import copy
+import mimetypes
+import os
+import re
+import tempfile
+import warnings
+import traceback
+from importlib.metadata import entry_points
+from typing import Any, List, Optional, Union
+from pathlib import Path
+from urllib.parse import urlparse
+from warnings import warn
+
+# File-format detection
+import puremagic
+import requests
+
+from .converters import (
+    DocumentConverter,
+    DocumentConverterResult,
+    PlainTextConverter,
+    HtmlConverter,
+    RssConverter,
+    WikipediaConverter,
+    YouTubeConverter,
+    IpynbConverter,
+    BingSerpConverter,
+    PdfConverter,
+    DocxConverter,
+    XlsxConverter,
+    XlsConverter,
+    PptxConverter,
+    ImageConverter,
+    WavConverter,
+    Mp3Converter,
+    OutlookMsgConverter,
+    ZipConverter,
+    DocumentIntelligenceConverter,
+)
+
+from ._exceptions import (
+    FileConversionException,
+    UnsupportedFormatException,
+    ConverterPrerequisiteException,
+)
+
+# Override mimetype for csv to fix issue on windows
+mimetypes.add_type("text/csv", ".csv")
+
+PRIORITY_SPECIFIC_FILE_FORMAT = 0.0
+PRIORITY_GENERIC_FILE_FORMAT = 10.0
+
+
+_plugins: Union[None | List[Any]] = None
+
+
+def _load_plugins() -> Union[None | List[Any]]:
+    """Lazy load plugins, exiting early if already loaded."""
+    global _plugins
+
+    # Skip if we've already loaded plugins
+    if _plugins is not None:
+        return _plugins
+
+    # Load plugins
+    _plugins = []
+    for entry_point in entry_points(group="markitdown.plugin"):
+        try:
+            _plugins.append(entry_point.load())
+        except Exception:
+            tb = traceback.format_exc()
+            warn(f"Plugin '{entry_point.name}' failed to load ... skipping:\n{tb}")
+
+    return _plugins
+
+
+class MarkItDown:
+    """(In preview) An extremely simple text-based document reader, suitable for LLM use.
+    This reader will convert common file-types or webpages to Markdown."""
+
+    def __init__(
+        self,
+        *,
+        enable_builtins: Union[None, bool] = None,
+        enable_plugins: Union[None, bool] = None,
+        **kwargs,
+    ):
+        self._builtins_enabled = False
+        self._plugins_enabled = False
+
+        requests_session = kwargs.get("requests_session")
+        if requests_session is None:
+            self._requests_session = requests.Session()
+        else:
+            self._requests_session = requests_session
+
+        # TODO - remove these (see enable_builtins)
+        self._llm_client = None
+        self._llm_model = None
+        self._exiftool_path = None
+        self._style_map = None
+
+        # Register the converters
+        self._page_converters: List[DocumentConverter] = []
+
+        if (
+            enable_builtins is None or enable_builtins
+        ):  # Default to True when not specified
+            self.enable_builtins(**kwargs)
+
+        if enable_plugins:
+            self.enable_plugins(**kwargs)
+
+    def enable_builtins(self, **kwargs) -> None:
+        """
+        Enable and register built-in converters.
+        Built-in converters are enabled by default.
+        This method should only be called once, if built-ins were initially disabled.
+        """
+        if not self._builtins_enabled:
+            # TODO: Move these into converter constructors
+            self._llm_client = kwargs.get("llm_client")
+            self._llm_model = kwargs.get("llm_model")
+            self._exiftool_path = kwargs.get("exiftool_path")
+            self._style_map = kwargs.get("style_map")
+
+            # Register converters for successful browsing operations
+            # Later registrations are tried first / take higher priority than earlier registrations
+            # To this end, the most specific converters should appear below the most generic converters
+            self.register_converter(PlainTextConverter())
+            self.register_converter(ZipConverter())
+            self.register_converter(HtmlConverter())
+            self.register_converter(RssConverter())
+            self.register_converter(WikipediaConverter())
+            self.register_converter(YouTubeConverter())
+            self.register_converter(BingSerpConverter())
+            self.register_converter(DocxConverter())
+            self.register_converter(XlsxConverter())
+            self.register_converter(XlsConverter())
+            self.register_converter(PptxConverter())
+            self.register_converter(WavConverter())
+            self.register_converter(Mp3Converter())
+            self.register_converter(ImageConverter())
+            self.register_converter(IpynbConverter())
+            self.register_converter(PdfConverter())
+            self.register_converter(OutlookMsgConverter())
+
+            # Register Document Intelligence converter at the top of the stack if endpoint is provided
+            docintel_endpoint = kwargs.get("docintel_endpoint")
+            if docintel_endpoint is not None:
+                self.register_converter(
+                    DocumentIntelligenceConverter(endpoint=docintel_endpoint)
+                )
+
+            self._builtins_enabled = True
+        else:
+            warn("Built-in converters are already enabled.", RuntimeWarning)
+
+    def enable_plugins(self, **kwargs) -> None:
+        """
+        Enable and register converters provided by plugins.
+        Plugins are disabled by default.
+        This method should only be called once, if plugins were initially disabled.
+        """
+        if not self._plugins_enabled:
+            # Load plugins
+            for plugin in _load_plugins():
+                try:
+                    plugin.register_converters(self, **kwargs)
+                except Exception:
+                    tb = traceback.format_exc()
+                    warn(f"Plugin '{plugin}' failed to register converters:\n{tb}")
+            self._plugins_enabled = True
+        else:
+            warn("Plugins converters are already enabled.", RuntimeWarning)
+
+    def convert(
+        self, source: Union[str, requests.Response, Path], **kwargs: Any
+    ) -> DocumentConverterResult:  # TODO: deal with kwargs
+        """
+        Args:
+            - source: can be a string representing a path either as string pathlib path object or url, or a requests.response object
+            - extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
+        """
+
+        # Local path or url
+        if isinstance(source, str):
+            if (
+                source.startswith("http://")
+                or source.startswith("https://")
+                or source.startswith("file://")
+            ):
+                return self.convert_url(source, **kwargs)
+            else:
+                return self.convert_local(source, **kwargs)
+        # Request response
+        elif isinstance(source, requests.Response):
+            return self.convert_response(source, **kwargs)
+        elif isinstance(source, Path):
+            return self.convert_local(source, **kwargs)
+
+    def convert_local(
+        self, path: Union[str, Path], **kwargs: Any
+    ) -> DocumentConverterResult:  # TODO: deal with kwargs
+        if isinstance(path, Path):
+            path = str(path)
+        # Prepare a list of extensions to try (in order of priority)
+        ext = kwargs.get("file_extension")
+        extensions = [ext] if ext is not None else []
+
+        # Get extension alternatives from the path and puremagic
+        base, ext = os.path.splitext(path)
+        self._append_ext(extensions, ext)
+
+        for g in self._guess_ext_magic(path):
+            self._append_ext(extensions, g)
+
+        # Convert
+        return self._convert(path, extensions, **kwargs)
+
+    # TODO what should stream's type be?
+    def convert_stream(
+        self, stream: Any, **kwargs: Any
+    ) -> DocumentConverterResult:  # TODO: deal with kwargs
+        # Prepare a list of extensions to try (in order of priority)
+        ext = kwargs.get("file_extension")
+        extensions = [ext] if ext is not None else []
+
+        # Save the file locally to a temporary file. It will be deleted before this method exits
+        handle, temp_path = tempfile.mkstemp()
+        fh = os.fdopen(handle, "wb")
+        result = None
+        try:
+            # Write to the temporary file
+            content = stream.read()
+            if isinstance(content, str):
+                fh.write(content.encode("utf-8"))
+            else:
+                fh.write(content)
+            fh.close()
+
+            # Use puremagic to check for more extension options
+            for g in self._guess_ext_magic(temp_path):
+                self._append_ext(extensions, g)
+
+            # Convert
+            result = self._convert(temp_path, extensions, **kwargs)
+        # Clean up
+        finally:
+            try:
+                fh.close()
+            except Exception:
+                pass
+            os.unlink(temp_path)
+
+        return result
+
+    def convert_url(
+        self, url: str, **kwargs: Any
+    ) -> DocumentConverterResult:  # TODO: fix kwargs type
+        # Send a HTTP request to the URL
+        response = self._requests_session.get(url, stream=True)
+        response.raise_for_status()
+        return self.convert_response(response, **kwargs)
+
+    def convert_response(
+        self, response: requests.Response, **kwargs: Any
+    ) -> DocumentConverterResult:  # TODO fix kwargs type
+        # Prepare a list of extensions to try (in order of priority)
+        ext = kwargs.get("file_extension")
+        extensions = [ext] if ext is not None else []
+
+        # Guess from the mimetype
+        content_type = response.headers.get("content-type", "").split(";")[0]
+        self._append_ext(extensions, mimetypes.guess_extension(content_type))
+
+        # Read the content disposition if there is one
+        content_disposition = response.headers.get("content-disposition", "")
+        m = re.search(r"filename=([^;]+)", content_disposition)
+        if m:
+            base, ext = os.path.splitext(m.group(1).strip("\"'"))
+            self._append_ext(extensions, ext)
+
+        # Read from the extension from the path
+        base, ext = os.path.splitext(urlparse(response.url).path)
+        self._append_ext(extensions, ext)
+
+        # Save the file locally to a temporary file. It will be deleted before this method exits
+        handle, temp_path = tempfile.mkstemp()
+        fh = os.fdopen(handle, "wb")
+        result = None
+        try:
+            # Download the file
+            for chunk in response.iter_content(chunk_size=512):
+                fh.write(chunk)
+            fh.close()
+
+            # Use puremagic to check for more extension options
+            for g in self._guess_ext_magic(temp_path):
+                self._append_ext(extensions, g)
+
+            # Convert
+            result = self._convert(temp_path, extensions, url=response.url, **kwargs)
+        # Clean up
+        finally:
+            try:
+                fh.close()
+            except Exception:
+                pass
+            os.unlink(temp_path)
+
+        return result
+
+    def _convert(
+        self, local_path: str, extensions: List[Union[str, None]], **kwargs
+    ) -> DocumentConverterResult:
+        error_trace = ""
+
+        # Create a copy of the page_converters list, sorted by priority.
+        # We do this with each call to _convert because the priority of converters may change between calls.
+        # The sort is guaranteed to be stable, so converters with the same priority will remain in the same order.
+        sorted_converters = sorted(self._page_converters, key=lambda x: x.priority)
+
+        for ext in extensions + [None]:  # Try last with no extension
+            for converter in sorted_converters:
+                _kwargs = copy.deepcopy(kwargs)
+
+                # Overwrite file_extension appropriately
+                if ext is None:
+                    if "file_extension" in _kwargs:
+                        del _kwargs["file_extension"]
+                else:
+                    _kwargs.update({"file_extension": ext})
+
+                # Copy any additional global options
+                if "llm_client" not in _kwargs and self._llm_client is not None:
+                    _kwargs["llm_client"] = self._llm_client
+
+                if "llm_model" not in _kwargs and self._llm_model is not None:
+                    _kwargs["llm_model"] = self._llm_model
+
+                if "style_map" not in _kwargs and self._style_map is not None:
+                    _kwargs["style_map"] = self._style_map
+
+                if "exiftool_path" not in _kwargs and self._exiftool_path is not None:
+                    _kwargs["exiftool_path"] = self._exiftool_path
+
+                # Add the list of converters for nested processing
+                _kwargs["_parent_converters"] = self._page_converters
+
+                # If we hit an error log it and keep trying
+                # try:
+                if True:
+                    res = converter.convert(local_path, **_kwargs)
+                # except Exception:
+                #    error_trace = ("\n\n" + traceback.format_exc()).strip()
+
+                if res is not None:
+                    # Normalize the content
+                    res.text_content = "\n".join(
+                        [line.rstrip() for line in re.split(r"\r?\n", res.text_content)]
+                    )
+                    res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
+
+                    # Todo
+                    return res
+
+        # If we got this far without success, report any exceptions
+        if len(error_trace) > 0:
+            raise FileConversionException(
+                f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
+            )
+
+        # Nothing can handle it!
+        raise UnsupportedFormatException(
+            f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
+        )
+
+    def _append_ext(self, extensions, ext):
+        """Append a unique non-None, non-empty extension to a list of extensions."""
+        if ext is None:
+            return
+        ext = ext.strip()
+        if ext == "":
+            return
+        # if ext not in extensions:
+        extensions.append(ext)
+
+    def _guess_ext_magic(self, path):
+        """Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
+        # Use puremagic to guess
+        try:
+            guesses = puremagic.magic_file(path)
+
+            # Fix for: https://github.com/microsoft/markitdown/issues/222
+            # If there are no guesses, then try again after trimming leading ASCII whitespaces.
+            # ASCII whitespace characters are those byte values in the sequence b' \t\n\r\x0b\f'
+            # (space, tab, newline, carriage return, vertical tab, form feed).
+            if len(guesses) == 0:
+                with open(path, "rb") as file:
+                    while True:
+                        char = file.read(1)
+                        if not char:  # End of file
+                            break
+                        if not char.isspace():
+                            file.seek(file.tell() - 1)
+                            break
+                    try:
+                        guesses = puremagic.magic_stream(file)
+                    except puremagic.main.PureError:
+                        pass
+
+            extensions = list()
+            for g in guesses:
+                ext = g.extension.strip()
+                if len(ext) > 0:
+                    if not ext.startswith("."):
+                        ext = "." + ext
+                    if ext not in extensions:
+                        extensions.append(ext)
+            return extensions
+        except FileNotFoundError:
+            pass
+        except IsADirectoryError:
+            pass
+        except PermissionError:
+            pass
+        return []
+
+    def register_page_converter(self, converter: DocumentConverter) -> None:
+        """DEPRECATED: User register_converter instead."""
+        warn(
+            "register_page_converter is deprecated. Use register_converter instead.",
+            DeprecationWarning,
+        )
+        self.register_converter(converter)
+
+    def register_converter(self, converter: DocumentConverter) -> None:
+        """Register a page text converter."""
+        self._page_converters.insert(0, converter)
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
+#
+# SPDX-License-Identifier: MIT
+
+from ._base import DocumentConverter, DocumentConverterResult
+from ._plain_text_converter import PlainTextConverter
+from ._html_converter import HtmlConverter
+from ._rss_converter import RssConverter
+from ._wikipedia_converter import WikipediaConverter
+from ._youtube_converter import YouTubeConverter
+from ._ipynb_converter import IpynbConverter
+from ._bing_serp_converter import BingSerpConverter
+from ._pdf_converter import PdfConverter
+from ._docx_converter import DocxConverter
+from ._xlsx_converter import XlsxConverter, XlsConverter
+from ._pptx_converter import PptxConverter
+from ._image_converter import ImageConverter
+from ._wav_converter import WavConverter
+from ._mp3_converter import Mp3Converter
+from ._outlook_msg_converter import OutlookMsgConverter
+from ._zip_converter import ZipConverter
+from ._doc_intel_converter import DocumentIntelligenceConverter
+
+__all__ = [
+    "DocumentConverter",
+    "DocumentConverterResult",
+    "PlainTextConverter",
+    "HtmlConverter",
+    "RssConverter",
+    "WikipediaConverter",
+    "YouTubeConverter",
+    "IpynbConverter",
+    "BingSerpConverter",
+    "PdfConverter",
+    "DocxConverter",
+    "XlsxConverter",
+    "XlsConverter",
+    "PptxConverter",
+    "ImageConverter",
+    "WavConverter",
+    "Mp3Converter",
+    "OutlookMsgConverter",
+    "ZipConverter",
+    "DocumentIntelligenceConverter",
+]
@@ -0,0 +1,34 @@
+from typing import Any, Union
+
+
+class DocumentConverterResult:
+    """The result of converting a document to text."""
+
+    def __init__(self, title: Union[str, None] = None, text_content: str = ""):
+        self.title: Union[str, None] = title
+        self.text_content: str = text_content
+
+
+class DocumentConverter:
+    """Abstract superclass of all DocumentConverters."""
+
+    def __init__(self, priority: float = 0.0):
+        self._priority = priority
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        raise NotImplementedError("Subclasses must implement this method")
+
+    @property
+    def priority(self) -> float:
+        """Priority of the converter in markitdown's converter list. Higher priority values are tried first."""
+        return self._priority
+
+    @priority.setter
+    def radius(self, value: float):
+        self._priority = value
+
+    @priority.deleter
+    def radius(self):
+        raise AttributeError("Cannot delete the priority attribute")
@@ -0,0 +1,81 @@
+# type: ignore
+import base64
+import re
+
+from typing import Union
+from urllib.parse import parse_qs, urlparse
+from bs4 import BeautifulSoup
+
+from ._base import DocumentConverter, DocumentConverterResult
+from ._markdownify import _CustomMarkdownify
+
+
+class BingSerpConverter(DocumentConverter):
+    """
+    Handle Bing results pages (only the organic search results).
+    NOTE: It is better to use the Bing API
+    """
+
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not a Bing SERP
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() not in [".html", ".htm"]:
+            return None
+        url = kwargs.get("url", "")
+        if not re.search(r"^https://www\.bing\.com/search\?q=", url):
+            return None
+
+        # Parse the query parameters
+        parsed_params = parse_qs(urlparse(url).query)
+        query = parsed_params.get("q", [""])[0]
+
+        # Parse the file
+        soup = None
+        with open(local_path, "rt", encoding="utf-8") as fh:
+            soup = BeautifulSoup(fh.read(), "html.parser")
+
+        # Clean up some formatting
+        for tptt in soup.find_all(class_="tptt"):
+            if hasattr(tptt, "string") and tptt.string:
+                tptt.string += " "
+        for slug in soup.find_all(class_="algoSlug_icon"):
+            slug.extract()
+
+        # Parse the algorithmic results
+        _markdownify = _CustomMarkdownify()
+        results = list()
+        for result in soup.find_all(class_="b_algo"):
+            # Rewrite redirect urls
+            for a in result.find_all("a", href=True):
+                parsed_href = urlparse(a["href"])
+                qs = parse_qs(parsed_href.query)
+
+                # The destination is contained in the u parameter,
+                # but appears to be base64 encoded, with some prefix
+                if "u" in qs:
+                    u = (
+                        qs["u"][0][2:].strip() + "=="
+                    )  # Python 3 doesn't care about extra padding
+
+                    try:
+                        # RFC 4648 / Base64URL" variant, which uses "-" and "_"
+                        a["href"] = base64.b64decode(u, altchars="-_").decode("utf-8")
+                    except UnicodeDecodeError:
+                        pass
+                    except binascii.Error:
+                        pass
+
+            # Convert to markdown
+            md_result = _markdownify.convert_soup(result).strip()
+            lines = [line.strip() for line in re.split(r"\n+", md_result)]
+            results.append("\n".join([line for line in lines if len(line) > 0]))
+
+        webpage_text = (
+            f"## A Bing search for '{query}' found the following results:\n\n"
+            + "\n\n".join(results)
+        )
+
+        return DocumentConverterResult(
+            title=None if soup.title is None else soup.title.string,
+            text_content=webpage_text,
+        )
@@ -0,0 +1,85 @@
+from typing import Any, Union
+
+# Azure imports
+from azure.ai.documentintelligence import DocumentIntelligenceClient
+from azure.ai.documentintelligence.models import (
+    AnalyzeDocumentRequest,
+    AnalyzeResult,
+    DocumentAnalysisFeature,
+)
+from azure.identity import DefaultAzureCredential
+
+from ._base import DocumentConverter, DocumentConverterResult
+
+
+# TODO: currently, there is a bug in the document intelligence SDK with importing the "ContentFormat" enum.
+# This constant is a temporary fix until the bug is resolved.
+CONTENT_FORMAT = "markdown"
+
+
+class DocumentIntelligenceConverter(DocumentConverter):
+    """Specialized DocumentConverter that uses Document Intelligence to extract text from documents."""
+
+    def __init__(
+        self,
+        endpoint: str,
+        api_version: str = "2024-07-31-preview",
+    ):
+        self.endpoint = endpoint
+        self.api_version = api_version
+        self.doc_intel_client = DocumentIntelligenceClient(
+            endpoint=self.endpoint,
+            api_version=self.api_version,
+            credential=DefaultAzureCredential(),
+        )
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        # Bail if extension is not supported by Document Intelligence
+        extension = kwargs.get("file_extension", "")
+        docintel_extensions = [
+            ".pdf",
+            ".docx",
+            ".xlsx",
+            ".pptx",
+            ".html",
+            ".jpeg",
+            ".jpg",
+            ".png",
+            ".bmp",
+            ".tiff",
+            ".heif",
+        ]
+        if extension.lower() not in docintel_extensions:
+            return None
+
+        # Get the bytestring for the local path
+        with open(local_path, "rb") as f:
+            file_bytes = f.read()
+
+        # Certain document analysis features are not availiable for filetypes (.xlsx, .pptx, .html)
+        if extension.lower() in [".xlsx", ".pptx", ".html"]:
+            analysis_features = []
+        else:
+            analysis_features = [
+                DocumentAnalysisFeature.FORMULAS,  # enable formula extraction
+                DocumentAnalysisFeature.OCR_HIGH_RESOLUTION,  # enable high resolution OCR
+                DocumentAnalysisFeature.STYLE_FONT,  # enable font style extraction
+            ]
+
+        # Extract the text using Azure Document Intelligence
+        poller = self.doc_intel_client.begin_analyze_document(
+            model_id="prebuilt-layout",
+            body=AnalyzeDocumentRequest(bytes_source=file_bytes),
+            features=analysis_features,
+            output_content_format=CONTENT_FORMAT,  # TODO: replace with "ContentFormat.MARKDOWN" when the bug is fixed
+        )
+        result: AnalyzeResult = poller.result()
+
+        # remove comments from the markdown content generated by Doc Intelligence and append to markdown string
+        markdown_text = re.sub(r"<!--.*?-->", "", result.content, flags=re.DOTALL)
+        return DocumentConverterResult(
+            title=None,
+            text_content=markdown_text,
+        )
@@ -0,0 +1,31 @@
+from typing import Union
+
+import mammoth
+
+from ._base import (
+    DocumentConverterResult,
+)
+
+from ._html_converter import HtmlConverter
+
+
+class DocxConverter(HtmlConverter):
+    """
+    Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
+    """
+
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not a DOCX
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".docx":
+            return None
+
+        result = None
+        with open(local_path, "rb") as docx_file:
+            style_map = kwargs.get("style_map", None)
+
+            result = mammoth.convert_to_html(docx_file, style_map=style_map)
+            html_content = result.value
+            result = self._convert(html_content)
+
+        return result
@@ -0,0 +1,51 @@
+from typing import Any, Union
+from bs4 import BeautifulSoup
+
+from ._base import DocumentConverter, DocumentConverterResult
+from ._markdownify import _CustomMarkdownify
+
+
+class HtmlConverter(DocumentConverter):
+    """Anything with content type text/html"""
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        # Bail if not html
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() not in [".html", ".htm"]:
+            return None
+
+        result = None
+        with open(local_path, "rt", encoding="utf-8") as fh:
+            result = self._convert(fh.read())
+
+        return result
+
+    def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
+        """Helper function that converts an HTML string."""
+
+        # Parse the string
+        soup = BeautifulSoup(html_content, "html.parser")
+
+        # Remove javascript and style blocks
+        for script in soup(["script", "style"]):
+            script.extract()
+
+        # Print only the main content
+        body_elm = soup.find("body")
+        webpage_text = ""
+        if body_elm:
+            webpage_text = _CustomMarkdownify().convert_soup(body_elm)
+        else:
+            webpage_text = _CustomMarkdownify().convert_soup(soup)
+
+        assert isinstance(webpage_text, str)
+
+        # remove leading and trailing \n
+        webpage_text = webpage_text.strip()
+
+        return DocumentConverterResult(
+            title=None if soup.title is None else soup.title.string,
+            text_content=webpage_text,
+        )
@@ -0,0 +1,87 @@
+from typing import Union
+from ._base import DocumentConverterResult
+from ._media_converter import MediaConverter
+
+
+class ImageConverter(MediaConverter):
+    """
+    Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an llm_client is configured).
+    """
+
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not an image
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() not in [".jpg", ".jpeg", ".png"]:
+            return None
+
+        md_content = ""
+
+        # Add metadata
+        metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
+
+        if metadata:
+            for f in [
+                "ImageSize",
+                "Title",
+                "Caption",
+                "Description",
+                "Keywords",
+                "Artist",
+                "Author",
+                "DateTimeOriginal",
+                "CreateDate",
+                "GPSPosition",
+            ]:
+                if f in metadata:
+                    md_content += f"{f}: {metadata[f]}\n"
+
+        # Try describing the image with GPTV
+        llm_client = kwargs.get("llm_client")
+        llm_model = kwargs.get("llm_model")
+        if llm_client is not None and llm_model is not None:
+            md_content += (
+                "\n# Description:\n"
+                + self._get_llm_description(
+                    local_path,
+                    extension,
+                    llm_client,
+                    llm_model,
+                    prompt=kwargs.get("llm_prompt"),
+                ).strip()
+                + "\n"
+            )
+
+        return DocumentConverterResult(
+            title=None,
+            text_content=md_content,
+        )
+
+    def _get_llm_description(self, local_path, extension, client, model, prompt=None):
+        if prompt is None or prompt.strip() == "":
+            prompt = "Write a detailed caption for this image."
+
+        data_uri = ""
+        with open(local_path, "rb") as image_file:
+            content_type, encoding = mimetypes.guess_type("_dummy" + extension)
+            if content_type is None:
+                content_type = "image/jpeg"
+            image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
+            data_uri = f"data:{content_type};base64,{image_base64}"
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": data_uri,
+                        },
+                    },
+                ],
+            }
+        ]
+
+        response = client.chat.completions.create(model=model, messages=messages)
+        return response.choices[0].message.content
@@ -0,0 +1,70 @@
+import json
+from typing import Any, Union
+
+from ._base import (
+    DocumentConverter,
+    DocumentConverterResult,
+)
+
+from .._exceptions import FileConversionException
+
+
+class IpynbConverter(DocumentConverter):
+    """Converts Jupyter Notebook (.ipynb) files to Markdown."""
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        # Bail if not ipynb
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".ipynb":
+            return None
+
+        # Parse and convert the notebook
+        result = None
+        with open(local_path, "rt", encoding="utf-8") as fh:
+            notebook_content = json.load(fh)
+            result = self._convert(notebook_content)
+
+        return result
+
+    def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResult]:
+        """Helper function that converts notebook JSON content to Markdown."""
+        try:
+            md_output = []
+            title = None
+
+            for cell in notebook_content.get("cells", []):
+                cell_type = cell.get("cell_type", "")
+                source_lines = cell.get("source", [])
+
+                if cell_type == "markdown":
+                    md_output.append("".join(source_lines))
+
+                    # Extract the first # heading as title if not already found
+                    if title is None:
+                        for line in source_lines:
+                            if line.startswith("# "):
+                                title = line.lstrip("# ").strip()
+                                break
+
+                elif cell_type == "code":
+                    # Code cells are wrapped in Markdown code blocks
+                    md_output.append(f"```python\n{''.join(source_lines)}\n```")
+                elif cell_type == "raw":
+                    md_output.append(f"```\n{''.join(source_lines)}\n```")
+
+            md_text = "\n\n".join(md_output)
+
+            # Check for title in notebook metadata
+            title = notebook_content.get("metadata", {}).get("title", title)
+
+            return DocumentConverterResult(
+                title=title,
+                text_content=md_text,
+            )
+
+        except Exception as e:
+            raise FileConversionException(
+                f"Error converting .ipynb file: {str(e)}"
+            ) from e
@@ -0,0 +1,87 @@
+import re
+import markdownify
+
+from typing import Any
+from urllib.parse import quote, unquote, urlparse, urlunparse
+
+
+class _CustomMarkdownify(markdownify.MarkdownConverter):
+    """
+    A custom version of markdownify's MarkdownConverter. Changes include:
+
+    - Altering the default heading style to use '#', '##', etc.
+    - Removing javascript hyperlinks.
+    - Truncating images with large data:uri sources.
+    - Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
+    """
+
+    def __init__(self, **options: Any):
+        options["heading_style"] = options.get("heading_style", markdownify.ATX)
+        # Explicitly cast options to the expected type if necessary
+        super().__init__(**options)
+
+    def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
+        """Same as usual, but be sure to start with a new line"""
+        if not convert_as_inline:
+            if not re.search(r"^\n", text):
+                return "\n" + super().convert_hn(n, el, text, convert_as_inline)  # type: ignore
+
+        return super().convert_hn(n, el, text, convert_as_inline)  # type: ignore
+
+    def convert_a(self, el: Any, text: str, convert_as_inline: bool):
+        """Same as usual converter, but removes Javascript links and escapes URIs."""
+        prefix, suffix, text = markdownify.chomp(text)  # type: ignore
+        if not text:
+            return ""
+        href = el.get("href")
+        title = el.get("title")
+
+        # Escape URIs and skip non-http or file schemes
+        if href:
+            try:
+                parsed_url = urlparse(href)  # type: ignore
+                if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]:  # type: ignore
+                    return "%s%s%s" % (prefix, text, suffix)
+                href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path))))  # type: ignore
+            except ValueError:  # It's not clear if this ever gets thrown
+                return "%s%s%s" % (prefix, text, suffix)
+
+        # For the replacement see #29: text nodes underscores are escaped
+        if (
+            self.options["autolinks"]
+            and text.replace(r"\_", "_") == href
+            and not title
+            and not self.options["default_title"]
+        ):
+            # Shortcut syntax
+            return "<%s>" % href
+        if self.options["default_title"] and not title:
+            title = href
+        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
+        return (
+            "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix)
+            if href
+            else text
+        )
+
+    def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
+        """Same as usual converter, but removes data URIs"""
+
+        alt = el.attrs.get("alt", None) or ""
+        src = el.attrs.get("src", None) or ""
+        title = el.attrs.get("title", None) or ""
+        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
+        if (
+            convert_as_inline
+            and el.parent.name not in self.options["keep_inline_images_in"]
+        ):
+            return alt
+
+        # Remove dataURIs
+        if src.startswith("data:"):
+            src = src.split(",")[0] + "..."
+
+        return "![%s](%s%s)" % (alt, src, title_part)
+
+    def convert_soup(self, soup: Any) -> str:
+        return super().convert_soup(soup)  # type: ignore
@@ -0,0 +1,36 @@
+import subprocess
+import shutil
+import json
+from warnings import warn
+
+from ._base import DocumentConverter
+
+
+class MediaConverter(DocumentConverter):
+    """
+    Abstract class for multi-modal media (e.g., images and audio)
+    """
+
+    def _get_metadata(self, local_path, exiftool_path=None):
+        if not exiftool_path:
+            which_exiftool = shutil.which("exiftool")
+            if which_exiftool:
+                warn(
+                    f"""Implicit discovery of 'exiftool' is disabled. If you would like to continue to use exiftool in MarkItDown, please set the exiftool_path parameter in the MarkItDown consructor. E.g., 
+
+    md = MarkItDown(exiftool_path="{which_exiftool}")
+
+This warning will be removed in future releases.
+""",
+                    DeprecationWarning,
+                )
+
+            return None
+        else:
+            try:
+                result = subprocess.run(
+                    [exiftool_path, "-json", local_path], capture_output=True, text=True
+                ).stdout
+                return json.loads(result)[0]
+            except Exception:
+                return None
@@ -0,0 +1,84 @@
+import tempfile
+from typing import Union
+from ._base import DocumentConverterResult
+from ._wav_converter import WavConverter
+from warnings import resetwarnings, catch_warnings
+
+# Optional Transcription support
+IS_AUDIO_TRANSCRIPTION_CAPABLE = False
+try:
+    # Using warnings' catch_warnings to catch
+    # pydub's warning of ffmpeg or avconv missing
+    with catch_warnings(record=True) as w:
+        import pydub
+
+        if w:
+            raise ModuleNotFoundError
+    import speech_recognition as sr
+
+    IS_AUDIO_TRANSCRIPTION_CAPABLE = True
+except ModuleNotFoundError:
+    pass
+finally:
+    resetwarnings()
+
+
+class Mp3Converter(WavConverter):
+    """
+    Converts MP3 files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
+    """
+
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not a MP3
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".mp3":
+            return None
+
+        md_content = ""
+
+        # Add metadata
+        metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
+        if metadata:
+            for f in [
+                "Title",
+                "Artist",
+                "Author",
+                "Band",
+                "Album",
+                "Genre",
+                "Track",
+                "DateTimeOriginal",
+                "CreateDate",
+                "Duration",
+            ]:
+                if f in metadata:
+                    md_content += f"{f}: {metadata[f]}\n"
+
+        # Transcribe
+        if IS_AUDIO_TRANSCRIPTION_CAPABLE:
+            handle, temp_path = tempfile.mkstemp(suffix=".wav")
+            os.close(handle)
+            try:
+                sound = pydub.AudioSegment.from_mp3(local_path)
+                sound.export(temp_path, format="wav")
+
+                _args = dict()
+                _args.update(kwargs)
+                _args["file_extension"] = ".wav"
+
+                try:
+                    transcript = super()._transcribe_audio(temp_path).strip()
+                    md_content += "\n\n### Audio Transcript:\n" + (
+                        "[No speech detected]" if transcript == "" else transcript
+                    )
+                except Exception:
+                    md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
+
+            finally:
+                os.unlink(temp_path)
+
+        # Return the result
+        return DocumentConverterResult(
+            title=None,
+            text_content=md_content.strip(),
+        )
@@ -0,0 +1,76 @@
+import olefile
+from typing import Any, Union
+from ._base import DocumentConverter, DocumentConverterResult
+
+
+class OutlookMsgConverter(DocumentConverter):
+    """Converts Outlook .msg files to markdown by extracting email metadata and content.
+
+    Uses the olefile package to parse the .msg file structure and extract:
+    - Email headers (From, To, Subject)
+    - Email body content
+    """
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        # Bail if not a MSG file
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".msg":
+            return None
+
+        try:
+            msg = olefile.OleFileIO(local_path)
+            # Extract email metadata
+            md_content = "# Email Message\n\n"
+
+            # Get headers
+            headers = {
+                "From": self._get_stream_data(msg, "__substg1.0_0C1F001F"),
+                "To": self._get_stream_data(msg, "__substg1.0_0E04001F"),
+                "Subject": self._get_stream_data(msg, "__substg1.0_0037001F"),
+            }
+
+            # Add headers to markdown
+            for key, value in headers.items():
+                if value:
+                    md_content += f"**{key}:** {value}\n"
+
+            md_content += "\n## Content\n\n"
+
+            # Get email body
+            body = self._get_stream_data(msg, "__substg1.0_1000001F")
+            if body:
+                md_content += body
+
+            msg.close()
+
+            return DocumentConverterResult(
+                title=headers.get("Subject"), text_content=md_content.strip()
+            )
+
+        except Exception as e:
+            raise FileConversionException(
+                f"Could not convert MSG file '{local_path}': {str(e)}"
+            )
+
+    def _get_stream_data(
+        self, msg: olefile.OleFileIO, stream_path: str
+    ) -> Union[str, None]:
+        """Helper to safely extract and decode stream data from the MSG file."""
+        try:
+            if msg.exists(stream_path):
+                data = msg.openstream(stream_path).read()
+                # Try UTF-16 first (common for .msg files)
+                try:
+                    return data.decode("utf-16-le").strip()
+                except UnicodeDecodeError:
+                    # Fall back to UTF-8
+                    try:
+                        return data.decode("utf-8").strip()
+                    except UnicodeDecodeError:
+                        # Last resort - ignore errors
+                        return data.decode("utf-8", errors="ignore").strip()
+        except Exception:
+            pass
+        return None
@@ -0,0 +1,21 @@
+import pdfminer
+import pdfminer.high_level
+from typing import Union
+from ._base import DocumentConverter, DocumentConverterResult
+
+
+class PdfConverter(DocumentConverter):
+    """
+    Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
+    """
+
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not a PDF
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".pdf":
+            return None
+
+        return DocumentConverterResult(
+            title=None,
+            text_content=pdfminer.high_level.extract_text(local_path),
+        )
@@ -0,0 +1,33 @@
+import mimetypes
+
+from charset_normalizer import from_path
+from typing import Any, Union
+
+from ._base import DocumentConverter, DocumentConverterResult
+
+
+class PlainTextConverter(DocumentConverter):
+    """Anything with content type text/plain"""
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        # Guess the content type from any file extension that might be around
+        content_type, _ = mimetypes.guess_type(
+            "__placeholder" + kwargs.get("file_extension", "")
+        )
+
+        # Only accept text files
+        if content_type is None:
+            return None
+        elif all(
+            not content_type.lower().startswith(type_prefix)
+            for type_prefix in ["text/", "application/json"]
+        ):
+            return None
+
+        text_content = str(from_path(local_path).best())
+        return DocumentConverterResult(
+            title=None,
+            text_content=text_content,
+        )
@@ -0,0 +1,180 @@
+import base64
+import pptx
+import re
+import html
+
+from typing import Union
+
+from ._base import DocumentConverterResult, DocumentConverter
+from ._html_converter import HtmlConverter
+
+
+class PptxConverter(HtmlConverter):
+    """
+    Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
+    """
+
+    def _get_llm_description(
+        self, llm_client, llm_model, image_blob, content_type, prompt=None
+    ):
+        if prompt is None or prompt.strip() == "":
+            prompt = "Write a detailed alt text for this image with less than 50 words."
+
+        image_base64 = base64.b64encode(image_blob).decode("utf-8")
+        data_uri = f"data:{content_type};base64,{image_base64}"
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": data_uri,
+                        },
+                    },
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+
+        response = llm_client.chat.completions.create(
+            model=llm_model, messages=messages
+        )
+        return response.choices[0].message.content
+
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not a PPTX
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".pptx":
+            return None
+
+        md_content = ""
+
+        presentation = pptx.Presentation(local_path)
+        slide_num = 0
+        for slide in presentation.slides:
+            slide_num += 1
+
+            md_content += f"\n\n<!-- Slide number: {slide_num} -->\n"
+
+            title = slide.shapes.title
+            for shape in slide.shapes:
+                # Pictures
+                if self._is_picture(shape):
+                    # https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
+
+                    llm_description = None
+                    alt_text = None
+
+                    llm_client = kwargs.get("llm_client")
+                    llm_model = kwargs.get("llm_model")
+                    if llm_client is not None and llm_model is not None:
+                        try:
+                            llm_description = self._get_llm_description(
+                                llm_client,
+                                llm_model,
+                                shape.image.blob,
+                                shape.image.content_type,
+                            )
+                        except Exception:
+                            # Unable to describe with LLM
+                            pass
+
+                    if not llm_description:
+                        try:
+                            alt_text = shape._element._nvXxPr.cNvPr.attrib.get(
+                                "descr", ""
+                            )
+                        except Exception:
+                            # Unable to get alt text
+                            pass
+
+                    # A placeholder name
+                    filename = re.sub(r"\W", "", shape.name) + ".jpg"
+                    md_content += (
+                        "\n!["
+                        + (llm_description or alt_text or shape.name)
+                        + "]("
+                        + filename
+                        + ")\n"
+                    )
+
+                # Tables
+                if self._is_table(shape):
+                    html_table = "<html><body><table>"
+                    first_row = True
+                    for row in shape.table.rows:
+                        html_table += "<tr>"
+                        for cell in row.cells:
+                            if first_row:
+                                html_table += "<th>" + html.escape(cell.text) + "</th>"
+                            else:
+                                html_table += "<td>" + html.escape(cell.text) + "</td>"
+                        html_table += "</tr>"
+                        first_row = False
+                    html_table += "</table></body></html>"
+                    md_content += (
+                        "\n" + self._convert(html_table).text_content.strip() + "\n"
+                    )
+
+                # Charts
+                if shape.has_chart:
+                    md_content += self._convert_chart_to_markdown(shape.chart)
+
+                # Text areas
+                elif shape.has_text_frame:
+                    if shape == title:
+                        md_content += "# " + shape.text.lstrip() + "\n"
+                    else:
+                        md_content += shape.text + "\n"
+
+            md_content = md_content.strip()
+
+            if slide.has_notes_slide:
+                md_content += "\n\n### Notes:\n"
+                notes_frame = slide.notes_slide.notes_text_frame
+                if notes_frame is not None:
+                    md_content += notes_frame.text
+                md_content = md_content.strip()
+
+        return DocumentConverterResult(
+            title=None,
+            text_content=md_content.strip(),
+        )
+
+    def _is_picture(self, shape):
+        if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
+            return True
+        if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
+            if hasattr(shape, "image"):
+                return True
+        return False
+
+    def _is_table(self, shape):
+        if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
+            return True
+        return False
+
+    def _convert_chart_to_markdown(self, chart):
+        md = "\n\n### Chart"
+        if chart.has_title:
+            md += f": {chart.chart_title.text_frame.text}"
+        md += "\n\n"
+        data = []
+        category_names = [c.label for c in chart.plots[0].categories]
+        series_names = [s.name for s in chart.series]
+        data.append(["Category"] + series_names)
+
+        for idx, category in enumerate(category_names):
+            row = [category]
+            for series in chart.series:
+                row.append(series.values[idx])
+            data.append(row)
+
+        markdown_table = []
+        for row in data:
+            markdown_table.append("| " + " | ".join(map(str, row)) + " |")
+        header = markdown_table[0]
+        separator = "|" + "|".join(["---"] * len(data[0])) + "|"
+        return md + "\n".join([header, separator] + markdown_table[1:])
@@ -0,0 +1,143 @@
+from xml.dom import minidom
+from typing import Union
+from bs4 import BeautifulSoup
+
+from ._markdownify import _CustomMarkdownify
+from ._base import DocumentConverter, DocumentConverterResult
+
+
+class RssConverter(DocumentConverter):
+    """Convert RSS / Atom type to markdown"""
+
+    def convert(
+        self, local_path: str, **kwargs
+    ) -> Union[None, DocumentConverterResult]:
+        # Bail if not RSS type
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() not in [".xml", ".rss", ".atom"]:
+            return None
+        try:
+            doc = minidom.parse(local_path)
+        except BaseException as _:
+            return None
+        result = None
+        if doc.getElementsByTagName("rss"):
+            # A RSS feed must have a root element of <rss>
+            result = self._parse_rss_type(doc)
+        elif doc.getElementsByTagName("feed"):
+            root = doc.getElementsByTagName("feed")[0]
+            if root.getElementsByTagName("entry"):
+                # An Atom feed must have a root element of <feed> and at least one <entry>
+                result = self._parse_atom_type(doc)
+            else:
+                return None
+        else:
+            # not rss or atom
+            return None
+
+        return result
+
+    def _parse_atom_type(
+        self, doc: minidom.Document
+    ) -> Union[None, DocumentConverterResult]:
+        """Parse the type of an Atom feed.
+
+        Returns None if the feed type is not recognized or something goes wrong.
+        """
+        try:
+            root = doc.getElementsByTagName("feed")[0]
+            title = self._get_data_by_tag_name(root, "title")
+            subtitle = self._get_data_by_tag_name(root, "subtitle")
+            entries = root.getElementsByTagName("entry")
+            md_text = f"# {title}\n"
+            if subtitle:
+                md_text += f"{subtitle}\n"
+            for entry in entries:
+                entry_title = self._get_data_by_tag_name(entry, "title")
+                entry_summary = self._get_data_by_tag_name(entry, "summary")
+                entry_updated = self._get_data_by_tag_name(entry, "updated")
+                entry_content = self._get_data_by_tag_name(entry, "content")
+
+                if entry_title:
+                    md_text += f"\n## {entry_title}\n"
+                if entry_updated:
+                    md_text += f"Updated on: {entry_updated}\n"
+                if entry_summary:
+                    md_text += self._parse_content(entry_summary)
+                if entry_content:
+                    md_text += self._parse_content(entry_content)
+
+            return DocumentConverterResult(
+                title=title,
+                text_content=md_text,
+            )
+        except BaseException as _:
+            return None
+
+    def _parse_rss_type(
+        self, doc: minidom.Document
+    ) -> Union[None, DocumentConverterResult]:
+        """Parse the type of an RSS feed.
+
+        Returns None if the feed type is not recognized or something goes wrong.
+        """
+        try:
+            root = doc.getElementsByTagName("rss")[0]
+            channel = root.getElementsByTagName("channel")
+            if not channel:
+                return None
+            channel = channel[0]
+            channel_title = self._get_data_by_tag_name(channel, "title")
+            channel_description = self._get_data_by_tag_name(channel, "description")
+            items = channel.getElementsByTagName("item")
+            if channel_title:
+                md_text = f"# {channel_title}\n"
+            if channel_description:
+                md_text += f"{channel_description}\n"
+            if not items:
+                items = []
+            for item in items:
+                title = self._get_data_by_tag_name(item, "title")
+                description = self._get_data_by_tag_name(item, "description")
+                pubDate = self._get_data_by_tag_name(item, "pubDate")
+                content = self._get_data_by_tag_name(item, "content:encoded")
+
+                if title:
+                    md_text += f"\n## {title}\n"
+                if pubDate:
+                    md_text += f"Published on: {pubDate}\n"
+                if description:
+                    md_text += self._parse_content(description)
+                if content:
+                    md_text += self._parse_content(content)
+
+            return DocumentConverterResult(
+                title=channel_title,
+                text_content=md_text,
+            )
+        except BaseException as _:
+            print(traceback.format_exc())
+            return None
+
+    def _parse_content(self, content: str) -> str:
+        """Parse the content of an RSS feed item"""
+        try:
+            # using bs4 because many RSS feeds have HTML-styled content
+            soup = BeautifulSoup(content, "html.parser")
+            return _CustomMarkdownify().convert_soup(soup)
+        except BaseException as _:
+            return content
+
+    def _get_data_by_tag_name(
+        self, element: minidom.Element, tag_name: str
+    ) -> Union[str, None]:
+        """Get data from first child element with the given tag name.
+        Returns None when no such element is found.
+        """
+        nodes = element.getElementsByTagName(tag_name)
+        if not nodes:
+            return None
+        fc = nodes[0].firstChild
+        if fc:
+            return fc.data
+        return None
@@ -0,0 +1,67 @@
+from typing import Union
+from ._base import DocumentConverterResult
+from ._media_converter import MediaConverter
+
+# Optional Transcription support
+IS_AUDIO_TRANSCRIPTION_CAPABLE = False
+try:
+    import speech_recognition as sr
+
+    IS_AUDIO_TRANSCRIPTION_CAPABLE = True
+except ModuleNotFoundError:
+    pass
+
+
+class WavConverter(MediaConverter):
+    """
+    Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
+    """
+
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not a WAV
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".wav":
+            return None
+
+        md_content = ""
+
+        # Add metadata
+        metadata = self._get_metadata(local_path, kwargs.get("exiftool_path"))
+        if metadata:
+            for f in [
+                "Title",
+                "Artist",
+                "Author",
+                "Band",
+                "Album",
+                "Genre",
+                "Track",
+                "DateTimeOriginal",
+                "CreateDate",
+                "Duration",
+            ]:
+                if f in metadata:
+                    md_content += f"{f}: {metadata[f]}\n"
+
+        # Transcribe
+        if IS_AUDIO_TRANSCRIPTION_CAPABLE:
+            try:
+                transcript = self._transcribe_audio(local_path)
+                md_content += "\n\n### Audio Transcript:\n" + (
+                    "[No speech detected]" if transcript == "" else transcript
+                )
+            except Exception:
+                md_content += (
+                    "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
+                )
+
+        return DocumentConverterResult(
+            title=None,
+            text_content=md_content.strip(),
+        )
+
+    def _transcribe_audio(self, local_path) -> str:
+        recognizer = sr.Recognizer()
+        with sr.AudioFile(local_path) as source:
+            audio = recognizer.record(source)
+            return recognizer.recognize_google(audio).strip()
@@ -0,0 +1,56 @@
+import re
+
+from typing import Any, Union
+from bs4 import BeautifulSoup
+
+from ._base import DocumentConverter, DocumentConverterResult
+from ._markdownify import _CustomMarkdownify
+
+
+class WikipediaConverter(DocumentConverter):
+    """Handle Wikipedia pages separately, focusing only on the main document content."""
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        # Bail if not Wikipedia
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() not in [".html", ".htm"]:
+            return None
+        url = kwargs.get("url", "")
+        if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
+            return None
+
+        # Parse the file
+        soup = None
+        with open(local_path, "rt", encoding="utf-8") as fh:
+            soup = BeautifulSoup(fh.read(), "html.parser")
+
+        # Remove javascript and style blocks
+        for script in soup(["script", "style"]):
+            script.extract()
+
+        # Print only the main content
+        body_elm = soup.find("div", {"id": "mw-content-text"})
+        title_elm = soup.find("span", {"class": "mw-page-title-main"})
+
+        webpage_text = ""
+        main_title = None if soup.title is None else soup.title.string
+
+        if body_elm:
+            # What's the title
+            if title_elm and len(title_elm) > 0:
+                main_title = title_elm.string  # type: ignore
+                assert isinstance(main_title, str)
+
+            # Convert the page
+            webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(
+                body_elm
+            )
+        else:
+            webpage_text = _CustomMarkdownify().convert_soup(soup)
+
+        return DocumentConverterResult(
+            title=main_title,
+            text_content=webpage_text,
+        )
@@ -0,0 +1,54 @@
+from typing import Union
+
+import pandas as pd
+
+from ._base import DocumentConverterResult
+from ._html_converter import HtmlConverter
+
+
+class XlsxConverter(HtmlConverter):
+    """
+    Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
+    """
+
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not a XLSX
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".xlsx":
+            return None
+
+        sheets = pd.read_excel(local_path, sheet_name=None, engine="openpyxl")
+        md_content = ""
+        for s in sheets:
+            md_content += f"## {s}\n"
+            html_content = sheets[s].to_html(index=False)
+            md_content += self._convert(html_content).text_content.strip() + "\n\n"
+
+        return DocumentConverterResult(
+            title=None,
+            text_content=md_content.strip(),
+        )
+
+
+class XlsConverter(HtmlConverter):
+    """
+    Converts XLS files to Markdown, with each sheet presented as a separate Markdown table.
+    """
+
+    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+        # Bail if not a XLS
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".xls":
+            return None
+
+        sheets = pd.read_excel(local_path, sheet_name=None, engine="xlrd")
+        md_content = ""
+        for s in sheets:
+            md_content += f"## {s}\n"
+            html_content = sheets[s].to_html(index=False)
+            md_content += self._convert(html_content).text_content.strip() + "\n\n"
+
+        return DocumentConverterResult(
+            title=None,
+            text_content=md_content.strip(),
+        )
@@ -0,0 +1,148 @@
+import re
+
+from typing import Any, Union, Dict, List
+from urllib.parse import parse_qs, urlparse
+from bs4 import BeautifulSoup
+
+from ._base import DocumentConverter, DocumentConverterResult
+
+
+# Optional YouTube transcription support
+try:
+    from youtube_transcript_api import YouTubeTranscriptApi
+
+    IS_YOUTUBE_TRANSCRIPT_CAPABLE = True
+except ModuleNotFoundError:
+    pass
+
+
+class YouTubeConverter(DocumentConverter):
+    """Handle YouTube specially, focusing on the video title, description, and transcript."""
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        # Bail if not YouTube
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() not in [".html", ".htm"]:
+            return None
+        url = kwargs.get("url", "")
+        if not url.startswith("https://www.youtube.com/watch?"):
+            return None
+
+        # Parse the file
+        soup = None
+        with open(local_path, "rt", encoding="utf-8") as fh:
+            soup = BeautifulSoup(fh.read(), "html.parser")
+
+        # Read the meta tags
+        assert soup.title is not None and soup.title.string is not None
+        metadata: Dict[str, str] = {"title": soup.title.string}
+        for meta in soup(["meta"]):
+            for a in meta.attrs:
+                if a in ["itemprop", "property", "name"]:
+                    metadata[meta[a]] = meta.get("content", "")
+                    break
+
+        # We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
+        try:
+            for script in soup(["script"]):
+                content = script.text
+                if "ytInitialData" in content:
+                    lines = re.split(r"\r?\n", content)
+                    obj_start = lines[0].find("{")
+                    obj_end = lines[0].rfind("}")
+                    if obj_start >= 0 and obj_end >= 0:
+                        data = json.loads(lines[0][obj_start : obj_end + 1])
+                        attrdesc = self._findKey(data, "attributedDescriptionBodyText")  # type: ignore
+                        if attrdesc:
+                            metadata["description"] = str(attrdesc["content"])
+                    break
+        except Exception:
+            pass
+
+        # Start preparing the page
+        webpage_text = "# YouTube\n"
+
+        title = self._get(metadata, ["title", "og:title", "name"])  # type: ignore
+        assert isinstance(title, str)
+
+        if title:
+            webpage_text += f"\n## {title}\n"
+
+        stats = ""
+        views = self._get(metadata, ["interactionCount"])  # type: ignore
+        if views:
+            stats += f"- **Views:** {views}\n"
+
+        keywords = self._get(metadata, ["keywords"])  # type: ignore
+        if keywords:
+            stats += f"- **Keywords:** {keywords}\n"
+
+        runtime = self._get(metadata, ["duration"])  # type: ignore
+        if runtime:
+            stats += f"- **Runtime:** {runtime}\n"
+
+        if len(stats) > 0:
+            webpage_text += f"\n### Video Metadata\n{stats}\n"
+
+        description = self._get(metadata, ["description", "og:description"])  # type: ignore
+        if description:
+            webpage_text += f"\n### Description\n{description}\n"
+
+        if IS_YOUTUBE_TRANSCRIPT_CAPABLE:
+            transcript_text = ""
+            parsed_url = urlparse(url)  # type: ignore
+            params = parse_qs(parsed_url.query)  # type: ignore
+            if "v" in params:
+                assert isinstance(params["v"][0], str)
+                video_id = str(params["v"][0])
+                try:
+                    youtube_transcript_languages = kwargs.get(
+                        "youtube_transcript_languages", ("en",)
+                    )
+                    # Must be a single transcript.
+                    transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=youtube_transcript_languages)  # type: ignore
+                    transcript_text = " ".join([part["text"] for part in transcript])  # type: ignore
+                    # Alternative formatting:
+                    # formatter = TextFormatter()
+                    # formatter.format_transcript(transcript)
+                except Exception:
+                    pass
+            if transcript_text:
+                webpage_text += f"\n### Transcript\n{transcript_text}\n"
+
+        title = title if title else soup.title.string
+        assert isinstance(title, str)
+
+        return DocumentConverterResult(
+            title=title,
+            text_content=webpage_text,
+        )
+
+    def _get(
+        self,
+        metadata: Dict[str, str],
+        keys: List[str],
+        default: Union[str, None] = None,
+    ) -> Union[str, None]:
+        for k in keys:
+            if k in metadata:
+                return metadata[k]
+        return default
+
+    def _findKey(self, json: Any, key: str) -> Union[str, None]:  # TODO: Fix json type
+        if isinstance(json, list):
+            for elm in json:
+                ret = self._findKey(elm, key)
+                if ret is not None:
+                    return ret
+        elif isinstance(json, dict):
+            for k in json:
+                if k == key:
+                    return json[k]
+                else:
+                    ret = self._findKey(json[k], key)
+                    if ret is not None:
+                        return ret
+        return None
@@ -0,0 +1,135 @@
+import os
+import zipfile
+import shutil
+from typing import Any, Union
+
+from ._base import DocumentConverter, DocumentConverterResult
+
+
+class ZipConverter(DocumentConverter):
+    """Converts ZIP files to markdown by extracting and converting all contained files.
+
+    The converter extracts the ZIP contents to a temporary directory, processes each file
+    using appropriate converters based on file extensions, and then combines the results
+    into a single markdown document. The temporary directory is cleaned up after processing.
+
+    Example output format:
+    ```markdown
+    Content from the zip file `example.zip`:
+
+    ## File: docs/readme.txt
+
+    This is the content of readme.txt
+    Multiple lines are preserved
+
+    ## File: images/example.jpg
+
+    ImageSize: 1920x1080
+    DateTimeOriginal: 2024-02-15 14:30:00
+    Description: A beautiful landscape photo
+
+    ## File: data/report.xlsx
+
+    ## Sheet1
+    | Column1 | Column2 | Column3 |
+    |---------|---------|---------|
+    | data1   | data2   | data3   |
+    | data4   | data5   | data6   |
+    ```
+
+    Key features:
+    - Maintains original file structure in headings
+    - Processes nested files recursively
+    - Uses appropriate converters for each file type
+    - Preserves formatting of converted content
+    - Cleans up temporary files after processing
+    """
+
+    def convert(
+        self, local_path: str, **kwargs: Any
+    ) -> Union[None, DocumentConverterResult]:
+        # Bail if not a ZIP
+        extension = kwargs.get("file_extension", "")
+        if extension.lower() != ".zip":
+            return None
+
+        # Get parent converters list if available
+        parent_converters = kwargs.get("_parent_converters", [])
+        if not parent_converters:
+            return DocumentConverterResult(
+                title=None,
+                text_content=f"[ERROR] No converters available to process zip contents from: {local_path}",
+            )
+
+        extracted_zip_folder_name = (
+            f"extracted_{os.path.basename(local_path).replace('.zip', '_zip')}"
+        )
+        extraction_dir = os.path.normpath(
+            os.path.join(os.path.dirname(local_path), extracted_zip_folder_name)
+        )
+        md_content = f"Content from the zip file `{os.path.basename(local_path)}`:\n\n"
+
+        try:
+            # Extract the zip file safely
+            with zipfile.ZipFile(local_path, "r") as zipObj:
+                # Safeguard against path traversal
+                for member in zipObj.namelist():
+                    member_path = os.path.normpath(os.path.join(extraction_dir, member))
+                    if (
+                        not os.path.commonprefix([extraction_dir, member_path])
+                        == extraction_dir
+                    ):
+                        raise ValueError(
+                            f"Path traversal detected in zip file: {member}"
+                        )
+
+                # Extract all files safely
+                zipObj.extractall(path=extraction_dir)
+
+            # Process each extracted file
+            for root, dirs, files in os.walk(extraction_dir):
+                for name in files:
+                    file_path = os.path.join(root, name)
+                    relative_path = os.path.relpath(file_path, extraction_dir)
+
+                    # Get file extension
+                    _, file_extension = os.path.splitext(name)
+
+                    # Update kwargs for the file
+                    file_kwargs = kwargs.copy()
+                    file_kwargs["file_extension"] = file_extension
+                    file_kwargs["_parent_converters"] = parent_converters
+
+                    # Try converting the file using available converters
+                    for converter in parent_converters:
+                        # Skip the zip converter to avoid infinite recursion
+                        if isinstance(converter, ZipConverter):
+                            continue
+
+                        result = converter.convert(file_path, **file_kwargs)
+                        if result is not None:
+                            md_content += f"\n## File: {relative_path}\n\n"
+                            md_content += result.text_content + "\n\n"
+                            break
+
+            # Clean up extracted files if specified
+            if kwargs.get("cleanup_extracted", True):
+                shutil.rmtree(extraction_dir)
+
+            return DocumentConverterResult(title=None, text_content=md_content.strip())
+
+        except zipfile.BadZipFile:
+            return DocumentConverterResult(
+                title=None,
+                text_content=f"[ERROR] Invalid or corrupted zip file: {local_path}",
+            )
+        except ValueError as ve:
+            return DocumentConverterResult(
+                title=None,
+                text_content=f"[ERROR] Security error in zip file {local_path}: {str(ve)}",
+            )
+        except Exception as e:
+            return DocumentConverterResult(
+                title=None,
+                text_content=f"[ERROR] Failed to process zip file {local_path}: {str(e)}",
+            )
@@ -0,0 +1,3 @@
+# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
+#
+# SPDX-License-Identifier: MIT
@@ -0,0 +1,10 @@
+{
+    "key1": "string_value",
+    "key2": 1234,
+    "key3": [
+        "list_value1",
+        "list_value2"
+    ],
+    "5b64c88c-b3c3-4510-bcb8-da0b200602d8": "uuid_key",
+    "uuid_value": "9700dc99-6685-40b4-9a3a-5e406dcb37f3"
+}
@@ -0,0 +1,4 @@
+–¼‘O,”N—î,�Z�Š
+�²“¡‘¾˜Y,30,“Œ‹ž
+ŽO–Ø‰pŽq,25,‘å�ã
+îà‹´�~,35,–¼ŒÃ‰®
@@ -0,0 +1,89 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "id": "0f61db80",
+            "metadata": {},
+            "source": [
+                "# Test Notebook"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 11,
+            "id": "3f2a5bbd",
+            "metadata": {},
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "markitdown\n"
+                    ]
+                }
+            ],
+            "source": [
+                "print('markitdown')"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "9b9c0468",
+            "metadata": {},
+            "source": [
+                "## Code Cell Below"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": 10,
+            "id": "37d8088a",
+            "metadata": {},
+            "outputs": [
+                {
+                    "name": "stdout",
+                    "output_type": "stream",
+                    "text": [
+                        "42\n"
+                    ]
+                }
+            ],
+            "source": [
+                "# comment in code\n",
+                "print(42)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "id": "2e3177bd",
+            "metadata": {},
+            "source": [
+                "End\n",
+                "\n",
+                "---"
+            ]
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": "Python 3",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.12.8"
+        },
+        "title": "Test Notebook Title"
+    },
+    "nbformat": 4,
+    "nbformat_minor": 5
+}
@@ -0,0 +1,334 @@
+#!/usr/bin/env python3 -m pytest
+import io
+import os
+import shutil
+
+import pytest
+import requests
+
+from warnings import catch_warnings, resetwarnings
+
+from markitdown import MarkItDown
+
+skip_remote = (
+    True if os.environ.get("GITHUB_ACTIONS") else False
+)  # Don't run these tests in CI
+
+
+# Don't run the llm tests without a key and the client library
+skip_llm = False if os.environ.get("OPENAI_API_KEY") else True
+try:
+    import openai
+except ModuleNotFoundError:
+    skip_llm = True
+
+# Skip exiftool tests if not installed
+skip_exiftool = shutil.which("exiftool") is None
+
+TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
+
+JPG_TEST_EXIFTOOL = {
+    "Author": "AutoGen Authors",
+    "Title": "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+    "Description": "AutoGen enables diverse LLM-based applications",
+    "ImageSize": "1615x1967",
+    "DateTimeOriginal": "2024:03:14 22:10:00",
+}
+
+PDF_TEST_URL = "https://arxiv.org/pdf/2308.08155v2.pdf"
+PDF_TEST_STRINGS = [
+    "While there is contemporaneous exploration of multi-agent approaches"
+]
+
+YOUTUBE_TEST_URL = "https://www.youtube.com/watch?v=V2qZ_lgxTzg"
+YOUTUBE_TEST_STRINGS = [
+    "## AutoGen FULL Tutorial with Python (Step-By-Step)",
+    "This is an intermediate tutorial for installing and using AutoGen locally",
+    "PT15M4S",
+    "the model we're going to be using today is GPT 3.5 turbo",  # From the transcript
+]
+
+XLSX_TEST_STRINGS = [
+    "## 09060124-b5e7-4717-9d07-3c046eb",
+    "6ff4173b-42a5-4784-9b19-f49caff4d93d",
+    "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
+]
+
+XLS_TEST_STRINGS = [
+    "## 09060124-b5e7-4717-9d07-3c046eb",
+    "6ff4173b-42a5-4784-9b19-f49caff4d93d",
+    "affc7dad-52dc-4b98-9b5d-51e65d8a8ad0",
+]
+
+DOCX_TEST_STRINGS = [
+    "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
+    "49e168b7-d2ae-407f-a055-2167576f39a1",
+    "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
+    "# Abstract",
+    "# Introduction",
+    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+]
+
+MSG_TEST_STRINGS = [
+    "# Email Message",
+    "**From:** test.sender@example.com",
+    "**To:** test.recipient@example.com",
+    "**Subject:** Test Email Message",
+    "## Content",
+    "This is the body of the test email message",
+]
+
+DOCX_COMMENT_TEST_STRINGS = [
+    "314b0a30-5b04-470b-b9f7-eed2c2bec74a",
+    "49e168b7-d2ae-407f-a055-2167576f39a1",
+    "## d666f1f7-46cb-42bd-9a39-9a39cf2a509f",
+    "# Abstract",
+    "# Introduction",
+    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+    "This is a test comment. 12df-321a",
+    "Yet another comment in the doc. 55yiyi-asd09",
+]
+
+PPTX_TEST_STRINGS = [
+    "2cdda5c8-e50e-4db4-b5f0-9722a649f455",
+    "04191ea8-5c73-4215-a1d3-1cfb43aaaf12",
+    "44bf7d06-5e7a-4a40-a2e1-a2e42ef28c8a",
+    "1b92870d-e3b5-4e65-8153-919f4ff45592",
+    "AutoGen: Enabling Next-Gen LLM Applications via Multi-Agent Conversation",
+    "a3f6004b-6f4f-4ea8-bee3-3741f4dc385f",  # chart title
+    "2003",  # chart value
+]
+
+BLOG_TEST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"
+BLOG_TEST_STRINGS = [
+    "Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?",
+    "an example where high cost can easily prevent a generic complex",
+]
+
+
+RSS_TEST_STRINGS = [
+    "The Official Microsoft Blog",
+    "In the case of AI, it is absolutely true that the industry is moving incredibly fast",
+]
+
+
+WIKIPEDIA_TEST_URL = "https://en.wikipedia.org/wiki/Microsoft"
+WIKIPEDIA_TEST_STRINGS = [
+    "Microsoft entered the operating system (OS) business in 1980 with its own version of [Unix]",
+    'Microsoft was founded by [Bill Gates](/wiki/Bill_Gates "Bill Gates")',
+]
+WIKIPEDIA_TEST_EXCLUDES = [
+    "You are encouraged to create an account and log in",
+    "154 languages",
+    "move to sidebar",
+]
+
+SERP_TEST_URL = "https://www.bing.com/search?q=microsoft+wikipedia"
+SERP_TEST_STRINGS = [
+    "](https://en.wikipedia.org/wiki/Microsoft",
+    "Microsoft Corporation is **an American multinational corporation and technology company headquartered** in Redmond",
+    "1995–2007: Foray into the Web, Windows 95, Windows XP, and Xbox",
+]
+SERP_TEST_EXCLUDES = [
+    "https://www.bing.com/ck/a?!&&p=",
+    "data:image/svg+xml,%3Csvg%20width%3D",
+]
+
+CSV_CP932_TEST_STRINGS = [
+    "名前,年齢,住所",
+    "佐藤太郎,30,東京",
+    "三木英子,25,大阪",
+    "髙橋淳,35,名古屋",
+]
+
+LLM_TEST_STRINGS = [
+    "5bda1dd6",
+]
+
+JSON_TEST_STRINGS = [
+    "5b64c88c-b3c3-4510-bcb8-da0b200602d8",
+    "9700dc99-6685-40b4-9a3a-5e406dcb37f3",
+]
+
+
+# --- Helper Functions ---
+def validate_strings(result, expected_strings, exclude_strings=None):
+    """Validate presence or absence of specific strings."""
+    text_content = result.text_content.replace("\\", "")
+    for string in expected_strings:
+        assert string in text_content
+    if exclude_strings:
+        for string in exclude_strings:
+            assert string not in text_content
+
+
+@pytest.mark.skipif(
+    skip_remote,
+    reason="do not run tests that query external urls",
+)
+def test_markitdown_remote() -> None:
+    markitdown = MarkItDown()
+
+    # By URL
+    result = markitdown.convert(PDF_TEST_URL)
+    for test_string in PDF_TEST_STRINGS:
+        assert test_string in result.text_content
+
+    # By stream
+    response = requests.get(PDF_TEST_URL)
+    result = markitdown.convert_stream(
+        io.BytesIO(response.content), file_extension=".pdf", url=PDF_TEST_URL
+    )
+    for test_string in PDF_TEST_STRINGS:
+        assert test_string in result.text_content
+
+    # Youtube
+    # TODO: This test randomly fails for some reason. Haven't been able to repro it yet. Disabling until I can debug the issue
+    # result = markitdown.convert(YOUTUBE_TEST_URL)
+    # for test_string in YOUTUBE_TEST_STRINGS:
+    #     assert test_string in result.text_content
+
+
+def test_markitdown_local() -> None:
+    markitdown = MarkItDown()
+
+    # Test XLSX processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
+    validate_strings(result, XLSX_TEST_STRINGS)
+
+    # Test XLS processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xls"))
+    for test_string in XLS_TEST_STRINGS:
+        text_content = result.text_content.replace("\\", "")
+        assert test_string in text_content
+
+    # Test DOCX processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.docx"))
+    validate_strings(result, DOCX_TEST_STRINGS)
+
+    # Test DOCX processing, with comments
+    result = markitdown.convert(
+        os.path.join(TEST_FILES_DIR, "test_with_comment.docx"),
+        style_map="comment-reference => ",
+    )
+    validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
+
+    # Test DOCX processing, with comments and setting style_map on init
+    markitdown_with_style_map = MarkItDown(style_map="comment-reference => ")
+    result = markitdown_with_style_map.convert(
+        os.path.join(TEST_FILES_DIR, "test_with_comment.docx")
+    )
+    validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
+
+    # Test PPTX processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx"))
+    validate_strings(result, PPTX_TEST_STRINGS)
+
+    # Test HTML processing
+    result = markitdown.convert(
+        os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL
+    )
+    validate_strings(result, BLOG_TEST_STRINGS)
+
+    # Test ZIP file processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_files.zip"))
+    validate_strings(result, XLSX_TEST_STRINGS)
+
+    # Test Wikipedia processing
+    result = markitdown.convert(
+        os.path.join(TEST_FILES_DIR, "test_wikipedia.html"), url=WIKIPEDIA_TEST_URL
+    )
+    text_content = result.text_content.replace("\\", "")
+    validate_strings(result, WIKIPEDIA_TEST_STRINGS, WIKIPEDIA_TEST_EXCLUDES)
+
+    # Test Bing processing
+    result = markitdown.convert(
+        os.path.join(TEST_FILES_DIR, "test_serp.html"), url=SERP_TEST_URL
+    )
+    text_content = result.text_content.replace("\\", "")
+    validate_strings(result, SERP_TEST_STRINGS, SERP_TEST_EXCLUDES)
+
+    # Test RSS processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_rss.xml"))
+    text_content = result.text_content.replace("\\", "")
+    for test_string in RSS_TEST_STRINGS:
+        assert test_string in text_content
+
+    ## Test non-UTF-8 encoding
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_mskanji.csv"))
+    validate_strings(result, CSV_CP932_TEST_STRINGS)
+
+    # Test MSG (Outlook email) processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_outlook_msg.msg"))
+    validate_strings(result, MSG_TEST_STRINGS)
+
+    # Test JSON processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.json"))
+    validate_strings(result, JSON_TEST_STRINGS)
+
+    # Test input with leading blank characters
+    input_data = b"   \n\n\n<html><body><h1>Test</h1></body></html>"
+    result = markitdown.convert_stream(io.BytesIO(input_data))
+    assert "# Test" in result.text_content
+
+
+@pytest.mark.skipif(
+    skip_exiftool,
+    reason="do not run if exiftool is not installed",
+)
+def test_markitdown_exiftool() -> None:
+    # Test the automatic discovery of exiftool throws a warning
+    # and is disabled
+    try:
+        with catch_warnings(record=True) as w:
+            markitdown = MarkItDown()
+            result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
+            assert len(w) == 1
+            assert w[0].category is DeprecationWarning
+            assert result.text_content.strip() == ""
+    finally:
+        resetwarnings()
+
+    # Test explicitly setting the location of exiftool
+    which_exiftool = shutil.which("exiftool")
+    markitdown = MarkItDown(exiftool_path=which_exiftool)
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
+    for key in JPG_TEST_EXIFTOOL:
+        target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
+        assert target in result.text_content
+
+    # Test setting the exiftool path through an environment variable
+    os.environ["EXIFTOOL_PATH"] = which_exiftool
+    markitdown = MarkItDown()
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.jpg"))
+    for key in JPG_TEST_EXIFTOOL:
+        target = f"{key}: {JPG_TEST_EXIFTOOL[key]}"
+        assert target in result.text_content
+
+
+@pytest.mark.skipif(
+    skip_llm,
+    reason="do not run llm tests without a key",
+)
+def test_markitdown_llm() -> None:
+    client = openai.OpenAI()
+    markitdown = MarkItDown(llm_client=client, llm_model="gpt-4o")
+
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test_llm.jpg"))
+
+    for test_string in LLM_TEST_STRINGS:
+        assert test_string in result.text_content
+
+    # This is not super precise. It would also accept "red square", "blue circle",
+    # "the square is not blue", etc. But it's sufficient for this test.
+    for test_string in ["red", "circle", "blue", "square"]:
+        assert test_string in result.text_content.lower()
+
+
+if __name__ == "__main__":
+    """Runs this file's tests from the command line."""
+    # test_markitdown_remote()
+    # test_markitdown_local()
+    test_markitdown_exiftool()
+    # test_markitdown_deprecation()
+    # test_markitdown_llm()