From 14bd8d319ac9be7ce0d2bfd00651805fe9b6aeff Mon Sep 17 00:00:00 2001 From: diya155 <159163630+diya155@users.noreply.github.com> Date: Tue, 17 Dec 2024 09:16:40 +0530 Subject: [PATCH 1/7] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 7079dbf..b6476fb 100644 --- a/README.md +++ b/README.md @@ -118,3 +118,4 @@ trademarks or logos is subject to and must follow [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any use of third-party trademarks or logos are subject to those third-party's policies. +// From 07fe457a9033baf9a534258105488f0b57853f7d Mon Sep 17 00:00:00 2001 From: lumin <71011125+l-melon@users.noreply.github.com> Date: Tue, 17 Dec 2024 11:15:16 +0900 Subject: [PATCH 2/7] feat: add devcontainer configuration and installation script Add a devcontainer configuration to streamline the development environment setup. Introduce an `install.sh` script to install the project in editable mode. Update the Dockerfile to use the `python:3.13-slim-bullseye` base image and install dependencies using `apt-get` for better compatibility. --- .devcontainer/devcontainer.json | 30 ++++++++++++++++++++++++++++++ Dockerfile | 6 ++++-- install.sh | 7 +++++++ 3 files changed, 41 insertions(+), 2 deletions(-) create mode 100644 .devcontainer/devcontainer.json create mode 100644 install.sh diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..2ce479b --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,30 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/docker-existing-dockerfile +{ + "name": "Existing Dockerfile", + "build": { + // Sets the run context to one level up instead of the .devcontainer folder. + "context": "..", + // Update the 'dockerFile' property if you aren't using the standard 'Dockerfile' filename. + "dockerfile": "../Dockerfile" + }, + + // Features to add to the dev container. More info: https://containers.dev/features. + // "features": {}, + "features": { + "ghcr.io/devcontainers-extra/features/hatch:2": {} + }, + + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + + // Uncomment the next line to run commands after the container is created. + // "postCreateCommand": "cat /etc/os-release", + "postCreateCommand": "./install.sh", + + // Configure tool-specific properties. + // "customizations": {}, + + // Uncomment to connect as an existing user other than the container default. More info: https://aka.ms/dev-containers-non-root. + "remoteUser": "root" +} diff --git a/Dockerfile b/Dockerfile index 492ad8a..f9c0bef 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,9 +1,11 @@ -FROM python:3.13-alpine +FROM python:3.13-slim-bullseye USER root # Runtime dependency -RUN apk add --no-cache ffmpeg +RUN apt-get update && apt-get install -y --no-install-recommends \ + ffmpeg \ + && rm -rf /var/lib/apt/lists/* RUN pip install markitdown diff --git a/install.sh b/install.sh new file mode 100644 index 0000000..7133c15 --- /dev/null +++ b/install.sh @@ -0,0 +1,7 @@ +#!/bin/sh + +# Install the current project in editable mode +echo "Installing the project in editable mode..." +pip install -e . + +echo "Setup complete!" From e0a30295ffdee37d0dbaaa6568cfdbfa48ecfe30 Mon Sep 17 00:00:00 2001 From: lumin <71011125+l-melon@users.noreply.github.com> Date: Tue, 17 Dec 2024 11:36:15 +0900 Subject: [PATCH 3/7] docs: update README with Devcontainer instructions Add instructions for using Dev to run tests.Remove the install script it is no longer needed. Update trademark section for clarity. --- .devcontainer/devcontainer.json | 1 - README.md | 11 +++++++++-- install.sh | 7 ------- 3 files changed, 9 insertions(+), 10 deletions(-) delete mode 100644 install.sh diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 2ce479b..f12fbcb 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -20,7 +20,6 @@ // Uncomment the next line to run commands after the container is created. // "postCreateCommand": "cat /etc/os-release", - "postCreateCommand": "./install.sh", // Configure tool-specific properties. // "customizations": {}, diff --git a/README.md b/README.md index 7079dbf..1de6cdc 100644 --- a/README.md +++ b/README.md @@ -103,6 +103,13 @@ hatch shell hatch test ``` +Alternative method: using Devcontainer +- Reopen project in the Devcontainer (via the Command Palette: `Reopen in Container`) +- Once inside the container, run: +```sh +hatch test +``` + ### Running Pre-commit Checks Please run the pre-commit checks before submitting a PR. @@ -113,8 +120,8 @@ pre-commit run --all-files ## Trademarks -This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft -trademarks or logos is subject to and must follow +This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft +trademarks or logos is subject to and must follow [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any use of third-party trademarks or logos are subject to those third-party's policies. diff --git a/install.sh b/install.sh deleted file mode 100644 index 7133c15..0000000 --- a/install.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/bin/sh - -# Install the current project in editable mode -echo "Installing the project in editable mode..." -pip install -e . - -echo "Setup complete!" From 3bcf2bdae74cd316bafbde6b89d4981e9bd143f7 Mon Sep 17 00:00:00 2001 From: gagb Date: Tue, 17 Dec 2024 16:54:17 -0800 Subject: [PATCH 4/7] Update README.md --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 58348e1..1de6cdc 100644 --- a/README.md +++ b/README.md @@ -125,4 +125,3 @@ trademarks or logos is subject to and must follow [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any use of third-party trademarks or logos are subject to those third-party's policies. -// From 1e7806a7ac191a9d95be3209addb4b187b0ca6a4 Mon Sep 17 00:00:00 2001 From: gagb Date: Tue, 17 Dec 2024 17:21:39 -0800 Subject: [PATCH 5/7] Simplify --- README.md | 112 ++++++++++++++++++++++-------------------------------- 1 file changed, 45 insertions(+), 67 deletions(-) diff --git a/README.md b/README.md index 1de6cdc..ae5aef2 100644 --- a/README.md +++ b/README.md @@ -2,65 +2,47 @@ [![PyPI](https://img.shields.io/pypi/v/markitdown.svg)](https://pypi.org/project/markitdown/) -The MarkItDown library is a utility tool for converting various files to Markdown (e.g., for indexing, text analysis, etc.) +MarkItDown is a utility for converting various files to Markdown (e.g., for indexing, text analysis, etc). +It supports: +- PDF +- PowerPoint +- Word +- Excel +- Images (EXIF metadata and OCR) +- Audio (EXIF metadata and speech transcription) +- HTML +- Text-based formats (CSV, JSON, XML) +- ZIP files (iterates over contents) -It presently supports: +To install MarkItDown, use pip: `pip install markitdown`. Alternatively, you can install it from the source: `pip install -e .`. -- PDF (.pdf) -- PowerPoint (.pptx) -- Word (.docx) -- Excel (.xlsx) -- Images (EXIF metadata, and OCR) -- Audio (EXIF metadata, and speech transcription) -- HTML (special handling of Wikipedia, etc.) -- Various other text-based formats (csv, json, xml, etc.) -- ZIP (Iterates over contents and converts each file) +## Usage -# Installation - -You can install `markitdown` using pip: - -```python -pip install markitdown -``` - -or from the source - -```sh -pip install -e . -``` - -# Usage -The API is simple: - -```python -from markitdown import MarkItDown - -markitdown = MarkItDown() -result = markitdown.convert("test.xlsx") -print(result.text_content) -``` - -To use this as a command-line utility, install it and then run it like this: - -```bash -markitdown path-to-file.pdf -``` - -This will output Markdown to standard output. You can save it like this: +### Command-Line ```bash markitdown path-to-file.pdf > document.md ``` -You can pipe content to standard input by omitting the argument: +You can also pipe content: ```bash cat path-to-file.pdf | markitdown ``` -You can also configure markitdown to use Large Language Models to describe images. To do so you must provide `llm_client` and `llm_model` parameters to MarkItDown object, according to your specific client. +### Python API +Basic usage in Python: + +```python +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert("test.xlsx") +print(result.text_content) +``` + +To use Large Language Models for image descriptions, provide `llm_client` and `llm_model`: ```python from markitdown import MarkItDown @@ -72,7 +54,7 @@ result = md.convert("example.jpg") print(result.text_content) ``` -You can also use the project as Docker Image: +### Docker ```sh docker build -t markitdown:latest . @@ -93,30 +75,26 @@ This project has adopted the [Microsoft Open Source Code of Conduct](https://ope For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. -### Running Tests +### Running Tests and Checks -To run tests, install `hatch` using `pip` or other methods as described [here](https://hatch.pypa.io/dev/install). +- Install `hatch` in your environment and run tests: + ```sh + pip install hatch # Other ways of installing hatch: https://hatch.pypa.io/dev/install/ + hatch shell + hatch test + ``` -```sh -pip install hatch -hatch shell -hatch test -``` + (Alternative) Use the Devcontainer which has all the dependencies installed: + ```sh + # Reopen the project in Devcontainer and run: + hatch test + ``` -Alternative method: using Devcontainer -- Reopen project in the Devcontainer (via the Command Palette: `Reopen in Container`) -- Once inside the container, run: -```sh -hatch test -``` - -### Running Pre-commit Checks - -Please run the pre-commit checks before submitting a PR. - -```sh -pre-commit run --all-files -``` +- Run pre-commit checks before submitting a PR: + ```sh + # pip install pre-commit + pre-commit run --all-files + ``` ## Trademarks From de1b54d79fa1ae0693e8c29a353601c058c820eb Mon Sep 17 00:00:00 2001 From: gagb Date: Tue, 17 Dec 2024 17:25:13 -0800 Subject: [PATCH 6/7] Update README.md --- README.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/README.md b/README.md index ae5aef2..0aae9b7 100644 --- a/README.md +++ b/README.md @@ -90,11 +90,7 @@ contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additio hatch test ``` -- Run pre-commit checks before submitting a PR: - ```sh - # pip install pre-commit - pre-commit run --all-files - ``` +- Run pre-commit checks before submitting a PR: `pre-commit run --all-files` ## Trademarks From 524aa0da753f42a54f2c160d66ab605353505611 Mon Sep 17 00:00:00 2001 From: gagb Date: Tue, 17 Dec 2024 17:25:40 -0800 Subject: [PATCH 7/7] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0aae9b7..75c2ba0 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ It supports: - Text-based formats (CSV, JSON, XML) - ZIP files (iterates over contents) -To install MarkItDown, use pip: `pip install markitdown`. Alternatively, you can install it from the source: `pip install -e .`. +To install MarkItDown, use pip: `pip install markitdown`. Alternatively, you can install it from the source: `pip install -e .` ## Usage