9 Commits

Author SHA1 Message Date
afourney 4a5340f93b Bump version for release. (#1564) 2026-02-20 11:40:57 -08:00
Bas Nijholt 6b0fd15e60 Remove onnxruntime<=1.20.1 Windows pin (#1551) 2026-02-16 15:05:37 -08:00
afourney 2b6ec9f315 Add text/markdown to Accept header (#1554) 2026-02-13 11:53:01 -08:00
lesyk c83de14a9c [MS] Extend table support for wide tables (#1552)
* feat: enhance PDF table extraction to support complex forms and add new test cases
* feat: enhance PDF table extraction with adaptive column clustering and add comprehensive test cases
* fix: correct formatting and improve assertions in PDF table tests
2026-02-13 10:45:39 -08:00
lesyk 7fdaefb724 Fix: PDF parsing doesn't support partially numbered lists (#1525)
* Fix: PDF parsing doesn't support partially numbered lists

* Refactor: Move import of PARTIAL_NUMBERING_PATTERN to the top of the test file

* Refactor: Improve assertion formatting in partial numbering tests
2026-01-08 15:15:22 -08:00
lesyk 251dddcf0c [MS] Update PDF table extraction to support aligned Markdown (#1499)
* Added PDF table extraction feature with aligned Markdown (#1419)

* Add PDF test files and enhance extraction tests

- Added a medical report scan PDF for testing scanned PDF handling.
- Included a retail purchase receipt PDF to validate receipt extraction functionality.
- Introduced a multipage invoice PDF to test extraction of complex invoice structures.
- Added a borderless table PDF for testing inventory reconciliation report extraction.
- Implemented comprehensive tests for PDF table extraction, ensuring proper structure and data integrity.
- Enhanced existing tests to validate the order and presence of extracted content across various PDF types.

* fix: update dependencies for PDF processing and improve table extraction logic

* Bumped version of pdfminer.six
---------

Authored-by: Ashok <ashh010101@gmail.com>
2026-01-07 16:38:45 -08:00
afourney dde250a456 Bump versions of mammoth and pdfminer.six (#1492)
* Updated pyproject to require a minimum version of pdfminer.six to ensure CVE-2025-64512 is patched.
2025-12-01 10:11:24 -08:00
afourney 3d4fe3cdcc Upgrade mammoth to 1.11.0 (#1452) 2025-10-20 16:07:39 -07:00
afourney 447c047731 Test if mammoth resolves rlinks. (#1451) 2025-10-20 15:54:05 -07:00
23 changed files with 2647 additions and 34 deletions
+3
View File
@@ -1,2 +1,5 @@
packages/markitdown/tests/test_files/** linguist-vendored
packages/markitdown-sample-plugin/tests/test_files/** linguist-vendored
# Treat PDF files as binary to prevent line ending conversion
*.pdf binary
+1
View File
@@ -52,6 +52,7 @@ coverage.xml
.hypothesis/
.pytest_cache/
cover/
.test-logs/
# Translations
*.mo
+6 -6
View File
@@ -30,30 +30,30 @@ dependencies = [
"magika~=0.6.1",
"charset-normalizer",
"defusedxml",
"onnxruntime<=1.20.1; sys_platform == 'win32'",
]
[project.optional-dependencies]
all = [
"python-pptx",
"mammoth~=1.10.0",
"mammoth~=1.11.0",
"pandas",
"openpyxl",
"xlrd",
"lxml",
"pdfminer.six",
"pdfminer.six>=20251230",
"pdfplumber>=0.11.9",
"olefile",
"pydub",
"SpeechRecognition",
"youtube-transcript-api~=1.0.0",
"azure-ai-documentintelligence",
"azure-identity"
"azure-identity",
]
pptx = ["python-pptx"]
docx = ["mammoth", "lxml"]
docx = ["mammoth~=1.11.0", "lxml"]
xlsx = ["pandas", "openpyxl"]
xls = ["pandas", "xlrd"]
pdf = ["pdfminer.six"]
pdf = ["pdfminer.six>=20251230", "pdfplumber>=0.11.9"]
outlook = ["olefile"]
audio-transcription = ["pydub", "SpeechRecognition"]
youtube-transcription = ["youtube-transcript-api"]
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
__version__ = "0.1.3"
__version__ = "0.1.5"
@@ -107,6 +107,13 @@ class MarkItDown:
requests_session = kwargs.get("requests_session")
if requests_session is None:
self._requests_session = requests.Session()
# Signal that we prefer markdown over HTML, etc. if the server supports it.
# e.g., https://blog.cloudflare.com/markdown-for-agents/
self._requests_session.headers.update(
{
"Accept": "text/markdown, text/html;q=0.9, text/plain;q=0.8, */*;q=0.1"
}
)
else:
self._requests_session = requests_session
@@ -15,13 +15,6 @@ from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
_dependency_exc_info = None
try:
import mammoth
import mammoth.docx.files
def mammoth_files_open(self, uri):
warn("DOCX: processing of r:link resources (e.g., linked images) is disabled.")
return io.BytesIO(b"")
mammoth.docx.files.Files.open = mammoth_files_open
except ImportError:
# Preserve the error and stack trace for later
@@ -1,22 +1,69 @@
import sys
import io
import re
from typing import BinaryIO, Any
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10")
PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$")
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
def _merge_partial_numbering_lines(text: str) -> str:
"""
Post-process extracted text to merge MasterFormat-style partial numbering
with the following text line.
MasterFormat documents use partial numbering like:
.1 The intent of this Request for Proposal...
.2 Available information relative to...
Some PDF extractors split these into separate lines:
.1
The intent of this Request for Proposal...
This function merges them back together.
"""
lines = text.split("\n")
result_lines: list[str] = []
i = 0
while i < len(lines):
line = lines[i]
stripped = line.strip()
# Check if this line is ONLY a partial numbering
if PARTIAL_NUMBERING_PATTERN.match(stripped):
# Look for the next non-empty line to merge with
j = i + 1
while j < len(lines) and not lines[j].strip():
j += 1
if j < len(lines):
# Merge the partial numbering with the next line
next_line = lines[j].strip()
result_lines.append(f"{stripped} {next_line}")
i = j + 1 # Skip past the merged line
else:
# No next line to merge with, keep as is
result_lines.append(line)
i += 1
else:
result_lines.append(line)
i += 1
return "\n".join(result_lines)
# Load dependencies
_dependency_exc_info = None
try:
import pdfminer
import pdfminer.high_level
import pdfplumber
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
@@ -28,16 +75,435 @@ ACCEPTED_MIME_TYPE_PREFIXES = [
ACCEPTED_FILE_EXTENSIONS = [".pdf"]
def _to_markdown_table(table: list[list[str]], include_separator: bool = True) -> str:
"""Convert a 2D list (rows/columns) into a nicely aligned Markdown table.
Args:
table: 2D list of cell values
include_separator: If True, include header separator row (standard markdown).
If False, output simple pipe-separated rows.
"""
if not table:
return ""
# Normalize None → ""
table = [[cell if cell is not None else "" for cell in row] for row in table]
# Filter out empty rows
table = [row for row in table if any(cell.strip() for cell in row)]
if not table:
return ""
# Column widths
col_widths = [max(len(str(cell)) for cell in col) for col in zip(*table)]
def fmt_row(row: list[str]) -> str:
return (
"|"
+ "|".join(str(cell).ljust(width) for cell, width in zip(row, col_widths))
+ "|"
)
if include_separator:
header, *rows = table
md = [fmt_row(header)]
md.append("|" + "|".join("-" * w for w in col_widths) + "|")
for row in rows:
md.append(fmt_row(row))
else:
md = [fmt_row(row) for row in table]
return "\n".join(md)
def _extract_form_content_from_words(page: Any) -> str | None:
"""
Extract form-style content from a PDF page by analyzing word positions.
This handles borderless forms/tables where words are aligned in columns.
Returns markdown with proper table formatting:
- Tables have pipe-separated columns with header separator rows
- Non-table content is rendered as plain text
Returns None if the page doesn't appear to be a form-style document,
indicating that pdfminer should be used instead for better text spacing.
"""
words = page.extract_words(keep_blank_chars=True, x_tolerance=3, y_tolerance=3)
if not words:
return None
# Group words by their Y position (rows)
y_tolerance = 5
rows_by_y: dict[float, list[dict]] = {}
for word in words:
y_key = round(word["top"] / y_tolerance) * y_tolerance
if y_key not in rows_by_y:
rows_by_y[y_key] = []
rows_by_y[y_key].append(word)
# Sort rows by Y position
sorted_y_keys = sorted(rows_by_y.keys())
page_width = page.width if hasattr(page, "width") else 612
# First pass: analyze each row
row_info: list[dict] = []
for y_key in sorted_y_keys:
row_words = sorted(rows_by_y[y_key], key=lambda w: w["x0"])
if not row_words:
continue
first_x0 = row_words[0]["x0"]
last_x1 = row_words[-1]["x1"]
line_width = last_x1 - first_x0
combined_text = " ".join(w["text"] for w in row_words)
# Count distinct x-position groups (columns)
x_positions = [w["x0"] for w in row_words]
x_groups: list[float] = []
for x in sorted(x_positions):
if not x_groups or x - x_groups[-1] > 50:
x_groups.append(x)
# Determine row type
is_paragraph = line_width > page_width * 0.55 and len(combined_text) > 60
# Check for MasterFormat-style partial numbering (e.g., ".1", ".2")
# These should be treated as list items, not table rows
has_partial_numbering = False
if row_words:
first_word = row_words[0]["text"].strip()
if PARTIAL_NUMBERING_PATTERN.match(first_word):
has_partial_numbering = True
row_info.append(
{
"y_key": y_key,
"words": row_words,
"text": combined_text,
"x_groups": x_groups,
"is_paragraph": is_paragraph,
"num_columns": len(x_groups),
"has_partial_numbering": has_partial_numbering,
}
)
# Collect ALL x-positions from rows with 3+ columns (table-like rows)
# This gives us the global column structure
all_table_x_positions: list[float] = []
for info in row_info:
if info["num_columns"] >= 3 and not info["is_paragraph"]:
all_table_x_positions.extend(info["x_groups"])
if not all_table_x_positions:
return None
# Compute adaptive column clustering tolerance based on gap analysis
all_table_x_positions.sort()
# Calculate gaps between consecutive x-positions
gaps = []
for i in range(len(all_table_x_positions) - 1):
gap = all_table_x_positions[i + 1] - all_table_x_positions[i]
if gap > 5: # Only significant gaps
gaps.append(gap)
# Determine optimal tolerance using statistical analysis
if gaps and len(gaps) >= 3:
# Use 70th percentile of gaps as threshold (balances precision/recall)
sorted_gaps = sorted(gaps)
percentile_70_idx = int(len(sorted_gaps) * 0.70)
adaptive_tolerance = sorted_gaps[percentile_70_idx]
# Clamp tolerance to reasonable range [25, 50]
adaptive_tolerance = max(25, min(50, adaptive_tolerance))
else:
# Fallback to conservative value
adaptive_tolerance = 35
# Compute global column boundaries using adaptive tolerance
global_columns: list[float] = []
for x in all_table_x_positions:
if not global_columns or x - global_columns[-1] > adaptive_tolerance:
global_columns.append(x)
# Adaptive max column check based on page characteristics
# Calculate average column width
if len(global_columns) > 1:
content_width = global_columns[-1] - global_columns[0]
avg_col_width = content_width / len(global_columns)
# Forms with very narrow columns (< 30px) are likely dense text
if avg_col_width < 30:
return None
# Compute adaptive max based on columns per inch
# Typical forms have 3-8 columns per inch
columns_per_inch = len(global_columns) / (content_width / 72)
# If density is too high (> 10 cols/inch), likely not a form
if columns_per_inch > 10:
return None
# Adaptive max: allow more columns for wider pages
# Standard letter is 612pt wide, so scale accordingly
adaptive_max_columns = int(20 * (page_width / 612))
adaptive_max_columns = max(15, adaptive_max_columns) # At least 15
if len(global_columns) > adaptive_max_columns:
return None
else:
# Single column, not a form
return None
# Now classify each row as table row or not
# A row is a table row if it has words that align with 2+ of the global columns
for info in row_info:
if info["is_paragraph"]:
info["is_table_row"] = False
continue
# Rows with partial numbering (e.g., ".1", ".2") are list items, not table rows
if info["has_partial_numbering"]:
info["is_table_row"] = False
continue
# Count how many global columns this row's words align with
aligned_columns: set[int] = set()
for word in info["words"]:
word_x = word["x0"]
for col_idx, col_x in enumerate(global_columns):
if abs(word_x - col_x) < 40:
aligned_columns.add(col_idx)
break
# If row uses 2+ of the established columns, it's a table row
info["is_table_row"] = len(aligned_columns) >= 2
# Find table regions (consecutive table rows)
table_regions: list[tuple[int, int]] = [] # (start_idx, end_idx)
i = 0
while i < len(row_info):
if row_info[i]["is_table_row"]:
start_idx = i
while i < len(row_info) and row_info[i]["is_table_row"]:
i += 1
end_idx = i
table_regions.append((start_idx, end_idx))
else:
i += 1
# Check if enough rows are table rows (at least 20%)
total_table_rows = sum(end - start for start, end in table_regions)
if len(row_info) > 0 and total_table_rows / len(row_info) < 0.2:
return None
# Build output - collect table data first, then format with proper column widths
result_lines: list[str] = []
num_cols = len(global_columns)
# Helper function to extract cells from a row
def extract_cells(info: dict) -> list[str]:
cells: list[str] = ["" for _ in range(num_cols)]
for word in info["words"]:
word_x = word["x0"]
# Find the correct column using boundary ranges
assigned_col = num_cols - 1 # Default to last column
for col_idx in range(num_cols - 1):
col_end = global_columns[col_idx + 1]
if word_x < col_end - 20:
assigned_col = col_idx
break
if cells[assigned_col]:
cells[assigned_col] += " " + word["text"]
else:
cells[assigned_col] = word["text"]
return cells
# Process rows, collecting table data for proper formatting
idx = 0
while idx < len(row_info):
info = row_info[idx]
# Check if this row starts a table region
table_region = None
for start, end in table_regions:
if idx == start:
table_region = (start, end)
break
if table_region:
start, end = table_region
# Collect all rows in this table
table_data: list[list[str]] = []
for table_idx in range(start, end):
cells = extract_cells(row_info[table_idx])
table_data.append(cells)
# Calculate column widths for this table
if table_data:
col_widths = [
max(len(row[col]) for row in table_data) for col in range(num_cols)
]
# Ensure minimum width of 3 for separator dashes
col_widths = [max(w, 3) for w in col_widths]
# Format header row
header = table_data[0]
header_str = (
"| "
+ " | ".join(
cell.ljust(col_widths[i]) for i, cell in enumerate(header)
)
+ " |"
)
result_lines.append(header_str)
# Format separator row
separator = (
"| "
+ " | ".join("-" * col_widths[i] for i in range(num_cols))
+ " |"
)
result_lines.append(separator)
# Format data rows
for row in table_data[1:]:
row_str = (
"| "
+ " | ".join(
cell.ljust(col_widths[i]) for i, cell in enumerate(row)
)
+ " |"
)
result_lines.append(row_str)
idx = end # Skip to end of table region
else:
# Check if we're inside a table region (not at start)
in_table = False
for start, end in table_regions:
if start < idx < end:
in_table = True
break
if not in_table:
# Non-table content
result_lines.append(info["text"])
idx += 1
return "\n".join(result_lines)
def _extract_tables_from_words(page: Any) -> list[list[list[str]]]:
"""
Extract tables from a PDF page by analyzing word positions.
This handles borderless tables where words are aligned in columns.
This function is designed for structured tabular data (like invoices),
not for multi-column text layouts in scientific documents.
"""
words = page.extract_words(keep_blank_chars=True, x_tolerance=3, y_tolerance=3)
if not words:
return []
# Group words by their Y position (rows)
y_tolerance = 5
rows_by_y: dict[float, list[dict]] = {}
for word in words:
y_key = round(word["top"] / y_tolerance) * y_tolerance
if y_key not in rows_by_y:
rows_by_y[y_key] = []
rows_by_y[y_key].append(word)
# Sort rows by Y position
sorted_y_keys = sorted(rows_by_y.keys())
# Find potential column boundaries by analyzing x positions across all rows
all_x_positions = []
for words_in_row in rows_by_y.values():
for word in words_in_row:
all_x_positions.append(word["x0"])
if not all_x_positions:
return []
# Cluster x positions to find column starts
all_x_positions.sort()
x_tolerance_col = 20
column_starts: list[float] = []
for x in all_x_positions:
if not column_starts or x - column_starts[-1] > x_tolerance_col:
column_starts.append(x)
# Need at least 3 columns but not too many (likely text layout, not table)
if len(column_starts) < 3 or len(column_starts) > 10:
return []
# Find rows that span multiple columns (potential table rows)
table_rows = []
for y_key in sorted_y_keys:
words_in_row = sorted(rows_by_y[y_key], key=lambda w: w["x0"])
# Assign words to columns
row_data = [""] * len(column_starts)
for word in words_in_row:
# Find the closest column
best_col = 0
min_dist = float("inf")
for i, col_x in enumerate(column_starts):
dist = abs(word["x0"] - col_x)
if dist < min_dist:
min_dist = dist
best_col = i
if row_data[best_col]:
row_data[best_col] += " " + word["text"]
else:
row_data[best_col] = word["text"]
# Only include rows that have content in multiple columns
non_empty = sum(1 for cell in row_data if cell.strip())
if non_empty >= 2:
table_rows.append(row_data)
# Validate table quality - tables should have:
# 1. Enough rows (at least 3 including header)
# 2. Short cell content (tables have concise data, not paragraphs)
# 3. Consistent structure across rows
if len(table_rows) < 3:
return []
# Check if cells contain short, structured data (not long text)
long_cell_count = 0
total_cell_count = 0
for row in table_rows:
for cell in row:
if cell.strip():
total_cell_count += 1
# If cell has more than 30 chars, it's likely prose text
if len(cell.strip()) > 30:
long_cell_count += 1
# If more than 30% of cells are long, this is probably not a table
if total_cell_count > 0 and long_cell_count / total_cell_count > 0.3:
return []
return [table_rows]
class PdfConverter(DocumentConverter):
"""
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
Converts PDFs to Markdown.
Supports extracting tables into aligned Markdown format (via pdfplumber).
Falls back to pdfminer if pdfplumber is missing or fails.
"""
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
**kwargs: Any,
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
@@ -55,9 +521,8 @@ class PdfConverter(DocumentConverter):
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
**kwargs: Any,
) -> DocumentConverterResult:
# Check the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
@@ -65,13 +530,58 @@ class PdfConverter(DocumentConverter):
extension=".pdf",
feature="pdf",
)
) from _dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
) from _dependency_exc_info[1].with_traceback(
_dependency_exc_info[2]
)
) # type: ignore[union-attr]
assert isinstance(file_stream, io.IOBase) # for mypy
return DocumentConverterResult(
markdown=pdfminer.high_level.extract_text(file_stream),
)
assert isinstance(file_stream, io.IOBase)
markdown_chunks: list[str] = []
# Read file stream into BytesIO for compatibility with pdfplumber
pdf_bytes = io.BytesIO(file_stream.read())
try:
# Track how many pages are form-style vs plain text
form_pages = 0
plain_pages = 0
with pdfplumber.open(pdf_bytes) as pdf:
for page in pdf.pages:
# Try form-style word position extraction
page_content = _extract_form_content_from_words(page)
# If extraction returns None, this page is not form-style
if page_content is None:
plain_pages += 1
# Extract text using pdfplumber's basic extraction for this page
text = page.extract_text()
if text and text.strip():
markdown_chunks.append(text.strip())
else:
form_pages += 1
if page_content.strip():
markdown_chunks.append(page_content)
# If most pages are plain text, use pdfminer for better text handling
if plain_pages > form_pages and plain_pages > 0:
pdf_bytes.seek(0)
markdown = pdfminer.high_level.extract_text(pdf_bytes)
else:
# Build markdown from chunks
markdown = "\n\n".join(markdown_chunks).strip()
except Exception:
# Fallback if pdfplumber fails
pdf_bytes.seek(0)
markdown = pdfminer.high_level.extract_text(pdf_bytes)
# Fallback if still empty
if not markdown:
pdf_bytes.seek(0)
markdown = pdfminer.high_level.extract_text(pdf_bytes)
# Post-process to merge MasterFormat-style partial numbering with following text
markdown = _merge_partial_numbering_lines(markdown)
return DocumentConverterResult(markdown=markdown)
@@ -0,0 +1,97 @@
%PDF-1.4
%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
1 0 obj
<<
/F1 2 0 R /F2 3 0 R /F3 4 0 R /F4 5 0 R
>>
endobj
2 0 obj
<<
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
>>
endobj
3 0 obj
<<
/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
>>
endobj
4 0 obj
<<
/BaseFont /Courier /Encoding /WinAnsiEncoding /Name /F3 /Subtype /Type1 /Type /Font
>>
endobj
5 0 obj
<<
/BaseFont /Courier-Bold /Encoding /WinAnsiEncoding /Name /F4 /Subtype /Type1 /Type /Font
>>
endobj
6 0 obj
<<
/BitsPerComponent 8 /ColorSpace /DeviceRGB /Filter [ /ASCII85Decode /FlateDecode ] /Height 70 /Length 4491 /Subtype /Image
/Type /XObject /Width 200
>>
stream
Gb"/lq,^Nc)M\9OkX:DBZ5YT>'!op&`0lHCEL`PXM2DFT$QuCdPsfSJ4#%$gW49i\1e&eZ\Acg:2bhaPc+^Q$/Shs#,1&Qu>83CBh729[%A$M]]Z8KL]Cu-OpO.1`\pCPboa2!3#sCC+4Yg#.W)"\K&i)doY5WCH.J-G0E$]%J"BRoZ88okcKEP@C7S%JEA:t(e6:OLb-"MZ3=$fAIE$]%J"BRoZ88okcKEP@C7S%JEA:t(e6:OKV=+u3F\<8ml>&.H@]EH:@TAeD]);ilGV3,!4G2kP0XRN"#r/]G!Kqo1c9Redk1d%Hh[t82=;lMkgMFr4J2#lTY[siCuNDGT]h^te4%1fj10r$&D--;(2UPbQ(Ze:VUL"2G=%qPZVOlc,tegG*BO,:$mI%mCAJ^8q2gWSn>)Ui.KQ!A5_(Z;l?_%(Xb28AGIU0SY?bVp%B6=$P2*I!1?W8WL>aAVWc$%-nk=D8ZjE$stW]LJ-LqIj9qZoFV/lj$Uf)=b`nfl*ANkt_qpb2t'P`;D#\h6o+g'M+"j4Sf:d#_jjZdTS>mnQJm^7>(S3Bq"m(kH5i3;0`YK<5e.0$k"4XA4UX)rfXH+2OamR360'cX$&"Dp"DdSkh^Q;?+H)fBc@YMp?\]UZuQ*lgt@kDS'dARs/]`Roa+]eXm%&TJ4e[RCKq6kQ:5Zpa[hLTEE,M/UR\C]SS.K'HJL7F)F5Ts63hWCKs+aTqi3TN!,P7#o$@'a?^`M.9&=d,$WJ*\b^*N1fR$JFV.s`.W*WfCgS\9h@C2/uC<b(1#bB73rR3UmcP"%)_DZ#=)TA1KkfUBT8F;=Yoc[BR6[ZkS\Y$n/.@mf!9WK1Hj]o3[f>DiFrdD&a`iRQ\df2(Xc53$=i@@upbP,MJ@.sDgm%`^.8+5u//"Hhn%^kIb$5o\2B7%r>DD\NZ:L;otY0]:)'6l[M<&ctoM"($Q_XW1'!4OB>g/3dF]mD],eF*&&'itQ;2e$/VWZ/QmdogQ0&d7ePkDGP[PZkk8TtUWkaJYa$Q)c6I+l<preqG)K\U>pY5H])D-lHdp52<d:Isd8)X0&b+pKUugDNb2NIW.aD_PLN/i(r9N&<3?,2br'%?gT'_i;n9VUeeM#>ko&@JS]_pP&PR0@L`i*pbXB"rgcI`#'#>-Njfe@8+ZC7hHU>Qm2oCj$^ATs<7[sZ@5*,@qslQ\p1m1#p6XrGL'F^?ok?+\fDe1,#<0n78&1'&KK/85E7IuiRklZ$<tM_`dQfdI@$2&Sj]k*=&V0n1RVEA-6,(U`EJ9674P)af%Z>l=<9-YKka`e!9&oUk]3u7Y);o;!X[W(:<D3M@"gDG)[;",CR0eT^r/fRnB+ob8JM\tp9\JoC\=uqFPX9nU`?:Y,eJf!6E95r@KI_iekuY/-+j6DIrXFQW>i@m+VqQff?4r\fn@@4QXN6[dWdtV8:B`3X2:bgH!rR8-r^sf)#EN%"`F/q0Heh_C7H6l'.@I3l<Jr.Q!as3DB-9V*+/'h,_<T8?^2u*t.p$h8d%"Dd\P<5M`MEg>7W_M8qB0Sd$'o&pWH!XFNS.JRZ%[WY$N:rl5tLIb;#&1u\'nOCIB]161$bC,Uuf;ZK6dl()epY<39X_LaOAXU:WAZiYqZk5Tq+hN`O"QdZ7[jLdf^cf`?9i4T#=]JO$'0fC4#Y=^M%VOouL.PZ6/V;r+XoF1Ls*YXu`6'4?,j47_u_U=.T*IX0ed;@5JN=Qlc\gS!W?r;#%jA)NSUh\`='l:HWsF<K@<`EqO<,Ht[H.@PGU,p6$s&YbEgb;cfG9YK,6Fh]@t(EUD@78Ob6ui6[#pIBoZn<UH)N"PrIeP3Y!qa-k8bP>_rQ_q'7l3]f,=As5FN;rm6/&IWa@[9HFr3YtU'N%=ZPr)s`!!&o!IbLLsBH7VnG&"_&hSn3;Nr^mpSZk^2i_aD8<g*:f)-)1j6@3KSHbb_c1PDAXpnkGE:H3Fs0m?uXff0>H]^Oi^Wq(2*ak3>^mA^!FkG4$-Vq(BH"U+YSR;%(5j(bnT,&RrR1d]\O5_42^f/Xa:4msf,Oms&5F6()XE"p6mS/Yc\Ga&`hC/3XdsM;'cTMl(uV@DiFY5AA_VWS4T'&^D<.7.S,`B?:^&!Q[ZVaCi0^$[E#=Xt_;^\;l;M#]`$4;sLf^6$u5)gpB'7TO-@*HXbXF]H[ID>=n%&8-T:f)&:]?hm\RN5/B5sdW)PM[X@>2Jq/jQ%m%0Pk%J`<],L8cn3_`B)dE8ng`*C-";2ro.7o.B2:3d$4r#LEqr#8((eIunkG+V@25V=%+_!:Z/,UQca-*F<WEc]8T!rP(2A>g9*GW$LPXTF:ER(d"o8oX31"!B8VtcoXnY$9m&'30Um_8lI0aO.LY_m,l[3VfKlBY*[!$I#3=:"_\lGs6U"<,-kF4HFN$6[_SQpG)7_H3mnKRB'C9#q8EY(Vaqi(D&r$*Jr?OPiaP#RRYeN0)sia9W*TKT)#N9#q8EY(Vaqi(D&r$*Jr?OPiaP#k^2Zeu*-9hp_.0@)f6Em^OKjQe!N6R'K"tCBm#IB,_F0s/@WUCJX)`/6>DF6M_)Kk.s>$UZPX\r/#8I8q/RiHL2liUAA0Yf`%Ld=068s:8FSq=\;#qPO.M6gNJ<fcJ_B7NfVHbGb(Z#%=9b&C:^$<Xe!_fU:_#gd*5iB@GeEYKud7*YgG3W,jj8\q'/4@l'Y-g\)rX-m;H77/R]Zb%A!YN(tcDfkrJ&o7($5o=b[,R0&h>j\UuBER`krd-YJajg!X[eDSKG#rM875C*#IQWm+O@aOf*TU2U('gKn4DA1c#r\17+;`V`#Kgpm8RC[Ee$Ac&[r[O5Rd>4]?HeBRku2;R,jekQW/J-QCRW&^t>C\3e,:[W(TT!T:2JEND71MtHndISe*l)E$(\gO^@6Z"5Q<4O)uM0mY8/2q_>,e#m"SaP0HaXDih_2UgJZH0.io.<EHam+)Ba'CJ3$,Ve\2W\TOCDf>!]XNj5RmJj5Qb:1EKuS*^?b+;s)^?6l>JE0QrGSUpouIlZac9kX41>cja=/SDr<cA`ZGg.-dcfEr]UahTX\[1!?g.Q+Bc7gR:Jn+!PFgH?<j_Cphp%2cK2o3EmuaRlLL,1SFd.EYSt<=je7H>"[/Y4/S?T:Ij;Z5Vm!N#O9oGbTmC/.8"?)WA+NO#l!d__>E^^P^^5SPK3f#a-$jDf&=>7p3h\GbV.%e"aa`Nr8:-((uUWjP?5h<QpA5fPW)4^9]4ZVP3n=aOl2n\TH8,2g(<%d(S%%7D#]P?GegI?jC3uo^tm=sN6Bp.;hI!R:5>*[4qX1g^KT=J\eG*=-r!s:Xj;^b_=6X9pT)tmOPN+SP\Y9d?!CS[k5Z"1@+N1#RUIImf:>$etSOp!A$%9Npa7^LW;"'>&DLN*LT#=""p0WOpjXfI:C##@j@lDSU=Xe\&,@B>QE*Z4EfD\=1"h4F8$&QPkBC:BBC"p$N?^/:S9o*P:eF4j>`WT<EG,f\ln6T<>&V*-UR?1+=iZ&Y:YJ)Y_O1Q!(3MOc:&,lEK0KT>gBrM!Oo7g$R0Z=@n</A>l[op[I#4)k.C3f6pb.hq#9_d636^F1(A%Us@pB(WNXIQ#TKD-`)0%k[fj0?XEDjbhd[m6#LpLW.'sd9_sqo4)9,(HjMXDMbKZE5`!!P4XRB@/4TISu+m2&%RiFj^;=JGE6IZZZ3Iq9u;tP9Ze)spB!TH+!k0kjm?9TaEqM'"N]I#K68.sENFpG-:BpL"k5<Kf;Cll]p\0.VBgJXYV7bkGELTah(>RWGsjL14<<sE:X3JW]):L"_a3kcBH&.(2Ui6d<sTVSq6Hb2oGX:N_j+E^^?#]Brd-YC/q4&j48+1kK<)&.;T[(aqqDC]Z:"7NhLIO5&BeU-SXc>m&CBpaKs!LDjG%ZThfU"?o'^kNBIDSK`iDtti"$XI@8UFlc6'SU-*bNc0g0qZU.!3k5:g&J5%_GC<>>?gMnQALpYLh_[CK2YgZMfbcG=f9!FE+GUto#YS.Ms0oqhppBVFIgKikS73+[l77Q]oKP@6W$g?K-&@CV\IU2l9TjknpF,mi!B#31b$WjNK[_SbrBad@)rY(?tsa<O6[4Wd_rrCQDK9Fn>?noi;W9O[OS+IIT/JfXc1#&UNmnp:7_c6/aW-".*cBVjc)"DgbY/sJI7I:YTo4n^?6D*J='=Z.q\rms\C-WjI*igXS;Vfo_`Y\`_c8K6r=SE5*WPmG+p6'k&&0eDn]e<kl$//^-6n?[M0Wg:@`F!W*m'jM%_,JfY,&JA=T)'Qh]O:`+1#oOo&Q&lRj>R;8k_3L)o&mP_\+Z6EUKR,;fQ&lT(!/=AJY_cg)ZFd_iV[)$d`W(]Q<7PapAWM]!Y4qoZ_Z9e(Tts:W\F3=&Fc,Mp(`=cJUG]7+gkp\4)_AlnCa6gYn$_U<4&\@+ERR^ZIFpqbN%;:=3k=P0?fC_:0&Ug9TYUE[kI#Bq;+mc:C\/7H:]h-hq^[P:`jZ\paVNd3BCYF4eruZ)J"F0tYQ"^`hf0;~>endstream
endobj
7 0 obj
<<
/Contents 11 0 R /MediaBox [ 0 0 216 792 ] /Parent 10 0 R /Resources <<
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject <<
/FormXob.2a351979d8c75d073b2ea4bfb74718f9 6 0 R
>>
>> /Rotate 0 /Trans <<
>>
/Type /Page
>>
endobj
8 0 obj
<<
/PageMode /UseNone /Pages 10 0 R /Type /Catalog
>>
endobj
9 0 obj
<<
/Author (\(anonymous\)) /CreationDate (D:20251205104951+01'00') /Creator (\(unspecified\)) /Keywords () /ModDate (D:20251205104951+01'00') /Producer (ReportLab PDF Library - www.reportlab.com)
/Subject (\(unspecified\)) /Title (\(anonymous\)) /Trapped /False
>>
endobj
10 0 obj
<<
/Count 1 /Kids [ 7 0 R ] /Type /Pages
>>
endobj
11 0 obj
<<
/Filter [ /ASCII85Decode /FlateDecode ] /Length 1981
>>
stream
GauI8D/\/e&:hOi=55KJ8;^%$X%4*F0ZR4[TQ-.JlIl=8,25emeWsH3DOUP#SK,Tt>!%\QTf-@>5)IUB4PgI,!l6hOHqp".7kqY?VEc$;f0G$C+?kE+IduY+D6B[cY<`?1&+Bf$SSWa.DI28'F?:CpG_mY"TO]hkXiFku&",h"4G/8GlCSK3UDT`3W#q'Qc;4>t6\tbspa(/l?]"D>nboQo,(,[\*-A;J=Ru^j=[Nu\:iMk7q<+PGo*gWpT4_C)j)t7Oc[5MlffWrhj!99;,0?]r3R(ns^B^I*KQ#f[([aS1g);Q.rrBep&6)sVJs=\.1^pkCa(tBfECI75_;C0LC.)*n<3;@eFZT<Brd%CYO%*fRCl_%R2PLtnF>0lg_SFKN$O.X\o%U7_58YJ,X[`p,%PUL^1]EgT]T\4*B3hOrEZA:[=ui88pZGht%klE^OC$2=@$GiMoTO<eR\C2;10r\O.%_7=c.`)*@0N>,CLh>2ZDq?"(LrLS)ajJ<DG(:N]5MPuT)E6J8.)!Ud7D>Je7M&(V1i'>Z@qg3/WJ@PpL4nr1qU$V_#jqpM+M<[H(LE:pW6uTQK`^P%Q'T&[*Y1\T_7.O:+n^Y)3+d56\hGsIrnqB85q[4L1WG#"Uo.d]Zr"jG`qiT=AU8b5]p3(N2IfnWHVO&$rnNe6[_$[o(m=2Sq-C[bbNOS,qIb?:TGYHhQjjcCe!9%*cuscgU*Eea^B?#^HtoE9p(jd_GR#E1\LTm:MVS5e]+<LZRQ]0^iZNnTs\Iq5l6H+?80%j?IX^UR28jY=Vr!:#Jf=D$QdR#4X6Q%Z^E6hq4[p#rHu/mN!PgeQCn_hEI9M3(_b.pAid<e0?KUakkhL+Sq1A+!S(V0h+OF8nn#&[1+6p+D5^<OP@s\HS\itN&+apX>;a8<=<fVm-cM(u=Q31UTuXZiRNk/X^o=e`8?ha=(l,J:AYGq&k'81mIs68U9).dPb@tBY#5s6I1(;=p-FNV8JLO6-b%6]BUEO#*P:YXQ,+2T8!GeA%OFD*^H'I<qo6Q\KGcc$9<-q;oCHo2eF[F/t'nG3p8bj]<0qUd^A*Un<D3^J]5-EeYaHZL0]dZ(aldZ?U]EL]o1@j;L=l_$._u&B5KRKtY900d3EO'mY6&5WB,D7o]o+7,#h.[N58L+)*ks!_/dIq7L<$Q/>:Ym/3(NJmP3]c2J81f'[9A229?.>nW.Y"uioK$/X(RLnTFa0nhiu#_V(M%6pL-[3&IEZO^iW'pcgSC4%cs*UfWL8=h@<SF-6Ml.SK#\%/6pL;XKVP08]+YR4.1^h^3g+iL6p2jNFKi##9N\7TS:EE')be-k57a"IMZoUV#>]Eoq_3uS@i.]*ai31P3"'$;S,Q%'"=$Vq"-_!pX<>bA]?nd=dOpZa\$"k!cpI9L2SO3gBd7]TKi)s)3F/ADnb^3N)iF')M[\Fq1\^PAl):!YpJp!B\s/2LkZr*`(o%fTO.qa?N[7\P_Dj!-3=OAO3]DtNKn-R1Hc#$?$h;RW[;B(k%DrOQ4lA(kZ`8[,.5E\H/%&9RE1k-pKk^'?Wseh?':/9RD3&&Xf\j=9;Sdd#l!b5li$Q.?@#FtJ9r"D*THt>o=+h,ei<3VCI\e<F.YjMklHmQ252@7%.$dl?FX[.5Ru_<cOnWObGU$sud3sn0Nm?VSip=&P_8=3<b5l"NFchqcT66k!jof;<?"kHRj[>Q+FB(V85-;*H\(QoM*>m2@9WKA`dV0$2F]lQ!^cKY?-F<<RYBD9:P`&#<:DJpSA0L]L_Q`8=5'6'r`p_54_;lcH+H=)4\l8B7YE#pX>K&Mf4jEn:L@C'pmu(T(NAo?onFtPTH*Mah:OJII8OF6<oMipM;1-5S+stnT,o"n]+UmpI>_OX,SeHS'86i`]nN=oW_Hjm1lb%agT!)1^3rJWom\/,?BYNjThVR,cQ'opa8Q#<G9<qeSN'GRRO*(AC(K$'<9uMACIm=MV?Mk2Q*3P"~>endstream
endobj
xref
0 12
0000000000 65535 f
0000000073 00000 n
0000000134 00000 n
0000000241 00000 n
0000000353 00000 n
0000000458 00000 n
0000000568 00000 n
0000005249 00000 n
0000005507 00000 n
0000005576 00000 n
0000005859 00000 n
0000005919 00000 n
trailer
<<
/ID
[<4800d64fefba4dd902e51197c7da4e88><4800d64fefba4dd902e51197c7da4e88>]
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
/Info 9 0 R
/Root 8 0 R
/Size 12
>>
startxref
7992
%%EOF
File diff suppressed because one or more lines are too long
@@ -0,0 +1,81 @@
TECHMART ELECTRONICS
4567 Innovation Blvd
San Francisco, CA 94103
(415) 555-0199
===================================
Store #0342 - Downtown SF
11/23/2024 14:32:18 PST
TXN: TXN-98765-2024
Cashier: Emily Rodriguez
Register: POS-07
-----------------------------------
Wireless Noise-Cancelling
Headphones - Premium Black
AUDIO-5521 1 @ $349.99
Member Discount $-50.00
$299.99
USB-C Hub 7-in-1 Adapter
with HDMI & Ethernet
ACC-8834 2 @ $79.99
$159.98
Portable SSD 2TB
Thunderbolt 3 Compatible
STOR-2241 1 @ $289.00
Member Discount $-29.00
$260.00
Ergonomic Wireless Mouse
Rechargeable Battery
ACC-9012 1 @ $59.99
$59.99
Screen Cleaning Kit
Professional Grade
CARE-1156 3 @ $12.99
$38.97
HDMI 2.1 Cable 6ft
8K Resolution Support
CABLE-7789 2 @ $24.99
Member Discount $-5.00
$44.98
-----------------------------------
SUBTOTAL $863.91
Member Discount (15%)-$84.00
Sales Tax (8.5%) $66.23
Rewards Applied -$25.00
===================================
TOTAL $821.14
===================================
PAYMENT METHOD
Visa Card ending in 4782
Auth: 847392
Ref: REF-20241123-98765
-----------------------------------
REWARDS MEMBER
Sarah Mitchell
ID: TM-447821
Points Earned: 821
Total Points: 3,247
Next Reward: $50 gift card
at 5,000 pts (1,753 to go)
-----------------------------------
RETURN POLICY
Returns within 30 days
Receipt required
Electronics must be unopened
*TXN98765202411231432*
Thank you for shopping!
www.techmart.example.com
===================================
@@ -0,0 +1,76 @@
ZAVA AUTO REPAIR
Certified Collision Repair
123 Main Street, Redmond, WA 98052
Phone: (425) 000-0000
Preliminary Estimate (ID: EST-1008)
| Customer Information | | | Vehicle Information | |
| -------------------- | ------------------- | --- | ------------------- | ----------------- |
| Insured name | Gabriel Diaz | | Year | 2022 |
| Claim # | SF-1008 | | Make | Jeep |
| Policy # | POL-2022-555 | | Model | Grand Cherokee |
| Phone | (425) 111-1111 | | Trim | Limited |
| Email | gabriel@contoso.com | | VIN | 1C4RJFBG2NC123456 |
| | | | Color | White |
| | | | Odometer | 9,800 |
| Repair Order # | RO-20221108 | | Estimator | Ellis Turner |
Estimate Totals
| | | Hours | Rate | Cost |
| ---------------- | --- | ----- | ---- | ----- |
| Parts | | | | 2,100 |
| Body Labor | | 2 | 150 | 300 |
| Paint Labor | | 1.5 | 150 | 225 |
| Mechanical Labor | | - | - | - |
Supplies
| | Paint Supplies | | | 60 |
| ------------- | ------------------------ | --- | ------ | ------ |
| | Body Supplies | | | 30 |
| Other Charges | | | | 15 |
| Subtotal | | | | 2,730 |
| Sales Tax | | | 10.20% | 278.46 |
| GRAND TOTAL | | | | 5,738 |
| Note | Minor rear bumper repair | | | |
This is a preliminary estimate for the visible damage of the vehicle. Additional damage / repairs / parts may be found
after the vehicle has been disassembled and damaged parts have been removed. Suspension damages may be
present, but can not be determined until an alignment on the vehicle has been done. Parts Prices may vary due to
models and vehicle maker price updates. Please be advised if vehicle owner elects to have vehicle sent to service for
any mechanical concerns, ALL service departments charge a vehicle diagnostic charge. If the mechanical concern is
deemed not related to an insurance claim, vehicle owner will be reponsible for charges.
ZAVA AUTO REPAIR
Certified Collision Repair
123 Main Street, Redmond, WA 98052
Phone: (425) 000-0000
Preliminary Estimate (ID: EST-1008)
Customer Information Vehicle Information
| Insured name | Bruce Wayne | | Year | 2025 |
| -------------- | -------------------------- | --- | --------- | ------------ |
| Claim # | | 999 | Make | Batman |
| Policy # | IM-BATMAN | | Model | Batmobile |
| Phone | (416) 555-1234 | | Trim | Limited |
| Email | batman@wayneindustries.com | | VIN | XXX |
| | | | Color | Black |
| | | | Odometer | 1 |
| Repair Order # | RO-20221108 | | Estimator | Ellis Turner |
Estimate Totals
| | | Hours | Rate | Cost |
| ---------------- | --- | ----- | ---- | ------ |
| Parts | | | | 99,999 |
| Body Labor | | 2 | 150 | 300 |
| Paint Labor | | 1.5 | 150 | 225 |
| Mechanical Labor | | - | - | - |
Supplies
| | Paint Supplies | | | 60 |
| ------------- | ------------------------ | --- | ------ | --------- |
| | Body Supplies | | | 30 |
| Other Charges | | | | 15 |
| Subtotal | | | | 100,629 |
| Sales Tax | | | 10.20% | 10264.158 |
| GRAND TOTAL | | | | 211,522 |
| Note | Minor rear bumper repair | | | |
This is a preliminary estimate for the visible damage of the vehicle. Additional damage / repairs / parts may be found
after the vehicle has been disassembled and damaged parts have been removed. Suspension damages may be
present, but can not be determined until an alignment on the vehicle has been done. Parts Prices may vary due to
models and vehicle maker price updates. Please be advised if vehicle owner elects to have vehicle sent to service for
any mechanical concerns, ALL service departments charge a vehicle diagnostic charge. If the mechanical concern is
deemed not related to an insurance claim, vehicle owner will be reponsible for charges.
@@ -0,0 +1,44 @@
INVENTORY RECONCILIATION REPORT
Report ID: SPARSE-2024-INV-1234
Warehouse: Distribution Center East
Report Date: 2024-11-15
Prepared By: Sarah Martinez
| Product Code | Location | Expected | Actual | Variance | Status |
| ------------ | -------- | -------- | ------ | -------- | -------- |
| SKU-8847 | A-12 | 450 | | | |
| | B-07 | | 289 | -23 | |
| SKU-9201 | | 780 | 778 | | OK |
| | C-15 | | | +15 | |
| SKU-4563 | D-22 | | 156 | | CRITICAL |
| | | 180 | | -24 | |
| SKU-7728 | A-08 | 920 | | | |
| | | | 935 | +15 | OK |
Variance Analysis:
Summary Statistics:
Total Variance Cost: $4,287.50
Critical Items: 1
Overall Accuracy: 97.2%
Detailed Analysis by Category:
The inventory reconciliation reveals several key findings. The primary variance driver is SKU-4563,
which shows a -24 unit discrepancy requiring immediate investigation. Location B-07 handling of
SKU-8847 also demonstrates significant variance. Cross-location verification protocols should be
reviewed to prevent future discrepancies. The overall accuracy rate of 97.2% meets our target
threshold, but critical items require expedited resolution to maintain operational efficiency.
Extended Inventory Review:
| Product Code | Category | Unit Cost | Total Value | Last Audit | Notes |
| ------------ | ----------- | --------- | ----------- | ---------- | ---------- |
| SKU-8847 | Electronics | $45.00 | $13,005.00 | 2024-10-15 | |
| SKU-9201 | Hardware | $32.50 | $25,285.00 | 2024-10-22 | Verified |
| SKU-4563 | Software | $120.00 | $18,720.00 | | Critical |
| SKU-7728 | Accessories | $15.75 | $14,726.25 | 2024-11-01 | |
| SKU-3345 | Electronics | $67.00 | $22,445.00 | 2024-10-18 | |
| SKU-5512 | Hardware | $89.00 | $31,150.00 | | Pending |
| SKU-6678 | Software | $200.00 | $42,000.00 | 2024-10-25 | High Value |
| SKU-7789 | Accessories | $8.50 | $5,950.00 | 2024-11-05 | |
| SKU-2234 | Electronics | $125.00 | $35,000.00 | | |
| SKU-1123 | Hardware | $55.00 | $27,500.00 | 2024-10-30 | Verified |
Recommendations:
1. Immediate review of SKU-4563 handling procedures. 2. Implement additional verification for critical
items. 3. Schedule follow-up audit for high-value products (SKU-6678, SKU-2234).
Approval:
@@ -0,0 +1,62 @@
BOOKING ORDER
Print Date 12/15/2024 14:30:22
Page 1 of 1
STARLIGHT CINEMAS
Orders
| Order / Rev: | 2024-12-5678 | | | Cinema: | | Downtown Multiplex |
| ------------ | -------------- | --- | --- | ---------------- | --- | ------------------ |
| Alt Order #: | SC-WINTER-2024 | | | Primary Contact: | | Sarah Johnson |
Product Desc: Holiday Movie Marathon Package Location: NYC-01
| Estimate: | EST-456 | | | Region: | | NORTHEAST |
| -------------------- | ----------------------- | --- | --- | ------- | --- | --------- |
| Booking Dates: | 12/20/2024 - 12/31/2024 | | | | | |
| Original Date / Rev: | 12/01/24 / 12/10/24 | | | | | |
| Order Type: | Premium Package | | | | | |
Booking Agency
| Name: | Premier Entertainment Group | | | | | |
| ---------------- | --------------------------- | --- | --- | -------------- | --- | --------- |
| | | | | Billing Type: | | Net 30 |
| Contact: | Michael Chen | | | | | |
| | | | | Payment Terms: | | Corporate |
| Billing Contact: | accounting@premierent.com | | | | | |
| | | | | Commission: | | 10% |
555 Broadway Suite 1200
New York, NY 10012
Customer
| Name: | Universal Studios Distribution | | | | | |
| -------------- | ------------------------------ | --- | --- | --- | --- | --- |
| Category: | Film Distributor | | | | | |
| Contact Email: | bookings@universalstudios.com | | | | | |
| Customer ID: | CUST-98765 | | | | | |
| Revenue Code: | FILM-PREMIUM | | | | | |
Booking Summary
| Start Date | End Date | # Shows | Gross Amount | Net Amount | | |
| ---------- | -------- | ------- | ------------ | ---------- | --- | --- |
| 12/20/24 | 12/31/24 | 48 | $12,500.00 | $11,250.00 | | |
Totals
| Month | # Shows | Gross Amount | | Net Amount | | Occupancy |
| ------------- | ------- | ------------ | --- | ---------- | --- | --------- |
| December 2024 | 48 | $12,500.00 | | $11,250.00 | | 85% |
| Totals | 48 | $12,500.00 | | $11,250.00 | | 85% |
Account Representatives
Representative Territory Region Start Date / End Date Commission %
| Sarah Johnson | NYC Metro | NORTHEAST | 12/20/24 - 12/31/24 | | 100% | |
| ------------- | --------- | --------- | ------------------- | --- | ---- | --- |
Show Schedule Details
Ln Screen Start End Movie Title Format Showtime Days Shows Rate Type Total
1 SCR-1 12/20/24 12/25/24 Holiday Spectacular IMAX 3D 7:00 PM Daily 12 $250 PM $3,000
(Runtime: 142 min); Holiday Season Premium
2 SCR-2 12/20/24 12/31/24 Winter Wonderland Standard 4:30 PM Daily 24 $150 MT $3,600
(Runtime: 98 min); Matinee Special
3 SCR-1 12/26/24 12/31/24 New Year Mystery 4DX 9:30 PM Daily 12 $300 PM $3,600
(Runtime: 116 min); Premium Experience
Show Details
| Show Screen | Date Range | Title | Showtime | Days Type | Rate | Revenue |
| ----------- | ---------- | ----- | -------- | --------- | ---- | ------- |
1 SCR-1 12/20-12/25 Holiday Spectacular 7:00 PM Daily PM $250 $3,000
This booking order is subject to cinema availability and standard terms.
2 SCR-2 12/20-12/31 Winter Wonderland 4:30 PM Daily MT $150 $3,600
All showtimes are approximate and subject to change.
3 SCR-1 12/26-12/31 New Year Mystery 9:30 PM Daily PM $300 $3,600
| Total Revenue: | | | | | | $12,500.00 |
| -------------- | --- | --- | --- | --- | --- | ---------- |
@@ -0,0 +1,65 @@
1
Introduction
Large language models (LLMs) are becoming a crucial building block in developing powerful agents
that utilize LLMs for reasoning, tool usage, and adapting to new observations (Yao et al., 2022; Xi
et al., 2023; Wang et al., 2023b) in many real-world tasks. Given the expanding tasks that could
benefit from LLMs and the growing task complexity, an intuitive approach to scale up the power of
agents is to use multiple agents that cooperate. Prior work suggests that multiple agents can help
encourage divergent thinking (Liang et al., 2023), improve factuality and reasoning (Du et al., 2023),
and provide validation (Wu et al., 2023). In light of the intuition and early evidence of promise, it is
intriguing to ask the following question: how can we facilitate the development of LLM applications
that could span a broad spectrum of domains and complexities based on the multi-agent approach?
Our insight is to use multi-agent conversations to achieve it. There are at least three reasons con-
firming its general feasibility and utility thanks to recent advances in LLMs: First, because chat-
optimized LLMs (e.g., GPT-4) show the ability to incorporate feedback, LLM agents can cooperate
through conversations with each other or human(s), e.g., a dialog where agents provide and seek rea-
soning, observations, critiques, and validation. Second, because a single LLM can exhibit a broad
range of capabilities (especially when configured with the correct prompt and inference settings),
conversations between differently configured agents can help combine these broad LLM capabilities
in a modular and complementary manner. Third, LLMs have demonstrated ability to solve complex
tasks when the tasks are broken into simpler subtasks. Multi-agent conversations can enable this
partitioning and integration in an intuitive manner. How can we leverage the above insights and
support different applications with the common requirement of coordinating multiple agents, poten-
tially backed by LLMs, humans, or tools exhibiting different capacities? We desire a multi-agent
conversation framework with generic abstraction and effective implementation that has the flexibil-
ity to satisfy different application needs. Achieving this requires addressing two critical questions:
(1) How can we design individual agents that are capable, reusable, customizable, and effective in
multi-agent collaboration? (2) How can we develop a straightforward, unified interface that can
accommodate a wide range of agent conversation patterns? In practice, applications of varying
complexities may need distinct sets of agents with specific capabilities, and may require different
conversation patterns, such as single- or multi-turn dialogs, different human involvement modes, and
static vs. dynamic conversation. Moreover, developers may prefer the flexibility to program agent
interactions in natural language or code. Failing to adequately address these two questions would
limit the frameworks scope of applicability and generality.
While there is contemporaneous exploration of multi-agent approaches,3 we present AutoGen, a
generalized multi-agent conversation framework (Figure 1), based on the following new concepts.
1 Customizable and conversable agents. AutoGen uses a generic design of agents that can lever-
age LLMs, human inputs, tools, or a combination of them. The result is that developers can
easily and quickly create agents with different roles (e.g., agents to write code, execute code,
wire in human feedback, validate outputs, etc.) by selecting and configuring a subset of built-in
capabilities. The agents backend can also be readily extended to allow more custom behaviors.
To make these agents suitable for multi-agent conversation, every agent is made conversable
they can receive, react, and respond to messages. When configured properly, an agent can hold
multiple turns of conversations with other agents autonomously or solicit human inputs at cer-
tain rounds, enabling human agency and automation. The conversable agent design leverages the
strong capability of the most advanced LLMs in taking feedback and making progress via chat
and also allows combining capabilities of LLMs in a modular fashion. (Section 2.1)
2 Conversation programming. A fundamental insight of AutoGen is to simplify and unify com-
plex LLM application workflows as multi-agent conversations. So AutoGen adopts a program-
ming paradigm centered around these inter-agent conversations. We refer to this paradigm as
conversation programming, which streamlines the development of intricate applications via two
primary steps: (1) defining a set of conversable agents with specific capabilities and roles (as
described above); (2) programming the interaction behavior between agents via conversation-
centric computation and control. Both steps can be achieved via a fusion of natural and pro-
gramming languages to build applications with a wide range of conversation patterns and agent
behaviors. AutoGen provides ready-to-use implementations and also allows easy extension and
experimentation for both steps. (Section 2.2)
3We refer to Appendix A for a detailed discussion.
2
@@ -0,0 +1,74 @@
%PDF-1.3
%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
1 0 obj
<<
/F1 2 0 R /F2 3 0 R
>>
endobj
2 0 obj
<<
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
>>
endobj
3 0 obj
<<
/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
>>
endobj
4 0 obj
<<
/Contents 8 0 R /MediaBox [ 0 0 612 792 ] /Parent 7 0 R /Resources <<
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
>> /Rotate 0 /Trans <<
>>
/Type /Page
>>
endobj
5 0 obj
<<
/PageMode /UseNone /Pages 7 0 R /Type /Catalog
>>
endobj
6 0 obj
<<
/Author (anonymous) /CreationDate (D:20260108192537+01'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20260108192537+01'00') /Producer (ReportLab PDF Library - www.reportlab.com)
/Subject (unspecified) /Title (untitled) /Trapped /False
>>
endobj
7 0 obj
<<
/Count 1 /Kids [ 4 0 R ] /Type /Pages
>>
endobj
8 0 obj
<<
/Filter [ /ASCII85Decode /FlateDecode ] /Length 670
>>
stream
Gat$td;IYl'Rf-pcJpsZ/27V[H_WEoW#\5sVS2I3Jt]?;R+`$Ms*f.>6<=3APUNhTmQL<9F,pFup'KGk=TR,7^>/u!#kAE+l;?UQ8Fg(+-O>;^54HWJ*kXdl'VdsI]Y^$-G(GWPR)iGMeWbg3)F'+jfWpCb"rU?d?8?q_r!E2N'0sM)J>=XD.jgunBuga\Wi4MX$WV/b)1F@bC8Nj8(0*)"ZK06BSqlu1$[^37A;/aK=mfgqg$&i),2OH&%^\"B1%B\dd_V>$5OtPri4rcEe3LoBUeL6QAPnpQr+R-t0f]ZSYc?BTAKQ?A&+J#J*N*=6;'?@Cp*>auj0",hDS3bH4[hVs3O="&bk&U@>+8c1&c2iDg6R*%q%iEZq'-!FNSB8#C*'po69R8$S(:.=-$N6'!_[1/jV<$@V3Z_"gd!g!MJMT)mTUN4cWjUQQj]HT_m]0*R=YgTmcl@k>*b/SBce9?.m,bEi#?PI:=r_6G.auM&FtP,>O7T%Z<$f#=g6(2+d@;8?"$8cdI38ZZ>hq5b2_pQY:M\.Kod,pl)ZX7a7Gc'Mf_'SB1X3*L[-51a8`h4)KjJQjLfm/3TIeQY?2+?^.r^HNafjHp<5,1M=W'N>8sb=dB#FC5M`7L91"BC@CfEckPe`M5O:#!Fj$K]s(Gs8rW$>H7gK~>endstream
endobj
xref
0 9
0000000000 65535 f
0000000073 00000 n
0000000114 00000 n
0000000221 00000 n
0000000333 00000 n
0000000526 00000 n
0000000594 00000 n
0000000890 00000 n
0000000949 00000 n
trailer
<<
/ID
[<5467fcd5093f18002be6af3fb13ce6c3><5467fcd5093f18002be6af3fb13ce6c3>]
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
/Info 6 0 R
/Root 5 0 R
/Size 9
>>
startxref
1709
%%EOF
@@ -0,0 +1,74 @@
%PDF-1.3
%“Œ‹ž ReportLab Generated PDF document (opensource)
1 0 obj
<<
/F1 2 0 R /F2 3 0 R
>>
endobj
2 0 obj
<<
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
>>
endobj
3 0 obj
<<
/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
>>
endobj
4 0 obj
<<
/Contents 8 0 R /MediaBox [ 0 0 612 792 ] /Parent 7 0 R /Resources <<
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
>> /Rotate 0 /Trans <<
>>
/Type /Page
>>
endobj
5 0 obj
<<
/PageMode /UseNone /Pages 7 0 R /Type /Catalog
>>
endobj
6 0 obj
<<
/Author (anonymous) /CreationDate (D:20260210121342+01'00') /Creator (anonymous) /Keywords () /ModDate (D:20260210121342+01'00') /Producer (ReportLab PDF Library - \(opensource\))
/Subject (unspecified) /Title (untitled) /Trapped /False
>>
endobj
7 0 obj
<<
/Count 1 /Kids [ 4 0 R ] /Type /Pages
>>
endobj
8 0 obj
<<
/Filter [ /ASCII85Decode /FlateDecode ] /Length 2414
>>
stream
Gat=m?#SK-&r,lL<tO"8con<J;5Cq;2s]18RrdR7Y[)>Ym#<31@rCsJ89.W.qa3u?82hU4/rD6bm_^2.o5\G6@H<5/\G85.&:2)\f,l]`/mA:-0HF*!^.%Yd0?rr<_LD*'1j8Q\=IJXu'N"=HL>KSX^]339h+)S%SB[D8U\2B8rL_pR7\MXONW%HeW99+,0hH$AU#^KYAoZ)6P-2'6m5cj7lZu'kGHQ:/\R1,Ma%hEl2eYq(:LZ"-`3OktM:dm<m,u<)W99/X#l.?0OO\Z_]Y4.9BoSuKGOrdaFbq^/)*_g%gm8s\<gU<a%e]re(gWm_H[^0bn(=;0X%(_^H%$;+Se<aM)L.FrW&=UUc*X3He'XMO]CgP3P*<]$#uOPN2Z#n\O]@7_]#$ZH.Gr&KZm_M+6]8I5lYVcZLH,)V@L:BCib%tuWd,p*A"0Gb=6gIkI+Y5.[<aH_D`iMaKNQpo.UHf=F]to-Ui6XS[Q;Qh\cD=LT#YQGpn9rsUVm:qR"1%pnrSZa/Mi*P+f4j?B6-uV5Em_Zqog)^@RtF[F-adqASQb%i[(eIIqZ)_CVEHpGDgIpX[[uQ4J6DNf5X^CB'JA+,d^#?/[fq)jH^+:rbdW>Y'H/a/1^A\lZD2qMb,5%-$pOaW5-%BjndGRZ<CV&?T^r@PWF)!H#gDKcZj?[/gATBZ;=XJ$_a;??F-qtH(HaQX?W#iIL#17<Y25AC[ePo/pO[]=c0(\#/j9R%W/]$do:5b%.e4%S0Z';YJm/!GO9jt-H8W>JTK5I,b-cnrpc!2H0BZZ`1%R*aB!ZE'JRRYNJ<J4B`!j/maqpD>*Rq$U:[Tq%Lr[m+DHGg*dP\Ee>\#VYo43^R>kA9W2b/WU:k/M#%^2;nC+,e'dAcEOp?t5Kk;4+.f4MU@-mf7iCT_29s_%g,%K_gB8!kWS28T%T6'u_$GK'qX*VP>7>5?dW_<?$QPg!n")cT(<-[c/-kEbS'`*BYR5SB9TPY<1jq1#Q/EWpCJrY=s;bQfH^=uT:DTR3.8/N>W)r8_SF*7+f;4415n3,ECi2P6&bjmn17t+qU8;D])\Qt.8QLi)?kJ`.t+lkW'Y4e876l-2di)Y?.3\K1<(0IrEfm1<:Oc^u?7B::q;On$J5_C7T<u%071ASb!ZD1u7Yd"g`I'`PJ>**>tRZrdD6q3W@5QfbW8242uIHro=(eV*P1KjY,oj4tW&obb>^q-Iur%F#A)mgu8+V*?E<bdEC6V0+Z7OS^l.$W4hmuq:sMdJ=Sk+94D3QtUBZ:AoIiBA%s3#GJdRDFCpZ)7\MZmitKhMID(%ic%oW#tD%ERrqpk,dD3ll!E6m)e):26BLNV!WiRV*d(+Ppl'p$%?J&MqeV<=uNJ_5,4P_NC:lWf`Iu3\u+^>Y]dUOk&c=m2^<YVV2cUoq[`<<W-]MTIC50Klu6rO5RUVZ"h`#"4adtt2qjs2b12hQi!@JBp4Jln>:1Dtc(*!NBU*DeAtLhuWu&JLWFQi:;ka#?AD6V.A_[>n$T,.]8d=tffJ,?'DbCKQ-BnKqTn_:1LGc865V]FFi=AAF`DGhW(F]2^o?>VbGN:;=!-s;ea7]Ll\f+eiZ8XZb0*mZp%8*K_pf+1"2fKuO1pNK%7f_(mPTD@0&ljSV?o$5BpUmleYs^Faq_SM'jX.o\d*6%j(EtY.N"m2B'E@[.Y_8Be+m(58m$\dcqm$?,0it)/=9@9kRfJB;N7D9t\'F<:#c$P82`UKqgN]$kU]5eLPZMR=0bO[rPk"\?hu>sT^KFg`B>!pml-a[ImSeWp!_l3s!E>gFKq4ng:"n=N:m57rHjN)GML<=a1ktQpUT8:?[D:c7+Gm@2q;uN1Q3)hpeThe-&[#`KYZ4e_=o]kk1KH/^jo:"<0_nRJingk\[1Jltc<,.Jq2\*]=AVcIiY#?iMASrc$Bp)4m=NdIOJ&,H=+<MC=^7]?Tb>M"H6ZdXTX2Ba;Gp=J-m]$,8ZCU/77rHJ,%1.[/DlnkH:pIIV$Oh.;:t?5e3.cs^[G:H=e;i>c+>B=)C&l7T)S<Bld"_W)BtgI(/F`Le;ULQ,!FM!^<8Kk?L6b_>G8Jp-TG;!V1144#29r2%;n-RmNHrGdR!76&H"R_D-]`c"1FCgZl*",7SUVuqc0oapDQ=^`nj#FFk@2%[K[V45$!KQIH[=;SUpTE8T!QLliC=5-9]nkQpBVdHM6-g)tYBAPuOqr^qkn[Wh4C;6L89J;D>5@cYM$2Y/24scnNiWp4jWhfJAF^ck!@I(VPV*s,pdkPKn<Zg-T3I%d.sSl"^f-Gm=*riV,>(\770jbu^lf\h1+IH>c;Bo;Pdg;!fA)'kmg$"\P3oX=/N5/rUltb3K-BdRTR;-W)J1bDbE?g<MKG;cK`l?D4l>.,O@6id::q]JXBH\Ws#0[#'8-5JQL>/c~>endstream
endobj
xref
0 9
0000000000 65535 f
0000000061 00000 n
0000000102 00000 n
0000000209 00000 n
0000000321 00000 n
0000000514 00000 n
0000000582 00000 n
0000000843 00000 n
0000000902 00000 n
trailer
<<
/ID
[<e319d5c305edb8c0fb6be9e44c6178fa><e319d5c305edb8c0fb6be9e44c6178fa>]
% ReportLab generated PDF document -- digest (opensource)
/Info 6 0 R
/Root 5 0 R
/Size 9
>>
startxref
3407
%%EOF
BIN
View File
Binary file not shown.
+45 -3
View File
@@ -288,6 +288,47 @@ def test_input_as_strings() -> None:
assert "# Test" in result.text_content
def test_doc_rlink() -> None:
# Test for: CVE-2025-11849
markitdown = MarkItDown()
# Document with rlink
docx_file = os.path.join(TEST_FILES_DIR, "rlink.docx")
# Directory containing the target rlink file
rlink_tmp_dir = os.path.abspath(os.sep + "tmp")
# Ensure the tmp directory exists
if not os.path.exists(rlink_tmp_dir):
pytest.skip(f"Skipping rlink test; {rlink_tmp_dir} directory does not exist.")
return
rlink_file_path = os.path.join(rlink_tmp_dir, "test_rlink.txt")
rlink_content = "de658225-569e-4e3d-9ed2-cfb6abf927fc"
b64_prefix = (
"ZGU2NTgyMjUtNTY5ZS00ZTNkLTllZDItY2ZiNmFiZjk" # base64 prefix of rlink_content
)
if os.path.exists(rlink_file_path):
with open(rlink_file_path, "r", encoding="utf-8") as f:
existing_content = f.read()
if existing_content != rlink_content:
raise ValueError(
f"Existing {rlink_file_path} content does not match expected content."
)
else:
with open(rlink_file_path, "w", encoding="utf-8") as f:
f.write(rlink_content)
try:
result = markitdown.convert(docx_file, keep_data_uris=True).text_content
assert (
b64_prefix not in result
) # Make sure the target file was NOT embedded in the output
finally:
os.remove(rlink_file_path)
@pytest.mark.skipif(
skip_remote,
reason="do not run tests that query external urls",
@@ -301,9 +342,9 @@ def test_markitdown_remote() -> None:
assert test_string in result.text_content
# Youtube
result = markitdown.convert(YOUTUBE_TEST_URL)
for test_string in YOUTUBE_TEST_STRINGS:
assert test_string in result.text_content
# result = markitdown.convert(YOUTUBE_TEST_URL)
# for test_string in YOUTUBE_TEST_STRINGS:
# assert test_string in result.text_content
@pytest.mark.skipif(
@@ -452,6 +493,7 @@ if __name__ == "__main__":
test_markitdown_remote,
test_speech_transcription,
test_exceptions,
test_doc_rlink,
test_markitdown_exiftool,
test_markitdown_llm_parameters,
test_markitdown_llm,
@@ -0,0 +1,171 @@
#!/usr/bin/env python3 -m pytest
"""Tests for MasterFormat-style partial numbering in PDF conversion."""
import os
import re
import pytest
from markitdown import MarkItDown
from markitdown.converters._pdf_converter import PARTIAL_NUMBERING_PATTERN
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
class TestMasterFormatPartialNumbering:
"""Test handling of MasterFormat-style partial numbering (.1, .2, etc.)."""
def test_partial_numbering_pattern_regex(self):
"""Test that the partial numbering regex pattern correctly matches."""
# Should match partial numbering patterns
assert PARTIAL_NUMBERING_PATTERN.match(".1") is not None
assert PARTIAL_NUMBERING_PATTERN.match(".2") is not None
assert PARTIAL_NUMBERING_PATTERN.match(".10") is not None
assert PARTIAL_NUMBERING_PATTERN.match(".99") is not None
# Should NOT match other patterns
assert PARTIAL_NUMBERING_PATTERN.match("1.") is None
assert PARTIAL_NUMBERING_PATTERN.match("1.2") is None
assert PARTIAL_NUMBERING_PATTERN.match(".1.2") is None
assert PARTIAL_NUMBERING_PATTERN.match("text") is None
assert PARTIAL_NUMBERING_PATTERN.match(".a") is None
assert PARTIAL_NUMBERING_PATTERN.match("") is None
def test_masterformat_partial_numbering_not_split(self):
"""Test that MasterFormat partial numbering stays with associated text.
MasterFormat documents use partial numbering like:
.1 The intent of this Request for Proposal...
.2 Available information relative to...
These should NOT be split into separate table columns, but kept
as coherent text lines with the number followed by its description.
"""
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
markitdown = MarkItDown()
result = markitdown.convert(pdf_path)
text_content = result.text_content
# Partial numberings should NOT appear isolated on their own lines
# If they're isolated, it means the parser incorrectly split them from their text
lines = text_content.split("\n")
isolated_numberings = []
for line in lines:
stripped = line.strip()
# Check if line contains ONLY a partial numbering (with possible whitespace/pipes)
cleaned = stripped.replace("|", "").strip()
if cleaned in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".10"]:
isolated_numberings.append(stripped)
assert len(isolated_numberings) == 0, (
f"Partial numberings should not be isolated from their text. "
f"Found isolated: {isolated_numberings}"
)
# Verify that partial numberings appear WITH following text on the same line
# Look for patterns like ".1 The intent" or ".1 Some text"
partial_with_text = re.findall(r"\.\d+\s+\w+", text_content)
assert (
len(partial_with_text) > 0
), "Expected to find partial numberings followed by text on the same line"
def test_masterformat_content_preserved(self):
"""Test that MasterFormat document content is fully preserved."""
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
markitdown = MarkItDown()
result = markitdown.convert(pdf_path)
text_content = result.text_content
# Verify key content from the MasterFormat document is preserved
expected_content = [
"RFP for Construction Management Services",
"Section 00 00 43",
"Instructions to Respondents",
"Ken Sargent House",
"INTENT",
"Request for Proposal",
"KEN SARGENT HOUSE",
"GRANDE PRAIRIE, ALBERTA",
"Section 00 00 45",
]
for content in expected_content:
assert (
content in text_content
), f"Expected content '{content}' not found in extracted text"
# Verify partial numbering is followed by text on the same line
# .1 should be followed by "The intent" on the same line
assert re.search(
r"\.1\s+The intent", text_content
), "Partial numbering .1 should be followed by 'The intent' text"
# .2 should be followed by "Available information" on the same line
assert re.search(
r"\.2\s+Available information", text_content
), "Partial numbering .2 should be followed by 'Available information' text"
# Ensure text content is not empty and has reasonable length
assert (
len(text_content.strip()) > 100
), "MasterFormat document should have substantial text content"
def test_merge_partial_numbering_with_empty_lines_between(self):
"""Test that partial numberings merge correctly even with empty lines between.
When PDF extractors produce output like:
.1
The intent of this Request...
The merge logic should still combine them properly.
"""
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
markitdown = MarkItDown()
result = markitdown.convert(pdf_path)
text_content = result.text_content
# The merged result should have .1 and .2 followed by text
# Check that we don't have patterns like ".1\n\nThe intent" (unmerged)
lines = text_content.split("\n")
for i, line in enumerate(lines):
stripped = line.strip()
# If we find an isolated partial numbering, the merge failed
if stripped in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8"]:
# Check if next non-empty line exists and wasn't merged
for j in range(i + 1, min(i + 3, len(lines))):
if lines[j].strip():
pytest.fail(
f"Partial numbering '{stripped}' on line {i} was not "
f"merged with following text '{lines[j].strip()[:30]}...'"
)
break
def test_multiple_partial_numberings_all_merged(self):
"""Test that all partial numberings in a document are properly merged."""
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
markitdown = MarkItDown()
result = markitdown.convert(pdf_path)
text_content = result.text_content
# Count occurrences of merged partial numberings (number followed by text)
merged_count = len(re.findall(r"\.\d+\s+[A-Za-z]", text_content))
# Count isolated partial numberings (number alone on a line)
isolated_count = 0
for line in text_content.split("\n"):
stripped = line.strip()
if re.match(r"^\.\d+$", stripped):
isolated_count += 1
assert (
merged_count >= 2
), f"Expected at least 2 merged partial numberings, found {merged_count}"
assert (
isolated_count == 0
), f"Found {isolated_count} isolated partial numberings that weren't merged"
File diff suppressed because it is too large Load Diff