Compare commits
9 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 4a5340f93b | |||
| 6b0fd15e60 | |||
| 2b6ec9f315 | |||
| c83de14a9c | |||
| 7fdaefb724 | |||
| 251dddcf0c | |||
| dde250a456 | |||
| 3d4fe3cdcc | |||
| 447c047731 |
@@ -1,2 +1,5 @@
|
||||
packages/markitdown/tests/test_files/** linguist-vendored
|
||||
packages/markitdown-sample-plugin/tests/test_files/** linguist-vendored
|
||||
|
||||
# Treat PDF files as binary to prevent line ending conversion
|
||||
*.pdf binary
|
||||
|
||||
@@ -52,6 +52,7 @@ coverage.xml
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
.test-logs/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
|
||||
@@ -30,30 +30,30 @@ dependencies = [
|
||||
"magika~=0.6.1",
|
||||
"charset-normalizer",
|
||||
"defusedxml",
|
||||
"onnxruntime<=1.20.1; sys_platform == 'win32'",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
all = [
|
||||
"python-pptx",
|
||||
"mammoth~=1.10.0",
|
||||
"mammoth~=1.11.0",
|
||||
"pandas",
|
||||
"openpyxl",
|
||||
"xlrd",
|
||||
"lxml",
|
||||
"pdfminer.six",
|
||||
"pdfminer.six>=20251230",
|
||||
"pdfplumber>=0.11.9",
|
||||
"olefile",
|
||||
"pydub",
|
||||
"SpeechRecognition",
|
||||
"youtube-transcript-api~=1.0.0",
|
||||
"azure-ai-documentintelligence",
|
||||
"azure-identity"
|
||||
"azure-identity",
|
||||
]
|
||||
pptx = ["python-pptx"]
|
||||
docx = ["mammoth", "lxml"]
|
||||
docx = ["mammoth~=1.11.0", "lxml"]
|
||||
xlsx = ["pandas", "openpyxl"]
|
||||
xls = ["pandas", "xlrd"]
|
||||
pdf = ["pdfminer.six"]
|
||||
pdf = ["pdfminer.six>=20251230", "pdfplumber>=0.11.9"]
|
||||
outlook = ["olefile"]
|
||||
audio-transcription = ["pydub", "SpeechRecognition"]
|
||||
youtube-transcription = ["youtube-transcript-api"]
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
__version__ = "0.1.3"
|
||||
__version__ = "0.1.5"
|
||||
|
||||
@@ -107,6 +107,13 @@ class MarkItDown:
|
||||
requests_session = kwargs.get("requests_session")
|
||||
if requests_session is None:
|
||||
self._requests_session = requests.Session()
|
||||
# Signal that we prefer markdown over HTML, etc. if the server supports it.
|
||||
# e.g., https://blog.cloudflare.com/markdown-for-agents/
|
||||
self._requests_session.headers.update(
|
||||
{
|
||||
"Accept": "text/markdown, text/html;q=0.9, text/plain;q=0.8, */*;q=0.1"
|
||||
}
|
||||
)
|
||||
else:
|
||||
self._requests_session = requests_session
|
||||
|
||||
|
||||
@@ -15,13 +15,6 @@ from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import mammoth
|
||||
import mammoth.docx.files
|
||||
|
||||
def mammoth_files_open(self, uri):
|
||||
warn("DOCX: processing of r:link resources (e.g., linked images) is disabled.")
|
||||
return io.BytesIO(b"")
|
||||
|
||||
mammoth.docx.files.Files.open = mammoth_files_open
|
||||
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
|
||||
@@ -1,22 +1,69 @@
|
||||
import sys
|
||||
import io
|
||||
|
||||
import re
|
||||
from typing import BinaryIO, Any
|
||||
|
||||
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
# Pattern for MasterFormat-style partial numbering (e.g., ".1", ".2", ".10")
|
||||
PARTIAL_NUMBERING_PATTERN = re.compile(r"^\.\d+$")
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
|
||||
def _merge_partial_numbering_lines(text: str) -> str:
|
||||
"""
|
||||
Post-process extracted text to merge MasterFormat-style partial numbering
|
||||
with the following text line.
|
||||
|
||||
MasterFormat documents use partial numbering like:
|
||||
.1 The intent of this Request for Proposal...
|
||||
.2 Available information relative to...
|
||||
|
||||
Some PDF extractors split these into separate lines:
|
||||
.1
|
||||
The intent of this Request for Proposal...
|
||||
|
||||
This function merges them back together.
|
||||
"""
|
||||
lines = text.split("\n")
|
||||
result_lines: list[str] = []
|
||||
i = 0
|
||||
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
stripped = line.strip()
|
||||
|
||||
# Check if this line is ONLY a partial numbering
|
||||
if PARTIAL_NUMBERING_PATTERN.match(stripped):
|
||||
# Look for the next non-empty line to merge with
|
||||
j = i + 1
|
||||
while j < len(lines) and not lines[j].strip():
|
||||
j += 1
|
||||
|
||||
if j < len(lines):
|
||||
# Merge the partial numbering with the next line
|
||||
next_line = lines[j].strip()
|
||||
result_lines.append(f"{stripped} {next_line}")
|
||||
i = j + 1 # Skip past the merged line
|
||||
else:
|
||||
# No next line to merge with, keep as is
|
||||
result_lines.append(line)
|
||||
i += 1
|
||||
else:
|
||||
result_lines.append(line)
|
||||
i += 1
|
||||
|
||||
return "\n".join(result_lines)
|
||||
|
||||
|
||||
# Load dependencies
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import pdfminer
|
||||
import pdfminer.high_level
|
||||
import pdfplumber
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
@@ -28,16 +75,435 @@ ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
ACCEPTED_FILE_EXTENSIONS = [".pdf"]
|
||||
|
||||
|
||||
def _to_markdown_table(table: list[list[str]], include_separator: bool = True) -> str:
|
||||
"""Convert a 2D list (rows/columns) into a nicely aligned Markdown table.
|
||||
|
||||
Args:
|
||||
table: 2D list of cell values
|
||||
include_separator: If True, include header separator row (standard markdown).
|
||||
If False, output simple pipe-separated rows.
|
||||
"""
|
||||
if not table:
|
||||
return ""
|
||||
|
||||
# Normalize None → ""
|
||||
table = [[cell if cell is not None else "" for cell in row] for row in table]
|
||||
|
||||
# Filter out empty rows
|
||||
table = [row for row in table if any(cell.strip() for cell in row)]
|
||||
|
||||
if not table:
|
||||
return ""
|
||||
|
||||
# Column widths
|
||||
col_widths = [max(len(str(cell)) for cell in col) for col in zip(*table)]
|
||||
|
||||
def fmt_row(row: list[str]) -> str:
|
||||
return (
|
||||
"|"
|
||||
+ "|".join(str(cell).ljust(width) for cell, width in zip(row, col_widths))
|
||||
+ "|"
|
||||
)
|
||||
|
||||
if include_separator:
|
||||
header, *rows = table
|
||||
md = [fmt_row(header)]
|
||||
md.append("|" + "|".join("-" * w for w in col_widths) + "|")
|
||||
for row in rows:
|
||||
md.append(fmt_row(row))
|
||||
else:
|
||||
md = [fmt_row(row) for row in table]
|
||||
|
||||
return "\n".join(md)
|
||||
|
||||
|
||||
def _extract_form_content_from_words(page: Any) -> str | None:
|
||||
"""
|
||||
Extract form-style content from a PDF page by analyzing word positions.
|
||||
This handles borderless forms/tables where words are aligned in columns.
|
||||
|
||||
Returns markdown with proper table formatting:
|
||||
- Tables have pipe-separated columns with header separator rows
|
||||
- Non-table content is rendered as plain text
|
||||
|
||||
Returns None if the page doesn't appear to be a form-style document,
|
||||
indicating that pdfminer should be used instead for better text spacing.
|
||||
"""
|
||||
words = page.extract_words(keep_blank_chars=True, x_tolerance=3, y_tolerance=3)
|
||||
if not words:
|
||||
return None
|
||||
|
||||
# Group words by their Y position (rows)
|
||||
y_tolerance = 5
|
||||
rows_by_y: dict[float, list[dict]] = {}
|
||||
for word in words:
|
||||
y_key = round(word["top"] / y_tolerance) * y_tolerance
|
||||
if y_key not in rows_by_y:
|
||||
rows_by_y[y_key] = []
|
||||
rows_by_y[y_key].append(word)
|
||||
|
||||
# Sort rows by Y position
|
||||
sorted_y_keys = sorted(rows_by_y.keys())
|
||||
page_width = page.width if hasattr(page, "width") else 612
|
||||
|
||||
# First pass: analyze each row
|
||||
row_info: list[dict] = []
|
||||
for y_key in sorted_y_keys:
|
||||
row_words = sorted(rows_by_y[y_key], key=lambda w: w["x0"])
|
||||
if not row_words:
|
||||
continue
|
||||
|
||||
first_x0 = row_words[0]["x0"]
|
||||
last_x1 = row_words[-1]["x1"]
|
||||
line_width = last_x1 - first_x0
|
||||
combined_text = " ".join(w["text"] for w in row_words)
|
||||
|
||||
# Count distinct x-position groups (columns)
|
||||
x_positions = [w["x0"] for w in row_words]
|
||||
x_groups: list[float] = []
|
||||
for x in sorted(x_positions):
|
||||
if not x_groups or x - x_groups[-1] > 50:
|
||||
x_groups.append(x)
|
||||
|
||||
# Determine row type
|
||||
is_paragraph = line_width > page_width * 0.55 and len(combined_text) > 60
|
||||
|
||||
# Check for MasterFormat-style partial numbering (e.g., ".1", ".2")
|
||||
# These should be treated as list items, not table rows
|
||||
has_partial_numbering = False
|
||||
if row_words:
|
||||
first_word = row_words[0]["text"].strip()
|
||||
if PARTIAL_NUMBERING_PATTERN.match(first_word):
|
||||
has_partial_numbering = True
|
||||
|
||||
row_info.append(
|
||||
{
|
||||
"y_key": y_key,
|
||||
"words": row_words,
|
||||
"text": combined_text,
|
||||
"x_groups": x_groups,
|
||||
"is_paragraph": is_paragraph,
|
||||
"num_columns": len(x_groups),
|
||||
"has_partial_numbering": has_partial_numbering,
|
||||
}
|
||||
)
|
||||
|
||||
# Collect ALL x-positions from rows with 3+ columns (table-like rows)
|
||||
# This gives us the global column structure
|
||||
all_table_x_positions: list[float] = []
|
||||
for info in row_info:
|
||||
if info["num_columns"] >= 3 and not info["is_paragraph"]:
|
||||
all_table_x_positions.extend(info["x_groups"])
|
||||
|
||||
if not all_table_x_positions:
|
||||
return None
|
||||
|
||||
# Compute adaptive column clustering tolerance based on gap analysis
|
||||
all_table_x_positions.sort()
|
||||
|
||||
# Calculate gaps between consecutive x-positions
|
||||
gaps = []
|
||||
for i in range(len(all_table_x_positions) - 1):
|
||||
gap = all_table_x_positions[i + 1] - all_table_x_positions[i]
|
||||
if gap > 5: # Only significant gaps
|
||||
gaps.append(gap)
|
||||
|
||||
# Determine optimal tolerance using statistical analysis
|
||||
if gaps and len(gaps) >= 3:
|
||||
# Use 70th percentile of gaps as threshold (balances precision/recall)
|
||||
sorted_gaps = sorted(gaps)
|
||||
percentile_70_idx = int(len(sorted_gaps) * 0.70)
|
||||
adaptive_tolerance = sorted_gaps[percentile_70_idx]
|
||||
|
||||
# Clamp tolerance to reasonable range [25, 50]
|
||||
adaptive_tolerance = max(25, min(50, adaptive_tolerance))
|
||||
else:
|
||||
# Fallback to conservative value
|
||||
adaptive_tolerance = 35
|
||||
|
||||
# Compute global column boundaries using adaptive tolerance
|
||||
global_columns: list[float] = []
|
||||
for x in all_table_x_positions:
|
||||
if not global_columns or x - global_columns[-1] > adaptive_tolerance:
|
||||
global_columns.append(x)
|
||||
|
||||
# Adaptive max column check based on page characteristics
|
||||
# Calculate average column width
|
||||
if len(global_columns) > 1:
|
||||
content_width = global_columns[-1] - global_columns[0]
|
||||
avg_col_width = content_width / len(global_columns)
|
||||
|
||||
# Forms with very narrow columns (< 30px) are likely dense text
|
||||
if avg_col_width < 30:
|
||||
return None
|
||||
|
||||
# Compute adaptive max based on columns per inch
|
||||
# Typical forms have 3-8 columns per inch
|
||||
columns_per_inch = len(global_columns) / (content_width / 72)
|
||||
|
||||
# If density is too high (> 10 cols/inch), likely not a form
|
||||
if columns_per_inch > 10:
|
||||
return None
|
||||
|
||||
# Adaptive max: allow more columns for wider pages
|
||||
# Standard letter is 612pt wide, so scale accordingly
|
||||
adaptive_max_columns = int(20 * (page_width / 612))
|
||||
adaptive_max_columns = max(15, adaptive_max_columns) # At least 15
|
||||
|
||||
if len(global_columns) > adaptive_max_columns:
|
||||
return None
|
||||
else:
|
||||
# Single column, not a form
|
||||
return None
|
||||
|
||||
# Now classify each row as table row or not
|
||||
# A row is a table row if it has words that align with 2+ of the global columns
|
||||
for info in row_info:
|
||||
if info["is_paragraph"]:
|
||||
info["is_table_row"] = False
|
||||
continue
|
||||
|
||||
# Rows with partial numbering (e.g., ".1", ".2") are list items, not table rows
|
||||
if info["has_partial_numbering"]:
|
||||
info["is_table_row"] = False
|
||||
continue
|
||||
|
||||
# Count how many global columns this row's words align with
|
||||
aligned_columns: set[int] = set()
|
||||
for word in info["words"]:
|
||||
word_x = word["x0"]
|
||||
for col_idx, col_x in enumerate(global_columns):
|
||||
if abs(word_x - col_x) < 40:
|
||||
aligned_columns.add(col_idx)
|
||||
break
|
||||
|
||||
# If row uses 2+ of the established columns, it's a table row
|
||||
info["is_table_row"] = len(aligned_columns) >= 2
|
||||
|
||||
# Find table regions (consecutive table rows)
|
||||
table_regions: list[tuple[int, int]] = [] # (start_idx, end_idx)
|
||||
i = 0
|
||||
while i < len(row_info):
|
||||
if row_info[i]["is_table_row"]:
|
||||
start_idx = i
|
||||
while i < len(row_info) and row_info[i]["is_table_row"]:
|
||||
i += 1
|
||||
end_idx = i
|
||||
table_regions.append((start_idx, end_idx))
|
||||
else:
|
||||
i += 1
|
||||
|
||||
# Check if enough rows are table rows (at least 20%)
|
||||
total_table_rows = sum(end - start for start, end in table_regions)
|
||||
if len(row_info) > 0 and total_table_rows / len(row_info) < 0.2:
|
||||
return None
|
||||
|
||||
# Build output - collect table data first, then format with proper column widths
|
||||
result_lines: list[str] = []
|
||||
num_cols = len(global_columns)
|
||||
|
||||
# Helper function to extract cells from a row
|
||||
def extract_cells(info: dict) -> list[str]:
|
||||
cells: list[str] = ["" for _ in range(num_cols)]
|
||||
for word in info["words"]:
|
||||
word_x = word["x0"]
|
||||
# Find the correct column using boundary ranges
|
||||
assigned_col = num_cols - 1 # Default to last column
|
||||
for col_idx in range(num_cols - 1):
|
||||
col_end = global_columns[col_idx + 1]
|
||||
if word_x < col_end - 20:
|
||||
assigned_col = col_idx
|
||||
break
|
||||
if cells[assigned_col]:
|
||||
cells[assigned_col] += " " + word["text"]
|
||||
else:
|
||||
cells[assigned_col] = word["text"]
|
||||
return cells
|
||||
|
||||
# Process rows, collecting table data for proper formatting
|
||||
idx = 0
|
||||
while idx < len(row_info):
|
||||
info = row_info[idx]
|
||||
|
||||
# Check if this row starts a table region
|
||||
table_region = None
|
||||
for start, end in table_regions:
|
||||
if idx == start:
|
||||
table_region = (start, end)
|
||||
break
|
||||
|
||||
if table_region:
|
||||
start, end = table_region
|
||||
# Collect all rows in this table
|
||||
table_data: list[list[str]] = []
|
||||
for table_idx in range(start, end):
|
||||
cells = extract_cells(row_info[table_idx])
|
||||
table_data.append(cells)
|
||||
|
||||
# Calculate column widths for this table
|
||||
if table_data:
|
||||
col_widths = [
|
||||
max(len(row[col]) for row in table_data) for col in range(num_cols)
|
||||
]
|
||||
# Ensure minimum width of 3 for separator dashes
|
||||
col_widths = [max(w, 3) for w in col_widths]
|
||||
|
||||
# Format header row
|
||||
header = table_data[0]
|
||||
header_str = (
|
||||
"| "
|
||||
+ " | ".join(
|
||||
cell.ljust(col_widths[i]) for i, cell in enumerate(header)
|
||||
)
|
||||
+ " |"
|
||||
)
|
||||
result_lines.append(header_str)
|
||||
|
||||
# Format separator row
|
||||
separator = (
|
||||
"| "
|
||||
+ " | ".join("-" * col_widths[i] for i in range(num_cols))
|
||||
+ " |"
|
||||
)
|
||||
result_lines.append(separator)
|
||||
|
||||
# Format data rows
|
||||
for row in table_data[1:]:
|
||||
row_str = (
|
||||
"| "
|
||||
+ " | ".join(
|
||||
cell.ljust(col_widths[i]) for i, cell in enumerate(row)
|
||||
)
|
||||
+ " |"
|
||||
)
|
||||
result_lines.append(row_str)
|
||||
|
||||
idx = end # Skip to end of table region
|
||||
else:
|
||||
# Check if we're inside a table region (not at start)
|
||||
in_table = False
|
||||
for start, end in table_regions:
|
||||
if start < idx < end:
|
||||
in_table = True
|
||||
break
|
||||
|
||||
if not in_table:
|
||||
# Non-table content
|
||||
result_lines.append(info["text"])
|
||||
idx += 1
|
||||
|
||||
return "\n".join(result_lines)
|
||||
|
||||
|
||||
def _extract_tables_from_words(page: Any) -> list[list[list[str]]]:
|
||||
"""
|
||||
Extract tables from a PDF page by analyzing word positions.
|
||||
This handles borderless tables where words are aligned in columns.
|
||||
|
||||
This function is designed for structured tabular data (like invoices),
|
||||
not for multi-column text layouts in scientific documents.
|
||||
"""
|
||||
words = page.extract_words(keep_blank_chars=True, x_tolerance=3, y_tolerance=3)
|
||||
if not words:
|
||||
return []
|
||||
|
||||
# Group words by their Y position (rows)
|
||||
y_tolerance = 5
|
||||
rows_by_y: dict[float, list[dict]] = {}
|
||||
for word in words:
|
||||
y_key = round(word["top"] / y_tolerance) * y_tolerance
|
||||
if y_key not in rows_by_y:
|
||||
rows_by_y[y_key] = []
|
||||
rows_by_y[y_key].append(word)
|
||||
|
||||
# Sort rows by Y position
|
||||
sorted_y_keys = sorted(rows_by_y.keys())
|
||||
|
||||
# Find potential column boundaries by analyzing x positions across all rows
|
||||
all_x_positions = []
|
||||
for words_in_row in rows_by_y.values():
|
||||
for word in words_in_row:
|
||||
all_x_positions.append(word["x0"])
|
||||
|
||||
if not all_x_positions:
|
||||
return []
|
||||
|
||||
# Cluster x positions to find column starts
|
||||
all_x_positions.sort()
|
||||
x_tolerance_col = 20
|
||||
column_starts: list[float] = []
|
||||
for x in all_x_positions:
|
||||
if not column_starts or x - column_starts[-1] > x_tolerance_col:
|
||||
column_starts.append(x)
|
||||
|
||||
# Need at least 3 columns but not too many (likely text layout, not table)
|
||||
if len(column_starts) < 3 or len(column_starts) > 10:
|
||||
return []
|
||||
|
||||
# Find rows that span multiple columns (potential table rows)
|
||||
table_rows = []
|
||||
for y_key in sorted_y_keys:
|
||||
words_in_row = sorted(rows_by_y[y_key], key=lambda w: w["x0"])
|
||||
|
||||
# Assign words to columns
|
||||
row_data = [""] * len(column_starts)
|
||||
for word in words_in_row:
|
||||
# Find the closest column
|
||||
best_col = 0
|
||||
min_dist = float("inf")
|
||||
for i, col_x in enumerate(column_starts):
|
||||
dist = abs(word["x0"] - col_x)
|
||||
if dist < min_dist:
|
||||
min_dist = dist
|
||||
best_col = i
|
||||
|
||||
if row_data[best_col]:
|
||||
row_data[best_col] += " " + word["text"]
|
||||
else:
|
||||
row_data[best_col] = word["text"]
|
||||
|
||||
# Only include rows that have content in multiple columns
|
||||
non_empty = sum(1 for cell in row_data if cell.strip())
|
||||
if non_empty >= 2:
|
||||
table_rows.append(row_data)
|
||||
|
||||
# Validate table quality - tables should have:
|
||||
# 1. Enough rows (at least 3 including header)
|
||||
# 2. Short cell content (tables have concise data, not paragraphs)
|
||||
# 3. Consistent structure across rows
|
||||
if len(table_rows) < 3:
|
||||
return []
|
||||
|
||||
# Check if cells contain short, structured data (not long text)
|
||||
long_cell_count = 0
|
||||
total_cell_count = 0
|
||||
for row in table_rows:
|
||||
for cell in row:
|
||||
if cell.strip():
|
||||
total_cell_count += 1
|
||||
# If cell has more than 30 chars, it's likely prose text
|
||||
if len(cell.strip()) > 30:
|
||||
long_cell_count += 1
|
||||
|
||||
# If more than 30% of cells are long, this is probably not a table
|
||||
if total_cell_count > 0 and long_cell_count / total_cell_count > 0.3:
|
||||
return []
|
||||
|
||||
return [table_rows]
|
||||
|
||||
|
||||
class PdfConverter(DocumentConverter):
|
||||
"""
|
||||
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
|
||||
Converts PDFs to Markdown.
|
||||
Supports extracting tables into aligned Markdown format (via pdfplumber).
|
||||
Falls back to pdfminer if pdfplumber is missing or fails.
|
||||
"""
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
**kwargs: Any,
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
@@ -55,9 +521,8 @@ class PdfConverter(DocumentConverter):
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
# Check the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
@@ -65,13 +530,58 @@ class PdfConverter(DocumentConverter):
|
||||
extension=".pdf",
|
||||
feature="pdf",
|
||||
)
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
_dependency_exc_info[2]
|
||||
)
|
||||
) # type: ignore[union-attr]
|
||||
|
||||
assert isinstance(file_stream, io.IOBase) # for mypy
|
||||
return DocumentConverterResult(
|
||||
markdown=pdfminer.high_level.extract_text(file_stream),
|
||||
)
|
||||
assert isinstance(file_stream, io.IOBase)
|
||||
|
||||
markdown_chunks: list[str] = []
|
||||
|
||||
# Read file stream into BytesIO for compatibility with pdfplumber
|
||||
pdf_bytes = io.BytesIO(file_stream.read())
|
||||
|
||||
try:
|
||||
# Track how many pages are form-style vs plain text
|
||||
form_pages = 0
|
||||
plain_pages = 0
|
||||
|
||||
with pdfplumber.open(pdf_bytes) as pdf:
|
||||
for page in pdf.pages:
|
||||
# Try form-style word position extraction
|
||||
page_content = _extract_form_content_from_words(page)
|
||||
|
||||
# If extraction returns None, this page is not form-style
|
||||
if page_content is None:
|
||||
plain_pages += 1
|
||||
# Extract text using pdfplumber's basic extraction for this page
|
||||
text = page.extract_text()
|
||||
if text and text.strip():
|
||||
markdown_chunks.append(text.strip())
|
||||
else:
|
||||
form_pages += 1
|
||||
if page_content.strip():
|
||||
markdown_chunks.append(page_content)
|
||||
|
||||
# If most pages are plain text, use pdfminer for better text handling
|
||||
if plain_pages > form_pages and plain_pages > 0:
|
||||
pdf_bytes.seek(0)
|
||||
markdown = pdfminer.high_level.extract_text(pdf_bytes)
|
||||
else:
|
||||
# Build markdown from chunks
|
||||
markdown = "\n\n".join(markdown_chunks).strip()
|
||||
|
||||
except Exception:
|
||||
# Fallback if pdfplumber fails
|
||||
pdf_bytes.seek(0)
|
||||
markdown = pdfminer.high_level.extract_text(pdf_bytes)
|
||||
|
||||
# Fallback if still empty
|
||||
if not markdown:
|
||||
pdf_bytes.seek(0)
|
||||
markdown = pdfminer.high_level.extract_text(pdf_bytes)
|
||||
|
||||
# Post-process to merge MasterFormat-style partial numbering with following text
|
||||
markdown = _merge_partial_numbering_lines(markdown)
|
||||
|
||||
return DocumentConverterResult(markdown=markdown)
|
||||
|
||||
BIN
Binary file not shown.
+97
@@ -0,0 +1,97 @@
|
||||
%PDF-1.4
|
||||
%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
|
||||
1 0 obj
|
||||
<<
|
||||
/F1 2 0 R /F2 3 0 R /F3 4 0 R /F4 5 0 R
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/BaseFont /Courier /Encoding /WinAnsiEncoding /Name /F3 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/BaseFont /Courier-Bold /Encoding /WinAnsiEncoding /Name /F4 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<<
|
||||
/BitsPerComponent 8 /ColorSpace /DeviceRGB /Filter [ /ASCII85Decode /FlateDecode ] /Height 70 /Length 4491 /Subtype /Image
|
||||
/Type /XObject /Width 200
|
||||
>>
|
||||
stream
|
||||
Gb"/lq,^Nc)M\9OkX:DBZ5YT>'!op&`0lHCEL`PXM2DFT$QuCdPsfSJ4#%$gW49i\1e&eZ\Acg:2bhaPc+^Q$/Shs#,1&Qu>83CBh729[%A$M]]Z8KL]Cu-OpO.1`\pCPboa2!3#sCC+4Yg#.W)"\K&i)doY5WCH.J-G0E$]%J"BRoZ88okcKEP@C7S%JEA:t(e6:OLb-"MZ3=$fAIE$]%J"BRoZ88okcKEP@C7S%JEA:t(e6:OKV=+u3F\<8ml>&.H@]EH:@TAeD]);ilGV3,!4G2kP0XRN"#r/]G!Kqo1c9Redk1d%Hh[t82=;lMkgMFr4J2#lTY[siCuNDGT]h^te4%1fj10r$&D--;(2UPbQ(Ze:VUL"2G=%qPZVOlc,tegG*BO,:$mI%mCAJ^8q2gWSn>)Ui.KQ!A5_(Z;l?_%(Xb28AGIU0SY?bVp%B6=$P2*I!1?W8WL>aAVWc$%-nk=D8ZjE$stW]LJ-LqIj9qZoFV/lj$Uf)=b`nfl*ANkt_qpb2t'P`;D#\h6o+g'M+"j4Sf:d#_jjZdTS>mnQJm^7>(S3Bq"m(kH5i3;0`YK<5e.0$k"4XA4UX)rfXH+2OamR360'cX$&"Dp"DdSkh^Q;?+H)fBc@YMp?\]UZuQ*lgt@kDS'dARs/]`Roa+]eXm%&TJ4e[RCKq6kQ:5Zpa[hLTEE,M/UR\C]SS.K'HJL7F)F5Ts63hWCKs+aTqi3TN!,P7#o$@'a?^`M.9&=d,$WJ*\b^*N1fR$JFV.s`.W*WfCgS\9h@C2/uC<b(1#bB73rR3UmcP"%)_DZ#=)TA1KkfUBT8F;=Yoc[BR6[ZkS\Y$n/.@mf!9WK1Hj]o3[f>DiFrdD&a`iRQ\df2(Xc53$=i@@upbP,MJ@.sDgm%`^.8+5u//"Hhn%^kIb$5o\2B7%r>DD\NZ:L;otY0]:)'6l[M<&ctoM"($Q_XW1'!4OB>g/3dF]mD],eF*&&'itQ;2e$/VWZ/QmdogQ0&d7ePkDGP[PZkk8TtUWkaJYa$Q)c6I+l<preqG)K\U>pY5H])D-lHdp52<d:Isd8)X0&b+pKUugDNb2NIW.aD_PLN/i(r9N&<3?,2br'%?gT'_i;n9VUeeM#>ko&@JS]_pP&PR0@L`i*pbXB"rgcI`#'#>-Njfe@8+ZC7hHU>Qm2oCj$^ATs<7[sZ@5*,@qslQ\p1m1#p6XrGL'F^?ok?+\fDe1,#<0n78&1'&KK/85E7IuiRklZ$<tM_`dQfdI@$2&Sj]k*=&V0n1RVEA-6,(U`EJ9674P)af%Z>l=<9-YKka`e!9&oUk]3u7Y);o;!X[W(:<D3M@"gDG)[;",CR0eT^r/fRnB+ob8JM\tp9\JoC\=uqFPX9nU`?:Y,eJf!6E95r@KI_iekuY/-+j6DIrXFQW>i@m+VqQff?4r\fn@@4QXN6[dWdtV8:B`3X2:bgH!rR8-r^sf)#EN%"`F/q0Heh_C7H6l'.@I3l<Jr.Q!as3DB-9V*+/'h,_<T8?^2u*t.p$h8d%"Dd\P<5M`MEg>7W_M8qB0Sd$'o&pWH!XFNS.JRZ%[WY$N:rl5tLIb;#&1u\'nOCIB]161$bC,Uuf;ZK6dl()epY<39X_LaOAXU:WAZiYqZk5Tq+hN`O"QdZ7[jLdf^cf`?9i4T#=]JO$'0fC4#Y=^M%VOouL.PZ6/V;r+XoF1Ls*YXu`6'4?,j47_u_U=.T*IX0ed;@5JN=Qlc\gS!W?r;#%jA)NSUh\`='l:HWsF<K@<`EqO<,Ht[H.@PGU,p6$s&YbEgb;cfG9YK,6Fh]@t(EUD@78Ob6ui6[#pIBoZn<UH)N"PrIeP3Y!qa-k8bP>_rQ_q'7l3]f,=As5FN;rm6/&IWa@[9HFr3YtU'N%=ZPr)s`!!&o!IbLLsBH7VnG&"_&hSn3;Nr^mpSZk^2i_aD8<g*:f)-)1j6@3KSHbb_c1PDAXpnkGE:H3Fs0m?uXff0>H]^Oi^Wq(2*ak3>^mA^!FkG4$-Vq(BH"U+YSR;%(5j(bnT,&RrR1d]\O5_42^f/Xa:4msf,Oms&5F6()XE"p6mS/Yc\Ga&`hC/3XdsM;'cTMl(uV@DiFY5AA_VWS4T'&^D<.7.S,`B?:^&!Q[ZVaCi0^$[E#=Xt_;^\;l;M#]`$4;sLf^6$u5)gpB'7TO-@*HXbXF]H[ID>=n%&8-T:f)&:]?hm\RN5/B5sdW)PM[X@>2Jq/jQ%m%0Pk%J`<],L8cn3_`B)dE8ng`*C-";2ro.7o.B2:3d$4r#LEqr#8((eIunkG+V@25V=%+_!:Z/,UQca-*F<WEc]8T!rP(2A>g9*GW$LPXTF:ER(d"o8oX31"!B8VtcoXnY$9m&'30Um_8lI0aO.LY_m,l[3VfKlBY*[!$I#3=:"_\lGs6U"<,-kF4HFN$6[_SQpG)7_H3mnKRB'C9#q8EY(Vaqi(D&r$*Jr?OPiaP#RRYeN0)sia9W*TKT)#N9#q8EY(Vaqi(D&r$*Jr?OPiaP#k^2Zeu*-9hp_.0@)f6Em^OKjQe!N6R'K"tCBm#IB,_F0s/@WUCJX)`/6>DF6M_)Kk.s>$UZPX\r/#8I8q/RiHL2liUAA0Yf`%Ld=068s:8FSq=\;#qPO.M6gNJ<fcJ_B7NfVHbGb(Z#%=9b&C:^$<Xe!_fU:_#gd*5iB@GeEYKud7*YgG3W,jj8\q'/4@l'Y-g\)rX-m;H77/R]Zb%A!YN(tcDfkrJ&o7($5o=b[,R0&h>j\UuBER`krd-YJajg!X[eDSKG#rM875C*#IQWm+O@aOf*TU2U('gKn4DA1c#r\17+;`V`#Kgpm8RC[Ee$Ac&[r[O5Rd>4]?HeBRku2;R,jekQW/J-QCRW&^t>C\3e,:[W(TT!T:2JEND71MtHndISe*l)E$(\gO^@6Z"5Q<4O)uM0mY8/2q_>,e#m"SaP0HaXDih_2UgJZH0.io.<EHam+)Ba'CJ3$,Ve\2W\TOCDf>!]XNj5RmJj5Qb:1EKuS*^?b+;s)^?6l>JE0QrGSUpouIlZac9kX41>cja=/SDr<cA`ZGg.-dcfEr]UahTX\[1!?g.Q+Bc7gR:Jn+!PFgH?<j_Cphp%2cK2o3EmuaRlLL,1SFd.EYSt<=je7H>"[/Y4/S?T:Ij;Z5Vm!N#O9oGbTmC/.8"?)WA+NO#l!d__>E^^P^^5SPK3f#a-$jDf&=>7p3h\GbV.%e"aa`Nr8:-((uUWjP?5h<QpA5fPW)4^9]4ZVP3n=aOl2n\TH8,2g(<%d(S%%7D#]P?GegI?jC3uo^tm=sN6Bp.;hI!R:5>*[4qX1g^KT=J\eG*=-r!s:Xj;^b_=6X9pT)tmOPN+SP\Y9d?!CS[k5Z"1@+N1#RUIImf:>$etSOp!A$%9Npa7^LW;"'>&DLN*LT#=""p0WOpjXfI:C##@j@lDSU=Xe\&,@B>QE*Z4EfD\=1"h4F8$&QPkBC:BBC"p$N?^/:S9o*P:eF4j>`WT<EG,f\ln6T<>&V*-UR?1+=iZ&Y:YJ)Y_O1Q!(3MOc:&,lEK0KT>gBrM!Oo7g$R0Z=@n</A>l[op[I#4)k.C3f6pb.hq#9_d636^F1(A%Us@pB(WNXIQ#TKD-`)0%k[fj0?XEDjbhd[m6#LpLW.'sd9_sqo4)9,(HjMXDMbKZE5`!!P4XRB@/4TISu+m2&%RiFj^;=JGE6IZZZ3Iq9u;tP9Ze)spB!TH+!k0kjm?9TaEqM'"N]I#K68.sENFpG-:BpL"k5<Kf;Cll]p\0.VBgJXYV7bkGELTah(>RWGsjL14<<sE:X3JW]):L"_a3kcBH&.(2Ui6d<sTVSq6Hb2oGX:N_j+E^^?#]Brd-YC/q4&j48+1kK<)&.;T[(aqqDC]Z:"7NhLIO5&BeU-SXc>m&CBpaKs!LDjG%ZThfU"?o'^kNBIDSK`iDtti"$XI@8UFlc6'SU-*bNc0g0qZU.!3k5:g&J5%_GC<>>?gMnQALpYLh_[CK2YgZMfbcG=f9!FE+GUto#YS.Ms0oqhppBVFIgKikS73+[l77Q]oKP@6W$g?K-&@CV\IU2l9TjknpF,mi!B#31b$WjNK[_SbrBad@)rY(?tsa<O6[4Wd_rrCQDK9Fn>?noi;W9O[OS+IIT/JfXc1#&UNmnp:7_c6/aW-".*cBVjc)"DgbY/sJI7I:YTo4n^?6D*J='=Z.q\rms\C-WjI*igXS;Vfo_`Y\`_c8K6r=SE5*WPmG+p6'k&&0eDn]e<kl$//^-6n?[M0Wg:@`F!W*m'jM%_,JfY,&JA=T)'Qh]O:`+1#oOo&Q&lRj>R;8k_3L)o&mP_\+Z6EUKR,;fQ&lT(!/=AJY_cg)ZFd_iV[)$d`W(]Q<7PapAWM]!Y4qoZ_Z9e(Tts:W\F3=&Fc,Mp(`=cJUG]7+gkp\4)_AlnCa6gYn$_U<4&\@+ERR^ZIFpqbN%;:=3k=P0?fC_:0&Ug9TYUE[kI#Bq;+mc:C\/7H:]h-hq^[P:`jZ\paVNd3BCYF4eruZ)J"F0tYQ"^`hf0;~>endstream
|
||||
endobj
|
||||
7 0 obj
|
||||
<<
|
||||
/Contents 11 0 R /MediaBox [ 0 0 216 792 ] /Parent 10 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject <<
|
||||
/FormXob.2a351979d8c75d073b2ea4bfb74718f9 6 0 R
|
||||
>>
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
8 0 obj
|
||||
<<
|
||||
/PageMode /UseNone /Pages 10 0 R /Type /Catalog
|
||||
>>
|
||||
endobj
|
||||
9 0 obj
|
||||
<<
|
||||
/Author (\(anonymous\)) /CreationDate (D:20251205104951+01'00') /Creator (\(unspecified\)) /Keywords () /ModDate (D:20251205104951+01'00') /Producer (ReportLab PDF Library - www.reportlab.com)
|
||||
/Subject (\(unspecified\)) /Title (\(anonymous\)) /Trapped /False
|
||||
>>
|
||||
endobj
|
||||
10 0 obj
|
||||
<<
|
||||
/Count 1 /Kids [ 7 0 R ] /Type /Pages
|
||||
>>
|
||||
endobj
|
||||
11 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 1981
|
||||
>>
|
||||
stream
|
||||
GauI8D/\/e&:hOi=55KJ8;^%$X%4*F0ZR4[TQ-.JlIl=8,25emeWsH3DOUP#SK,Tt>!%\QTf-@>5)IUB4PgI,!l6hOHqp".7kqY?VEc$;f0G$C+?kE+IduY+D6B[cY<`?1&+Bf$SSWa.DI28'F?:CpG_mY"TO]hkXiFku&",h"4G/8GlCSK3UDT`3W#q'Qc;4>t6\tbspa(/l?]"D>nboQo,(,[\*-A;J=Ru^j=[Nu\:iMk7q<+PGo*gWpT4_C)j)t7Oc[5MlffWrhj!99;,0?]r3R(ns^B^I*KQ#f[([aS1g);Q.rrBep&6)sVJs=\.1^pkCa(tBfECI75_;C0LC.)*n<3;@eFZT<Brd%CYO%*fRCl_%R2PLtnF>0lg_SFKN$O.X\o%U7_58YJ,X[`p,%PUL^1]EgT]T\4*B3hOrEZA:[=ui88pZGht%klE^OC$2=@$GiMoTO<eR\C2;10r\O.%_7=c.`)*@0N>,CLh>2ZDq?"(LrLS)ajJ<DG(:N]5MPuT)E6J8.)!Ud7D>Je7M&(V1i'>Z@qg3/WJ@PpL4nr1qU$V_#jqpM+M<[H(LE:pW6uTQK`^P%Q'T&[*Y1\T_7.O:+n^Y)3+d56\hGsIrnqB85q[4L1WG#"Uo.d]Zr"jG`qiT=AU8b5]p3(N2IfnWHVO&$rnNe6[_$[o(m=2Sq-C[bbNOS,qIb?:TGYHhQjjcCe!9%*cuscgU*Eea^B?#^HtoE9p(jd_GR#E1\LTm:MVS5e]+<LZRQ]0^iZNnTs\Iq5l6H+?80%j?IX^UR28jY=Vr!:#Jf=D$QdR#4X6Q%Z^E6hq4[p#rHu/mN!PgeQCn_hEI9M3(_b.pAid<e0?KUakkhL+Sq1A+!S(V0h+OF8nn#&[1+6p+D5^<OP@s\HS\itN&+apX>;a8<=<fVm-cM(u=Q31UTuXZiRNk/X^o=e`8?ha=(l,J:AYGq&k'81mIs68U9).dPb@tBY#5s6I1(;=p-FNV8JLO6-b%6]BUEO#*P:YXQ,+2T8!GeA%OFD*^H'I<qo6Q\KGcc$9<-q;oCHo2eF[F/t'nG3p8bj]<0qUd^A*Un<D3^J]5-EeYaHZL0]dZ(aldZ?U]EL]o1@j;L=l_$._u&B5KRKtY900d3EO'mY6&5WB,D7o]o+7,#h.[N58L+)*ks!_/dIq7L<$Q/>:Ym/3(NJmP3]c2J81f'[9A229?.>nW.Y"uioK$/X(RLnTFa0nhiu#_V(M%6pL-[3&IEZO^iW'pcgSC4%cs*UfWL8=h@<SF-6Ml.SK#\%/6pL;XKVP08]+YR4.1^h^3g+iL6p2jNFKi##9N\7TS:EE')be-k57a"IMZoUV#>]Eoq_3uS@i.]*ai31P3"'$;S,Q%'"=$Vq"-_!pX<>bA]?nd=dOpZa\$"k!cpI9L2SO3gBd7]TKi)s)3F/ADnb^3N)iF')M[\Fq1\^PAl):!YpJp!B\s/2LkZr*`(o%fTO.qa?N[7\P_Dj!-3=OAO3]DtNKn-R1Hc#$?$h;RW[;B(k%DrOQ4lA(kZ`8[,.5E\H/%&9RE1k-pKk^'?Wseh?':/9RD3&&Xf\j=9;Sdd#l!b5li$Q.?@#FtJ9r"D*THt>o=+h,ei<3VCI\e<F.YjMklHmQ252@7%.$dl?FX[.5Ru_<cOnWObGU$sud3sn0Nm?VSip=&P_8=3<b5l"NFchqcT66k!jof;<?"kHRj[>Q+FB(V85-;*H\(QoM*>m2@9WKA`dV0$2F]lQ!^cKY?-F<<RYBD9:P`&#<:DJpSA0L]L_Q`8=5'6'r`p_54_;lcH+H=)4\l8B7YE#pX>K&Mf4jEn:L@C'pmu(T(NAo?onFtPTH*Mah:OJII8OF6<oMipM;1-5S+stnT,o"n]+UmpI>_OX,SeHS'86i`]nN=oW_Hjm1lb%agT!)1^3rJWom\/,?BYNjThVR,cQ'opa8Q#<G9<qeSN'GRRO*(AC(K$'<9uMACIm=MV?Mk2Q*3P"~>endstream
|
||||
endobj
|
||||
xref
|
||||
0 12
|
||||
0000000000 65535 f
|
||||
0000000073 00000 n
|
||||
0000000134 00000 n
|
||||
0000000241 00000 n
|
||||
0000000353 00000 n
|
||||
0000000458 00000 n
|
||||
0000000568 00000 n
|
||||
0000005249 00000 n
|
||||
0000005507 00000 n
|
||||
0000005576 00000 n
|
||||
0000005859 00000 n
|
||||
0000005919 00000 n
|
||||
trailer
|
||||
<<
|
||||
/ID
|
||||
[<4800d64fefba4dd902e51197c7da4e88><4800d64fefba4dd902e51197c7da4e88>]
|
||||
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
|
||||
|
||||
/Info 9 0 R
|
||||
/Root 8 0 R
|
||||
/Size 12
|
||||
>>
|
||||
startxref
|
||||
7992
|
||||
%%EOF
|
||||
Binary file not shown.
+115
File diff suppressed because one or more lines are too long
Vendored
Vendored
+81
@@ -0,0 +1,81 @@
|
||||
TECHMART ELECTRONICS
|
||||
4567 Innovation Blvd
|
||||
San Francisco, CA 94103
|
||||
(415) 555-0199
|
||||
|
||||
===================================
|
||||
|
||||
Store #0342 - Downtown SF
|
||||
11/23/2024 14:32:18 PST
|
||||
TXN: TXN-98765-2024
|
||||
Cashier: Emily Rodriguez
|
||||
Register: POS-07
|
||||
|
||||
-----------------------------------
|
||||
|
||||
Wireless Noise-Cancelling
|
||||
Headphones - Premium Black
|
||||
AUDIO-5521 1 @ $349.99
|
||||
Member Discount $-50.00
|
||||
$299.99
|
||||
USB-C Hub 7-in-1 Adapter
|
||||
with HDMI & Ethernet
|
||||
ACC-8834 2 @ $79.99
|
||||
$159.98
|
||||
Portable SSD 2TB
|
||||
Thunderbolt 3 Compatible
|
||||
STOR-2241 1 @ $289.00
|
||||
Member Discount $-29.00
|
||||
$260.00
|
||||
Ergonomic Wireless Mouse
|
||||
Rechargeable Battery
|
||||
ACC-9012 1 @ $59.99
|
||||
$59.99
|
||||
Screen Cleaning Kit
|
||||
Professional Grade
|
||||
CARE-1156 3 @ $12.99
|
||||
$38.97
|
||||
HDMI 2.1 Cable 6ft
|
||||
8K Resolution Support
|
||||
CABLE-7789 2 @ $24.99
|
||||
Member Discount $-5.00
|
||||
$44.98
|
||||
-----------------------------------
|
||||
|
||||
SUBTOTAL $863.91
|
||||
Member Discount (15%)-$84.00
|
||||
Sales Tax (8.5%) $66.23
|
||||
Rewards Applied -$25.00
|
||||
===================================
|
||||
TOTAL $821.14
|
||||
===================================
|
||||
|
||||
PAYMENT METHOD
|
||||
Visa Card ending in 4782
|
||||
Auth: 847392
|
||||
Ref: REF-20241123-98765
|
||||
|
||||
-----------------------------------
|
||||
|
||||
REWARDS MEMBER
|
||||
Sarah Mitchell
|
||||
ID: TM-447821
|
||||
Points Earned: 821
|
||||
Total Points: 3,247
|
||||
Next Reward: $50 gift card
|
||||
at 5,000 pts (1,753 to go)
|
||||
|
||||
-----------------------------------
|
||||
|
||||
RETURN POLICY
|
||||
Returns within 30 days
|
||||
Receipt required
|
||||
Electronics must be unopened
|
||||
|
||||
*TXN98765202411231432*
|
||||
|
||||
Thank you for shopping!
|
||||
www.techmart.example.com
|
||||
|
||||
===================================
|
||||
|
||||
+76
@@ -0,0 +1,76 @@
|
||||
ZAVA AUTO REPAIR
|
||||
Certified Collision Repair
|
||||
123 Main Street, Redmond, WA 98052
|
||||
Phone: (425) 000-0000
|
||||
Preliminary Estimate (ID: EST-1008)
|
||||
| Customer Information | | | Vehicle Information | |
|
||||
| -------------------- | ------------------- | --- | ------------------- | ----------------- |
|
||||
| Insured name | Gabriel Diaz | | Year | 2022 |
|
||||
| Claim # | SF-1008 | | Make | Jeep |
|
||||
| Policy # | POL-2022-555 | | Model | Grand Cherokee |
|
||||
| Phone | (425) 111-1111 | | Trim | Limited |
|
||||
| Email | gabriel@contoso.com | | VIN | 1C4RJFBG2NC123456 |
|
||||
| | | | Color | White |
|
||||
| | | | Odometer | 9,800 |
|
||||
| Repair Order # | RO-20221108 | | Estimator | Ellis Turner |
|
||||
Estimate Totals
|
||||
| | | Hours | Rate | Cost |
|
||||
| ---------------- | --- | ----- | ---- | ----- |
|
||||
| Parts | | | | 2,100 |
|
||||
| Body Labor | | 2 | 150 | 300 |
|
||||
| Paint Labor | | 1.5 | 150 | 225 |
|
||||
| Mechanical Labor | | - | - | - |
|
||||
Supplies
|
||||
| | Paint Supplies | | | 60 |
|
||||
| ------------- | ------------------------ | --- | ------ | ------ |
|
||||
| | Body Supplies | | | 30 |
|
||||
| Other Charges | | | | 15 |
|
||||
| Subtotal | | | | 2,730 |
|
||||
| Sales Tax | | | 10.20% | 278.46 |
|
||||
| GRAND TOTAL | | | | 5,738 |
|
||||
| Note | Minor rear bumper repair | | | |
|
||||
This is a preliminary estimate for the visible damage of the vehicle. Additional damage / repairs / parts may be found
|
||||
after the vehicle has been disassembled and damaged parts have been removed. Suspension damages may be
|
||||
present, but can not be determined until an alignment on the vehicle has been done. Parts Prices may vary due to
|
||||
models and vehicle maker price updates. Please be advised if vehicle owner elects to have vehicle sent to service for
|
||||
any mechanical concerns, ALL service departments charge a vehicle diagnostic charge. If the mechanical concern is
|
||||
deemed not related to an insurance claim, vehicle owner will be reponsible for charges.
|
||||
|
||||
ZAVA AUTO REPAIR
|
||||
Certified Collision Repair
|
||||
123 Main Street, Redmond, WA 98052
|
||||
Phone: (425) 000-0000
|
||||
Preliminary Estimate (ID: EST-1008)
|
||||
Customer Information Vehicle Information
|
||||
| Insured name | Bruce Wayne | | Year | 2025 |
|
||||
| -------------- | -------------------------- | --- | --------- | ------------ |
|
||||
| Claim # | | 999 | Make | Batman |
|
||||
| Policy # | IM-BATMAN | | Model | Batmobile |
|
||||
| Phone | (416) 555-1234 | | Trim | Limited |
|
||||
| Email | batman@wayneindustries.com | | VIN | XXX |
|
||||
| | | | Color | Black |
|
||||
| | | | Odometer | 1 |
|
||||
| Repair Order # | RO-20221108 | | Estimator | Ellis Turner |
|
||||
Estimate Totals
|
||||
| | | Hours | Rate | Cost |
|
||||
| ---------------- | --- | ----- | ---- | ------ |
|
||||
| Parts | | | | 99,999 |
|
||||
| Body Labor | | 2 | 150 | 300 |
|
||||
| Paint Labor | | 1.5 | 150 | 225 |
|
||||
| Mechanical Labor | | - | - | - |
|
||||
Supplies
|
||||
| | Paint Supplies | | | 60 |
|
||||
| ------------- | ------------------------ | --- | ------ | --------- |
|
||||
| | Body Supplies | | | 30 |
|
||||
| Other Charges | | | | 15 |
|
||||
| Subtotal | | | | 100,629 |
|
||||
| Sales Tax | | | 10.20% | 10264.158 |
|
||||
| GRAND TOTAL | | | | 211,522 |
|
||||
| Note | Minor rear bumper repair | | | |
|
||||
|
||||
This is a preliminary estimate for the visible damage of the vehicle. Additional damage / repairs / parts may be found
|
||||
after the vehicle has been disassembled and damaged parts have been removed. Suspension damages may be
|
||||
present, but can not be determined until an alignment on the vehicle has been done. Parts Prices may vary due to
|
||||
models and vehicle maker price updates. Please be advised if vehicle owner elects to have vehicle sent to service for
|
||||
any mechanical concerns, ALL service departments charge a vehicle diagnostic charge. If the mechanical concern is
|
||||
deemed not related to an insurance claim, vehicle owner will be reponsible for charges.
|
||||
Vendored
+44
@@ -0,0 +1,44 @@
|
||||
INVENTORY RECONCILIATION REPORT
|
||||
Report ID: SPARSE-2024-INV-1234
|
||||
Warehouse: Distribution Center East
|
||||
Report Date: 2024-11-15
|
||||
Prepared By: Sarah Martinez
|
||||
| Product Code | Location | Expected | Actual | Variance | Status |
|
||||
| ------------ | -------- | -------- | ------ | -------- | -------- |
|
||||
| SKU-8847 | A-12 | 450 | | | |
|
||||
| | B-07 | | 289 | -23 | |
|
||||
| SKU-9201 | | 780 | 778 | | OK |
|
||||
| | C-15 | | | +15 | |
|
||||
| SKU-4563 | D-22 | | 156 | | CRITICAL |
|
||||
| | | 180 | | -24 | |
|
||||
| SKU-7728 | A-08 | 920 | | | |
|
||||
| | | | 935 | +15 | OK |
|
||||
Variance Analysis:
|
||||
Summary Statistics:
|
||||
Total Variance Cost: $4,287.50
|
||||
Critical Items: 1
|
||||
Overall Accuracy: 97.2%
|
||||
Detailed Analysis by Category:
|
||||
The inventory reconciliation reveals several key findings. The primary variance driver is SKU-4563,
|
||||
which shows a -24 unit discrepancy requiring immediate investigation. Location B-07 handling of
|
||||
SKU-8847 also demonstrates significant variance. Cross-location verification protocols should be
|
||||
|
||||
reviewed to prevent future discrepancies. The overall accuracy rate of 97.2% meets our target
|
||||
threshold, but critical items require expedited resolution to maintain operational efficiency.
|
||||
Extended Inventory Review:
|
||||
| Product Code | Category | Unit Cost | Total Value | Last Audit | Notes |
|
||||
| ------------ | ----------- | --------- | ----------- | ---------- | ---------- |
|
||||
| SKU-8847 | Electronics | $45.00 | $13,005.00 | 2024-10-15 | |
|
||||
| SKU-9201 | Hardware | $32.50 | $25,285.00 | 2024-10-22 | Verified |
|
||||
| SKU-4563 | Software | $120.00 | $18,720.00 | | Critical |
|
||||
| SKU-7728 | Accessories | $15.75 | $14,726.25 | 2024-11-01 | |
|
||||
| SKU-3345 | Electronics | $67.00 | $22,445.00 | 2024-10-18 | |
|
||||
| SKU-5512 | Hardware | $89.00 | $31,150.00 | | Pending |
|
||||
| SKU-6678 | Software | $200.00 | $42,000.00 | 2024-10-25 | High Value |
|
||||
| SKU-7789 | Accessories | $8.50 | $5,950.00 | 2024-11-05 | |
|
||||
| SKU-2234 | Electronics | $125.00 | $35,000.00 | | |
|
||||
| SKU-1123 | Hardware | $55.00 | $27,500.00 | 2024-10-30 | Verified |
|
||||
Recommendations:
|
||||
1. Immediate review of SKU-4563 handling procedures. 2. Implement additional verification for critical
|
||||
items. 3. Schedule follow-up audit for high-value products (SKU-6678, SKU-2234).
|
||||
Approval:
|
||||
+62
@@ -0,0 +1,62 @@
|
||||
BOOKING ORDER
|
||||
Print Date 12/15/2024 14:30:22
|
||||
Page 1 of 1
|
||||
STARLIGHT CINEMAS
|
||||
Orders
|
||||
| Order / Rev: | 2024-12-5678 | | | Cinema: | | Downtown Multiplex |
|
||||
| ------------ | -------------- | --- | --- | ---------------- | --- | ------------------ |
|
||||
| Alt Order #: | SC-WINTER-2024 | | | Primary Contact: | | Sarah Johnson |
|
||||
Product Desc: Holiday Movie Marathon Package Location: NYC-01
|
||||
| Estimate: | EST-456 | | | Region: | | NORTHEAST |
|
||||
| -------------------- | ----------------------- | --- | --- | ------- | --- | --------- |
|
||||
| Booking Dates: | 12/20/2024 - 12/31/2024 | | | | | |
|
||||
| Original Date / Rev: | 12/01/24 / 12/10/24 | | | | | |
|
||||
| Order Type: | Premium Package | | | | | |
|
||||
Booking Agency
|
||||
| Name: | Premier Entertainment Group | | | | | |
|
||||
| ---------------- | --------------------------- | --- | --- | -------------- | --- | --------- |
|
||||
| | | | | Billing Type: | | Net 30 |
|
||||
| Contact: | Michael Chen | | | | | |
|
||||
| | | | | Payment Terms: | | Corporate |
|
||||
| Billing Contact: | accounting@premierent.com | | | | | |
|
||||
| | | | | Commission: | | 10% |
|
||||
555 Broadway Suite 1200
|
||||
New York, NY 10012
|
||||
Customer
|
||||
| Name: | Universal Studios Distribution | | | | | |
|
||||
| -------------- | ------------------------------ | --- | --- | --- | --- | --- |
|
||||
| Category: | Film Distributor | | | | | |
|
||||
| Contact Email: | bookings@universalstudios.com | | | | | |
|
||||
| Customer ID: | CUST-98765 | | | | | |
|
||||
| Revenue Code: | FILM-PREMIUM | | | | | |
|
||||
Booking Summary
|
||||
| Start Date | End Date | # Shows | Gross Amount | Net Amount | | |
|
||||
| ---------- | -------- | ------- | ------------ | ---------- | --- | --- |
|
||||
| 12/20/24 | 12/31/24 | 48 | $12,500.00 | $11,250.00 | | |
|
||||
Totals
|
||||
| Month | # Shows | Gross Amount | | Net Amount | | Occupancy |
|
||||
| ------------- | ------- | ------------ | --- | ---------- | --- | --------- |
|
||||
| December 2024 | 48 | $12,500.00 | | $11,250.00 | | 85% |
|
||||
| Totals | 48 | $12,500.00 | | $11,250.00 | | 85% |
|
||||
Account Representatives
|
||||
Representative Territory Region Start Date / End Date Commission %
|
||||
| Sarah Johnson | NYC Metro | NORTHEAST | 12/20/24 - 12/31/24 | | 100% | |
|
||||
| ------------- | --------- | --------- | ------------------- | --- | ---- | --- |
|
||||
Show Schedule Details
|
||||
Ln Screen Start End Movie Title Format Showtime Days Shows Rate Type Total
|
||||
1 SCR-1 12/20/24 12/25/24 Holiday Spectacular IMAX 3D 7:00 PM Daily 12 $250 PM $3,000
|
||||
(Runtime: 142 min); Holiday Season Premium
|
||||
2 SCR-2 12/20/24 12/31/24 Winter Wonderland Standard 4:30 PM Daily 24 $150 MT $3,600
|
||||
(Runtime: 98 min); Matinee Special
|
||||
3 SCR-1 12/26/24 12/31/24 New Year Mystery 4DX 9:30 PM Daily 12 $300 PM $3,600
|
||||
(Runtime: 116 min); Premium Experience
|
||||
Show Details
|
||||
| Show Screen | Date Range | Title | Showtime | Days Type | Rate | Revenue |
|
||||
| ----------- | ---------- | ----- | -------- | --------- | ---- | ------- |
|
||||
1 SCR-1 12/20-12/25 Holiday Spectacular 7:00 PM Daily PM $250 $3,000
|
||||
This booking order is subject to cinema availability and standard terms.
|
||||
2 SCR-2 12/20-12/31 Winter Wonderland 4:30 PM Daily MT $150 $3,600
|
||||
All showtimes are approximate and subject to change.
|
||||
3 SCR-1 12/26-12/31 New Year Mystery 9:30 PM Daily PM $300 $3,600
|
||||
| Total Revenue: | | | | | | $12,500.00 |
|
||||
| -------------- | --- | --- | --- | --- | --- | ---------- |
|
||||
@@ -0,0 +1,65 @@
|
||||
1
|
||||
|
||||
Introduction
|
||||
|
||||
Large language models (LLMs) are becoming a crucial building block in developing powerful agents
|
||||
that utilize LLMs for reasoning, tool usage, and adapting to new observations (Yao et al., 2022; Xi
|
||||
et al., 2023; Wang et al., 2023b) in many real-world tasks. Given the expanding tasks that could
|
||||
benefit from LLMs and the growing task complexity, an intuitive approach to scale up the power of
|
||||
agents is to use multiple agents that cooperate. Prior work suggests that multiple agents can help
|
||||
encourage divergent thinking (Liang et al., 2023), improve factuality and reasoning (Du et al., 2023),
|
||||
and provide validation (Wu et al., 2023). In light of the intuition and early evidence of promise, it is
|
||||
intriguing to ask the following question: how can we facilitate the development of LLM applications
|
||||
that could span a broad spectrum of domains and complexities based on the multi-agent approach?
|
||||
|
||||
Our insight is to use multi-agent conversations to achieve it. There are at least three reasons con-
|
||||
firming its general feasibility and utility thanks to recent advances in LLMs: First, because chat-
|
||||
optimized LLMs (e.g., GPT-4) show the ability to incorporate feedback, LLM agents can cooperate
|
||||
through conversations with each other or human(s), e.g., a dialog where agents provide and seek rea-
|
||||
soning, observations, critiques, and validation. Second, because a single LLM can exhibit a broad
|
||||
range of capabilities (especially when configured with the correct prompt and inference settings),
|
||||
conversations between differently configured agents can help combine these broad LLM capabilities
|
||||
in a modular and complementary manner. Third, LLMs have demonstrated ability to solve complex
|
||||
tasks when the tasks are broken into simpler subtasks. Multi-agent conversations can enable this
|
||||
partitioning and integration in an intuitive manner. How can we leverage the above insights and
|
||||
support different applications with the common requirement of coordinating multiple agents, poten-
|
||||
tially backed by LLMs, humans, or tools exhibiting different capacities? We desire a multi-agent
|
||||
conversation framework with generic abstraction and effective implementation that has the flexibil-
|
||||
ity to satisfy different application needs. Achieving this requires addressing two critical questions:
|
||||
(1) How can we design individual agents that are capable, reusable, customizable, and effective in
|
||||
multi-agent collaboration? (2) How can we develop a straightforward, unified interface that can
|
||||
accommodate a wide range of agent conversation patterns? In practice, applications of varying
|
||||
complexities may need distinct sets of agents with specific capabilities, and may require different
|
||||
conversation patterns, such as single- or multi-turn dialogs, different human involvement modes, and
|
||||
static vs. dynamic conversation. Moreover, developers may prefer the flexibility to program agent
|
||||
interactions in natural language or code. Failing to adequately address these two questions would
|
||||
limit the framework’s scope of applicability and generality.
|
||||
While there is contemporaneous exploration of multi-agent approaches,3 we present AutoGen, a
|
||||
generalized multi-agent conversation framework (Figure 1), based on the following new concepts.
|
||||
1 Customizable and conversable agents. AutoGen uses a generic design of agents that can lever-
|
||||
age LLMs, human inputs, tools, or a combination of them. The result is that developers can
|
||||
easily and quickly create agents with different roles (e.g., agents to write code, execute code,
|
||||
wire in human feedback, validate outputs, etc.) by selecting and configuring a subset of built-in
|
||||
capabilities. The agent’s backend can also be readily extended to allow more custom behaviors.
|
||||
To make these agents suitable for multi-agent conversation, every agent is made conversable –
|
||||
they can receive, react, and respond to messages. When configured properly, an agent can hold
|
||||
multiple turns of conversations with other agents autonomously or solicit human inputs at cer-
|
||||
tain rounds, enabling human agency and automation. The conversable agent design leverages the
|
||||
strong capability of the most advanced LLMs in taking feedback and making progress via chat
|
||||
and also allows combining capabilities of LLMs in a modular fashion. (Section 2.1)
|
||||
|
||||
2 Conversation programming. A fundamental insight of AutoGen is to simplify and unify com-
|
||||
plex LLM application workflows as multi-agent conversations. So AutoGen adopts a program-
|
||||
ming paradigm centered around these inter-agent conversations. We refer to this paradigm as
|
||||
conversation programming, which streamlines the development of intricate applications via two
|
||||
primary steps: (1) defining a set of conversable agents with specific capabilities and roles (as
|
||||
described above); (2) programming the interaction behavior between agents via conversation-
|
||||
centric computation and control. Both steps can be achieved via a fusion of natural and pro-
|
||||
gramming languages to build applications with a wide range of conversation patterns and agent
|
||||
behaviors. AutoGen provides ready-to-use implementations and also allows easy extension and
|
||||
experimentation for both steps. (Section 2.2)
|
||||
|
||||
3We refer to Appendix A for a detailed discussion.
|
||||
|
||||
2
|
||||
|
||||
@@ -0,0 +1,74 @@
|
||||
%PDF-1.3
|
||||
%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
|
||||
1 0 obj
|
||||
<<
|
||||
/F1 2 0 R /F2 3 0 R
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/Contents 8 0 R /MediaBox [ 0 0 612 792 ] /Parent 7 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/PageMode /UseNone /Pages 7 0 R /Type /Catalog
|
||||
>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<<
|
||||
/Author (anonymous) /CreationDate (D:20260108192537+01'00') /Creator (ReportLab PDF Library - www.reportlab.com) /Keywords () /ModDate (D:20260108192537+01'00') /Producer (ReportLab PDF Library - www.reportlab.com)
|
||||
/Subject (unspecified) /Title (untitled) /Trapped /False
|
||||
>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<<
|
||||
/Count 1 /Kids [ 4 0 R ] /Type /Pages
|
||||
>>
|
||||
endobj
|
||||
8 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 670
|
||||
>>
|
||||
stream
|
||||
Gat$td;IYl'Rf-pcJpsZ/27V[H_WEoW#\5sVS2I3Jt]?;R+`$Ms*f.>6<=3APUNhTmQL<9F,pFup'KGk=TR,7^>/u!#kAE+l;?UQ8Fg(+-O>;^54HWJ*kXdl'VdsI]Y^$-G(GWPR)iGMeWbg3)F'+jfWpCb"rU?d?8?q_r!E2N'0sM)J>=XD.jgunBuga\Wi4MX$WV/b)1F@bC8Nj8(0*)"ZK06BSqlu1$[^37A;/aK=mfgqg$&i),2OH&%^\"B1%B\dd_V>$5OtPri4rcEe3LoBUeL6QAPnpQr+R-t0f]ZSYc?BTAKQ?A&+J#J*N*=6;'?@Cp*>auj0",hDS3bH4[hVs3O="&bk&U@>+8c1&c2iDg6R*%q%iEZq'-!FNSB8#C*'po69R8$S(:.=-$N6'!_[1/jV<$@V3Z_"gd!g!MJMT)mTUN4cWjUQQj]HT_m]0*R=YgTmcl@k>*b/SBce9?.m,bEi#?PI:=r_6G.auM&FtP,>O7T%Z<$f#=g6(2+d@;8?"$8cdI38ZZ>hq5b2_pQY:M\.Kod,pl)ZX7a7Gc'Mf_'SB1X3*L[-51a8`h4)KjJQjLfm/3TIeQY?2+?^.r^HNafjHp<5,1M=W'N>8sb=dB#FC5M`7L91"BC@CfEckPe`M5O:#!Fj$K]s(Gs8rW$>H7gK~>endstream
|
||||
endobj
|
||||
xref
|
||||
0 9
|
||||
0000000000 65535 f
|
||||
0000000073 00000 n
|
||||
0000000114 00000 n
|
||||
0000000221 00000 n
|
||||
0000000333 00000 n
|
||||
0000000526 00000 n
|
||||
0000000594 00000 n
|
||||
0000000890 00000 n
|
||||
0000000949 00000 n
|
||||
trailer
|
||||
<<
|
||||
/ID
|
||||
[<5467fcd5093f18002be6af3fb13ce6c3><5467fcd5093f18002be6af3fb13ce6c3>]
|
||||
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
|
||||
|
||||
/Info 6 0 R
|
||||
/Root 5 0 R
|
||||
/Size 9
|
||||
>>
|
||||
startxref
|
||||
1709
|
||||
%%EOF
|
||||
@@ -0,0 +1,74 @@
|
||||
%PDF-1.3
|
||||
%“Œ‹ž ReportLab Generated PDF document (opensource)
|
||||
1 0 obj
|
||||
<<
|
||||
/F1 2 0 R /F2 3 0 R
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/Contents 8 0 R /MediaBox [ 0 0 612 792 ] /Parent 7 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/PageMode /UseNone /Pages 7 0 R /Type /Catalog
|
||||
>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<<
|
||||
/Author (anonymous) /CreationDate (D:20260210121342+01'00') /Creator (anonymous) /Keywords () /ModDate (D:20260210121342+01'00') /Producer (ReportLab PDF Library - \(opensource\))
|
||||
/Subject (unspecified) /Title (untitled) /Trapped /False
|
||||
>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<<
|
||||
/Count 1 /Kids [ 4 0 R ] /Type /Pages
|
||||
>>
|
||||
endobj
|
||||
8 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 2414
|
||||
>>
|
||||
stream
|
||||
Gat=m?#SK-&r,lL<tO"8con<J;5Cq;2s]18RrdR7Y[)>Ym#<31@rCsJ89.W.qa3u?82hU4/rD6bm_^2.o5\G6@H<5/\G85.&:2)\f,l]`/mA:-0HF*!^.%Yd0?rr<_LD*'1j8Q\=IJXu'N"=HL>KSX^]339h+)S%SB[D8U\2B8rL_pR7\MXONW%HeW99+,0hH$AU#^KYAoZ)6P-2'6m5cj7lZu'kGHQ:/\R1,Ma%hEl2eYq(:LZ"-`3OktM:dm<m,u<)W99/X#l.?0OO\Z_]Y4.9BoSuKGOrdaFbq^/)*_g%gm8s\<gU<a%e]re(gWm_H[^0bn(=;0X%(_^H%$;+Se<aM)L.FrW&=UUc*X3He'XMO]CgP3P*<]$#uOPN2Z#n\O]@7_]#$ZH.Gr&KZm_M+6]8I5lYVcZLH,)V@L:BCib%tuWd,p*A"0Gb=6gIkI+Y5.[<aH_D`iMaKNQpo.UHf=F]to-Ui6XS[Q;Qh\cD=LT#YQGpn9rsUVm:qR"1%pnrSZa/Mi*P+f4j?B6-uV5Em_Zqog)^@RtF[F-adqASQb%i[(eIIqZ)_CVEHpGDgIpX[[uQ4J6DNf5X^CB'JA+,d^#?/[fq)jH^+:rbdW>Y'H/a/1^A\lZD2qMb,5%-$pOaW5-%BjndGRZ<CV&?T^r@PWF)!H#gDKcZj?[/gATBZ;=XJ$_a;??F-qtH(HaQX?W#iIL#17<Y25AC[ePo/pO[]=c0(\#/j9R%W/]$do:5b%.e4%S0Z';YJm/!GO9jt-H8W>JTK5I,b-cnrpc!2H0BZZ`1%R*aB!ZE'JRRYNJ<J4B`!j/maqpD>*Rq$U:[Tq%Lr[m+DHGg*dP\Ee>\#VYo43^R>kA9W2b/WU:k/M#%^2;nC+,e'dAcEOp?t5Kk;4+.f4MU@-mf7iCT_29s_%g,%K_gB8!kWS28T%T6'u_$GK'qX*VP>7>5?dW_<?$QPg!n")cT(<-[c/-kEbS'`*BYR5SB9TPY<1jq1#Q/EWpCJrY=s;bQfH^=uT:DTR3.8/N>W)r8_SF*7+f;4415n3,ECi2P6&bjmn17t+qU8;D])\Qt.8QLi)?kJ`.t+lkW'Y4e876l-2di)Y?.3\K1<(0IrEfm1<:Oc^u?7B::q;On$J5_C7T<u%071ASb!ZD1u7Yd"g`I'`PJ>**>tRZrdD6q3W@5QfbW8242uIHro=(eV*P1KjY,oj4tW&obb>^q-Iur%F#A)mgu8+V*?E<bdEC6V0+Z7OS^l.$W4hmuq:sMdJ=Sk+94D3QtUBZ:AoIiBA%s3#GJdRDFCpZ)7\MZmitKhMID(%ic%oW#tD%ERrqpk,dD3ll!E6m)e):26BLNV!WiRV*d(+Ppl'p$%?J&MqeV<=uNJ_5,4P_NC:lWf`Iu3\u+^>Y]dUOk&c=m2^<YVV2cUoq[`<<W-]MTIC50Klu6rO5RUVZ"h`#"4adtt2qjs2b12hQi!@JBp4Jln>:1Dtc(*!NBU*DeAtLhuWu&JLWFQi:;ka#?AD6V.A_[>n$T,.]8d=tffJ,?'DbCKQ-BnKqTn_:1LGc865V]FFi=AAF`DGhW(F]2^o?>VbGN:;=!-s;ea7]Ll\f+eiZ8XZb0*mZp%8*K_pf+1"2fKuO1pNK%7f_(mPTD@0&ljSV?o$5BpUmleYs^Faq_SM'jX.o\d*6%j(EtY.N"m2B'E@[.Y_8Be+m(58m$\dcqm$?,0it)/=9@9kRfJB;N7D9t\'F<:#c$P82`UKqgN]$kU]5eLPZMR=0bO[rPk"\?hu>sT^KFg`B>!pml-a[ImSeWp!_l3s!E>gFKq4ng:"n=N:m57rHjN)GML<=a1ktQpUT8:?[D:c7+Gm@2q;uN1Q3)hpeThe-&[#`KYZ4e_=o]kk1KH/^jo:"<0_nRJingk\[1Jltc<,.Jq2\*]=AVcIiY#?iMASrc$Bp)4m=NdIOJ&,H=+<MC=^7]?Tb>M"H6ZdXTX2Ba;Gp=J-m]$,8ZCU/77rHJ,%1.[/DlnkH:pIIV$Oh.;:t?5e3.cs^[G:H=e;i>c+>B=)C&l7T)S<Bld"_W)BtgI(/F`Le;ULQ,!FM!^<8Kk?L6b_>G8Jp-TG;!V1144#29r2%;n-RmNHrGdR!76&H"R_D-]`c"1FCgZl*",7SUVuqc0oapDQ=^`nj#FFk@2%[K[V45$!KQIH[=;SUpTE8T!QLliC=5-9]nkQpBVdHM6-g)tYBAPuOqr^qkn[Wh4C;6L89J;D>5@cYM$2Y/24scnNiWp4jWhfJAF^ck!@I(VPV*s,pdkPKn<Zg-T3I%d.sSl"^f-Gm=*riV,>(\770jbu^lf\h1+IH>c;Bo;Pdg;!fA)'kmg$"\P3oX=/N5/rUltb3K-BdRTR;-W)J1bDbE?g<MKG;cK`l?D4l>.,O@6id::q]JXBH\Ws#0[#'8-5JQL>/c~>endstream
|
||||
endobj
|
||||
xref
|
||||
0 9
|
||||
0000000000 65535 f
|
||||
0000000061 00000 n
|
||||
0000000102 00000 n
|
||||
0000000209 00000 n
|
||||
0000000321 00000 n
|
||||
0000000514 00000 n
|
||||
0000000582 00000 n
|
||||
0000000843 00000 n
|
||||
0000000902 00000 n
|
||||
trailer
|
||||
<<
|
||||
/ID
|
||||
[<e319d5c305edb8c0fb6be9e44c6178fa><e319d5c305edb8c0fb6be9e44c6178fa>]
|
||||
% ReportLab generated PDF document -- digest (opensource)
|
||||
|
||||
/Info 6 0 R
|
||||
/Root 5 0 R
|
||||
/Size 9
|
||||
>>
|
||||
startxref
|
||||
3407
|
||||
%%EOF
|
||||
BIN
Binary file not shown.
@@ -288,6 +288,47 @@ def test_input_as_strings() -> None:
|
||||
assert "# Test" in result.text_content
|
||||
|
||||
|
||||
def test_doc_rlink() -> None:
|
||||
# Test for: CVE-2025-11849
|
||||
markitdown = MarkItDown()
|
||||
|
||||
# Document with rlink
|
||||
docx_file = os.path.join(TEST_FILES_DIR, "rlink.docx")
|
||||
|
||||
# Directory containing the target rlink file
|
||||
rlink_tmp_dir = os.path.abspath(os.sep + "tmp")
|
||||
|
||||
# Ensure the tmp directory exists
|
||||
if not os.path.exists(rlink_tmp_dir):
|
||||
pytest.skip(f"Skipping rlink test; {rlink_tmp_dir} directory does not exist.")
|
||||
return
|
||||
|
||||
rlink_file_path = os.path.join(rlink_tmp_dir, "test_rlink.txt")
|
||||
rlink_content = "de658225-569e-4e3d-9ed2-cfb6abf927fc"
|
||||
b64_prefix = (
|
||||
"ZGU2NTgyMjUtNTY5ZS00ZTNkLTllZDItY2ZiNmFiZjk" # base64 prefix of rlink_content
|
||||
)
|
||||
|
||||
if os.path.exists(rlink_file_path):
|
||||
with open(rlink_file_path, "r", encoding="utf-8") as f:
|
||||
existing_content = f.read()
|
||||
if existing_content != rlink_content:
|
||||
raise ValueError(
|
||||
f"Existing {rlink_file_path} content does not match expected content."
|
||||
)
|
||||
else:
|
||||
with open(rlink_file_path, "w", encoding="utf-8") as f:
|
||||
f.write(rlink_content)
|
||||
|
||||
try:
|
||||
result = markitdown.convert(docx_file, keep_data_uris=True).text_content
|
||||
assert (
|
||||
b64_prefix not in result
|
||||
) # Make sure the target file was NOT embedded in the output
|
||||
finally:
|
||||
os.remove(rlink_file_path)
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
skip_remote,
|
||||
reason="do not run tests that query external urls",
|
||||
@@ -301,9 +342,9 @@ def test_markitdown_remote() -> None:
|
||||
assert test_string in result.text_content
|
||||
|
||||
# Youtube
|
||||
result = markitdown.convert(YOUTUBE_TEST_URL)
|
||||
for test_string in YOUTUBE_TEST_STRINGS:
|
||||
assert test_string in result.text_content
|
||||
# result = markitdown.convert(YOUTUBE_TEST_URL)
|
||||
# for test_string in YOUTUBE_TEST_STRINGS:
|
||||
# assert test_string in result.text_content
|
||||
|
||||
|
||||
@pytest.mark.skipif(
|
||||
@@ -452,6 +493,7 @@ if __name__ == "__main__":
|
||||
test_markitdown_remote,
|
||||
test_speech_transcription,
|
||||
test_exceptions,
|
||||
test_doc_rlink,
|
||||
test_markitdown_exiftool,
|
||||
test_markitdown_llm_parameters,
|
||||
test_markitdown_llm,
|
||||
|
||||
@@ -0,0 +1,171 @@
|
||||
#!/usr/bin/env python3 -m pytest
|
||||
"""Tests for MasterFormat-style partial numbering in PDF conversion."""
|
||||
|
||||
import os
|
||||
import re
|
||||
import pytest
|
||||
|
||||
from markitdown import MarkItDown
|
||||
from markitdown.converters._pdf_converter import PARTIAL_NUMBERING_PATTERN
|
||||
|
||||
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
|
||||
|
||||
|
||||
class TestMasterFormatPartialNumbering:
|
||||
"""Test handling of MasterFormat-style partial numbering (.1, .2, etc.)."""
|
||||
|
||||
def test_partial_numbering_pattern_regex(self):
|
||||
"""Test that the partial numbering regex pattern correctly matches."""
|
||||
|
||||
# Should match partial numbering patterns
|
||||
assert PARTIAL_NUMBERING_PATTERN.match(".1") is not None
|
||||
assert PARTIAL_NUMBERING_PATTERN.match(".2") is not None
|
||||
assert PARTIAL_NUMBERING_PATTERN.match(".10") is not None
|
||||
assert PARTIAL_NUMBERING_PATTERN.match(".99") is not None
|
||||
|
||||
# Should NOT match other patterns
|
||||
assert PARTIAL_NUMBERING_PATTERN.match("1.") is None
|
||||
assert PARTIAL_NUMBERING_PATTERN.match("1.2") is None
|
||||
assert PARTIAL_NUMBERING_PATTERN.match(".1.2") is None
|
||||
assert PARTIAL_NUMBERING_PATTERN.match("text") is None
|
||||
assert PARTIAL_NUMBERING_PATTERN.match(".a") is None
|
||||
assert PARTIAL_NUMBERING_PATTERN.match("") is None
|
||||
|
||||
def test_masterformat_partial_numbering_not_split(self):
|
||||
"""Test that MasterFormat partial numbering stays with associated text.
|
||||
|
||||
MasterFormat documents use partial numbering like:
|
||||
.1 The intent of this Request for Proposal...
|
||||
.2 Available information relative to...
|
||||
|
||||
These should NOT be split into separate table columns, but kept
|
||||
as coherent text lines with the number followed by its description.
|
||||
"""
|
||||
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
|
||||
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert(pdf_path)
|
||||
text_content = result.text_content
|
||||
|
||||
# Partial numberings should NOT appear isolated on their own lines
|
||||
# If they're isolated, it means the parser incorrectly split them from their text
|
||||
lines = text_content.split("\n")
|
||||
isolated_numberings = []
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
# Check if line contains ONLY a partial numbering (with possible whitespace/pipes)
|
||||
cleaned = stripped.replace("|", "").strip()
|
||||
if cleaned in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8", ".9", ".10"]:
|
||||
isolated_numberings.append(stripped)
|
||||
|
||||
assert len(isolated_numberings) == 0, (
|
||||
f"Partial numberings should not be isolated from their text. "
|
||||
f"Found isolated: {isolated_numberings}"
|
||||
)
|
||||
|
||||
# Verify that partial numberings appear WITH following text on the same line
|
||||
# Look for patterns like ".1 The intent" or ".1 Some text"
|
||||
partial_with_text = re.findall(r"\.\d+\s+\w+", text_content)
|
||||
assert (
|
||||
len(partial_with_text) > 0
|
||||
), "Expected to find partial numberings followed by text on the same line"
|
||||
|
||||
def test_masterformat_content_preserved(self):
|
||||
"""Test that MasterFormat document content is fully preserved."""
|
||||
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
|
||||
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert(pdf_path)
|
||||
text_content = result.text_content
|
||||
|
||||
# Verify key content from the MasterFormat document is preserved
|
||||
expected_content = [
|
||||
"RFP for Construction Management Services",
|
||||
"Section 00 00 43",
|
||||
"Instructions to Respondents",
|
||||
"Ken Sargent House",
|
||||
"INTENT",
|
||||
"Request for Proposal",
|
||||
"KEN SARGENT HOUSE",
|
||||
"GRANDE PRAIRIE, ALBERTA",
|
||||
"Section 00 00 45",
|
||||
]
|
||||
|
||||
for content in expected_content:
|
||||
assert (
|
||||
content in text_content
|
||||
), f"Expected content '{content}' not found in extracted text"
|
||||
|
||||
# Verify partial numbering is followed by text on the same line
|
||||
# .1 should be followed by "The intent" on the same line
|
||||
assert re.search(
|
||||
r"\.1\s+The intent", text_content
|
||||
), "Partial numbering .1 should be followed by 'The intent' text"
|
||||
|
||||
# .2 should be followed by "Available information" on the same line
|
||||
assert re.search(
|
||||
r"\.2\s+Available information", text_content
|
||||
), "Partial numbering .2 should be followed by 'Available information' text"
|
||||
|
||||
# Ensure text content is not empty and has reasonable length
|
||||
assert (
|
||||
len(text_content.strip()) > 100
|
||||
), "MasterFormat document should have substantial text content"
|
||||
|
||||
def test_merge_partial_numbering_with_empty_lines_between(self):
|
||||
"""Test that partial numberings merge correctly even with empty lines between.
|
||||
|
||||
When PDF extractors produce output like:
|
||||
.1
|
||||
|
||||
The intent of this Request...
|
||||
|
||||
The merge logic should still combine them properly.
|
||||
"""
|
||||
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
|
||||
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert(pdf_path)
|
||||
text_content = result.text_content
|
||||
|
||||
# The merged result should have .1 and .2 followed by text
|
||||
# Check that we don't have patterns like ".1\n\nThe intent" (unmerged)
|
||||
lines = text_content.split("\n")
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
stripped = line.strip()
|
||||
# If we find an isolated partial numbering, the merge failed
|
||||
if stripped in [".1", ".2", ".3", ".4", ".5", ".6", ".7", ".8"]:
|
||||
# Check if next non-empty line exists and wasn't merged
|
||||
for j in range(i + 1, min(i + 3, len(lines))):
|
||||
if lines[j].strip():
|
||||
pytest.fail(
|
||||
f"Partial numbering '{stripped}' on line {i} was not "
|
||||
f"merged with following text '{lines[j].strip()[:30]}...'"
|
||||
)
|
||||
break
|
||||
|
||||
def test_multiple_partial_numberings_all_merged(self):
|
||||
"""Test that all partial numberings in a document are properly merged."""
|
||||
pdf_path = os.path.join(TEST_FILES_DIR, "masterformat_partial_numbering.pdf")
|
||||
|
||||
markitdown = MarkItDown()
|
||||
result = markitdown.convert(pdf_path)
|
||||
text_content = result.text_content
|
||||
|
||||
# Count occurrences of merged partial numberings (number followed by text)
|
||||
merged_count = len(re.findall(r"\.\d+\s+[A-Za-z]", text_content))
|
||||
|
||||
# Count isolated partial numberings (number alone on a line)
|
||||
isolated_count = 0
|
||||
for line in text_content.split("\n"):
|
||||
stripped = line.strip()
|
||||
if re.match(r"^\.\d+$", stripped):
|
||||
isolated_count += 1
|
||||
|
||||
assert (
|
||||
merged_count >= 2
|
||||
), f"Expected at least 2 merged partial numberings, found {merged_count}"
|
||||
assert (
|
||||
isolated_count == 0
|
||||
), f"Found {isolated_count} isolated partial numberings that weren't merged"
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user