[MS] Update PDF table extraction to support aligned Markdown (#1499)
* Added PDF table extraction feature with aligned Markdown (#1419) * Add PDF test files and enhance extraction tests - Added a medical report scan PDF for testing scanned PDF handling. - Included a retail purchase receipt PDF to validate receipt extraction functionality. - Introduced a multipage invoice PDF to test extraction of complex invoice structures. - Added a borderless table PDF for testing inventory reconciliation report extraction. - Implemented comprehensive tests for PDF table extraction, ensuring proper structure and data integrity. - Enhanced existing tests to validate the order and presence of extracted content across various PDF types. * fix: update dependencies for PDF processing and improve table extraction logic * Bumped version of pdfminer.six --------- Authored-by: Ashok <ashh010101@gmail.com>
This commit is contained in:
@@ -52,6 +52,7 @@ coverage.xml
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
.test-logs/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
|
||||
@@ -41,19 +41,20 @@ all = [
|
||||
"openpyxl",
|
||||
"xlrd",
|
||||
"lxml",
|
||||
"pdfminer.six>=20251107",
|
||||
"pdfminer.six>=20251230",
|
||||
"pdfplumber>=0.11.9",
|
||||
"olefile",
|
||||
"pydub",
|
||||
"SpeechRecognition",
|
||||
"youtube-transcript-api~=1.0.0",
|
||||
"azure-ai-documentintelligence",
|
||||
"azure-identity"
|
||||
"azure-identity",
|
||||
]
|
||||
pptx = ["python-pptx"]
|
||||
docx = ["mammoth~=1.11.0", "lxml"]
|
||||
xlsx = ["pandas", "openpyxl"]
|
||||
xls = ["pandas", "xlrd"]
|
||||
pdf = ["pdfminer.six"]
|
||||
pdf = ["pdfminer.six>=20251230", "pdfplumber>=0.11.9"]
|
||||
outlook = ["olefile"]
|
||||
audio-transcription = ["pydub", "SpeechRecognition"]
|
||||
youtube-transcription = ["youtube-transcript-api"]
|
||||
|
||||
@@ -1,22 +1,18 @@
|
||||
import sys
|
||||
import io
|
||||
|
||||
from typing import BinaryIO, Any
|
||||
|
||||
|
||||
from .._base_converter import DocumentConverter, DocumentConverterResult
|
||||
from .._stream_info import StreamInfo
|
||||
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
|
||||
|
||||
|
||||
# Try loading optional (but in this case, required) dependencies
|
||||
# Save reporting of any exceptions for later
|
||||
# Load dependencies
|
||||
_dependency_exc_info = None
|
||||
try:
|
||||
import pdfminer
|
||||
import pdfminer.high_level
|
||||
import pdfplumber
|
||||
except ImportError:
|
||||
# Preserve the error and stack trace for later
|
||||
_dependency_exc_info = sys.exc_info()
|
||||
|
||||
|
||||
@@ -28,16 +24,374 @@ ACCEPTED_MIME_TYPE_PREFIXES = [
|
||||
ACCEPTED_FILE_EXTENSIONS = [".pdf"]
|
||||
|
||||
|
||||
def _to_markdown_table(table: list[list[str]], include_separator: bool = True) -> str:
|
||||
"""Convert a 2D list (rows/columns) into a nicely aligned Markdown table.
|
||||
|
||||
Args:
|
||||
table: 2D list of cell values
|
||||
include_separator: If True, include header separator row (standard markdown).
|
||||
If False, output simple pipe-separated rows.
|
||||
"""
|
||||
if not table:
|
||||
return ""
|
||||
|
||||
# Normalize None → ""
|
||||
table = [[cell if cell is not None else "" for cell in row] for row in table]
|
||||
|
||||
# Filter out empty rows
|
||||
table = [row for row in table if any(cell.strip() for cell in row)]
|
||||
|
||||
if not table:
|
||||
return ""
|
||||
|
||||
# Column widths
|
||||
col_widths = [max(len(str(cell)) for cell in col) for col in zip(*table)]
|
||||
|
||||
def fmt_row(row: list[str]) -> str:
|
||||
return (
|
||||
"|"
|
||||
+ "|".join(str(cell).ljust(width) for cell, width in zip(row, col_widths))
|
||||
+ "|"
|
||||
)
|
||||
|
||||
if include_separator:
|
||||
header, *rows = table
|
||||
md = [fmt_row(header)]
|
||||
md.append("|" + "|".join("-" * w for w in col_widths) + "|")
|
||||
for row in rows:
|
||||
md.append(fmt_row(row))
|
||||
else:
|
||||
md = [fmt_row(row) for row in table]
|
||||
|
||||
return "\n".join(md)
|
||||
|
||||
|
||||
def _extract_form_content_from_words(page: Any) -> str | None:
|
||||
"""
|
||||
Extract form-style content from a PDF page by analyzing word positions.
|
||||
This handles borderless forms/tables where words are aligned in columns.
|
||||
|
||||
Returns markdown with proper table formatting:
|
||||
- Tables have pipe-separated columns with header separator rows
|
||||
- Non-table content is rendered as plain text
|
||||
|
||||
Returns None if the page doesn't appear to be a form-style document,
|
||||
indicating that pdfminer should be used instead for better text spacing.
|
||||
"""
|
||||
words = page.extract_words(keep_blank_chars=True, x_tolerance=3, y_tolerance=3)
|
||||
if not words:
|
||||
return None
|
||||
|
||||
# Group words by their Y position (rows)
|
||||
y_tolerance = 5
|
||||
rows_by_y: dict[float, list[dict]] = {}
|
||||
for word in words:
|
||||
y_key = round(word["top"] / y_tolerance) * y_tolerance
|
||||
if y_key not in rows_by_y:
|
||||
rows_by_y[y_key] = []
|
||||
rows_by_y[y_key].append(word)
|
||||
|
||||
# Sort rows by Y position
|
||||
sorted_y_keys = sorted(rows_by_y.keys())
|
||||
page_width = page.width if hasattr(page, "width") else 612
|
||||
|
||||
# First pass: analyze each row
|
||||
row_info: list[dict] = []
|
||||
for y_key in sorted_y_keys:
|
||||
row_words = sorted(rows_by_y[y_key], key=lambda w: w["x0"])
|
||||
if not row_words:
|
||||
continue
|
||||
|
||||
first_x0 = row_words[0]["x0"]
|
||||
last_x1 = row_words[-1]["x1"]
|
||||
line_width = last_x1 - first_x0
|
||||
combined_text = " ".join(w["text"] for w in row_words)
|
||||
|
||||
# Count distinct x-position groups (columns)
|
||||
x_positions = [w["x0"] for w in row_words]
|
||||
x_groups: list[float] = []
|
||||
for x in sorted(x_positions):
|
||||
if not x_groups or x - x_groups[-1] > 50:
|
||||
x_groups.append(x)
|
||||
|
||||
# Determine row type
|
||||
is_paragraph = line_width > page_width * 0.55 and len(combined_text) > 60
|
||||
|
||||
row_info.append(
|
||||
{
|
||||
"y_key": y_key,
|
||||
"words": row_words,
|
||||
"text": combined_text,
|
||||
"x_groups": x_groups,
|
||||
"is_paragraph": is_paragraph,
|
||||
"num_columns": len(x_groups),
|
||||
}
|
||||
)
|
||||
|
||||
# Collect ALL x-positions from rows with 3+ columns (table-like rows)
|
||||
# This gives us the global column structure
|
||||
all_table_x_positions: list[float] = []
|
||||
for info in row_info:
|
||||
if info["num_columns"] >= 3 and not info["is_paragraph"]:
|
||||
all_table_x_positions.extend(info["x_groups"])
|
||||
|
||||
if not all_table_x_positions:
|
||||
return None
|
||||
|
||||
# Compute global column boundaries
|
||||
all_table_x_positions.sort()
|
||||
global_columns: list[float] = []
|
||||
for x in all_table_x_positions:
|
||||
if not global_columns or x - global_columns[-1] > 30:
|
||||
global_columns.append(x)
|
||||
|
||||
# Too many columns suggests dense text, not a form
|
||||
if len(global_columns) > 8:
|
||||
return None
|
||||
|
||||
# Now classify each row as table row or not
|
||||
# A row is a table row if it has words that align with 2+ of the global columns
|
||||
for info in row_info:
|
||||
if info["is_paragraph"]:
|
||||
info["is_table_row"] = False
|
||||
continue
|
||||
|
||||
# Count how many global columns this row's words align with
|
||||
aligned_columns: set[int] = set()
|
||||
for word in info["words"]:
|
||||
word_x = word["x0"]
|
||||
for col_idx, col_x in enumerate(global_columns):
|
||||
if abs(word_x - col_x) < 40:
|
||||
aligned_columns.add(col_idx)
|
||||
break
|
||||
|
||||
# If row uses 2+ of the established columns, it's a table row
|
||||
info["is_table_row"] = len(aligned_columns) >= 2
|
||||
|
||||
# Find table regions (consecutive table rows)
|
||||
table_regions: list[tuple[int, int]] = [] # (start_idx, end_idx)
|
||||
i = 0
|
||||
while i < len(row_info):
|
||||
if row_info[i]["is_table_row"]:
|
||||
start_idx = i
|
||||
while i < len(row_info) and row_info[i]["is_table_row"]:
|
||||
i += 1
|
||||
end_idx = i
|
||||
table_regions.append((start_idx, end_idx))
|
||||
else:
|
||||
i += 1
|
||||
|
||||
# Check if enough rows are table rows (at least 20%)
|
||||
total_table_rows = sum(end - start for start, end in table_regions)
|
||||
if len(row_info) > 0 and total_table_rows / len(row_info) < 0.2:
|
||||
return None
|
||||
|
||||
# Build output - collect table data first, then format with proper column widths
|
||||
result_lines: list[str] = []
|
||||
num_cols = len(global_columns)
|
||||
|
||||
# Helper function to extract cells from a row
|
||||
def extract_cells(info: dict) -> list[str]:
|
||||
cells: list[str] = ["" for _ in range(num_cols)]
|
||||
for word in info["words"]:
|
||||
word_x = word["x0"]
|
||||
# Find the correct column using boundary ranges
|
||||
assigned_col = num_cols - 1 # Default to last column
|
||||
for col_idx in range(num_cols - 1):
|
||||
col_end = global_columns[col_idx + 1]
|
||||
if word_x < col_end - 20:
|
||||
assigned_col = col_idx
|
||||
break
|
||||
if cells[assigned_col]:
|
||||
cells[assigned_col] += " " + word["text"]
|
||||
else:
|
||||
cells[assigned_col] = word["text"]
|
||||
return cells
|
||||
|
||||
# Process rows, collecting table data for proper formatting
|
||||
idx = 0
|
||||
while idx < len(row_info):
|
||||
info = row_info[idx]
|
||||
|
||||
# Check if this row starts a table region
|
||||
table_region = None
|
||||
for start, end in table_regions:
|
||||
if idx == start:
|
||||
table_region = (start, end)
|
||||
break
|
||||
|
||||
if table_region:
|
||||
start, end = table_region
|
||||
# Collect all rows in this table
|
||||
table_data: list[list[str]] = []
|
||||
for table_idx in range(start, end):
|
||||
cells = extract_cells(row_info[table_idx])
|
||||
table_data.append(cells)
|
||||
|
||||
# Calculate column widths for this table
|
||||
if table_data:
|
||||
col_widths = [
|
||||
max(len(row[col]) for row in table_data) for col in range(num_cols)
|
||||
]
|
||||
# Ensure minimum width of 3 for separator dashes
|
||||
col_widths = [max(w, 3) for w in col_widths]
|
||||
|
||||
# Format header row
|
||||
header = table_data[0]
|
||||
header_str = (
|
||||
"| "
|
||||
+ " | ".join(
|
||||
cell.ljust(col_widths[i]) for i, cell in enumerate(header)
|
||||
)
|
||||
+ " |"
|
||||
)
|
||||
result_lines.append(header_str)
|
||||
|
||||
# Format separator row
|
||||
separator = (
|
||||
"| "
|
||||
+ " | ".join("-" * col_widths[i] for i in range(num_cols))
|
||||
+ " |"
|
||||
)
|
||||
result_lines.append(separator)
|
||||
|
||||
# Format data rows
|
||||
for row in table_data[1:]:
|
||||
row_str = (
|
||||
"| "
|
||||
+ " | ".join(
|
||||
cell.ljust(col_widths[i]) for i, cell in enumerate(row)
|
||||
)
|
||||
+ " |"
|
||||
)
|
||||
result_lines.append(row_str)
|
||||
|
||||
idx = end # Skip to end of table region
|
||||
else:
|
||||
# Check if we're inside a table region (not at start)
|
||||
in_table = False
|
||||
for start, end in table_regions:
|
||||
if start < idx < end:
|
||||
in_table = True
|
||||
break
|
||||
|
||||
if not in_table:
|
||||
# Non-table content
|
||||
result_lines.append(info["text"])
|
||||
idx += 1
|
||||
|
||||
return "\n".join(result_lines)
|
||||
|
||||
|
||||
def _extract_tables_from_words(page: Any) -> list[list[list[str]]]:
|
||||
"""
|
||||
Extract tables from a PDF page by analyzing word positions.
|
||||
This handles borderless tables where words are aligned in columns.
|
||||
|
||||
This function is designed for structured tabular data (like invoices),
|
||||
not for multi-column text layouts in scientific documents.
|
||||
"""
|
||||
words = page.extract_words(keep_blank_chars=True, x_tolerance=3, y_tolerance=3)
|
||||
if not words:
|
||||
return []
|
||||
|
||||
# Group words by their Y position (rows)
|
||||
y_tolerance = 5
|
||||
rows_by_y: dict[float, list[dict]] = {}
|
||||
for word in words:
|
||||
y_key = round(word["top"] / y_tolerance) * y_tolerance
|
||||
if y_key not in rows_by_y:
|
||||
rows_by_y[y_key] = []
|
||||
rows_by_y[y_key].append(word)
|
||||
|
||||
# Sort rows by Y position
|
||||
sorted_y_keys = sorted(rows_by_y.keys())
|
||||
|
||||
# Find potential column boundaries by analyzing x positions across all rows
|
||||
all_x_positions = []
|
||||
for words_in_row in rows_by_y.values():
|
||||
for word in words_in_row:
|
||||
all_x_positions.append(word["x0"])
|
||||
|
||||
if not all_x_positions:
|
||||
return []
|
||||
|
||||
# Cluster x positions to find column starts
|
||||
all_x_positions.sort()
|
||||
x_tolerance_col = 20
|
||||
column_starts: list[float] = []
|
||||
for x in all_x_positions:
|
||||
if not column_starts or x - column_starts[-1] > x_tolerance_col:
|
||||
column_starts.append(x)
|
||||
|
||||
# Need at least 3 columns but not too many (likely text layout, not table)
|
||||
if len(column_starts) < 3 or len(column_starts) > 10:
|
||||
return []
|
||||
|
||||
# Find rows that span multiple columns (potential table rows)
|
||||
table_rows = []
|
||||
for y_key in sorted_y_keys:
|
||||
words_in_row = sorted(rows_by_y[y_key], key=lambda w: w["x0"])
|
||||
|
||||
# Assign words to columns
|
||||
row_data = [""] * len(column_starts)
|
||||
for word in words_in_row:
|
||||
# Find the closest column
|
||||
best_col = 0
|
||||
min_dist = float("inf")
|
||||
for i, col_x in enumerate(column_starts):
|
||||
dist = abs(word["x0"] - col_x)
|
||||
if dist < min_dist:
|
||||
min_dist = dist
|
||||
best_col = i
|
||||
|
||||
if row_data[best_col]:
|
||||
row_data[best_col] += " " + word["text"]
|
||||
else:
|
||||
row_data[best_col] = word["text"]
|
||||
|
||||
# Only include rows that have content in multiple columns
|
||||
non_empty = sum(1 for cell in row_data if cell.strip())
|
||||
if non_empty >= 2:
|
||||
table_rows.append(row_data)
|
||||
|
||||
# Validate table quality - tables should have:
|
||||
# 1. Enough rows (at least 3 including header)
|
||||
# 2. Short cell content (tables have concise data, not paragraphs)
|
||||
# 3. Consistent structure across rows
|
||||
if len(table_rows) < 3:
|
||||
return []
|
||||
|
||||
# Check if cells contain short, structured data (not long text)
|
||||
long_cell_count = 0
|
||||
total_cell_count = 0
|
||||
for row in table_rows:
|
||||
for cell in row:
|
||||
if cell.strip():
|
||||
total_cell_count += 1
|
||||
# If cell has more than 30 chars, it's likely prose text
|
||||
if len(cell.strip()) > 30:
|
||||
long_cell_count += 1
|
||||
|
||||
# If more than 30% of cells are long, this is probably not a table
|
||||
if total_cell_count > 0 and long_cell_count / total_cell_count > 0.3:
|
||||
return []
|
||||
|
||||
return [table_rows]
|
||||
|
||||
|
||||
class PdfConverter(DocumentConverter):
|
||||
"""
|
||||
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
|
||||
Converts PDFs to Markdown.
|
||||
Supports extracting tables into aligned Markdown format (via pdfplumber).
|
||||
Falls back to pdfminer if pdfplumber is missing or fails.
|
||||
"""
|
||||
|
||||
def accepts(
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
**kwargs: Any,
|
||||
) -> bool:
|
||||
mimetype = (stream_info.mimetype or "").lower()
|
||||
extension = (stream_info.extension or "").lower()
|
||||
@@ -55,9 +409,8 @@ class PdfConverter(DocumentConverter):
|
||||
self,
|
||||
file_stream: BinaryIO,
|
||||
stream_info: StreamInfo,
|
||||
**kwargs: Any, # Options to pass to the converter
|
||||
**kwargs: Any,
|
||||
) -> DocumentConverterResult:
|
||||
# Check the dependencies
|
||||
if _dependency_exc_info is not None:
|
||||
raise MissingDependencyException(
|
||||
MISSING_DEPENDENCY_MESSAGE.format(
|
||||
@@ -65,13 +418,55 @@ class PdfConverter(DocumentConverter):
|
||||
extension=".pdf",
|
||||
feature="pdf",
|
||||
)
|
||||
) from _dependency_exc_info[
|
||||
1
|
||||
].with_traceback( # type: ignore[union-attr]
|
||||
) from _dependency_exc_info[1].with_traceback(
|
||||
_dependency_exc_info[2]
|
||||
)
|
||||
) # type: ignore[union-attr]
|
||||
|
||||
assert isinstance(file_stream, io.IOBase) # for mypy
|
||||
return DocumentConverterResult(
|
||||
markdown=pdfminer.high_level.extract_text(file_stream),
|
||||
)
|
||||
assert isinstance(file_stream, io.IOBase)
|
||||
|
||||
markdown_chunks: list[str] = []
|
||||
|
||||
# Read file stream into BytesIO for compatibility with pdfplumber
|
||||
pdf_bytes = io.BytesIO(file_stream.read())
|
||||
|
||||
try:
|
||||
# Track how many pages are form-style vs plain text
|
||||
form_pages = 0
|
||||
plain_pages = 0
|
||||
|
||||
with pdfplumber.open(pdf_bytes) as pdf:
|
||||
for page in pdf.pages:
|
||||
# Try form-style word position extraction
|
||||
page_content = _extract_form_content_from_words(page)
|
||||
|
||||
# If extraction returns None, this page is not form-style
|
||||
if page_content is None:
|
||||
plain_pages += 1
|
||||
# Extract text using pdfplumber's basic extraction for this page
|
||||
text = page.extract_text()
|
||||
if text and text.strip():
|
||||
markdown_chunks.append(text.strip())
|
||||
else:
|
||||
form_pages += 1
|
||||
if page_content.strip():
|
||||
markdown_chunks.append(page_content)
|
||||
|
||||
# If most pages are plain text, use pdfminer for better text handling
|
||||
if plain_pages > form_pages and plain_pages > 0:
|
||||
pdf_bytes.seek(0)
|
||||
markdown = pdfminer.high_level.extract_text(pdf_bytes)
|
||||
else:
|
||||
# Build markdown from chunks
|
||||
markdown = "\n\n".join(markdown_chunks).strip()
|
||||
|
||||
except Exception:
|
||||
# Fallback if pdfplumber fails
|
||||
pdf_bytes.seek(0)
|
||||
markdown = pdfminer.high_level.extract_text(pdf_bytes)
|
||||
|
||||
# Fallback if still empty
|
||||
if not markdown:
|
||||
pdf_bytes.seek(0)
|
||||
markdown = pdfminer.high_level.extract_text(pdf_bytes)
|
||||
|
||||
return DocumentConverterResult(markdown=markdown)
|
||||
|
||||
BIN
Binary file not shown.
+97
@@ -0,0 +1,97 @@
|
||||
%PDF-1.4
|
||||
%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
|
||||
1 0 obj
|
||||
<<
|
||||
/F1 2 0 R /F2 3 0 R /F3 4 0 R /F4 5 0 R
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/BaseFont /Courier /Encoding /WinAnsiEncoding /Name /F3 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/BaseFont /Courier-Bold /Encoding /WinAnsiEncoding /Name /F4 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<<
|
||||
/BitsPerComponent 8 /ColorSpace /DeviceRGB /Filter [ /ASCII85Decode /FlateDecode ] /Height 70 /Length 4491 /Subtype /Image
|
||||
/Type /XObject /Width 200
|
||||
>>
|
||||
stream
|
||||
Gb"/lq,^Nc)M\9OkX:DBZ5YT>'!op&`0lHCEL`PXM2DFT$QuCdPsfSJ4#%$gW49i\1e&eZ\Acg:2bhaPc+^Q$/Shs#,1&Qu>83CBh729[%A$M]]Z8KL]Cu-OpO.1`\pCPboa2!3#sCC+4Yg#.W)"\K&i)doY5WCH.J-G0E$]%J"BRoZ88okcKEP@C7S%JEA:t(e6:OLb-"MZ3=$fAIE$]%J"BRoZ88okcKEP@C7S%JEA:t(e6:OKV=+u3F\<8ml>&.H@]EH:@TAeD]);ilGV3,!4G2kP0XRN"#r/]G!Kqo1c9Redk1d%Hh[t82=;lMkgMFr4J2#lTY[siCuNDGT]h^te4%1fj10r$&D--;(2UPbQ(Ze:VUL"2G=%qPZVOlc,tegG*BO,:$mI%mCAJ^8q2gWSn>)Ui.KQ!A5_(Z;l?_%(Xb28AGIU0SY?bVp%B6=$P2*I!1?W8WL>aAVWc$%-nk=D8ZjE$stW]LJ-LqIj9qZoFV/lj$Uf)=b`nfl*ANkt_qpb2t'P`;D#\h6o+g'M+"j4Sf:d#_jjZdTS>mnQJm^7>(S3Bq"m(kH5i3;0`YK<5e.0$k"4XA4UX)rfXH+2OamR360'cX$&"Dp"DdSkh^Q;?+H)fBc@YMp?\]UZuQ*lgt@kDS'dARs/]`Roa+]eXm%&TJ4e[RCKq6kQ:5Zpa[hLTEE,M/UR\C]SS.K'HJL7F)F5Ts63hWCKs+aTqi3TN!,P7#o$@'a?^`M.9&=d,$WJ*\b^*N1fR$JFV.s`.W*WfCgS\9h@C2/uC<b(1#bB73rR3UmcP"%)_DZ#=)TA1KkfUBT8F;=Yoc[BR6[ZkS\Y$n/.@mf!9WK1Hj]o3[f>DiFrdD&a`iRQ\df2(Xc53$=i@@upbP,MJ@.sDgm%`^.8+5u//"Hhn%^kIb$5o\2B7%r>DD\NZ:L;otY0]:)'6l[M<&ctoM"($Q_XW1'!4OB>g/3dF]mD],eF*&&'itQ;2e$/VWZ/QmdogQ0&d7ePkDGP[PZkk8TtUWkaJYa$Q)c6I+l<preqG)K\U>pY5H])D-lHdp52<d:Isd8)X0&b+pKUugDNb2NIW.aD_PLN/i(r9N&<3?,2br'%?gT'_i;n9VUeeM#>ko&@JS]_pP&PR0@L`i*pbXB"rgcI`#'#>-Njfe@8+ZC7hHU>Qm2oCj$^ATs<7[sZ@5*,@qslQ\p1m1#p6XrGL'F^?ok?+\fDe1,#<0n78&1'&KK/85E7IuiRklZ$<tM_`dQfdI@$2&Sj]k*=&V0n1RVEA-6,(U`EJ9674P)af%Z>l=<9-YKka`e!9&oUk]3u7Y);o;!X[W(:<D3M@"gDG)[;",CR0eT^r/fRnB+ob8JM\tp9\JoC\=uqFPX9nU`?:Y,eJf!6E95r@KI_iekuY/-+j6DIrXFQW>i@m+VqQff?4r\fn@@4QXN6[dWdtV8:B`3X2:bgH!rR8-r^sf)#EN%"`F/q0Heh_C7H6l'.@I3l<Jr.Q!as3DB-9V*+/'h,_<T8?^2u*t.p$h8d%"Dd\P<5M`MEg>7W_M8qB0Sd$'o&pWH!XFNS.JRZ%[WY$N:rl5tLIb;#&1u\'nOCIB]161$bC,Uuf;ZK6dl()epY<39X_LaOAXU:WAZiYqZk5Tq+hN`O"QdZ7[jLdf^cf`?9i4T#=]JO$'0fC4#Y=^M%VOouL.PZ6/V;r+XoF1Ls*YXu`6'4?,j47_u_U=.T*IX0ed;@5JN=Qlc\gS!W?r;#%jA)NSUh\`='l:HWsF<K@<`EqO<,Ht[H.@PGU,p6$s&YbEgb;cfG9YK,6Fh]@t(EUD@78Ob6ui6[#pIBoZn<UH)N"PrIeP3Y!qa-k8bP>_rQ_q'7l3]f,=As5FN;rm6/&IWa@[9HFr3YtU'N%=ZPr)s`!!&o!IbLLsBH7VnG&"_&hSn3;Nr^mpSZk^2i_aD8<g*:f)-)1j6@3KSHbb_c1PDAXpnkGE:H3Fs0m?uXff0>H]^Oi^Wq(2*ak3>^mA^!FkG4$-Vq(BH"U+YSR;%(5j(bnT,&RrR1d]\O5_42^f/Xa:4msf,Oms&5F6()XE"p6mS/Yc\Ga&`hC/3XdsM;'cTMl(uV@DiFY5AA_VWS4T'&^D<.7.S,`B?:^&!Q[ZVaCi0^$[E#=Xt_;^\;l;M#]`$4;sLf^6$u5)gpB'7TO-@*HXbXF]H[ID>=n%&8-T:f)&:]?hm\RN5/B5sdW)PM[X@>2Jq/jQ%m%0Pk%J`<],L8cn3_`B)dE8ng`*C-";2ro.7o.B2:3d$4r#LEqr#8((eIunkG+V@25V=%+_!:Z/,UQca-*F<WEc]8T!rP(2A>g9*GW$LPXTF:ER(d"o8oX31"!B8VtcoXnY$9m&'30Um_8lI0aO.LY_m,l[3VfKlBY*[!$I#3=:"_\lGs6U"<,-kF4HFN$6[_SQpG)7_H3mnKRB'C9#q8EY(Vaqi(D&r$*Jr?OPiaP#RRYeN0)sia9W*TKT)#N9#q8EY(Vaqi(D&r$*Jr?OPiaP#k^2Zeu*-9hp_.0@)f6Em^OKjQe!N6R'K"tCBm#IB,_F0s/@WUCJX)`/6>DF6M_)Kk.s>$UZPX\r/#8I8q/RiHL2liUAA0Yf`%Ld=068s:8FSq=\;#qPO.M6gNJ<fcJ_B7NfVHbGb(Z#%=9b&C:^$<Xe!_fU:_#gd*5iB@GeEYKud7*YgG3W,jj8\q'/4@l'Y-g\)rX-m;H77/R]Zb%A!YN(tcDfkrJ&o7($5o=b[,R0&h>j\UuBER`krd-YJajg!X[eDSKG#rM875C*#IQWm+O@aOf*TU2U('gKn4DA1c#r\17+;`V`#Kgpm8RC[Ee$Ac&[r[O5Rd>4]?HeBRku2;R,jekQW/J-QCRW&^t>C\3e,:[W(TT!T:2JEND71MtHndISe*l)E$(\gO^@6Z"5Q<4O)uM0mY8/2q_>,e#m"SaP0HaXDih_2UgJZH0.io.<EHam+)Ba'CJ3$,Ve\2W\TOCDf>!]XNj5RmJj5Qb:1EKuS*^?b+;s)^?6l>JE0QrGSUpouIlZac9kX41>cja=/SDr<cA`ZGg.-dcfEr]UahTX\[1!?g.Q+Bc7gR:Jn+!PFgH?<j_Cphp%2cK2o3EmuaRlLL,1SFd.EYSt<=je7H>"[/Y4/S?T:Ij;Z5Vm!N#O9oGbTmC/.8"?)WA+NO#l!d__>E^^P^^5SPK3f#a-$jDf&=>7p3h\GbV.%e"aa`Nr8:-((uUWjP?5h<QpA5fPW)4^9]4ZVP3n=aOl2n\TH8,2g(<%d(S%%7D#]P?GegI?jC3uo^tm=sN6Bp.;hI!R:5>*[4qX1g^KT=J\eG*=-r!s:Xj;^b_=6X9pT)tmOPN+SP\Y9d?!CS[k5Z"1@+N1#RUIImf:>$etSOp!A$%9Npa7^LW;"'>&DLN*LT#=""p0WOpjXfI:C##@j@lDSU=Xe\&,@B>QE*Z4EfD\=1"h4F8$&QPkBC:BBC"p$N?^/:S9o*P:eF4j>`WT<EG,f\ln6T<>&V*-UR?1+=iZ&Y:YJ)Y_O1Q!(3MOc:&,lEK0KT>gBrM!Oo7g$R0Z=@n</A>l[op[I#4)k.C3f6pb.hq#9_d636^F1(A%Us@pB(WNXIQ#TKD-`)0%k[fj0?XEDjbhd[m6#LpLW.'sd9_sqo4)9,(HjMXDMbKZE5`!!P4XRB@/4TISu+m2&%RiFj^;=JGE6IZZZ3Iq9u;tP9Ze)spB!TH+!k0kjm?9TaEqM'"N]I#K68.sENFpG-:BpL"k5<Kf;Cll]p\0.VBgJXYV7bkGELTah(>RWGsjL14<<sE:X3JW]):L"_a3kcBH&.(2Ui6d<sTVSq6Hb2oGX:N_j+E^^?#]Brd-YC/q4&j48+1kK<)&.;T[(aqqDC]Z:"7NhLIO5&BeU-SXc>m&CBpaKs!LDjG%ZThfU"?o'^kNBIDSK`iDtti"$XI@8UFlc6'SU-*bNc0g0qZU.!3k5:g&J5%_GC<>>?gMnQALpYLh_[CK2YgZMfbcG=f9!FE+GUto#YS.Ms0oqhppBVFIgKikS73+[l77Q]oKP@6W$g?K-&@CV\IU2l9TjknpF,mi!B#31b$WjNK[_SbrBad@)rY(?tsa<O6[4Wd_rrCQDK9Fn>?noi;W9O[OS+IIT/JfXc1#&UNmnp:7_c6/aW-".*cBVjc)"DgbY/sJI7I:YTo4n^?6D*J='=Z.q\rms\C-WjI*igXS;Vfo_`Y\`_c8K6r=SE5*WPmG+p6'k&&0eDn]e<kl$//^-6n?[M0Wg:@`F!W*m'jM%_,JfY,&JA=T)'Qh]O:`+1#oOo&Q&lRj>R;8k_3L)o&mP_\+Z6EUKR,;fQ&lT(!/=AJY_cg)ZFd_iV[)$d`W(]Q<7PapAWM]!Y4qoZ_Z9e(Tts:W\F3=&Fc,Mp(`=cJUG]7+gkp\4)_AlnCa6gYn$_U<4&\@+ERR^ZIFpqbN%;:=3k=P0?fC_:0&Ug9TYUE[kI#Bq;+mc:C\/7H:]h-hq^[P:`jZ\paVNd3BCYF4eruZ)J"F0tYQ"^`hf0;~>endstream
|
||||
endobj
|
||||
7 0 obj
|
||||
<<
|
||||
/Contents 11 0 R /MediaBox [ 0 0 216 792 ] /Parent 10 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject <<
|
||||
/FormXob.2a351979d8c75d073b2ea4bfb74718f9 6 0 R
|
||||
>>
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
8 0 obj
|
||||
<<
|
||||
/PageMode /UseNone /Pages 10 0 R /Type /Catalog
|
||||
>>
|
||||
endobj
|
||||
9 0 obj
|
||||
<<
|
||||
/Author (\(anonymous\)) /CreationDate (D:20251205104951+01'00') /Creator (\(unspecified\)) /Keywords () /ModDate (D:20251205104951+01'00') /Producer (ReportLab PDF Library - www.reportlab.com)
|
||||
/Subject (\(unspecified\)) /Title (\(anonymous\)) /Trapped /False
|
||||
>>
|
||||
endobj
|
||||
10 0 obj
|
||||
<<
|
||||
/Count 1 /Kids [ 7 0 R ] /Type /Pages
|
||||
>>
|
||||
endobj
|
||||
11 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 1981
|
||||
>>
|
||||
stream
|
||||
GauI8D/\/e&:hOi=55KJ8;^%$X%4*F0ZR4[TQ-.JlIl=8,25emeWsH3DOUP#SK,Tt>!%\QTf-@>5)IUB4PgI,!l6hOHqp".7kqY?VEc$;f0G$C+?kE+IduY+D6B[cY<`?1&+Bf$SSWa.DI28'F?:CpG_mY"TO]hkXiFku&",h"4G/8GlCSK3UDT`3W#q'Qc;4>t6\tbspa(/l?]"D>nboQo,(,[\*-A;J=Ru^j=[Nu\:iMk7q<+PGo*gWpT4_C)j)t7Oc[5MlffWrhj!99;,0?]r3R(ns^B^I*KQ#f[([aS1g);Q.rrBep&6)sVJs=\.1^pkCa(tBfECI75_;C0LC.)*n<3;@eFZT<Brd%CYO%*fRCl_%R2PLtnF>0lg_SFKN$O.X\o%U7_58YJ,X[`p,%PUL^1]EgT]T\4*B3hOrEZA:[=ui88pZGht%klE^OC$2=@$GiMoTO<eR\C2;10r\O.%_7=c.`)*@0N>,CLh>2ZDq?"(LrLS)ajJ<DG(:N]5MPuT)E6J8.)!Ud7D>Je7M&(V1i'>Z@qg3/WJ@PpL4nr1qU$V_#jqpM+M<[H(LE:pW6uTQK`^P%Q'T&[*Y1\T_7.O:+n^Y)3+d56\hGsIrnqB85q[4L1WG#"Uo.d]Zr"jG`qiT=AU8b5]p3(N2IfnWHVO&$rnNe6[_$[o(m=2Sq-C[bbNOS,qIb?:TGYHhQjjcCe!9%*cuscgU*Eea^B?#^HtoE9p(jd_GR#E1\LTm:MVS5e]+<LZRQ]0^iZNnTs\Iq5l6H+?80%j?IX^UR28jY=Vr!:#Jf=D$QdR#4X6Q%Z^E6hq4[p#rHu/mN!PgeQCn_hEI9M3(_b.pAid<e0?KUakkhL+Sq1A+!S(V0h+OF8nn#&[1+6p+D5^<OP@s\HS\itN&+apX>;a8<=<fVm-cM(u=Q31UTuXZiRNk/X^o=e`8?ha=(l,J:AYGq&k'81mIs68U9).dPb@tBY#5s6I1(;=p-FNV8JLO6-b%6]BUEO#*P:YXQ,+2T8!GeA%OFD*^H'I<qo6Q\KGcc$9<-q;oCHo2eF[F/t'nG3p8bj]<0qUd^A*Un<D3^J]5-EeYaHZL0]dZ(aldZ?U]EL]o1@j;L=l_$._u&B5KRKtY900d3EO'mY6&5WB,D7o]o+7,#h.[N58L+)*ks!_/dIq7L<$Q/>:Ym/3(NJmP3]c2J81f'[9A229?.>nW.Y"uioK$/X(RLnTFa0nhiu#_V(M%6pL-[3&IEZO^iW'pcgSC4%cs*UfWL8=h@<SF-6Ml.SK#\%/6pL;XKVP08]+YR4.1^h^3g+iL6p2jNFKi##9N\7TS:EE')be-k57a"IMZoUV#>]Eoq_3uS@i.]*ai31P3"'$;S,Q%'"=$Vq"-_!pX<>bA]?nd=dOpZa\$"k!cpI9L2SO3gBd7]TKi)s)3F/ADnb^3N)iF')M[\Fq1\^PAl):!YpJp!B\s/2LkZr*`(o%fTO.qa?N[7\P_Dj!-3=OAO3]DtNKn-R1Hc#$?$h;RW[;B(k%DrOQ4lA(kZ`8[,.5E\H/%&9RE1k-pKk^'?Wseh?':/9RD3&&Xf\j=9;Sdd#l!b5li$Q.?@#FtJ9r"D*THt>o=+h,ei<3VCI\e<F.YjMklHmQ252@7%.$dl?FX[.5Ru_<cOnWObGU$sud3sn0Nm?VSip=&P_8=3<b5l"NFchqcT66k!jof;<?"kHRj[>Q+FB(V85-;*H\(QoM*>m2@9WKA`dV0$2F]lQ!^cKY?-F<<RYBD9:P`&#<:DJpSA0L]L_Q`8=5'6'r`p_54_;lcH+H=)4\l8B7YE#pX>K&Mf4jEn:L@C'pmu(T(NAo?onFtPTH*Mah:OJII8OF6<oMipM;1-5S+stnT,o"n]+UmpI>_OX,SeHS'86i`]nN=oW_Hjm1lb%agT!)1^3rJWom\/,?BYNjThVR,cQ'opa8Q#<G9<qeSN'GRRO*(AC(K$'<9uMACIm=MV?Mk2Q*3P"~>endstream
|
||||
endobj
|
||||
xref
|
||||
0 12
|
||||
0000000000 65535 f
|
||||
0000000073 00000 n
|
||||
0000000134 00000 n
|
||||
0000000241 00000 n
|
||||
0000000353 00000 n
|
||||
0000000458 00000 n
|
||||
0000000568 00000 n
|
||||
0000005249 00000 n
|
||||
0000005507 00000 n
|
||||
0000005576 00000 n
|
||||
0000005859 00000 n
|
||||
0000005919 00000 n
|
||||
trailer
|
||||
<<
|
||||
/ID
|
||||
[<4800d64fefba4dd902e51197c7da4e88><4800d64fefba4dd902e51197c7da4e88>]
|
||||
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
|
||||
|
||||
/Info 9 0 R
|
||||
/Root 8 0 R
|
||||
/Size 12
|
||||
>>
|
||||
startxref
|
||||
7992
|
||||
%%EOF
|
||||
Binary file not shown.
+115
File diff suppressed because one or more lines are too long
@@ -0,0 +1,871 @@
|
||||
#!/usr/bin/env python3 -m pytest
|
||||
"""Tests for PDF table extraction functionality."""
|
||||
import os
|
||||
import re
|
||||
import pytest
|
||||
|
||||
from markitdown import MarkItDown
|
||||
|
||||
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
|
||||
|
||||
|
||||
# --- Helper Functions ---
|
||||
def validate_strings(result, expected_strings, exclude_strings=None):
|
||||
"""Validate presence or absence of specific strings."""
|
||||
text_content = result.text_content.replace("\\", "")
|
||||
for string in expected_strings:
|
||||
assert string in text_content, f"Expected string not found: {string}"
|
||||
if exclude_strings:
|
||||
for string in exclude_strings:
|
||||
assert string not in text_content, f"Excluded string found: {string}"
|
||||
|
||||
|
||||
def validate_markdown_table(result, expected_headers, expected_data_samples):
|
||||
"""Validate that a markdown table exists with expected headers and data."""
|
||||
text_content = result.text_content
|
||||
|
||||
# Check for markdown table structure (| header | header |)
|
||||
assert "|" in text_content, "No markdown table markers found"
|
||||
|
||||
# Check headers are present
|
||||
for header in expected_headers:
|
||||
assert header in text_content, f"Expected table header not found: {header}"
|
||||
|
||||
# Check some data values are present
|
||||
for data in expected_data_samples:
|
||||
assert data in text_content, f"Expected table data not found: {data}"
|
||||
|
||||
|
||||
def extract_markdown_tables(text_content):
|
||||
"""
|
||||
Extract all markdown tables from text content.
|
||||
Returns a list of tables, where each table is a list of rows,
|
||||
and each row is a list of cell values.
|
||||
"""
|
||||
tables = []
|
||||
lines = text_content.split("\n")
|
||||
current_table = []
|
||||
in_table = False
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if line.startswith("|") and line.endswith("|"):
|
||||
# Skip separator rows (contain only dashes and pipes)
|
||||
if re.match(r"^\|[\s\-|]+\|$", line):
|
||||
continue
|
||||
# Parse cells from the row
|
||||
cells = [cell.strip() for cell in line.split("|")[1:-1]]
|
||||
current_table.append(cells)
|
||||
in_table = True
|
||||
else:
|
||||
if in_table and current_table:
|
||||
tables.append(current_table)
|
||||
current_table = []
|
||||
in_table = False
|
||||
|
||||
# Don't forget the last table
|
||||
if current_table:
|
||||
tables.append(current_table)
|
||||
|
||||
return tables
|
||||
|
||||
|
||||
def validate_table_structure(table):
|
||||
"""
|
||||
Validate that a table has consistent structure:
|
||||
- All rows have the same number of columns
|
||||
- Has at least a header row and one data row
|
||||
"""
|
||||
if not table:
|
||||
return False, "Table is empty"
|
||||
|
||||
if len(table) < 2:
|
||||
return False, "Table should have at least header and one data row"
|
||||
|
||||
num_cols = len(table[0])
|
||||
if num_cols < 2:
|
||||
return False, f"Table should have at least 2 columns, found {num_cols}"
|
||||
|
||||
for i, row in enumerate(table):
|
||||
if len(row) != num_cols:
|
||||
return False, f"Row {i} has {len(row)} columns, expected {num_cols}"
|
||||
|
||||
return True, "Table structure is valid"
|
||||
|
||||
|
||||
class TestPdfTableExtraction:
|
||||
"""Test PDF table extraction with various PDF types."""
|
||||
|
||||
@pytest.fixture
|
||||
def markitdown(self):
|
||||
"""Create MarkItDown instance."""
|
||||
return MarkItDown()
|
||||
|
||||
def test_borderless_table_extraction(self, markitdown):
|
||||
"""Test extraction of borderless tables from SPARSE inventory PDF.
|
||||
|
||||
Expected output structure:
|
||||
- Header: INVENTORY RECONCILIATION REPORT with Report ID, Warehouse, Date, Prepared By
|
||||
- Pipe-separated rows with inventory data
|
||||
- Text section: Variance Analysis with Summary Statistics
|
||||
- More pipe-separated rows with extended inventory review
|
||||
- Footer: Recommendations section
|
||||
"""
|
||||
pdf_path = os.path.join(
|
||||
TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf"
|
||||
)
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
text_content = result.text_content
|
||||
|
||||
# Validate document header content
|
||||
expected_strings = [
|
||||
"INVENTORY RECONCILIATION REPORT",
|
||||
"Report ID: SPARSE-2024-INV-1234",
|
||||
"Warehouse: Distribution Center East",
|
||||
"Report Date: 2024-11-15",
|
||||
"Prepared By: Sarah Martinez",
|
||||
]
|
||||
validate_strings(result, expected_strings)
|
||||
|
||||
# Validate pipe-separated format is used
|
||||
assert "|" in text_content, "Should have pipe separators for form-style data"
|
||||
|
||||
# --- Validate First Table Data (Inventory Variance) ---
|
||||
# Validate table headers are present
|
||||
first_table_headers = [
|
||||
"Product Code",
|
||||
"Location",
|
||||
"Expected",
|
||||
"Actual",
|
||||
"Variance",
|
||||
"Status",
|
||||
]
|
||||
for header in first_table_headers:
|
||||
assert header in text_content, f"Should contain header '{header}'"
|
||||
|
||||
# Validate first table has all expected SKUs
|
||||
first_table_skus = ["SKU-8847", "SKU-9201", "SKU-4563", "SKU-7728"]
|
||||
for sku in first_table_skus:
|
||||
assert sku in text_content, f"Should contain {sku}"
|
||||
|
||||
# Validate first table has correct status values
|
||||
expected_statuses = ["OK", "CRITICAL"]
|
||||
for status in expected_statuses:
|
||||
assert status in text_content, f"Should contain status '{status}'"
|
||||
|
||||
# Validate first table has location codes
|
||||
expected_locations = ["A-12", "B-07", "C-15", "D-22", "A-08"]
|
||||
for loc in expected_locations:
|
||||
assert loc in text_content, f"Should contain location '{loc}'"
|
||||
|
||||
# --- Validate Second Table Data (Extended Inventory Review) ---
|
||||
# Validate second table headers
|
||||
second_table_headers = [
|
||||
"Category",
|
||||
"Unit Cost",
|
||||
"Total Value",
|
||||
"Last Audit",
|
||||
"Notes",
|
||||
]
|
||||
for header in second_table_headers:
|
||||
assert header in text_content, f"Should contain header '{header}'"
|
||||
|
||||
# Validate second table has all expected SKUs (10 products)
|
||||
second_table_skus = [
|
||||
"SKU-8847",
|
||||
"SKU-9201",
|
||||
"SKU-4563",
|
||||
"SKU-7728",
|
||||
"SKU-3345",
|
||||
"SKU-5512",
|
||||
"SKU-6678",
|
||||
"SKU-7789",
|
||||
"SKU-2234",
|
||||
"SKU-1123",
|
||||
]
|
||||
for sku in second_table_skus:
|
||||
assert sku in text_content, f"Should contain {sku}"
|
||||
|
||||
# Validate second table has categories
|
||||
expected_categories = ["Electronics", "Hardware", "Software", "Accessories"]
|
||||
for category in expected_categories:
|
||||
assert category in text_content, f"Should contain category '{category}'"
|
||||
|
||||
# Validate second table has cost values (spot check)
|
||||
expected_costs = ["$45.00", "$32.50", "$120.00", "$15.75"]
|
||||
for cost in expected_costs:
|
||||
assert cost in text_content, f"Should contain cost '{cost}'"
|
||||
|
||||
# Validate second table has note values
|
||||
expected_notes = ["Verified", "Critical", "Pending"]
|
||||
for note in expected_notes:
|
||||
assert note in text_content, f"Should contain note '{note}'"
|
||||
|
||||
# --- Validate Analysis Text Section ---
|
||||
analysis_strings = [
|
||||
"Variance Analysis:",
|
||||
"Summary Statistics:",
|
||||
"Total Variance Cost: $4,287.50",
|
||||
"Critical Items: 1",
|
||||
"Overall Accuracy: 97.2%",
|
||||
"Recommendations:",
|
||||
]
|
||||
validate_strings(result, analysis_strings)
|
||||
|
||||
# --- Validate Document Structure Order ---
|
||||
# Verify sections appear in correct order
|
||||
# Note: Using flexible patterns since column merging may occur based on gap detection
|
||||
import re
|
||||
|
||||
header_pos = text_content.find("INVENTORY RECONCILIATION REPORT")
|
||||
# Look for Product Code header - may be in same column as Location or separate
|
||||
first_table_match = re.search(r"\|\s*Product Code", text_content)
|
||||
variance_pos = text_content.find("Variance Analysis:")
|
||||
extended_review_pos = text_content.find("Extended Inventory Review:")
|
||||
# Second table - look for SKU entries after extended review section
|
||||
# The table may not have pipes on every row due to paragraph detection
|
||||
second_table_pos = -1
|
||||
if extended_review_pos != -1:
|
||||
# Look for either "| Product Code" or "Product Code" as table header
|
||||
second_table_match = re.search(
|
||||
r"Product Code.*Category", text_content[extended_review_pos:]
|
||||
)
|
||||
if second_table_match:
|
||||
# Adjust position to be relative to full text
|
||||
second_table_pos = extended_review_pos + second_table_match.start()
|
||||
recommendations_pos = text_content.find("Recommendations:")
|
||||
|
||||
positions = {
|
||||
"header": header_pos,
|
||||
"first_table": first_table_match.start() if first_table_match else -1,
|
||||
"variance_analysis": variance_pos,
|
||||
"extended_review": extended_review_pos,
|
||||
"second_table": second_table_pos,
|
||||
"recommendations": recommendations_pos,
|
||||
}
|
||||
|
||||
# All sections should be found
|
||||
for name, pos in positions.items():
|
||||
assert pos != -1, f"Section '{name}' not found in output"
|
||||
|
||||
# Verify correct order
|
||||
assert (
|
||||
positions["header"] < positions["first_table"]
|
||||
), "Header should come before first table"
|
||||
assert (
|
||||
positions["first_table"] < positions["variance_analysis"]
|
||||
), "First table should come before Variance Analysis"
|
||||
assert (
|
||||
positions["variance_analysis"] < positions["extended_review"]
|
||||
), "Variance Analysis should come before Extended Review"
|
||||
assert (
|
||||
positions["extended_review"] < positions["second_table"]
|
||||
), "Extended Review should come before second table"
|
||||
assert (
|
||||
positions["second_table"] < positions["recommendations"]
|
||||
), "Second table should come before Recommendations"
|
||||
|
||||
def test_borderless_table_no_duplication(self, markitdown):
|
||||
"""Test that borderless table content is not duplicated excessively."""
|
||||
pdf_path = os.path.join(
|
||||
TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf"
|
||||
)
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
text_content = result.text_content
|
||||
|
||||
# Count occurrences of unique table data - should not be excessively duplicated
|
||||
# SKU-8847 appears in both tables, plus possibly once in summary text
|
||||
sku_count = text_content.count("SKU-8847")
|
||||
# Should appear at most 4 times (2 tables + minor text references), not more
|
||||
assert (
|
||||
sku_count <= 4
|
||||
), f"SKU-8847 appears too many times ({sku_count}), suggests duplication issue"
|
||||
|
||||
def test_borderless_table_correct_position(self, markitdown):
|
||||
"""Test that tables appear in correct positions relative to text."""
|
||||
pdf_path = os.path.join(
|
||||
TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf"
|
||||
)
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
text_content = result.text_content
|
||||
|
||||
# Verify content order - header should come before table content, which should come before analysis
|
||||
header_pos = text_content.find("Prepared By: Sarah Martinez")
|
||||
# Look for Product Code in any pipe-separated format
|
||||
product_code_pos = text_content.find("Product Code")
|
||||
variance_pos = text_content.find("Variance Analysis:")
|
||||
|
||||
assert header_pos != -1, "Header should be found"
|
||||
assert product_code_pos != -1, "Product Code should be found"
|
||||
assert variance_pos != -1, "Variance Analysis should be found"
|
||||
|
||||
assert (
|
||||
header_pos < product_code_pos < variance_pos
|
||||
), "Product data should appear between header and Variance Analysis"
|
||||
|
||||
# Second table content should appear after "Extended Inventory Review"
|
||||
extended_review_pos = text_content.find("Extended Inventory Review:")
|
||||
# Look for Category header which is in second table
|
||||
category_pos = text_content.find("Category")
|
||||
recommendations_pos = text_content.find("Recommendations:")
|
||||
|
||||
if (
|
||||
extended_review_pos != -1
|
||||
and category_pos != -1
|
||||
and recommendations_pos != -1
|
||||
):
|
||||
# Find Category position after Extended Inventory Review
|
||||
category_after_review = text_content.find("Category", extended_review_pos)
|
||||
if category_after_review != -1:
|
||||
assert (
|
||||
extended_review_pos < category_after_review < recommendations_pos
|
||||
), "Extended review table should appear between Extended Inventory Review and Recommendations"
|
||||
|
||||
def test_receipt_pdf_extraction(self, markitdown):
|
||||
"""Test extraction of receipt PDF (no tables, formatted text).
|
||||
|
||||
Expected output structure:
|
||||
- Store header: TECHMART ELECTRONICS with address
|
||||
- Transaction info: Store #, date, TXN, Cashier, Register
|
||||
- Line items: 6 products with prices and member discounts
|
||||
- Totals: Subtotal, Member Discount, Sales Tax, Rewards, TOTAL
|
||||
- Payment info: Visa Card, Auth, Ref
|
||||
- Rewards member info: Name, ID, Points
|
||||
- Return policy and footer
|
||||
"""
|
||||
pdf_path = os.path.join(
|
||||
TEST_FILES_DIR, "RECEIPT-2024-TXN-98765_retail_purchase.pdf"
|
||||
)
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
text_content = result.text_content
|
||||
|
||||
# --- Validate Store Header ---
|
||||
store_header = [
|
||||
"TECHMART ELECTRONICS",
|
||||
"4567 Innovation Blvd",
|
||||
"San Francisco, CA 94103",
|
||||
"(415) 555-0199",
|
||||
]
|
||||
validate_strings(result, store_header)
|
||||
|
||||
# --- Validate Transaction Info ---
|
||||
transaction_info = [
|
||||
"Store #0342 - Downtown SF",
|
||||
"11/23/2024",
|
||||
"TXN: TXN-98765-2024",
|
||||
"Cashier: Emily Rodriguez",
|
||||
"Register: POS-07",
|
||||
]
|
||||
validate_strings(result, transaction_info)
|
||||
|
||||
# --- Validate Line Items (6 products) ---
|
||||
line_items = [
|
||||
# Product 1: Headphones
|
||||
"Wireless Noise-Cancelling",
|
||||
"Headphones - Premium Black",
|
||||
"AUDIO-5521",
|
||||
"$349.99",
|
||||
"$299.99",
|
||||
# Product 2: USB-C Hub
|
||||
"USB-C Hub 7-in-1 Adapter",
|
||||
"ACC-8834",
|
||||
"$79.99",
|
||||
"$159.98",
|
||||
# Product 3: Portable SSD
|
||||
"Portable SSD 2TB",
|
||||
"STOR-2241",
|
||||
"$289.00",
|
||||
"$260.00",
|
||||
# Product 4: Wireless Mouse
|
||||
"Ergonomic Wireless Mouse",
|
||||
"ACC-9012",
|
||||
"$59.99",
|
||||
# Product 5: Screen Cleaning Kit
|
||||
"Screen Cleaning Kit",
|
||||
"CARE-1156",
|
||||
"$12.99",
|
||||
"$38.97",
|
||||
# Product 6: HDMI Cable
|
||||
"HDMI 2.1 Cable 6ft",
|
||||
"CABLE-7789",
|
||||
"$24.99",
|
||||
"$44.98",
|
||||
]
|
||||
validate_strings(result, line_items)
|
||||
|
||||
# --- Validate Totals ---
|
||||
totals = [
|
||||
"SUBTOTAL",
|
||||
"$863.91",
|
||||
"Member Discount",
|
||||
"Sales Tax (8.5%)",
|
||||
"$66.23",
|
||||
"Rewards Applied",
|
||||
"-$25.00",
|
||||
"TOTAL",
|
||||
"$821.14",
|
||||
]
|
||||
validate_strings(result, totals)
|
||||
|
||||
# --- Validate Payment Info ---
|
||||
payment_info = [
|
||||
"PAYMENT METHOD",
|
||||
"Visa Card ending in 4782",
|
||||
"Auth: 847392",
|
||||
"REF-20241123-98765",
|
||||
]
|
||||
validate_strings(result, payment_info)
|
||||
|
||||
# --- Validate Rewards Member Info ---
|
||||
rewards_info = [
|
||||
"REWARDS MEMBER",
|
||||
"Sarah Mitchell",
|
||||
"ID: TM-447821",
|
||||
"Points Earned: 821",
|
||||
"Total Points: 3,247",
|
||||
]
|
||||
validate_strings(result, rewards_info)
|
||||
|
||||
# --- Validate Return Policy & Footer ---
|
||||
footer_info = [
|
||||
"RETURN POLICY",
|
||||
"Returns within 30 days",
|
||||
"Receipt required",
|
||||
"Thank you for shopping!",
|
||||
"www.techmart.example.com",
|
||||
]
|
||||
validate_strings(result, footer_info)
|
||||
|
||||
# --- Validate Document Structure Order ---
|
||||
positions = {
|
||||
"store_header": text_content.find("TECHMART ELECTRONICS"),
|
||||
"transaction": text_content.find("TXN: TXN-98765-2024"),
|
||||
"first_item": text_content.find("Wireless Noise-Cancelling"),
|
||||
"subtotal": text_content.find("SUBTOTAL"),
|
||||
"total": text_content.find("TOTAL"),
|
||||
"payment": text_content.find("PAYMENT METHOD"),
|
||||
"rewards": text_content.find("REWARDS MEMBER"),
|
||||
"return_policy": text_content.find("RETURN POLICY"),
|
||||
}
|
||||
|
||||
# All sections should be found
|
||||
for name, pos in positions.items():
|
||||
assert pos != -1, f"Section '{name}' not found in output"
|
||||
|
||||
# Verify correct order
|
||||
assert (
|
||||
positions["store_header"] < positions["transaction"]
|
||||
), "Store header should come before transaction"
|
||||
assert (
|
||||
positions["transaction"] < positions["first_item"]
|
||||
), "Transaction should come before items"
|
||||
assert (
|
||||
positions["first_item"] < positions["subtotal"]
|
||||
), "Items should come before subtotal"
|
||||
assert (
|
||||
positions["subtotal"] < positions["total"]
|
||||
), "Subtotal should come before total"
|
||||
assert (
|
||||
positions["total"] < positions["payment"]
|
||||
), "Total should come before payment"
|
||||
assert (
|
||||
positions["payment"] < positions["rewards"]
|
||||
), "Payment should come before rewards"
|
||||
assert (
|
||||
positions["rewards"] < positions["return_policy"]
|
||||
), "Rewards should come before return policy"
|
||||
|
||||
def test_multipage_invoice_extraction(self, markitdown):
|
||||
"""Test extraction of multipage invoice PDF with form-style layout.
|
||||
|
||||
Expected output: Pipe-separated format with clear cell boundaries.
|
||||
Form data should be extracted with pipes indicating column separations.
|
||||
"""
|
||||
pdf_path = os.path.join(TEST_FILES_DIR, "REPAIR-2022-INV-001_multipage.pdf")
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
text_content = result.text_content
|
||||
|
||||
# Validate basic content is extracted
|
||||
expected_strings = [
|
||||
"ZAVA AUTO REPAIR",
|
||||
"Collision Repair",
|
||||
"Redmond, WA",
|
||||
"Gabriel Diaz",
|
||||
"Jeep",
|
||||
"Grand Cherokee",
|
||||
"Parts",
|
||||
"Body Labor",
|
||||
"Paint Labor",
|
||||
"GRAND TOTAL",
|
||||
# Second page content
|
||||
"Bruce Wayne",
|
||||
"Batmobile",
|
||||
]
|
||||
validate_strings(result, expected_strings)
|
||||
|
||||
# Validate pipe-separated table format
|
||||
# Form-style documents should use pipes to separate cells
|
||||
assert "|" in text_content, "Form-style PDF should contain pipe separators"
|
||||
|
||||
# Validate key form fields are properly separated
|
||||
# These patterns check that label and value are in separate cells
|
||||
# Note: cells may have padding spaces for column alignment
|
||||
import re
|
||||
|
||||
assert re.search(
|
||||
r"\| Insured name\s*\|", text_content
|
||||
), "Insured name should be in its own cell"
|
||||
assert re.search(
|
||||
r"\| Gabriel Diaz\s*\|", text_content
|
||||
), "Gabriel Diaz should be in its own cell"
|
||||
assert re.search(
|
||||
r"\| Year\s*\|", text_content
|
||||
), "Year label should be in its own cell"
|
||||
assert re.search(
|
||||
r"\| 2022\s*\|", text_content
|
||||
), "Year value should be in its own cell"
|
||||
|
||||
# Validate table structure for estimate totals
|
||||
assert (
|
||||
re.search(r"\| Hours\s*\|", text_content) or "Hours |" in text_content
|
||||
), "Hours column header should be present"
|
||||
assert (
|
||||
re.search(r"\| Rate\s*\|", text_content) or "Rate |" in text_content
|
||||
), "Rate column header should be present"
|
||||
assert (
|
||||
re.search(r"\| Cost\s*\|", text_content) or "Cost |" in text_content
|
||||
), "Cost column header should be present"
|
||||
|
||||
# Validate numeric values are extracted
|
||||
assert "2,100" in text_content, "Parts cost should be extracted"
|
||||
assert "300" in text_content, "Body labor cost should be extracted"
|
||||
assert "225" in text_content, "Paint labor cost should be extracted"
|
||||
assert "5,738" in text_content, "Grand total should be extracted"
|
||||
|
||||
# Validate second page content (Bruce Wayne invoice)
|
||||
assert "Bruce Wayne" in text_content, "Second page customer name"
|
||||
assert "Batmobile" in text_content, "Second page vehicle model"
|
||||
assert "211,522" in text_content, "Second page grand total"
|
||||
|
||||
# Validate disclaimer text is NOT in table format (long paragraph)
|
||||
# The disclaimer should be extracted as plain text, not pipe-separated
|
||||
assert (
|
||||
"preliminary estimate" in text_content.lower()
|
||||
), "Disclaimer text should be present"
|
||||
|
||||
def test_academic_pdf_extraction(self, markitdown):
|
||||
"""Test extraction of academic paper PDF (scientific document).
|
||||
|
||||
Expected output: Plain text without tables or pipe characters.
|
||||
Scientific documents should be extracted as flowing text with proper spacing,
|
||||
not misinterpreted as tables.
|
||||
"""
|
||||
pdf_path = os.path.join(TEST_FILES_DIR, "test.pdf")
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
text_content = result.text_content
|
||||
|
||||
# Validate academic paper content with proper spacing
|
||||
expected_strings = [
|
||||
"Introduction",
|
||||
"Large language models", # Should have proper spacing, not "Largelanguagemodels"
|
||||
"agents",
|
||||
"multi-agent", # Should be properly hyphenated
|
||||
]
|
||||
validate_strings(result, expected_strings)
|
||||
|
||||
# Validate proper text formatting (words separated by spaces)
|
||||
assert "LLMs" in text_content, "Should contain 'LLMs' acronym"
|
||||
assert "reasoning" in text_content, "Should contain 'reasoning'"
|
||||
assert "observations" in text_content, "Should contain 'observations'"
|
||||
|
||||
# Ensure content is not empty and has proper length
|
||||
assert len(text_content) > 1000, "Academic PDF should have substantial content"
|
||||
|
||||
# Scientific documents should NOT have tables or pipe characters
|
||||
assert (
|
||||
"|" not in text_content
|
||||
), "Scientific document should not contain pipe characters (no tables)"
|
||||
|
||||
# Verify no markdown tables were extracted
|
||||
tables = extract_markdown_tables(text_content)
|
||||
assert (
|
||||
len(tables) == 0
|
||||
), f"Scientific document should have no tables, found {len(tables)}"
|
||||
|
||||
# Verify text is properly formatted with spaces between words
|
||||
# Check that common phrases are NOT joined together (which would indicate bad extraction)
|
||||
assert (
|
||||
"Largelanguagemodels" not in text_content
|
||||
), "Text should have proper spacing, not joined words"
|
||||
assert (
|
||||
"multiagentconversations" not in text_content.lower()
|
||||
), "Text should have proper spacing between words"
|
||||
|
||||
def test_scanned_pdf_handling(self, markitdown):
|
||||
"""Test handling of scanned/image-based PDF (no text layer).
|
||||
|
||||
Expected output: Empty - scanned PDFs without OCR have no text layer.
|
||||
"""
|
||||
pdf_path = os.path.join(
|
||||
TEST_FILES_DIR, "MEDRPT-2024-PAT-3847_medical_report_scan.pdf"
|
||||
)
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
|
||||
# Scanned PDFs without OCR have no text layer, so extraction should be empty
|
||||
assert (
|
||||
result is not None
|
||||
), "Converter should return a result even for scanned PDFs"
|
||||
assert result.text_content is not None, "text_content should not be None"
|
||||
|
||||
# Verify extraction is empty (no text layer in scanned PDF)
|
||||
assert (
|
||||
result.text_content.strip() == ""
|
||||
), f"Scanned PDF should have empty extraction, got: '{result.text_content[:100]}...'"
|
||||
|
||||
|
||||
class TestPdfTableMarkdownFormat:
|
||||
"""Test that extracted tables have proper markdown formatting."""
|
||||
|
||||
@pytest.fixture
|
||||
def markitdown(self):
|
||||
"""Create MarkItDown instance."""
|
||||
return MarkItDown()
|
||||
|
||||
def test_markdown_table_has_pipe_format(self, markitdown):
|
||||
"""Test that form-style PDFs have pipe-separated format."""
|
||||
pdf_path = os.path.join(
|
||||
TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf"
|
||||
)
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
text_content = result.text_content
|
||||
|
||||
# Find rows with pipes
|
||||
lines = text_content.split("\n")
|
||||
pipe_rows = [
|
||||
line for line in lines if line.startswith("|") and line.endswith("|")
|
||||
]
|
||||
|
||||
assert len(pipe_rows) > 0, "Should have pipe-separated rows"
|
||||
|
||||
# Check that Product Code appears in a pipe-separated row
|
||||
product_code_found = any("Product Code" in row for row in pipe_rows)
|
||||
assert product_code_found, "Product Code should be in pipe-separated format"
|
||||
|
||||
def test_markdown_table_columns_have_pipes(self, markitdown):
|
||||
"""Test that form-style PDF columns are separated with pipes."""
|
||||
pdf_path = os.path.join(
|
||||
TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf"
|
||||
)
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
text_content = result.text_content
|
||||
|
||||
# Find table rows and verify column structure
|
||||
lines = text_content.split("\n")
|
||||
table_rows = [
|
||||
line for line in lines if line.startswith("|") and line.endswith("|")
|
||||
]
|
||||
|
||||
assert len(table_rows) > 0, "Should have markdown table rows"
|
||||
|
||||
# Check that at least some rows have multiple columns (pipes)
|
||||
multi_col_rows = [row for row in table_rows if row.count("|") >= 3]
|
||||
assert (
|
||||
len(multi_col_rows) > 5
|
||||
), f"Should have rows with multiple columns, found {len(multi_col_rows)}"
|
||||
|
||||
|
||||
class TestPdfTableStructureConsistency:
|
||||
"""Test that extracted tables have consistent structure across all PDF types."""
|
||||
|
||||
@pytest.fixture
|
||||
def markitdown(self):
|
||||
"""Create MarkItDown instance."""
|
||||
return MarkItDown()
|
||||
|
||||
def test_borderless_table_structure(self, markitdown):
|
||||
"""Test that borderless table PDF has pipe-separated structure."""
|
||||
pdf_path = os.path.join(
|
||||
TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf"
|
||||
)
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
text_content = result.text_content
|
||||
|
||||
# Should have pipe-separated content
|
||||
assert "|" in text_content, "Borderless table PDF should have pipe separators"
|
||||
|
||||
# Check that key content is present
|
||||
assert "Product Code" in text_content, "Should contain Product Code"
|
||||
assert "SKU-8847" in text_content, "Should contain first SKU"
|
||||
assert "SKU-9201" in text_content, "Should contain second SKU"
|
||||
|
||||
def test_multipage_invoice_table_structure(self, markitdown):
|
||||
"""Test that multipage invoice PDF has pipe-separated format."""
|
||||
pdf_path = os.path.join(TEST_FILES_DIR, "REPAIR-2022-INV-001_multipage.pdf")
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
text_content = result.text_content
|
||||
|
||||
# Should have pipe-separated content
|
||||
assert "|" in text_content, "Invoice PDF should have pipe separators"
|
||||
|
||||
# Find rows with pipes
|
||||
lines = text_content.split("\n")
|
||||
pipe_rows = [
|
||||
line for line in lines if line.startswith("|") and line.endswith("|")
|
||||
]
|
||||
|
||||
assert (
|
||||
len(pipe_rows) > 10
|
||||
), f"Should have multiple pipe-separated rows, found {len(pipe_rows)}"
|
||||
|
||||
# Check that some rows have multiple columns
|
||||
multi_col_rows = [row for row in pipe_rows if row.count("|") >= 4]
|
||||
assert len(multi_col_rows) > 5, "Should have rows with 3+ columns"
|
||||
|
||||
def test_receipt_has_no_tables(self, markitdown):
|
||||
"""Test that receipt PDF doesn't incorrectly extract tables from formatted text."""
|
||||
pdf_path = os.path.join(
|
||||
TEST_FILES_DIR, "RECEIPT-2024-TXN-98765_retail_purchase.pdf"
|
||||
)
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
tables = extract_markdown_tables(result.text_content)
|
||||
|
||||
# Receipt should not have markdown tables extracted
|
||||
# (it's formatted text, not tabular data)
|
||||
# If tables are extracted, they should be minimal/empty
|
||||
total_table_rows = sum(len(t) for t in tables)
|
||||
assert (
|
||||
total_table_rows < 5
|
||||
), f"Receipt should not have significant tables, found {total_table_rows} rows"
|
||||
|
||||
def test_scanned_pdf_no_tables(self, markitdown):
|
||||
"""Test that scanned PDF has empty extraction and no tables."""
|
||||
pdf_path = os.path.join(
|
||||
TEST_FILES_DIR, "MEDRPT-2024-PAT-3847_medical_report_scan.pdf"
|
||||
)
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
|
||||
# Scanned PDF with no text layer should have empty extraction
|
||||
assert (
|
||||
result.text_content.strip() == ""
|
||||
), "Scanned PDF should have empty extraction"
|
||||
|
||||
tables = extract_markdown_tables(result.text_content)
|
||||
|
||||
# Scanned PDF with no text layer should have no tables
|
||||
assert len(tables) == 0, "Scanned PDF should have no extracted tables"
|
||||
|
||||
def test_all_pdfs_table_rows_consistent(self, markitdown):
|
||||
"""Test that all PDF tables have rows with pipe-separated content.
|
||||
|
||||
Note: With gap-based column detection, rows may have different column counts
|
||||
depending on how content is spaced in the PDF. What's important is that each
|
||||
row has pipe separators and the content is readable.
|
||||
"""
|
||||
pdf_files = [
|
||||
"SPARSE-2024-INV-1234_borderless_table.pdf",
|
||||
"REPAIR-2022-INV-001_multipage.pdf",
|
||||
"RECEIPT-2024-TXN-98765_retail_purchase.pdf",
|
||||
"test.pdf",
|
||||
]
|
||||
|
||||
for pdf_file in pdf_files:
|
||||
pdf_path = os.path.join(TEST_FILES_DIR, pdf_file)
|
||||
if not os.path.exists(pdf_path):
|
||||
continue
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
tables = extract_markdown_tables(result.text_content)
|
||||
|
||||
for table_idx, table in enumerate(tables):
|
||||
if not table:
|
||||
continue
|
||||
|
||||
# Verify each row has at least one column (pipe-separated content)
|
||||
for row_idx, row in enumerate(table):
|
||||
assert (
|
||||
len(row) >= 1
|
||||
), f"{pdf_file}: Table {table_idx}, row {row_idx} has no columns"
|
||||
|
||||
# Verify the row has non-empty content
|
||||
row_content = " ".join(cell.strip() for cell in row)
|
||||
assert (
|
||||
len(row_content.strip()) > 0
|
||||
), f"{pdf_file}: Table {table_idx}, row {row_idx} is empty"
|
||||
|
||||
def test_borderless_table_data_integrity(self, markitdown):
|
||||
"""Test that borderless table extraction preserves data integrity."""
|
||||
pdf_path = os.path.join(
|
||||
TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf"
|
||||
)
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
tables = extract_markdown_tables(result.text_content)
|
||||
|
||||
assert len(tables) >= 2, "Should have at least 2 tables"
|
||||
|
||||
# Check first table has expected SKU data
|
||||
first_table = tables[0]
|
||||
table_text = str(first_table)
|
||||
assert "SKU-8847" in table_text, "First table should contain SKU-8847"
|
||||
assert "SKU-9201" in table_text, "First table should contain SKU-9201"
|
||||
|
||||
# Check second table has expected category data
|
||||
second_table = tables[1]
|
||||
table_text = str(second_table)
|
||||
assert "Electronics" in table_text, "Second table should contain Electronics"
|
||||
assert "Hardware" in table_text, "Second table should contain Hardware"
|
||||
Reference in New Issue
Block a user