[MS] Update PDF table extraction to support aligned Markdown (#1499)

* Added PDF table extraction feature with aligned Markdown (#1419)

* Add PDF test files and enhance extraction tests

- Added a medical report scan PDF for testing scanned PDF handling.
- Included a retail purchase receipt PDF to validate receipt extraction functionality.
- Introduced a multipage invoice PDF to test extraction of complex invoice structures.
- Added a borderless table PDF for testing inventory reconciliation report extraction.
- Implemented comprehensive tests for PDF table extraction, ensuring proper structure and data integrity.
- Enhanced existing tests to validate the order and presence of extracted content across various PDF types.

* fix: update dependencies for PDF processing and improve table extraction logic

* Bumped version of pdfminer.six
---------

Authored-by: Ashok <ashh010101@gmail.com>
This commit is contained in:
lesyk
2026-01-08 01:38:45 +01:00
committed by GitHub
parent dde250a456
commit 251dddcf0c
8 changed files with 1501 additions and 21 deletions
+1
View File
@@ -52,6 +52,7 @@ coverage.xml
.hypothesis/
.pytest_cache/
cover/
.test-logs/
# Translations
*.mo
+4 -3
View File
@@ -41,19 +41,20 @@ all = [
"openpyxl",
"xlrd",
"lxml",
"pdfminer.six>=20251107",
"pdfminer.six>=20251230",
"pdfplumber>=0.11.9",
"olefile",
"pydub",
"SpeechRecognition",
"youtube-transcript-api~=1.0.0",
"azure-ai-documentintelligence",
"azure-identity"
"azure-identity",
]
pptx = ["python-pptx"]
docx = ["mammoth~=1.11.0", "lxml"]
xlsx = ["pandas", "openpyxl"]
xls = ["pandas", "xlrd"]
pdf = ["pdfminer.six"]
pdf = ["pdfminer.six>=20251230", "pdfplumber>=0.11.9"]
outlook = ["olefile"]
audio-transcription = ["pydub", "SpeechRecognition"]
youtube-transcription = ["youtube-transcript-api"]
@@ -1,22 +1,18 @@
import sys
import io
from typing import BinaryIO, Any
from .._base_converter import DocumentConverter, DocumentConverterResult
from .._stream_info import StreamInfo
from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE
# Try loading optional (but in this case, required) dependencies
# Save reporting of any exceptions for later
# Load dependencies
_dependency_exc_info = None
try:
import pdfminer
import pdfminer.high_level
import pdfplumber
except ImportError:
# Preserve the error and stack trace for later
_dependency_exc_info = sys.exc_info()
@@ -28,16 +24,374 @@ ACCEPTED_MIME_TYPE_PREFIXES = [
ACCEPTED_FILE_EXTENSIONS = [".pdf"]
def _to_markdown_table(table: list[list[str]], include_separator: bool = True) -> str:
"""Convert a 2D list (rows/columns) into a nicely aligned Markdown table.
Args:
table: 2D list of cell values
include_separator: If True, include header separator row (standard markdown).
If False, output simple pipe-separated rows.
"""
if not table:
return ""
# Normalize None → ""
table = [[cell if cell is not None else "" for cell in row] for row in table]
# Filter out empty rows
table = [row for row in table if any(cell.strip() for cell in row)]
if not table:
return ""
# Column widths
col_widths = [max(len(str(cell)) for cell in col) for col in zip(*table)]
def fmt_row(row: list[str]) -> str:
return (
"|"
+ "|".join(str(cell).ljust(width) for cell, width in zip(row, col_widths))
+ "|"
)
if include_separator:
header, *rows = table
md = [fmt_row(header)]
md.append("|" + "|".join("-" * w for w in col_widths) + "|")
for row in rows:
md.append(fmt_row(row))
else:
md = [fmt_row(row) for row in table]
return "\n".join(md)
def _extract_form_content_from_words(page: Any) -> str | None:
"""
Extract form-style content from a PDF page by analyzing word positions.
This handles borderless forms/tables where words are aligned in columns.
Returns markdown with proper table formatting:
- Tables have pipe-separated columns with header separator rows
- Non-table content is rendered as plain text
Returns None if the page doesn't appear to be a form-style document,
indicating that pdfminer should be used instead for better text spacing.
"""
words = page.extract_words(keep_blank_chars=True, x_tolerance=3, y_tolerance=3)
if not words:
return None
# Group words by their Y position (rows)
y_tolerance = 5
rows_by_y: dict[float, list[dict]] = {}
for word in words:
y_key = round(word["top"] / y_tolerance) * y_tolerance
if y_key not in rows_by_y:
rows_by_y[y_key] = []
rows_by_y[y_key].append(word)
# Sort rows by Y position
sorted_y_keys = sorted(rows_by_y.keys())
page_width = page.width if hasattr(page, "width") else 612
# First pass: analyze each row
row_info: list[dict] = []
for y_key in sorted_y_keys:
row_words = sorted(rows_by_y[y_key], key=lambda w: w["x0"])
if not row_words:
continue
first_x0 = row_words[0]["x0"]
last_x1 = row_words[-1]["x1"]
line_width = last_x1 - first_x0
combined_text = " ".join(w["text"] for w in row_words)
# Count distinct x-position groups (columns)
x_positions = [w["x0"] for w in row_words]
x_groups: list[float] = []
for x in sorted(x_positions):
if not x_groups or x - x_groups[-1] > 50:
x_groups.append(x)
# Determine row type
is_paragraph = line_width > page_width * 0.55 and len(combined_text) > 60
row_info.append(
{
"y_key": y_key,
"words": row_words,
"text": combined_text,
"x_groups": x_groups,
"is_paragraph": is_paragraph,
"num_columns": len(x_groups),
}
)
# Collect ALL x-positions from rows with 3+ columns (table-like rows)
# This gives us the global column structure
all_table_x_positions: list[float] = []
for info in row_info:
if info["num_columns"] >= 3 and not info["is_paragraph"]:
all_table_x_positions.extend(info["x_groups"])
if not all_table_x_positions:
return None
# Compute global column boundaries
all_table_x_positions.sort()
global_columns: list[float] = []
for x in all_table_x_positions:
if not global_columns or x - global_columns[-1] > 30:
global_columns.append(x)
# Too many columns suggests dense text, not a form
if len(global_columns) > 8:
return None
# Now classify each row as table row or not
# A row is a table row if it has words that align with 2+ of the global columns
for info in row_info:
if info["is_paragraph"]:
info["is_table_row"] = False
continue
# Count how many global columns this row's words align with
aligned_columns: set[int] = set()
for word in info["words"]:
word_x = word["x0"]
for col_idx, col_x in enumerate(global_columns):
if abs(word_x - col_x) < 40:
aligned_columns.add(col_idx)
break
# If row uses 2+ of the established columns, it's a table row
info["is_table_row"] = len(aligned_columns) >= 2
# Find table regions (consecutive table rows)
table_regions: list[tuple[int, int]] = [] # (start_idx, end_idx)
i = 0
while i < len(row_info):
if row_info[i]["is_table_row"]:
start_idx = i
while i < len(row_info) and row_info[i]["is_table_row"]:
i += 1
end_idx = i
table_regions.append((start_idx, end_idx))
else:
i += 1
# Check if enough rows are table rows (at least 20%)
total_table_rows = sum(end - start for start, end in table_regions)
if len(row_info) > 0 and total_table_rows / len(row_info) < 0.2:
return None
# Build output - collect table data first, then format with proper column widths
result_lines: list[str] = []
num_cols = len(global_columns)
# Helper function to extract cells from a row
def extract_cells(info: dict) -> list[str]:
cells: list[str] = ["" for _ in range(num_cols)]
for word in info["words"]:
word_x = word["x0"]
# Find the correct column using boundary ranges
assigned_col = num_cols - 1 # Default to last column
for col_idx in range(num_cols - 1):
col_end = global_columns[col_idx + 1]
if word_x < col_end - 20:
assigned_col = col_idx
break
if cells[assigned_col]:
cells[assigned_col] += " " + word["text"]
else:
cells[assigned_col] = word["text"]
return cells
# Process rows, collecting table data for proper formatting
idx = 0
while idx < len(row_info):
info = row_info[idx]
# Check if this row starts a table region
table_region = None
for start, end in table_regions:
if idx == start:
table_region = (start, end)
break
if table_region:
start, end = table_region
# Collect all rows in this table
table_data: list[list[str]] = []
for table_idx in range(start, end):
cells = extract_cells(row_info[table_idx])
table_data.append(cells)
# Calculate column widths for this table
if table_data:
col_widths = [
max(len(row[col]) for row in table_data) for col in range(num_cols)
]
# Ensure minimum width of 3 for separator dashes
col_widths = [max(w, 3) for w in col_widths]
# Format header row
header = table_data[0]
header_str = (
"| "
+ " | ".join(
cell.ljust(col_widths[i]) for i, cell in enumerate(header)
)
+ " |"
)
result_lines.append(header_str)
# Format separator row
separator = (
"| "
+ " | ".join("-" * col_widths[i] for i in range(num_cols))
+ " |"
)
result_lines.append(separator)
# Format data rows
for row in table_data[1:]:
row_str = (
"| "
+ " | ".join(
cell.ljust(col_widths[i]) for i, cell in enumerate(row)
)
+ " |"
)
result_lines.append(row_str)
idx = end # Skip to end of table region
else:
# Check if we're inside a table region (not at start)
in_table = False
for start, end in table_regions:
if start < idx < end:
in_table = True
break
if not in_table:
# Non-table content
result_lines.append(info["text"])
idx += 1
return "\n".join(result_lines)
def _extract_tables_from_words(page: Any) -> list[list[list[str]]]:
"""
Extract tables from a PDF page by analyzing word positions.
This handles borderless tables where words are aligned in columns.
This function is designed for structured tabular data (like invoices),
not for multi-column text layouts in scientific documents.
"""
words = page.extract_words(keep_blank_chars=True, x_tolerance=3, y_tolerance=3)
if not words:
return []
# Group words by their Y position (rows)
y_tolerance = 5
rows_by_y: dict[float, list[dict]] = {}
for word in words:
y_key = round(word["top"] / y_tolerance) * y_tolerance
if y_key not in rows_by_y:
rows_by_y[y_key] = []
rows_by_y[y_key].append(word)
# Sort rows by Y position
sorted_y_keys = sorted(rows_by_y.keys())
# Find potential column boundaries by analyzing x positions across all rows
all_x_positions = []
for words_in_row in rows_by_y.values():
for word in words_in_row:
all_x_positions.append(word["x0"])
if not all_x_positions:
return []
# Cluster x positions to find column starts
all_x_positions.sort()
x_tolerance_col = 20
column_starts: list[float] = []
for x in all_x_positions:
if not column_starts or x - column_starts[-1] > x_tolerance_col:
column_starts.append(x)
# Need at least 3 columns but not too many (likely text layout, not table)
if len(column_starts) < 3 or len(column_starts) > 10:
return []
# Find rows that span multiple columns (potential table rows)
table_rows = []
for y_key in sorted_y_keys:
words_in_row = sorted(rows_by_y[y_key], key=lambda w: w["x0"])
# Assign words to columns
row_data = [""] * len(column_starts)
for word in words_in_row:
# Find the closest column
best_col = 0
min_dist = float("inf")
for i, col_x in enumerate(column_starts):
dist = abs(word["x0"] - col_x)
if dist < min_dist:
min_dist = dist
best_col = i
if row_data[best_col]:
row_data[best_col] += " " + word["text"]
else:
row_data[best_col] = word["text"]
# Only include rows that have content in multiple columns
non_empty = sum(1 for cell in row_data if cell.strip())
if non_empty >= 2:
table_rows.append(row_data)
# Validate table quality - tables should have:
# 1. Enough rows (at least 3 including header)
# 2. Short cell content (tables have concise data, not paragraphs)
# 3. Consistent structure across rows
if len(table_rows) < 3:
return []
# Check if cells contain short, structured data (not long text)
long_cell_count = 0
total_cell_count = 0
for row in table_rows:
for cell in row:
if cell.strip():
total_cell_count += 1
# If cell has more than 30 chars, it's likely prose text
if len(cell.strip()) > 30:
long_cell_count += 1
# If more than 30% of cells are long, this is probably not a table
if total_cell_count > 0 and long_cell_count / total_cell_count > 0.3:
return []
return [table_rows]
class PdfConverter(DocumentConverter):
"""
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
Converts PDFs to Markdown.
Supports extracting tables into aligned Markdown format (via pdfplumber).
Falls back to pdfminer if pdfplumber is missing or fails.
"""
def accepts(
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
**kwargs: Any,
) -> bool:
mimetype = (stream_info.mimetype or "").lower()
extension = (stream_info.extension or "").lower()
@@ -55,9 +409,8 @@ class PdfConverter(DocumentConverter):
self,
file_stream: BinaryIO,
stream_info: StreamInfo,
**kwargs: Any, # Options to pass to the converter
**kwargs: Any,
) -> DocumentConverterResult:
# Check the dependencies
if _dependency_exc_info is not None:
raise MissingDependencyException(
MISSING_DEPENDENCY_MESSAGE.format(
@@ -65,13 +418,55 @@ class PdfConverter(DocumentConverter):
extension=".pdf",
feature="pdf",
)
) from _dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
) from _dependency_exc_info[1].with_traceback(
_dependency_exc_info[2]
)
) # type: ignore[union-attr]
assert isinstance(file_stream, io.IOBase) # for mypy
return DocumentConverterResult(
markdown=pdfminer.high_level.extract_text(file_stream),
)
assert isinstance(file_stream, io.IOBase)
markdown_chunks: list[str] = []
# Read file stream into BytesIO for compatibility with pdfplumber
pdf_bytes = io.BytesIO(file_stream.read())
try:
# Track how many pages are form-style vs plain text
form_pages = 0
plain_pages = 0
with pdfplumber.open(pdf_bytes) as pdf:
for page in pdf.pages:
# Try form-style word position extraction
page_content = _extract_form_content_from_words(page)
# If extraction returns None, this page is not form-style
if page_content is None:
plain_pages += 1
# Extract text using pdfplumber's basic extraction for this page
text = page.extract_text()
if text and text.strip():
markdown_chunks.append(text.strip())
else:
form_pages += 1
if page_content.strip():
markdown_chunks.append(page_content)
# If most pages are plain text, use pdfminer for better text handling
if plain_pages > form_pages and plain_pages > 0:
pdf_bytes.seek(0)
markdown = pdfminer.high_level.extract_text(pdf_bytes)
else:
# Build markdown from chunks
markdown = "\n\n".join(markdown_chunks).strip()
except Exception:
# Fallback if pdfplumber fails
pdf_bytes.seek(0)
markdown = pdfminer.high_level.extract_text(pdf_bytes)
# Fallback if still empty
if not markdown:
pdf_bytes.seek(0)
markdown = pdfminer.high_level.extract_text(pdf_bytes)
return DocumentConverterResult(markdown=markdown)
@@ -0,0 +1,97 @@
%PDF-1.4
%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com
1 0 obj
<<
/F1 2 0 R /F2 3 0 R /F3 4 0 R /F4 5 0 R
>>
endobj
2 0 obj
<<
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
>>
endobj
3 0 obj
<<
/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
>>
endobj
4 0 obj
<<
/BaseFont /Courier /Encoding /WinAnsiEncoding /Name /F3 /Subtype /Type1 /Type /Font
>>
endobj
5 0 obj
<<
/BaseFont /Courier-Bold /Encoding /WinAnsiEncoding /Name /F4 /Subtype /Type1 /Type /Font
>>
endobj
6 0 obj
<<
/BitsPerComponent 8 /ColorSpace /DeviceRGB /Filter [ /ASCII85Decode /FlateDecode ] /Height 70 /Length 4491 /Subtype /Image
/Type /XObject /Width 200
>>
stream
Gb"/lq,^Nc)M\9OkX:DBZ5YT>'!op&`0lHCEL`PXM2DFT$QuCdPsfSJ4#%$gW49i\1e&eZ\Acg:2bhaPc+^Q$/Shs#,1&Qu>83CBh729[%A$M]]Z8KL]Cu-OpO.1`\pCPboa2!3#sCC+4Yg#.W)"\K&i)doY5WCH.J-G0E$]%J"BRoZ88okcKEP@C7S%JEA:t(e6:OLb-"MZ3=$fAIE$]%J"BRoZ88okcKEP@C7S%JEA:t(e6:OKV=+u3F\<8ml>&.H@]EH:@TAeD]);ilGV3,!4G2kP0XRN"#r/]G!Kqo1c9Redk1d%Hh[t82=;lMkgMFr4J2#lTY[siCuNDGT]h^te4%1fj10r$&D--;(2UPbQ(Ze:VUL"2G=%qPZVOlc,tegG*BO,:$mI%mCAJ^8q2gWSn>)Ui.KQ!A5_(Z;l?_%(Xb28AGIU0SY?bVp%B6=$P2*I!1?W8WL>aAVWc$%-nk=D8ZjE$stW]LJ-LqIj9qZoFV/lj$Uf)=b`nfl*ANkt_qpb2t'P`;D#\h6o+g'M+"j4Sf:d#_jjZdTS>mnQJm^7>(S3Bq"m(kH5i3;0`YK<5e.0$k"4XA4UX)rfXH+2OamR360'cX$&"Dp"DdSkh^Q;?+H)fBc@YMp?\]UZuQ*lgt@kDS'dARs/]`Roa+]eXm%&TJ4e[RCKq6kQ:5Zpa[hLTEE,M/UR\C]SS.K'HJL7F)F5Ts63hWCKs+aTqi3TN!,P7#o$@'a?^`M.9&=d,$WJ*\b^*N1fR$JFV.s`.W*WfCgS\9h@C2/uC<b(1#bB73rR3UmcP"%)_DZ#=)TA1KkfUBT8F;=Yoc[BR6[ZkS\Y$n/.@mf!9WK1Hj]o3[f>DiFrdD&a`iRQ\df2(Xc53$=i@@upbP,MJ@.sDgm%`^.8+5u//"Hhn%^kIb$5o\2B7%r>DD\NZ:L;otY0]:)'6l[M<&ctoM"($Q_XW1'!4OB>g/3dF]mD],eF*&&'itQ;2e$/VWZ/QmdogQ0&d7ePkDGP[PZkk8TtUWkaJYa$Q)c6I+l<preqG)K\U>pY5H])D-lHdp52<d:Isd8)X0&b+pKUugDNb2NIW.aD_PLN/i(r9N&<3?,2br'%?gT'_i;n9VUeeM#>ko&@JS]_pP&PR0@L`i*pbXB"rgcI`#'#>-Njfe@8+ZC7hHU>Qm2oCj$^ATs<7[sZ@5*,@qslQ\p1m1#p6XrGL'F^?ok?+\fDe1,#<0n78&1'&KK/85E7IuiRklZ$<tM_`dQfdI@$2&Sj]k*=&V0n1RVEA-6,(U`EJ9674P)af%Z>l=<9-YKka`e!9&oUk]3u7Y);o;!X[W(:<D3M@"gDG)[;",CR0eT^r/fRnB+ob8JM\tp9\JoC\=uqFPX9nU`?:Y,eJf!6E95r@KI_iekuY/-+j6DIrXFQW>i@m+VqQff?4r\fn@@4QXN6[dWdtV8:B`3X2:bgH!rR8-r^sf)#EN%"`F/q0Heh_C7H6l'.@I3l<Jr.Q!as3DB-9V*+/'h,_<T8?^2u*t.p$h8d%"Dd\P<5M`MEg>7W_M8qB0Sd$'o&pWH!XFNS.JRZ%[WY$N:rl5tLIb;#&1u\'nOCIB]161$bC,Uuf;ZK6dl()epY<39X_LaOAXU:WAZiYqZk5Tq+hN`O"QdZ7[jLdf^cf`?9i4T#=]JO$'0fC4#Y=^M%VOouL.PZ6/V;r+XoF1Ls*YXu`6'4?,j47_u_U=.T*IX0ed;@5JN=Qlc\gS!W?r;#%jA)NSUh\`='l:HWsF<K@<`EqO<,Ht[H.@PGU,p6$s&YbEgb;cfG9YK,6Fh]@t(EUD@78Ob6ui6[#pIBoZn<UH)N"PrIeP3Y!qa-k8bP>_rQ_q'7l3]f,=As5FN;rm6/&IWa@[9HFr3YtU'N%=ZPr)s`!!&o!IbLLsBH7VnG&"_&hSn3;Nr^mpSZk^2i_aD8<g*:f)-)1j6@3KSHbb_c1PDAXpnkGE:H3Fs0m?uXff0>H]^Oi^Wq(2*ak3>^mA^!FkG4$-Vq(BH"U+YSR;%(5j(bnT,&RrR1d]\O5_42^f/Xa:4msf,Oms&5F6()XE"p6mS/Yc\Ga&`hC/3XdsM;'cTMl(uV@DiFY5AA_VWS4T'&^D<.7.S,`B?:^&!Q[ZVaCi0^$[E#=Xt_;^\;l;M#]`$4;sLf^6$u5)gpB'7TO-@*HXbXF]H[ID>=n%&8-T:f)&:]?hm\RN5/B5sdW)PM[X@>2Jq/jQ%m%0Pk%J`<],L8cn3_`B)dE8ng`*C-";2ro.7o.B2:3d$4r#LEqr#8((eIunkG+V@25V=%+_!:Z/,UQca-*F<WEc]8T!rP(2A>g9*GW$LPXTF:ER(d"o8oX31"!B8VtcoXnY$9m&'30Um_8lI0aO.LY_m,l[3VfKlBY*[!$I#3=:"_\lGs6U"<,-kF4HFN$6[_SQpG)7_H3mnKRB'C9#q8EY(Vaqi(D&r$*Jr?OPiaP#RRYeN0)sia9W*TKT)#N9#q8EY(Vaqi(D&r$*Jr?OPiaP#k^2Zeu*-9hp_.0@)f6Em^OKjQe!N6R'K"tCBm#IB,_F0s/@WUCJX)`/6>DF6M_)Kk.s>$UZPX\r/#8I8q/RiHL2liUAA0Yf`%Ld=068s:8FSq=\;#qPO.M6gNJ<fcJ_B7NfVHbGb(Z#%=9b&C:^$<Xe!_fU:_#gd*5iB@GeEYKud7*YgG3W,jj8\q'/4@l'Y-g\)rX-m;H77/R]Zb%A!YN(tcDfkrJ&o7($5o=b[,R0&h>j\UuBER`krd-YJajg!X[eDSKG#rM875C*#IQWm+O@aOf*TU2U('gKn4DA1c#r\17+;`V`#Kgpm8RC[Ee$Ac&[r[O5Rd>4]?HeBRku2;R,jekQW/J-QCRW&^t>C\3e,:[W(TT!T:2JEND71MtHndISe*l)E$(\gO^@6Z"5Q<4O)uM0mY8/2q_>,e#m"SaP0HaXDih_2UgJZH0.io.<EHam+)Ba'CJ3$,Ve\2W\TOCDf>!]XNj5RmJj5Qb:1EKuS*^?b+;s)^?6l>JE0QrGSUpouIlZac9kX41>cja=/SDr<cA`ZGg.-dcfEr]UahTX\[1!?g.Q+Bc7gR:Jn+!PFgH?<j_Cphp%2cK2o3EmuaRlLL,1SFd.EYSt<=je7H>"[/Y4/S?T:Ij;Z5Vm!N#O9oGbTmC/.8"?)WA+NO#l!d__>E^^P^^5SPK3f#a-$jDf&=>7p3h\GbV.%e"aa`Nr8:-((uUWjP?5h<QpA5fPW)4^9]4ZVP3n=aOl2n\TH8,2g(<%d(S%%7D#]P?GegI?jC3uo^tm=sN6Bp.;hI!R:5>*[4qX1g^KT=J\eG*=-r!s:Xj;^b_=6X9pT)tmOPN+SP\Y9d?!CS[k5Z"1@+N1#RUIImf:>$etSOp!A$%9Npa7^LW;"'>&DLN*LT#=""p0WOpjXfI:C##@j@lDSU=Xe\&,@B>QE*Z4EfD\=1"h4F8$&QPkBC:BBC"p$N?^/:S9o*P:eF4j>`WT<EG,f\ln6T<>&V*-UR?1+=iZ&Y:YJ)Y_O1Q!(3MOc:&,lEK0KT>gBrM!Oo7g$R0Z=@n</A>l[op[I#4)k.C3f6pb.hq#9_d636^F1(A%Us@pB(WNXIQ#TKD-`)0%k[fj0?XEDjbhd[m6#LpLW.'sd9_sqo4)9,(HjMXDMbKZE5`!!P4XRB@/4TISu+m2&%RiFj^;=JGE6IZZZ3Iq9u;tP9Ze)spB!TH+!k0kjm?9TaEqM'"N]I#K68.sENFpG-:BpL"k5<Kf;Cll]p\0.VBgJXYV7bkGELTah(>RWGsjL14<<sE:X3JW]):L"_a3kcBH&.(2Ui6d<sTVSq6Hb2oGX:N_j+E^^?#]Brd-YC/q4&j48+1kK<)&.;T[(aqqDC]Z:"7NhLIO5&BeU-SXc>m&CBpaKs!LDjG%ZThfU"?o'^kNBIDSK`iDtti"$XI@8UFlc6'SU-*bNc0g0qZU.!3k5:g&J5%_GC<>>?gMnQALpYLh_[CK2YgZMfbcG=f9!FE+GUto#YS.Ms0oqhppBVFIgKikS73+[l77Q]oKP@6W$g?K-&@CV\IU2l9TjknpF,mi!B#31b$WjNK[_SbrBad@)rY(?tsa<O6[4Wd_rrCQDK9Fn>?noi;W9O[OS+IIT/JfXc1#&UNmnp:7_c6/aW-".*cBVjc)"DgbY/sJI7I:YTo4n^?6D*J='=Z.q\rms\C-WjI*igXS;Vfo_`Y\`_c8K6r=SE5*WPmG+p6'k&&0eDn]e<kl$//^-6n?[M0Wg:@`F!W*m'jM%_,JfY,&JA=T)'Qh]O:`+1#oOo&Q&lRj>R;8k_3L)o&mP_\+Z6EUKR,;fQ&lT(!/=AJY_cg)ZFd_iV[)$d`W(]Q<7PapAWM]!Y4qoZ_Z9e(Tts:W\F3=&Fc,Mp(`=cJUG]7+gkp\4)_AlnCa6gYn$_U<4&\@+ERR^ZIFpqbN%;:=3k=P0?fC_:0&Ug9TYUE[kI#Bq;+mc:C\/7H:]h-hq^[P:`jZ\paVNd3BCYF4eruZ)J"F0tYQ"^`hf0;~>endstream
endobj
7 0 obj
<<
/Contents 11 0 R /MediaBox [ 0 0 216 792 ] /Parent 10 0 R /Resources <<
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject <<
/FormXob.2a351979d8c75d073b2ea4bfb74718f9 6 0 R
>>
>> /Rotate 0 /Trans <<
>>
/Type /Page
>>
endobj
8 0 obj
<<
/PageMode /UseNone /Pages 10 0 R /Type /Catalog
>>
endobj
9 0 obj
<<
/Author (\(anonymous\)) /CreationDate (D:20251205104951+01'00') /Creator (\(unspecified\)) /Keywords () /ModDate (D:20251205104951+01'00') /Producer (ReportLab PDF Library - www.reportlab.com)
/Subject (\(unspecified\)) /Title (\(anonymous\)) /Trapped /False
>>
endobj
10 0 obj
<<
/Count 1 /Kids [ 7 0 R ] /Type /Pages
>>
endobj
11 0 obj
<<
/Filter [ /ASCII85Decode /FlateDecode ] /Length 1981
>>
stream
GauI8D/\/e&:hOi=55KJ8;^%$X%4*F0ZR4[TQ-.JlIl=8,25emeWsH3DOUP#SK,Tt>!%\QTf-@>5)IUB4PgI,!l6hOHqp".7kqY?VEc$;f0G$C+?kE+IduY+D6B[cY<`?1&+Bf$SSWa.DI28'F?:CpG_mY"TO]hkXiFku&",h"4G/8GlCSK3UDT`3W#q'Qc;4>t6\tbspa(/l?]"D>nboQo,(,[\*-A;J=Ru^j=[Nu\:iMk7q<+PGo*gWpT4_C)j)t7Oc[5MlffWrhj!99;,0?]r3R(ns^B^I*KQ#f[([aS1g);Q.rrBep&6)sVJs=\.1^pkCa(tBfECI75_;C0LC.)*n<3;@eFZT<Brd%CYO%*fRCl_%R2PLtnF>0lg_SFKN$O.X\o%U7_58YJ,X[`p,%PUL^1]EgT]T\4*B3hOrEZA:[=ui88pZGht%klE^OC$2=@$GiMoTO<eR\C2;10r\O.%_7=c.`)*@0N>,CLh>2ZDq?"(LrLS)ajJ<DG(:N]5MPuT)E6J8.)!Ud7D>Je7M&(V1i'>Z@qg3/WJ@PpL4nr1qU$V_#jqpM+M<[H(LE:pW6uTQK`^P%Q'T&[*Y1\T_7.O:+n^Y)3+d56\hGsIrnqB85q[4L1WG#"Uo.d]Zr"jG`qiT=AU8b5]p3(N2IfnWHVO&$rnNe6[_$[o(m=2Sq-C[bbNOS,qIb?:TGYHhQjjcCe!9%*cuscgU*Eea^B?#^HtoE9p(jd_GR#E1\LTm:MVS5e]+<LZRQ]0^iZNnTs\Iq5l6H+?80%j?IX^UR28jY=Vr!:#Jf=D$QdR#4X6Q%Z^E6hq4[p#rHu/mN!PgeQCn_hEI9M3(_b.pAid<e0?KUakkhL+Sq1A+!S(V0h+OF8nn#&[1+6p+D5^<OP@s\HS\itN&+apX>;a8<=<fVm-cM(u=Q31UTuXZiRNk/X^o=e`8?ha=(l,J:AYGq&k'81mIs68U9).dPb@tBY#5s6I1(;=p-FNV8JLO6-b%6]BUEO#*P:YXQ,+2T8!GeA%OFD*^H'I<qo6Q\KGcc$9<-q;oCHo2eF[F/t'nG3p8bj]<0qUd^A*Un<D3^J]5-EeYaHZL0]dZ(aldZ?U]EL]o1@j;L=l_$._u&B5KRKtY900d3EO'mY6&5WB,D7o]o+7,#h.[N58L+)*ks!_/dIq7L<$Q/>:Ym/3(NJmP3]c2J81f'[9A229?.>nW.Y"uioK$/X(RLnTFa0nhiu#_V(M%6pL-[3&IEZO^iW'pcgSC4%cs*UfWL8=h@<SF-6Ml.SK#\%/6pL;XKVP08]+YR4.1^h^3g+iL6p2jNFKi##9N\7TS:EE')be-k57a"IMZoUV#>]Eoq_3uS@i.]*ai31P3"'$;S,Q%'"=$Vq"-_!pX<>bA]?nd=dOpZa\$"k!cpI9L2SO3gBd7]TKi)s)3F/ADnb^3N)iF')M[\Fq1\^PAl):!YpJp!B\s/2LkZr*`(o%fTO.qa?N[7\P_Dj!-3=OAO3]DtNKn-R1Hc#$?$h;RW[;B(k%DrOQ4lA(kZ`8[,.5E\H/%&9RE1k-pKk^'?Wseh?':/9RD3&&Xf\j=9;Sdd#l!b5li$Q.?@#FtJ9r"D*THt>o=+h,ei<3VCI\e<F.YjMklHmQ252@7%.$dl?FX[.5Ru_<cOnWObGU$sud3sn0Nm?VSip=&P_8=3<b5l"NFchqcT66k!jof;<?"kHRj[>Q+FB(V85-;*H\(QoM*>m2@9WKA`dV0$2F]lQ!^cKY?-F<<RYBD9:P`&#<:DJpSA0L]L_Q`8=5'6'r`p_54_;lcH+H=)4\l8B7YE#pX>K&Mf4jEn:L@C'pmu(T(NAo?onFtPTH*Mah:OJII8OF6<oMipM;1-5S+stnT,o"n]+UmpI>_OX,SeHS'86i`]nN=oW_Hjm1lb%agT!)1^3rJWom\/,?BYNjThVR,cQ'opa8Q#<G9<qeSN'GRRO*(AC(K$'<9uMACIm=MV?Mk2Q*3P"~>endstream
endobj
xref
0 12
0000000000 65535 f
0000000073 00000 n
0000000134 00000 n
0000000241 00000 n
0000000353 00000 n
0000000458 00000 n
0000000568 00000 n
0000005249 00000 n
0000005507 00000 n
0000005576 00000 n
0000005859 00000 n
0000005919 00000 n
trailer
<<
/ID
[<4800d64fefba4dd902e51197c7da4e88><4800d64fefba4dd902e51197c7da4e88>]
% ReportLab generated PDF document -- digest (http://www.reportlab.com)
/Info 9 0 R
/Root 8 0 R
/Size 12
>>
startxref
7992
%%EOF
File diff suppressed because one or more lines are too long
@@ -0,0 +1,871 @@
#!/usr/bin/env python3 -m pytest
"""Tests for PDF table extraction functionality."""
import os
import re
import pytest
from markitdown import MarkItDown
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files")
# --- Helper Functions ---
def validate_strings(result, expected_strings, exclude_strings=None):
"""Validate presence or absence of specific strings."""
text_content = result.text_content.replace("\\", "")
for string in expected_strings:
assert string in text_content, f"Expected string not found: {string}"
if exclude_strings:
for string in exclude_strings:
assert string not in text_content, f"Excluded string found: {string}"
def validate_markdown_table(result, expected_headers, expected_data_samples):
"""Validate that a markdown table exists with expected headers and data."""
text_content = result.text_content
# Check for markdown table structure (| header | header |)
assert "|" in text_content, "No markdown table markers found"
# Check headers are present
for header in expected_headers:
assert header in text_content, f"Expected table header not found: {header}"
# Check some data values are present
for data in expected_data_samples:
assert data in text_content, f"Expected table data not found: {data}"
def extract_markdown_tables(text_content):
"""
Extract all markdown tables from text content.
Returns a list of tables, where each table is a list of rows,
and each row is a list of cell values.
"""
tables = []
lines = text_content.split("\n")
current_table = []
in_table = False
for line in lines:
line = line.strip()
if line.startswith("|") and line.endswith("|"):
# Skip separator rows (contain only dashes and pipes)
if re.match(r"^\|[\s\-|]+\|$", line):
continue
# Parse cells from the row
cells = [cell.strip() for cell in line.split("|")[1:-1]]
current_table.append(cells)
in_table = True
else:
if in_table and current_table:
tables.append(current_table)
current_table = []
in_table = False
# Don't forget the last table
if current_table:
tables.append(current_table)
return tables
def validate_table_structure(table):
"""
Validate that a table has consistent structure:
- All rows have the same number of columns
- Has at least a header row and one data row
"""
if not table:
return False, "Table is empty"
if len(table) < 2:
return False, "Table should have at least header and one data row"
num_cols = len(table[0])
if num_cols < 2:
return False, f"Table should have at least 2 columns, found {num_cols}"
for i, row in enumerate(table):
if len(row) != num_cols:
return False, f"Row {i} has {len(row)} columns, expected {num_cols}"
return True, "Table structure is valid"
class TestPdfTableExtraction:
"""Test PDF table extraction with various PDF types."""
@pytest.fixture
def markitdown(self):
"""Create MarkItDown instance."""
return MarkItDown()
def test_borderless_table_extraction(self, markitdown):
"""Test extraction of borderless tables from SPARSE inventory PDF.
Expected output structure:
- Header: INVENTORY RECONCILIATION REPORT with Report ID, Warehouse, Date, Prepared By
- Pipe-separated rows with inventory data
- Text section: Variance Analysis with Summary Statistics
- More pipe-separated rows with extended inventory review
- Footer: Recommendations section
"""
pdf_path = os.path.join(
TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf"
)
if not os.path.exists(pdf_path):
pytest.skip(f"Test file not found: {pdf_path}")
result = markitdown.convert(pdf_path)
text_content = result.text_content
# Validate document header content
expected_strings = [
"INVENTORY RECONCILIATION REPORT",
"Report ID: SPARSE-2024-INV-1234",
"Warehouse: Distribution Center East",
"Report Date: 2024-11-15",
"Prepared By: Sarah Martinez",
]
validate_strings(result, expected_strings)
# Validate pipe-separated format is used
assert "|" in text_content, "Should have pipe separators for form-style data"
# --- Validate First Table Data (Inventory Variance) ---
# Validate table headers are present
first_table_headers = [
"Product Code",
"Location",
"Expected",
"Actual",
"Variance",
"Status",
]
for header in first_table_headers:
assert header in text_content, f"Should contain header '{header}'"
# Validate first table has all expected SKUs
first_table_skus = ["SKU-8847", "SKU-9201", "SKU-4563", "SKU-7728"]
for sku in first_table_skus:
assert sku in text_content, f"Should contain {sku}"
# Validate first table has correct status values
expected_statuses = ["OK", "CRITICAL"]
for status in expected_statuses:
assert status in text_content, f"Should contain status '{status}'"
# Validate first table has location codes
expected_locations = ["A-12", "B-07", "C-15", "D-22", "A-08"]
for loc in expected_locations:
assert loc in text_content, f"Should contain location '{loc}'"
# --- Validate Second Table Data (Extended Inventory Review) ---
# Validate second table headers
second_table_headers = [
"Category",
"Unit Cost",
"Total Value",
"Last Audit",
"Notes",
]
for header in second_table_headers:
assert header in text_content, f"Should contain header '{header}'"
# Validate second table has all expected SKUs (10 products)
second_table_skus = [
"SKU-8847",
"SKU-9201",
"SKU-4563",
"SKU-7728",
"SKU-3345",
"SKU-5512",
"SKU-6678",
"SKU-7789",
"SKU-2234",
"SKU-1123",
]
for sku in second_table_skus:
assert sku in text_content, f"Should contain {sku}"
# Validate second table has categories
expected_categories = ["Electronics", "Hardware", "Software", "Accessories"]
for category in expected_categories:
assert category in text_content, f"Should contain category '{category}'"
# Validate second table has cost values (spot check)
expected_costs = ["$45.00", "$32.50", "$120.00", "$15.75"]
for cost in expected_costs:
assert cost in text_content, f"Should contain cost '{cost}'"
# Validate second table has note values
expected_notes = ["Verified", "Critical", "Pending"]
for note in expected_notes:
assert note in text_content, f"Should contain note '{note}'"
# --- Validate Analysis Text Section ---
analysis_strings = [
"Variance Analysis:",
"Summary Statistics:",
"Total Variance Cost: $4,287.50",
"Critical Items: 1",
"Overall Accuracy: 97.2%",
"Recommendations:",
]
validate_strings(result, analysis_strings)
# --- Validate Document Structure Order ---
# Verify sections appear in correct order
# Note: Using flexible patterns since column merging may occur based on gap detection
import re
header_pos = text_content.find("INVENTORY RECONCILIATION REPORT")
# Look for Product Code header - may be in same column as Location or separate
first_table_match = re.search(r"\|\s*Product Code", text_content)
variance_pos = text_content.find("Variance Analysis:")
extended_review_pos = text_content.find("Extended Inventory Review:")
# Second table - look for SKU entries after extended review section
# The table may not have pipes on every row due to paragraph detection
second_table_pos = -1
if extended_review_pos != -1:
# Look for either "| Product Code" or "Product Code" as table header
second_table_match = re.search(
r"Product Code.*Category", text_content[extended_review_pos:]
)
if second_table_match:
# Adjust position to be relative to full text
second_table_pos = extended_review_pos + second_table_match.start()
recommendations_pos = text_content.find("Recommendations:")
positions = {
"header": header_pos,
"first_table": first_table_match.start() if first_table_match else -1,
"variance_analysis": variance_pos,
"extended_review": extended_review_pos,
"second_table": second_table_pos,
"recommendations": recommendations_pos,
}
# All sections should be found
for name, pos in positions.items():
assert pos != -1, f"Section '{name}' not found in output"
# Verify correct order
assert (
positions["header"] < positions["first_table"]
), "Header should come before first table"
assert (
positions["first_table"] < positions["variance_analysis"]
), "First table should come before Variance Analysis"
assert (
positions["variance_analysis"] < positions["extended_review"]
), "Variance Analysis should come before Extended Review"
assert (
positions["extended_review"] < positions["second_table"]
), "Extended Review should come before second table"
assert (
positions["second_table"] < positions["recommendations"]
), "Second table should come before Recommendations"
def test_borderless_table_no_duplication(self, markitdown):
"""Test that borderless table content is not duplicated excessively."""
pdf_path = os.path.join(
TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf"
)
if not os.path.exists(pdf_path):
pytest.skip(f"Test file not found: {pdf_path}")
result = markitdown.convert(pdf_path)
text_content = result.text_content
# Count occurrences of unique table data - should not be excessively duplicated
# SKU-8847 appears in both tables, plus possibly once in summary text
sku_count = text_content.count("SKU-8847")
# Should appear at most 4 times (2 tables + minor text references), not more
assert (
sku_count <= 4
), f"SKU-8847 appears too many times ({sku_count}), suggests duplication issue"
def test_borderless_table_correct_position(self, markitdown):
"""Test that tables appear in correct positions relative to text."""
pdf_path = os.path.join(
TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf"
)
if not os.path.exists(pdf_path):
pytest.skip(f"Test file not found: {pdf_path}")
result = markitdown.convert(pdf_path)
text_content = result.text_content
# Verify content order - header should come before table content, which should come before analysis
header_pos = text_content.find("Prepared By: Sarah Martinez")
# Look for Product Code in any pipe-separated format
product_code_pos = text_content.find("Product Code")
variance_pos = text_content.find("Variance Analysis:")
assert header_pos != -1, "Header should be found"
assert product_code_pos != -1, "Product Code should be found"
assert variance_pos != -1, "Variance Analysis should be found"
assert (
header_pos < product_code_pos < variance_pos
), "Product data should appear between header and Variance Analysis"
# Second table content should appear after "Extended Inventory Review"
extended_review_pos = text_content.find("Extended Inventory Review:")
# Look for Category header which is in second table
category_pos = text_content.find("Category")
recommendations_pos = text_content.find("Recommendations:")
if (
extended_review_pos != -1
and category_pos != -1
and recommendations_pos != -1
):
# Find Category position after Extended Inventory Review
category_after_review = text_content.find("Category", extended_review_pos)
if category_after_review != -1:
assert (
extended_review_pos < category_after_review < recommendations_pos
), "Extended review table should appear between Extended Inventory Review and Recommendations"
def test_receipt_pdf_extraction(self, markitdown):
"""Test extraction of receipt PDF (no tables, formatted text).
Expected output structure:
- Store header: TECHMART ELECTRONICS with address
- Transaction info: Store #, date, TXN, Cashier, Register
- Line items: 6 products with prices and member discounts
- Totals: Subtotal, Member Discount, Sales Tax, Rewards, TOTAL
- Payment info: Visa Card, Auth, Ref
- Rewards member info: Name, ID, Points
- Return policy and footer
"""
pdf_path = os.path.join(
TEST_FILES_DIR, "RECEIPT-2024-TXN-98765_retail_purchase.pdf"
)
if not os.path.exists(pdf_path):
pytest.skip(f"Test file not found: {pdf_path}")
result = markitdown.convert(pdf_path)
text_content = result.text_content
# --- Validate Store Header ---
store_header = [
"TECHMART ELECTRONICS",
"4567 Innovation Blvd",
"San Francisco, CA 94103",
"(415) 555-0199",
]
validate_strings(result, store_header)
# --- Validate Transaction Info ---
transaction_info = [
"Store #0342 - Downtown SF",
"11/23/2024",
"TXN: TXN-98765-2024",
"Cashier: Emily Rodriguez",
"Register: POS-07",
]
validate_strings(result, transaction_info)
# --- Validate Line Items (6 products) ---
line_items = [
# Product 1: Headphones
"Wireless Noise-Cancelling",
"Headphones - Premium Black",
"AUDIO-5521",
"$349.99",
"$299.99",
# Product 2: USB-C Hub
"USB-C Hub 7-in-1 Adapter",
"ACC-8834",
"$79.99",
"$159.98",
# Product 3: Portable SSD
"Portable SSD 2TB",
"STOR-2241",
"$289.00",
"$260.00",
# Product 4: Wireless Mouse
"Ergonomic Wireless Mouse",
"ACC-9012",
"$59.99",
# Product 5: Screen Cleaning Kit
"Screen Cleaning Kit",
"CARE-1156",
"$12.99",
"$38.97",
# Product 6: HDMI Cable
"HDMI 2.1 Cable 6ft",
"CABLE-7789",
"$24.99",
"$44.98",
]
validate_strings(result, line_items)
# --- Validate Totals ---
totals = [
"SUBTOTAL",
"$863.91",
"Member Discount",
"Sales Tax (8.5%)",
"$66.23",
"Rewards Applied",
"-$25.00",
"TOTAL",
"$821.14",
]
validate_strings(result, totals)
# --- Validate Payment Info ---
payment_info = [
"PAYMENT METHOD",
"Visa Card ending in 4782",
"Auth: 847392",
"REF-20241123-98765",
]
validate_strings(result, payment_info)
# --- Validate Rewards Member Info ---
rewards_info = [
"REWARDS MEMBER",
"Sarah Mitchell",
"ID: TM-447821",
"Points Earned: 821",
"Total Points: 3,247",
]
validate_strings(result, rewards_info)
# --- Validate Return Policy & Footer ---
footer_info = [
"RETURN POLICY",
"Returns within 30 days",
"Receipt required",
"Thank you for shopping!",
"www.techmart.example.com",
]
validate_strings(result, footer_info)
# --- Validate Document Structure Order ---
positions = {
"store_header": text_content.find("TECHMART ELECTRONICS"),
"transaction": text_content.find("TXN: TXN-98765-2024"),
"first_item": text_content.find("Wireless Noise-Cancelling"),
"subtotal": text_content.find("SUBTOTAL"),
"total": text_content.find("TOTAL"),
"payment": text_content.find("PAYMENT METHOD"),
"rewards": text_content.find("REWARDS MEMBER"),
"return_policy": text_content.find("RETURN POLICY"),
}
# All sections should be found
for name, pos in positions.items():
assert pos != -1, f"Section '{name}' not found in output"
# Verify correct order
assert (
positions["store_header"] < positions["transaction"]
), "Store header should come before transaction"
assert (
positions["transaction"] < positions["first_item"]
), "Transaction should come before items"
assert (
positions["first_item"] < positions["subtotal"]
), "Items should come before subtotal"
assert (
positions["subtotal"] < positions["total"]
), "Subtotal should come before total"
assert (
positions["total"] < positions["payment"]
), "Total should come before payment"
assert (
positions["payment"] < positions["rewards"]
), "Payment should come before rewards"
assert (
positions["rewards"] < positions["return_policy"]
), "Rewards should come before return policy"
def test_multipage_invoice_extraction(self, markitdown):
"""Test extraction of multipage invoice PDF with form-style layout.
Expected output: Pipe-separated format with clear cell boundaries.
Form data should be extracted with pipes indicating column separations.
"""
pdf_path = os.path.join(TEST_FILES_DIR, "REPAIR-2022-INV-001_multipage.pdf")
if not os.path.exists(pdf_path):
pytest.skip(f"Test file not found: {pdf_path}")
result = markitdown.convert(pdf_path)
text_content = result.text_content
# Validate basic content is extracted
expected_strings = [
"ZAVA AUTO REPAIR",
"Collision Repair",
"Redmond, WA",
"Gabriel Diaz",
"Jeep",
"Grand Cherokee",
"Parts",
"Body Labor",
"Paint Labor",
"GRAND TOTAL",
# Second page content
"Bruce Wayne",
"Batmobile",
]
validate_strings(result, expected_strings)
# Validate pipe-separated table format
# Form-style documents should use pipes to separate cells
assert "|" in text_content, "Form-style PDF should contain pipe separators"
# Validate key form fields are properly separated
# These patterns check that label and value are in separate cells
# Note: cells may have padding spaces for column alignment
import re
assert re.search(
r"\| Insured name\s*\|", text_content
), "Insured name should be in its own cell"
assert re.search(
r"\| Gabriel Diaz\s*\|", text_content
), "Gabriel Diaz should be in its own cell"
assert re.search(
r"\| Year\s*\|", text_content
), "Year label should be in its own cell"
assert re.search(
r"\| 2022\s*\|", text_content
), "Year value should be in its own cell"
# Validate table structure for estimate totals
assert (
re.search(r"\| Hours\s*\|", text_content) or "Hours |" in text_content
), "Hours column header should be present"
assert (
re.search(r"\| Rate\s*\|", text_content) or "Rate |" in text_content
), "Rate column header should be present"
assert (
re.search(r"\| Cost\s*\|", text_content) or "Cost |" in text_content
), "Cost column header should be present"
# Validate numeric values are extracted
assert "2,100" in text_content, "Parts cost should be extracted"
assert "300" in text_content, "Body labor cost should be extracted"
assert "225" in text_content, "Paint labor cost should be extracted"
assert "5,738" in text_content, "Grand total should be extracted"
# Validate second page content (Bruce Wayne invoice)
assert "Bruce Wayne" in text_content, "Second page customer name"
assert "Batmobile" in text_content, "Second page vehicle model"
assert "211,522" in text_content, "Second page grand total"
# Validate disclaimer text is NOT in table format (long paragraph)
# The disclaimer should be extracted as plain text, not pipe-separated
assert (
"preliminary estimate" in text_content.lower()
), "Disclaimer text should be present"
def test_academic_pdf_extraction(self, markitdown):
"""Test extraction of academic paper PDF (scientific document).
Expected output: Plain text without tables or pipe characters.
Scientific documents should be extracted as flowing text with proper spacing,
not misinterpreted as tables.
"""
pdf_path = os.path.join(TEST_FILES_DIR, "test.pdf")
if not os.path.exists(pdf_path):
pytest.skip(f"Test file not found: {pdf_path}")
result = markitdown.convert(pdf_path)
text_content = result.text_content
# Validate academic paper content with proper spacing
expected_strings = [
"Introduction",
"Large language models", # Should have proper spacing, not "Largelanguagemodels"
"agents",
"multi-agent", # Should be properly hyphenated
]
validate_strings(result, expected_strings)
# Validate proper text formatting (words separated by spaces)
assert "LLMs" in text_content, "Should contain 'LLMs' acronym"
assert "reasoning" in text_content, "Should contain 'reasoning'"
assert "observations" in text_content, "Should contain 'observations'"
# Ensure content is not empty and has proper length
assert len(text_content) > 1000, "Academic PDF should have substantial content"
# Scientific documents should NOT have tables or pipe characters
assert (
"|" not in text_content
), "Scientific document should not contain pipe characters (no tables)"
# Verify no markdown tables were extracted
tables = extract_markdown_tables(text_content)
assert (
len(tables) == 0
), f"Scientific document should have no tables, found {len(tables)}"
# Verify text is properly formatted with spaces between words
# Check that common phrases are NOT joined together (which would indicate bad extraction)
assert (
"Largelanguagemodels" not in text_content
), "Text should have proper spacing, not joined words"
assert (
"multiagentconversations" not in text_content.lower()
), "Text should have proper spacing between words"
def test_scanned_pdf_handling(self, markitdown):
"""Test handling of scanned/image-based PDF (no text layer).
Expected output: Empty - scanned PDFs without OCR have no text layer.
"""
pdf_path = os.path.join(
TEST_FILES_DIR, "MEDRPT-2024-PAT-3847_medical_report_scan.pdf"
)
if not os.path.exists(pdf_path):
pytest.skip(f"Test file not found: {pdf_path}")
result = markitdown.convert(pdf_path)
# Scanned PDFs without OCR have no text layer, so extraction should be empty
assert (
result is not None
), "Converter should return a result even for scanned PDFs"
assert result.text_content is not None, "text_content should not be None"
# Verify extraction is empty (no text layer in scanned PDF)
assert (
result.text_content.strip() == ""
), f"Scanned PDF should have empty extraction, got: '{result.text_content[:100]}...'"
class TestPdfTableMarkdownFormat:
"""Test that extracted tables have proper markdown formatting."""
@pytest.fixture
def markitdown(self):
"""Create MarkItDown instance."""
return MarkItDown()
def test_markdown_table_has_pipe_format(self, markitdown):
"""Test that form-style PDFs have pipe-separated format."""
pdf_path = os.path.join(
TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf"
)
if not os.path.exists(pdf_path):
pytest.skip(f"Test file not found: {pdf_path}")
result = markitdown.convert(pdf_path)
text_content = result.text_content
# Find rows with pipes
lines = text_content.split("\n")
pipe_rows = [
line for line in lines if line.startswith("|") and line.endswith("|")
]
assert len(pipe_rows) > 0, "Should have pipe-separated rows"
# Check that Product Code appears in a pipe-separated row
product_code_found = any("Product Code" in row for row in pipe_rows)
assert product_code_found, "Product Code should be in pipe-separated format"
def test_markdown_table_columns_have_pipes(self, markitdown):
"""Test that form-style PDF columns are separated with pipes."""
pdf_path = os.path.join(
TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf"
)
if not os.path.exists(pdf_path):
pytest.skip(f"Test file not found: {pdf_path}")
result = markitdown.convert(pdf_path)
text_content = result.text_content
# Find table rows and verify column structure
lines = text_content.split("\n")
table_rows = [
line for line in lines if line.startswith("|") and line.endswith("|")
]
assert len(table_rows) > 0, "Should have markdown table rows"
# Check that at least some rows have multiple columns (pipes)
multi_col_rows = [row for row in table_rows if row.count("|") >= 3]
assert (
len(multi_col_rows) > 5
), f"Should have rows with multiple columns, found {len(multi_col_rows)}"
class TestPdfTableStructureConsistency:
"""Test that extracted tables have consistent structure across all PDF types."""
@pytest.fixture
def markitdown(self):
"""Create MarkItDown instance."""
return MarkItDown()
def test_borderless_table_structure(self, markitdown):
"""Test that borderless table PDF has pipe-separated structure."""
pdf_path = os.path.join(
TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf"
)
if not os.path.exists(pdf_path):
pytest.skip(f"Test file not found: {pdf_path}")
result = markitdown.convert(pdf_path)
text_content = result.text_content
# Should have pipe-separated content
assert "|" in text_content, "Borderless table PDF should have pipe separators"
# Check that key content is present
assert "Product Code" in text_content, "Should contain Product Code"
assert "SKU-8847" in text_content, "Should contain first SKU"
assert "SKU-9201" in text_content, "Should contain second SKU"
def test_multipage_invoice_table_structure(self, markitdown):
"""Test that multipage invoice PDF has pipe-separated format."""
pdf_path = os.path.join(TEST_FILES_DIR, "REPAIR-2022-INV-001_multipage.pdf")
if not os.path.exists(pdf_path):
pytest.skip(f"Test file not found: {pdf_path}")
result = markitdown.convert(pdf_path)
text_content = result.text_content
# Should have pipe-separated content
assert "|" in text_content, "Invoice PDF should have pipe separators"
# Find rows with pipes
lines = text_content.split("\n")
pipe_rows = [
line for line in lines if line.startswith("|") and line.endswith("|")
]
assert (
len(pipe_rows) > 10
), f"Should have multiple pipe-separated rows, found {len(pipe_rows)}"
# Check that some rows have multiple columns
multi_col_rows = [row for row in pipe_rows if row.count("|") >= 4]
assert len(multi_col_rows) > 5, "Should have rows with 3+ columns"
def test_receipt_has_no_tables(self, markitdown):
"""Test that receipt PDF doesn't incorrectly extract tables from formatted text."""
pdf_path = os.path.join(
TEST_FILES_DIR, "RECEIPT-2024-TXN-98765_retail_purchase.pdf"
)
if not os.path.exists(pdf_path):
pytest.skip(f"Test file not found: {pdf_path}")
result = markitdown.convert(pdf_path)
tables = extract_markdown_tables(result.text_content)
# Receipt should not have markdown tables extracted
# (it's formatted text, not tabular data)
# If tables are extracted, they should be minimal/empty
total_table_rows = sum(len(t) for t in tables)
assert (
total_table_rows < 5
), f"Receipt should not have significant tables, found {total_table_rows} rows"
def test_scanned_pdf_no_tables(self, markitdown):
"""Test that scanned PDF has empty extraction and no tables."""
pdf_path = os.path.join(
TEST_FILES_DIR, "MEDRPT-2024-PAT-3847_medical_report_scan.pdf"
)
if not os.path.exists(pdf_path):
pytest.skip(f"Test file not found: {pdf_path}")
result = markitdown.convert(pdf_path)
# Scanned PDF with no text layer should have empty extraction
assert (
result.text_content.strip() == ""
), "Scanned PDF should have empty extraction"
tables = extract_markdown_tables(result.text_content)
# Scanned PDF with no text layer should have no tables
assert len(tables) == 0, "Scanned PDF should have no extracted tables"
def test_all_pdfs_table_rows_consistent(self, markitdown):
"""Test that all PDF tables have rows with pipe-separated content.
Note: With gap-based column detection, rows may have different column counts
depending on how content is spaced in the PDF. What's important is that each
row has pipe separators and the content is readable.
"""
pdf_files = [
"SPARSE-2024-INV-1234_borderless_table.pdf",
"REPAIR-2022-INV-001_multipage.pdf",
"RECEIPT-2024-TXN-98765_retail_purchase.pdf",
"test.pdf",
]
for pdf_file in pdf_files:
pdf_path = os.path.join(TEST_FILES_DIR, pdf_file)
if not os.path.exists(pdf_path):
continue
result = markitdown.convert(pdf_path)
tables = extract_markdown_tables(result.text_content)
for table_idx, table in enumerate(tables):
if not table:
continue
# Verify each row has at least one column (pipe-separated content)
for row_idx, row in enumerate(table):
assert (
len(row) >= 1
), f"{pdf_file}: Table {table_idx}, row {row_idx} has no columns"
# Verify the row has non-empty content
row_content = " ".join(cell.strip() for cell in row)
assert (
len(row_content.strip()) > 0
), f"{pdf_file}: Table {table_idx}, row {row_idx} is empty"
def test_borderless_table_data_integrity(self, markitdown):
"""Test that borderless table extraction preserves data integrity."""
pdf_path = os.path.join(
TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf"
)
if not os.path.exists(pdf_path):
pytest.skip(f"Test file not found: {pdf_path}")
result = markitdown.convert(pdf_path)
tables = extract_markdown_tables(result.text_content)
assert len(tables) >= 2, "Should have at least 2 tables"
# Check first table has expected SKU data
first_table = tables[0]
table_text = str(first_table)
assert "SKU-8847" in table_text, "First table should contain SKU-8847"
assert "SKU-9201" in table_text, "First table should contain SKU-9201"
# Check second table has expected category data
second_table = tables[1]
table_text = str(second_table)
assert "Electronics" in table_text, "Second table should contain Electronics"
assert "Hardware" in table_text, "Second table should contain Hardware"