diff --git a/.gitignore b/.gitignore index aa4abd3..15613ea 100644 --- a/.gitignore +++ b/.gitignore @@ -52,6 +52,7 @@ coverage.xml .hypothesis/ .pytest_cache/ cover/ +.test-logs/ # Translations *.mo diff --git a/packages/markitdown/pyproject.toml b/packages/markitdown/pyproject.toml index 8ca9b11..701a461 100644 --- a/packages/markitdown/pyproject.toml +++ b/packages/markitdown/pyproject.toml @@ -41,19 +41,20 @@ all = [ "openpyxl", "xlrd", "lxml", - "pdfminer.six>=20251107", + "pdfminer.six>=20251230", + "pdfplumber>=0.11.9", "olefile", "pydub", "SpeechRecognition", "youtube-transcript-api~=1.0.0", "azure-ai-documentintelligence", - "azure-identity" + "azure-identity", ] pptx = ["python-pptx"] docx = ["mammoth~=1.11.0", "lxml"] xlsx = ["pandas", "openpyxl"] xls = ["pandas", "xlrd"] -pdf = ["pdfminer.six"] +pdf = ["pdfminer.six>=20251230", "pdfplumber>=0.11.9"] outlook = ["olefile"] audio-transcription = ["pydub", "SpeechRecognition"] youtube-transcription = ["youtube-transcript-api"] diff --git a/packages/markitdown/src/markitdown/converters/_pdf_converter.py b/packages/markitdown/src/markitdown/converters/_pdf_converter.py index 63162d5..b692f16 100644 --- a/packages/markitdown/src/markitdown/converters/_pdf_converter.py +++ b/packages/markitdown/src/markitdown/converters/_pdf_converter.py @@ -1,22 +1,18 @@ import sys import io - from typing import BinaryIO, Any - from .._base_converter import DocumentConverter, DocumentConverterResult from .._stream_info import StreamInfo from .._exceptions import MissingDependencyException, MISSING_DEPENDENCY_MESSAGE - -# Try loading optional (but in this case, required) dependencies -# Save reporting of any exceptions for later +# Load dependencies _dependency_exc_info = None try: import pdfminer import pdfminer.high_level + import pdfplumber except ImportError: - # Preserve the error and stack trace for later _dependency_exc_info = sys.exc_info() @@ -28,16 +24,374 @@ ACCEPTED_MIME_TYPE_PREFIXES = [ ACCEPTED_FILE_EXTENSIONS = [".pdf"] +def _to_markdown_table(table: list[list[str]], include_separator: bool = True) -> str: + """Convert a 2D list (rows/columns) into a nicely aligned Markdown table. + + Args: + table: 2D list of cell values + include_separator: If True, include header separator row (standard markdown). + If False, output simple pipe-separated rows. + """ + if not table: + return "" + + # Normalize None → "" + table = [[cell if cell is not None else "" for cell in row] for row in table] + + # Filter out empty rows + table = [row for row in table if any(cell.strip() for cell in row)] + + if not table: + return "" + + # Column widths + col_widths = [max(len(str(cell)) for cell in col) for col in zip(*table)] + + def fmt_row(row: list[str]) -> str: + return ( + "|" + + "|".join(str(cell).ljust(width) for cell, width in zip(row, col_widths)) + + "|" + ) + + if include_separator: + header, *rows = table + md = [fmt_row(header)] + md.append("|" + "|".join("-" * w for w in col_widths) + "|") + for row in rows: + md.append(fmt_row(row)) + else: + md = [fmt_row(row) for row in table] + + return "\n".join(md) + + +def _extract_form_content_from_words(page: Any) -> str | None: + """ + Extract form-style content from a PDF page by analyzing word positions. + This handles borderless forms/tables where words are aligned in columns. + + Returns markdown with proper table formatting: + - Tables have pipe-separated columns with header separator rows + - Non-table content is rendered as plain text + + Returns None if the page doesn't appear to be a form-style document, + indicating that pdfminer should be used instead for better text spacing. + """ + words = page.extract_words(keep_blank_chars=True, x_tolerance=3, y_tolerance=3) + if not words: + return None + + # Group words by their Y position (rows) + y_tolerance = 5 + rows_by_y: dict[float, list[dict]] = {} + for word in words: + y_key = round(word["top"] / y_tolerance) * y_tolerance + if y_key not in rows_by_y: + rows_by_y[y_key] = [] + rows_by_y[y_key].append(word) + + # Sort rows by Y position + sorted_y_keys = sorted(rows_by_y.keys()) + page_width = page.width if hasattr(page, "width") else 612 + + # First pass: analyze each row + row_info: list[dict] = [] + for y_key in sorted_y_keys: + row_words = sorted(rows_by_y[y_key], key=lambda w: w["x0"]) + if not row_words: + continue + + first_x0 = row_words[0]["x0"] + last_x1 = row_words[-1]["x1"] + line_width = last_x1 - first_x0 + combined_text = " ".join(w["text"] for w in row_words) + + # Count distinct x-position groups (columns) + x_positions = [w["x0"] for w in row_words] + x_groups: list[float] = [] + for x in sorted(x_positions): + if not x_groups or x - x_groups[-1] > 50: + x_groups.append(x) + + # Determine row type + is_paragraph = line_width > page_width * 0.55 and len(combined_text) > 60 + + row_info.append( + { + "y_key": y_key, + "words": row_words, + "text": combined_text, + "x_groups": x_groups, + "is_paragraph": is_paragraph, + "num_columns": len(x_groups), + } + ) + + # Collect ALL x-positions from rows with 3+ columns (table-like rows) + # This gives us the global column structure + all_table_x_positions: list[float] = [] + for info in row_info: + if info["num_columns"] >= 3 and not info["is_paragraph"]: + all_table_x_positions.extend(info["x_groups"]) + + if not all_table_x_positions: + return None + + # Compute global column boundaries + all_table_x_positions.sort() + global_columns: list[float] = [] + for x in all_table_x_positions: + if not global_columns or x - global_columns[-1] > 30: + global_columns.append(x) + + # Too many columns suggests dense text, not a form + if len(global_columns) > 8: + return None + + # Now classify each row as table row or not + # A row is a table row if it has words that align with 2+ of the global columns + for info in row_info: + if info["is_paragraph"]: + info["is_table_row"] = False + continue + + # Count how many global columns this row's words align with + aligned_columns: set[int] = set() + for word in info["words"]: + word_x = word["x0"] + for col_idx, col_x in enumerate(global_columns): + if abs(word_x - col_x) < 40: + aligned_columns.add(col_idx) + break + + # If row uses 2+ of the established columns, it's a table row + info["is_table_row"] = len(aligned_columns) >= 2 + + # Find table regions (consecutive table rows) + table_regions: list[tuple[int, int]] = [] # (start_idx, end_idx) + i = 0 + while i < len(row_info): + if row_info[i]["is_table_row"]: + start_idx = i + while i < len(row_info) and row_info[i]["is_table_row"]: + i += 1 + end_idx = i + table_regions.append((start_idx, end_idx)) + else: + i += 1 + + # Check if enough rows are table rows (at least 20%) + total_table_rows = sum(end - start for start, end in table_regions) + if len(row_info) > 0 and total_table_rows / len(row_info) < 0.2: + return None + + # Build output - collect table data first, then format with proper column widths + result_lines: list[str] = [] + num_cols = len(global_columns) + + # Helper function to extract cells from a row + def extract_cells(info: dict) -> list[str]: + cells: list[str] = ["" for _ in range(num_cols)] + for word in info["words"]: + word_x = word["x0"] + # Find the correct column using boundary ranges + assigned_col = num_cols - 1 # Default to last column + for col_idx in range(num_cols - 1): + col_end = global_columns[col_idx + 1] + if word_x < col_end - 20: + assigned_col = col_idx + break + if cells[assigned_col]: + cells[assigned_col] += " " + word["text"] + else: + cells[assigned_col] = word["text"] + return cells + + # Process rows, collecting table data for proper formatting + idx = 0 + while idx < len(row_info): + info = row_info[idx] + + # Check if this row starts a table region + table_region = None + for start, end in table_regions: + if idx == start: + table_region = (start, end) + break + + if table_region: + start, end = table_region + # Collect all rows in this table + table_data: list[list[str]] = [] + for table_idx in range(start, end): + cells = extract_cells(row_info[table_idx]) + table_data.append(cells) + + # Calculate column widths for this table + if table_data: + col_widths = [ + max(len(row[col]) for row in table_data) for col in range(num_cols) + ] + # Ensure minimum width of 3 for separator dashes + col_widths = [max(w, 3) for w in col_widths] + + # Format header row + header = table_data[0] + header_str = ( + "| " + + " | ".join( + cell.ljust(col_widths[i]) for i, cell in enumerate(header) + ) + + " |" + ) + result_lines.append(header_str) + + # Format separator row + separator = ( + "| " + + " | ".join("-" * col_widths[i] for i in range(num_cols)) + + " |" + ) + result_lines.append(separator) + + # Format data rows + for row in table_data[1:]: + row_str = ( + "| " + + " | ".join( + cell.ljust(col_widths[i]) for i, cell in enumerate(row) + ) + + " |" + ) + result_lines.append(row_str) + + idx = end # Skip to end of table region + else: + # Check if we're inside a table region (not at start) + in_table = False + for start, end in table_regions: + if start < idx < end: + in_table = True + break + + if not in_table: + # Non-table content + result_lines.append(info["text"]) + idx += 1 + + return "\n".join(result_lines) + + +def _extract_tables_from_words(page: Any) -> list[list[list[str]]]: + """ + Extract tables from a PDF page by analyzing word positions. + This handles borderless tables where words are aligned in columns. + + This function is designed for structured tabular data (like invoices), + not for multi-column text layouts in scientific documents. + """ + words = page.extract_words(keep_blank_chars=True, x_tolerance=3, y_tolerance=3) + if not words: + return [] + + # Group words by their Y position (rows) + y_tolerance = 5 + rows_by_y: dict[float, list[dict]] = {} + for word in words: + y_key = round(word["top"] / y_tolerance) * y_tolerance + if y_key not in rows_by_y: + rows_by_y[y_key] = [] + rows_by_y[y_key].append(word) + + # Sort rows by Y position + sorted_y_keys = sorted(rows_by_y.keys()) + + # Find potential column boundaries by analyzing x positions across all rows + all_x_positions = [] + for words_in_row in rows_by_y.values(): + for word in words_in_row: + all_x_positions.append(word["x0"]) + + if not all_x_positions: + return [] + + # Cluster x positions to find column starts + all_x_positions.sort() + x_tolerance_col = 20 + column_starts: list[float] = [] + for x in all_x_positions: + if not column_starts or x - column_starts[-1] > x_tolerance_col: + column_starts.append(x) + + # Need at least 3 columns but not too many (likely text layout, not table) + if len(column_starts) < 3 or len(column_starts) > 10: + return [] + + # Find rows that span multiple columns (potential table rows) + table_rows = [] + for y_key in sorted_y_keys: + words_in_row = sorted(rows_by_y[y_key], key=lambda w: w["x0"]) + + # Assign words to columns + row_data = [""] * len(column_starts) + for word in words_in_row: + # Find the closest column + best_col = 0 + min_dist = float("inf") + for i, col_x in enumerate(column_starts): + dist = abs(word["x0"] - col_x) + if dist < min_dist: + min_dist = dist + best_col = i + + if row_data[best_col]: + row_data[best_col] += " " + word["text"] + else: + row_data[best_col] = word["text"] + + # Only include rows that have content in multiple columns + non_empty = sum(1 for cell in row_data if cell.strip()) + if non_empty >= 2: + table_rows.append(row_data) + + # Validate table quality - tables should have: + # 1. Enough rows (at least 3 including header) + # 2. Short cell content (tables have concise data, not paragraphs) + # 3. Consistent structure across rows + if len(table_rows) < 3: + return [] + + # Check if cells contain short, structured data (not long text) + long_cell_count = 0 + total_cell_count = 0 + for row in table_rows: + for cell in row: + if cell.strip(): + total_cell_count += 1 + # If cell has more than 30 chars, it's likely prose text + if len(cell.strip()) > 30: + long_cell_count += 1 + + # If more than 30% of cells are long, this is probably not a table + if total_cell_count > 0 and long_cell_count / total_cell_count > 0.3: + return [] + + return [table_rows] + + class PdfConverter(DocumentConverter): """ - Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. + Converts PDFs to Markdown. + Supports extracting tables into aligned Markdown format (via pdfplumber). + Falls back to pdfminer if pdfplumber is missing or fails. """ def accepts( self, file_stream: BinaryIO, stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter + **kwargs: Any, ) -> bool: mimetype = (stream_info.mimetype or "").lower() extension = (stream_info.extension or "").lower() @@ -55,9 +409,8 @@ class PdfConverter(DocumentConverter): self, file_stream: BinaryIO, stream_info: StreamInfo, - **kwargs: Any, # Options to pass to the converter + **kwargs: Any, ) -> DocumentConverterResult: - # Check the dependencies if _dependency_exc_info is not None: raise MissingDependencyException( MISSING_DEPENDENCY_MESSAGE.format( @@ -65,13 +418,55 @@ class PdfConverter(DocumentConverter): extension=".pdf", feature="pdf", ) - ) from _dependency_exc_info[ - 1 - ].with_traceback( # type: ignore[union-attr] + ) from _dependency_exc_info[1].with_traceback( _dependency_exc_info[2] - ) + ) # type: ignore[union-attr] - assert isinstance(file_stream, io.IOBase) # for mypy - return DocumentConverterResult( - markdown=pdfminer.high_level.extract_text(file_stream), - ) + assert isinstance(file_stream, io.IOBase) + + markdown_chunks: list[str] = [] + + # Read file stream into BytesIO for compatibility with pdfplumber + pdf_bytes = io.BytesIO(file_stream.read()) + + try: + # Track how many pages are form-style vs plain text + form_pages = 0 + plain_pages = 0 + + with pdfplumber.open(pdf_bytes) as pdf: + for page in pdf.pages: + # Try form-style word position extraction + page_content = _extract_form_content_from_words(page) + + # If extraction returns None, this page is not form-style + if page_content is None: + plain_pages += 1 + # Extract text using pdfplumber's basic extraction for this page + text = page.extract_text() + if text and text.strip(): + markdown_chunks.append(text.strip()) + else: + form_pages += 1 + if page_content.strip(): + markdown_chunks.append(page_content) + + # If most pages are plain text, use pdfminer for better text handling + if plain_pages > form_pages and plain_pages > 0: + pdf_bytes.seek(0) + markdown = pdfminer.high_level.extract_text(pdf_bytes) + else: + # Build markdown from chunks + markdown = "\n\n".join(markdown_chunks).strip() + + except Exception: + # Fallback if pdfplumber fails + pdf_bytes.seek(0) + markdown = pdfminer.high_level.extract_text(pdf_bytes) + + # Fallback if still empty + if not markdown: + pdf_bytes.seek(0) + markdown = pdfminer.high_level.extract_text(pdf_bytes) + + return DocumentConverterResult(markdown=markdown) diff --git a/packages/markitdown/tests/test_files/MEDRPT-2024-PAT-3847_medical_report_scan.pdf b/packages/markitdown/tests/test_files/MEDRPT-2024-PAT-3847_medical_report_scan.pdf new file mode 100644 index 0000000..30e1960 Binary files /dev/null and b/packages/markitdown/tests/test_files/MEDRPT-2024-PAT-3847_medical_report_scan.pdf differ diff --git a/packages/markitdown/tests/test_files/RECEIPT-2024-TXN-98765_retail_purchase.pdf b/packages/markitdown/tests/test_files/RECEIPT-2024-TXN-98765_retail_purchase.pdf new file mode 100644 index 0000000..34842dc --- /dev/null +++ b/packages/markitdown/tests/test_files/RECEIPT-2024-TXN-98765_retail_purchase.pdf @@ -0,0 +1,97 @@ +%PDF-1.4 +%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com +1 0 obj +<< +/F1 2 0 R /F2 3 0 R /F3 4 0 R /F4 5 0 R +>> +endobj +2 0 obj +<< +/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font +>> +endobj +3 0 obj +<< +/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font +>> +endobj +4 0 obj +<< +/BaseFont /Courier /Encoding /WinAnsiEncoding /Name /F3 /Subtype /Type1 /Type /Font +>> +endobj +5 0 obj +<< +/BaseFont /Courier-Bold /Encoding /WinAnsiEncoding /Name /F4 /Subtype /Type1 /Type /Font +>> +endobj +6 0 obj +<< +/BitsPerComponent 8 /ColorSpace /DeviceRGB /Filter [ /ASCII85Decode /FlateDecode ] /Height 70 /Length 4491 /Subtype /Image + /Type /XObject /Width 200 +>> +stream +Gb"/lq,^Nc)M\9OkX:DBZ5YT>'!op&`0lHCEL`PXM2DFT$QuCdPsfSJ4#%$gW49i\1e&eZ\Acg:2bhaPc+^Q$/Shs#,1&Qu>83CBh729[%A$M]]Z8KL]Cu-OpO.1`\pCPboa2!3#sCC+4Yg#.W)"\K&i)doY5WCH.J-G0E$]%J"BRoZ88okcKEP@C7S%JEA:t(e6:OLb-"MZ3=$fAIE$]%J"BRoZ88okcKEP@C7S%JEA:t(e6:OKV=+u3F\<8ml>&.H@]EH:@TAeD]);ilGV3,!4G2kP0XRN"#r/]G!Kqo1c9Redk1d%Hh[t82=;lMkgMFr4J2#lTY[siCuNDGT]h^te4%1fj10r$&D--;(2UPbQ(Ze:VUL"2G=%qPZVOlc,tegG*BO,:$mI%mCAJ^8q2gWSn>)Ui.KQ!A5_(Z;l?_%(Xb28AGIU0SY?bVp%B6=$P2*I!1?W8WL>aAVWc$%-nk=D8ZjE$stW]LJ-LqIj9qZoFV/lj$Uf)=b`nfl*ANkt_qpb2t'P`;D#\h6o+g'M+"j4Sf:d#_jjZdTS>mnQJm^7>(S3Bq"m(kH5i3;0`YK<5e.0$k"4XA4UX)rfXH+2OamR360'cX$&"Dp"DdSkh^Q;?+H)fBc@YMp?\]UZuQ*lgt@kDS'dARs/]`Roa+]eXm%&TJ4e[RCKq6kQ:5Zpa[hLTEE,M/UR\C]SS.K'HJL7F)F5Ts63hWCKs+aTqi3TN!,P7#o$@'a?^`M.9&=d,$WJ*\b^*N1fR$JFV.s`.W*WfCgS\9h@C2/uCDiFrdD&a`iRQ\df2(Xc53$=i@@upbP,MJ@.sDgm%`^.8+5u//"Hhn%^kIb$5o\2B7%r>DD\NZ:L;otY0]:)'6l[M<&ctoM"($Q_XW1'!4OB>g/3dF]mD],eF*&&'itQ;2e$/VWZ/QmdogQ0&d7ePkDGP[PZkk8TtUWkaJYa$Q)c6I+lpY5H])D-lHdp52ko&@JS]_pP&PR0@L`i*pbXB"rgcI`#'#>-Njfe@8+ZC7hHU>Qm2oCj$^ATs<7[sZ@5*,@qslQ\p1m1#p6XrGL'F^?ok?+\fDe1,#<0n78&1'&KK/85E7IuiRklZ$l=<9-YKka`e!9&oUk]3u7Y);o;!X[W(:i@m+VqQff?4r\fn@@4QXN6[dWdtV8:B`3X2:bgH!rR8-r^sf)#EN%"`F/q0Heh_C7H6l'.@I3l7W_M8qB0Sd$'o&pWH!XFNS.JRZ%[WY$N:rl5tLIb;#&1u\'nOCIB]161$bC,Uuf;ZK6dl()epY<39X_LaOAXU:WAZiYqZk5Tq+hN`O"QdZ7[jLdf^cf`?9i4T#=]JO$'0fC4#Y=^M%VOouL.PZ6/V;r+XoF1Ls*YXu`6'4?,j47_u_U=.T*IX0ed;@5JN=Qlc\gS!W?r;#%jA)NSUh\`='l:HWsF_rQ_q'7l3]f,=As5FN;rm6/&IWa@[9HFr3YtU'N%=ZPr)s`!!&o!IbLLsBH7VnG&"_&hSn3;Nr^mpSZk^2i_aD8H]^Oi^Wq(2*ak3>^mA^!FkG4$-Vq(BH"U+YSR;%(5j(bnT,&RrR1d]\O5_42^f/Xa:4msf,Oms&5F6()XE"p6mS/Yc\Ga&`hC/3XdsM;'cTMl(uV@DiFY5AA_VWS4T'&^D<.7.S,`B?:^&!Q[ZVaCi0^$[E#=Xt_;^\;l;M#]`$4;sLf^6$u5)gpB'7TO-@*HXbXF]H[ID>=n%&8-T:f)&:]?hm\RN5/B5sdW)PM[X@>2Jq/jQ%m%0Pk%J`<],L8cn3_`B)dE8ng`*C-";2ro.7o.B2:3d$4r#LEqr#8((eIunkG+V@25V=%+_!:Z/,UQca-*Fg9*GW$LPXTF:ER(d"o8oX31"!B8VtcoXnY$9m&'30Um_8lI0aO.LY_m,l[3VfKlBY*[!$I#3=:"_\lGs6U"<,-kF4HFN$6[_SQpG)7_H3mnKRB'C9#q8EY(Vaqi(D&r$*Jr?OPiaP#RRYeN0)sia9W*TKT)#N9#q8EY(Vaqi(D&r$*Jr?OPiaP#k^2Zeu*-9hp_.0@)f6Em^OKjQe!N6R'K"tCBm#IB,_F0s/@WUCJX)`/6>DF6M_)Kk.s>$UZPX\r/#8I8q/RiHL2liUAA0Yf`%Ld=068s:8FSq=\;#qPO.M6gNJj\UuBER`krd-YJajg!X[eDSKG#rM875C*#IQWm+O@aOf*TU2U('gKn4DA1c#r\17+;`V`#Kgpm8RC[Ee$Ac&[r[O5Rd>4]?HeBRku2;R,jekQW/J-QCRW&^t>C\3e,:[W(TT!T:2JEND71MtHndISe*l)E$(\gO^@6Z"5Q<4O)uM0mY8/2q_>,e#m"SaP0HaXDih_2UgJZH0.io.!]XNj5RmJj5Qb:1EKuS*^?b+;s)^?6l>JE0QrGSUpouIlZac9kX41>cja=/SDr"[/Y4/S?T:Ij;Z5Vm!N#O9oGbTmC/.8"?)WA+NO#l!d__>E^^P^^5SPK3f#a-$jDf&=>7p3h\GbV.%e"aa`Nr8:-((uUWjP?5h*[4qX1g^KT=J\eG*=-r!s:Xj;^b_=6X9pT)tmOPN+SP\Y9d?!CS[k5Z"1@+N1#RUIImf:>$etSOp!A$%9Npa7^LW;"'>&DLN*LT#=""p0WOpjXfI:C##@j@lDSU=Xe\&,@B>QE*Z4EfD\=1"h4F8$&QPkBC:BBC"p$N?^/:S9o*P:eF4j>`WT&V*-UR?1+=iZ&Y:YJ)Y_O1Q!(3MOc:&,lEK0KT>gBrM!Oo7g$R0Z=@nl[op[I#4)k.C3f6pb.hq#9_d636^F1(A%Us@pB(WNXIQ#TKD-`)0%k[fj0?XEDjbhd[m6#LpLW.'sd9_sqo4)9,(HjMXDMbKZE5`!!P4XRB@/4TISu+m2&%RiFj^;=JGE6IZZZ3Iq9u;tP9Ze)spB!TH+!k0kjm?9TaEqM'"N]I#K68.sENFpG-:BpL"k5RWGsjL14<m&CBpaKs!LDjG%ZThfU"?o'^kNBIDSK`iDtti"$XI@8UFlc6'SU-*bNc0g0qZU.!3k5:g&J5%_GC<>>?gMnQALpYLh_[CK2YgZMfbcG=f9!FE+GUto#YS.Ms0oqhppBVFIgKikS73+[l77Q]oKP@6W$g?K-&@CV\IU2l9TjknpF,mi!B#31b$WjNK[_SbrBad@)rY(?tsa?noi;W9O[OS+IIT/JfXc1#&UNmnp:7_c6/aW-".*cBVjc)"DgbY/sJI7I:YTo4n^?6D*J='=Z.q\rms\C-WjI*igXS;Vfo_`Y\`_c8K6r=SE5*WPmG+p6'k&&0eDn]eR;8k_3L)o&mP_\+Z6EUKR,;fQ&lT(!/=AJY_cg)ZFd_iV[)$d`W(]Q<7PapAWM]!Y4qoZ_Z9e(Tts:W\F3=&Fc,Mp(`=cJUG]7+gkp\4)_AlnCa6gYn$_U<4&\@+ERR^ZIFpqbN%;:=3k=P0?fC_:0&Ug9TYUE[kI#Bq;+mc:C\/7H:]h-hq^[P:`jZ\paVNd3BCYF4eruZ)J"F0tYQ"^`hf0;~>endstream +endobj +7 0 obj +<< +/Contents 11 0 R /MediaBox [ 0 0 216 792 ] /Parent 10 0 R /Resources << +/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject << +/FormXob.2a351979d8c75d073b2ea4bfb74718f9 6 0 R +>> +>> /Rotate 0 /Trans << + +>> + /Type /Page +>> +endobj +8 0 obj +<< +/PageMode /UseNone /Pages 10 0 R /Type /Catalog +>> +endobj +9 0 obj +<< +/Author (\(anonymous\)) /CreationDate (D:20251205104951+01'00') /Creator (\(unspecified\)) /Keywords () /ModDate (D:20251205104951+01'00') /Producer (ReportLab PDF Library - www.reportlab.com) + /Subject (\(unspecified\)) /Title (\(anonymous\)) /Trapped /False +>> +endobj +10 0 obj +<< +/Count 1 /Kids [ 7 0 R ] /Type /Pages +>> +endobj +11 0 obj +<< +/Filter [ /ASCII85Decode /FlateDecode ] /Length 1981 +>> +stream +GauI8D/\/e&:hOi=55KJ8;^%$X%4*F0ZR4[TQ-.JlIl=8,25emeWsH3DOUP#SK,Tt>!%\QTf-@>5)IUB4PgI,!l6hOHqp".7kqY?VEc$;f0G$C+?kE+IduY+D6B[cY<`?1&+Bf$SSWa.DI28'F?:CpG_mY"TO]hkXiFku&",h"4G/8GlCSK3UDT`3W#q'Qc;4>t6\tbspa(/l?]"D>nboQo,(,[\*-A;J=Ru^j=[Nu\:iMk7q<+PGo*gWpT4_C)j)t7Oc[5MlffWrhj!99;,0?]r3R(ns^B^I*KQ#f[([aS1g);Q.rrBep&6)sVJs=\.1^pkCa(tBfECI75_;C0LC.)*n<3;@eFZT0lg_SFKN$O.X\o%U7_58YJ,X[`p,%PUL^1]EgT]T\4*B3hOrEZA:[=ui88pZGht%klE^OC$2=@$GiMoTO,CLh>2ZDq?"(LrLS)ajJJe7M&(V1i'>Z@qg3/WJ@PpL4nr1qU$V_#jqpM+M<[H(LE:pW6uTQK`^P%Q'T&[*Y1\T_7.O:+n^Y)3+d56\hGsIrnqB85q[4L1WG#"Uo.d]Zr"jG`qiT=AU8b5]p3(N2IfnWHVO&$rnNe6[_$[o(m=2Sq-C[bbNOS,qIb?:TGYHhQjjcCe!9%*cuscgU*Eea^B?#^HtoE9p(jd_GR#E1\LTm:MVS5e]+;a8<=:Ym/3(NJmP3]c2J81f'[9A229?.>nW.Y"uioK$/X(RLnTFa0nhiu#_V(M%6pL-[3&IEZO^iW'pcgSC4%cs*UfWL8=h@]Eoq_3uS@i.]*ai31P3"'$;S,Q%'"=$Vq"-_!pX<>bA]?nd=dOpZa\$"k!cpI9L2SO3gBd7]TKi)s)3F/ADnb^3N)iF')M[\Fq1\^PAl):!YpJp!B\s/2LkZr*`(o%fTO.qa?N[7\P_Dj!-3=OAO3]DtNKn-R1Hc#$?$h;RW[;B(k%DrOQ4lA(kZ`8[,.5E\H/%&9RE1k-pKk^'?Wseh?':/9RD3&&Xf\j=9;Sdd#l!b5li$Q.?@#FtJ9r"D*THt>o=+h,ei<3VCI\eQ+FB(V85-;*H\(QoM*>m2@9WKA`dV0$2F]lQ!^cKY?-F<K&Mf4jEn:L@C'pmu(T(NAo?onFtPTH*Mah:OJII8OF6_OX,SeHS'86i`]nN=oW_Hjm1lb%agT!)1^3rJWom\/,?BYNjThVR,cQ'opa8Q#endstream +endobj +xref +0 12 +0000000000 65535 f +0000000073 00000 n +0000000134 00000 n +0000000241 00000 n +0000000353 00000 n +0000000458 00000 n +0000000568 00000 n +0000005249 00000 n +0000005507 00000 n +0000005576 00000 n +0000005859 00000 n +0000005919 00000 n +trailer +<< +/ID +[<4800d64fefba4dd902e51197c7da4e88><4800d64fefba4dd902e51197c7da4e88>] +% ReportLab generated PDF document -- digest (http://www.reportlab.com) + +/Info 9 0 R +/Root 8 0 R +/Size 12 +>> +startxref +7992 +%%EOF diff --git a/packages/markitdown/tests/test_files/REPAIR-2022-INV-001_multipage.pdf b/packages/markitdown/tests/test_files/REPAIR-2022-INV-001_multipage.pdf new file mode 100644 index 0000000..c795d9e Binary files /dev/null and b/packages/markitdown/tests/test_files/REPAIR-2022-INV-001_multipage.pdf differ diff --git a/packages/markitdown/tests/test_files/SPARSE-2024-INV-1234_borderless_table.pdf b/packages/markitdown/tests/test_files/SPARSE-2024-INV-1234_borderless_table.pdf new file mode 100644 index 0000000..e8ba29f --- /dev/null +++ b/packages/markitdown/tests/test_files/SPARSE-2024-INV-1234_borderless_table.pdf @@ -0,0 +1,115 @@ +%PDF-1.4 +%“Œ‹ž ReportLab Generated PDF document http://www.reportlab.com +1 0 obj +<< +/F1 2 0 R /F2 3 0 R +>> +endobj +2 0 obj +<< +/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font +>> +endobj +3 0 obj +<< +/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font +>> +endobj +4 0 obj +<< +/BitsPerComponent 8 /ColorSpace /DeviceRGB /Filter [ /ASCII85Decode /FlateDecode ] /Height 250 /Length 7324 /Subtype /Image + /Type /XObject /Width 500 +>> +stream +Gb"0T$]J'Hh8f&`$E10oYm@(8\X0op.8VZ0?U3+-`PUA-kXC)$ZF(FS+cnSt_+UFmc3fWAAe?iKV_=cYIbmbE?q#A3MC5(3;*ju:&SG4>@oYJ9b]>H8cll(s_!]^Wdj#7mENY_N]/ZF]UIf(e(iIV3S9+%Xs-]`8!.2T!b144!mPH)qfI]jf[oX'bfT]RAeXM;mN(9DE.lI5\^bZHi0H?+ZtHqt&,uDj/khIt-b"Rr?#dgph@W9eT(j"qA0*QRl=]$mBsBh^?V`!Ie^f&L=olbQVoVhn(tad)_WTfB\B\&;,P^gT6t*&(g]2WIf&N31U4Far9$:/Dr/.jL^e^fH0;m?qWi=Z:.*WY[G;!G4"!hE5;^qL(1KRAX]M;r8<2<7ZSVHUe-b'Idr>?hgP7hj2Ys5<8W&k^%9Q7ho`EpY?\Un4BTdReMt&%1Rp.-K)JTA_12=1T+2r,ghtatY![sSp(h$2b#dqMVXhrZHhsK;FJNRHd%NhApV$$L+_fnYo]X]Y]6@K'>#o(!e_G4b7\s6F%lH=^*b8(3L(0%9F0]X>A;`\q&%lN]ZL5qV]8o@Nd-j6]k$[r8a.P9s#(nPcjj^$`Jtm)Je7s8D.sV+g?CBl;=2\8'Qo"q;K8Pa$j]o]X]?0!$sKgt8uJe$n4`gJ0fU4nkJ+@)%;lhARK\W^-W0.5fij+$OKACE):K.oqu=9&'UO5"O+[+bgsT11F$Q"b-`B4qCUC=VU+-_@PpG+u775'#_?NFV5OYCHL$osIZ12*M@48*^,N>%mQpfe\T`0I/SqhAe^rk006%i.NXclqn/"LE:d*b?t6hFPL2,HaYS4\?KJ`Np5$6PjQHlnF!pW[N"3!eKu'V_;``m)$UTmCm!JU?bc5IVe1:8cO]X@+92++1k47TB:>_]lI+aIEm9pFu^5-5,*@=Hk_*ko^?aR0OTF%FGc!7n%8&&e812C*G&NRkD+=%D=$YbYd7Q2H-W$H^N8S`)*c7nUIUBJQN%`h2ZHJ>Q"b-8fl!,bG)c"=7iu#bp!phWrU7W9lpBhfJ,.:[l-m7dl-ckKbAH"BV4PlY+8s]W"8L)"bA:P"I!P"$194K[56(QP91hgnC]Di2dd[%macjUNhm[N2lM`g?5M#>9NqDdh]63G*.GHGX0"g\j5Q'INp9/=i_h/a"(7;N\%e!/]1atQ+ABOU>cd.ib29n"Z](S:k%;7E%LlRM.pYL8*IpTEk8Z)7)1q1tN\"LJ5o#k[iR*h^$PN^JUfSQB-YgCChnN3]5d9Zm;IL/Nn[>%_$;&EHLtm1P8lN!ga/:l@[l+#!#I4jFO\T7-FdG4""#g9etN"g>W5R[s6)X]lUl>N;JkeIV^E'@&Af.tdf`br=Nr.Fb@tX.T=N-dhr;DVr0VaH.R2*Z"PFgiH9`[uP7TXKg6j"W(E_h0%>GrXGZWQ&T;"\$mDCIf,9MigDR-I5tc-C(JE?uh^d12Wb)'^%3+[%d\TUS]6rn+@oVG*D@n$rEYF<@1:c9(K2fW\dm@H$kCe+5^=,V@nnS#C^afDq"pI)g)u"A8rl7PO$/9*V%Y0^&X4#,"F>kQFN_TLLX!"A8rl7PO$/9*V%Y0^&X4#,"F>kQFN_TLLX!"A8rl7PO$/9*V%Y0^&X4#,"F>kQFN_TLLX!"A8rl7PO$/9*V%Y0^&X4#,"F>kQFN_TLLX!"A8rl7PO$/9*V%Y0^&X4#,"F>kQFN_TLLX!"A8rl7PO$/9*V%Y0^&X4#,"F>kQFN_TLLX!"A8rl7PO$/9*V%Y0^&X4#,"F>kQFN_TLLX!"A8rl7PO$/9*V%Y0^&X4#,"F>kQFN_TLLX!"A8rl7PO$/9*V%Y0^&X4#,"F>kQFN_TLLX!"A8rl7PO$/9*V%Y0^&X4#,"F>kQFN_TLLX!"A8rl7PO$/9*V%Y0^&X4#,"F>kQFN_TLLX!"A8rl7PO$/9*V%Y0^&X4#,"F>kQFN_TLLX!"A8rl7PO$/9*V%Y0^&X4#,"F>kQFN_TLLX!"A8rl7PO$/9*V%Y0^&X4#,"F>kQFN_TLLX!"A8rl7PO$/9*V%Y0^&X4#,"F>kQFN_TLLX!"A8rl7PO$/9*V%Y0^&X4#*?7#CY+R`0d=3`X&gqjkh3:T*bY/F"G9IRA,p7V`f(d.Q7Rm\hLGS(4aQb1n\uNc]-gCG]]+f!nhdbc&I^_:Oo_)LC>!L62f77_Ac(W7S%Ee,EnP#j'c%Rf=Ks&GF$(t)5pobiJ5>k%UQ;u#YJ0jJG^Si.X\lM[?G(O0TL">C*I@>%2dkQFN_TLLX!"A8qU/bW*o[0m20@**pGb=QoMoBV0Jc^J7`J0qQbQD&[L=>HP+Cl`-U?O2r+BHl2@)9NbI,m0UWp/m(?Kj`#Q-F,e#TmS"c18'J?O(B_H#aPm4,26TFaqoXU".I=)=GHC#@**pGZ9=MIHm)TITS[KGALLmkQ4.FNgLD,D(P3q8R`P2)L@U%+aab_D4dMdh+]pLmM/85a-sm7G$tpj>@iHP+Cl`-U?O2r+BHl2@)9NbI,m0UWp/m(?Kj`#Q-F,e#TmS"c18'J?O(B_H#aPm4,26TFaqoXU".I=)=GHC#@**pGZ9=MIHm)TITS[KGALLmkQ4.FNgLD,D(P3q8R`P2)L@U%+aab_D4dMdh+]pLmM/85a-sm7G$tpj>@iHP+Cl`-U?O2r+BHl2@)9NbI,m0UWp/m(?Kj`#Q-F,e#TmS"c18'J?O(B_H#aPm4,26TFaqoXU".I=)=GHC#@**pGZ9=MIHm)TITS[KGALLmkQ4.FNgLD,D(P3q8R`P2)L@U%+aab_D4dMdh+]pLmM/85a-sm7G$tpj>@iHP+Cl`-U?O2r+BHl2@)9NbI,m0UWp/m(?Kj`#Q-F,e#TmS"c18'J?O(B_H#aPm4,26TFaqoXU".I=)=GHC#@**pGZ9=MIHm)TITS[KGALLmkQ4.FNgLD,D(P3r_#*bI@Nu^YGacc/eP>=A&If/uhdaIgSf=,YR3kl>\0?EihL@U%+adu,Lp;i@@8%Nm2R[&InBdjrdfU\j&_S5K:qdJlfi-Gpm9@#8l.<,rZ=V5M3$n?RCl`-U?O2taWRPbA]mI3lqmL@rniJ/mKkuf]-sm7Gr#&=1<9-rAQ'LHX7dpE2*m7Bo&1e^G7(1,kPV3/4"uMJ0Ymf_%_3+gnA:#jpq*M;r5SMflb>>na//cf&[`:.f0*FqN%i5$kh]Ef_a^VP]M."HuJat9X^*n2r@*f4G>TYAt1?=@6f*MYP@*/(mng[#@.!M4icj?"C+"4TYAt1?=@6f*MYP!rrf+NnZt`qniVgK]G44'6Ppo.f[1eZB*20+21kH*F4FO#_"-g[_jb?*O7r=f*SJdaeS0`"gYg:=Ck#E?&'tmm<-8q6h9G%A:\R#'36!68(SmlA:'lOb)e`s%Tsq/l\&8mjO&+k!ngm-X>2&^Y!WuGp,ZWI+aJ/MZ9YkM$*+KVU[+!FZFs"8AOmkJLFePSFi#Uqo&r,FJS_M':1J6ZTL+aJ/MODCpQ,`f)1Eae+SR2nA4Ct>]@TL"4mbaDsh]@sfXGIf5?O6_[XVRl5-_r;"arMMd:J'.5\Tg9c(mhJ^1-[N-eM0-<9!CdC?QqtBED04,baD=Zc0G3$drYM?Ep9D$rGCY+R`0d;0@@i&4ugYpdb3-VQR54_#i:)goc'jX`LY[EsJ4^_7:L?*3l-rc@*sJc7(7ur)r7=AD^Ej[?%IdT0AMhX3=JnqVN@%gU45jkFR%G*\Afi.5!7@eZ5#WH1K')a,_$>YoCu)\1*VoM$W>J*Xug??kF`)fE`.Tf?*qI.2]Xj]AQfFo;%)b,VN&E0]+^]$e,PG9^aiA4YX6\1qC9qteS@^?)V6:/b9R!Y\\[K>:<@KRPFm`]$mSWA_d*A1Dm:ro-XK7^4Ppq,pHh@CA#$?d9?+SP7$UuSR'::UjP&13%^%U0PrTn1U.dQ%u[pV:>0_6buOE1(pY^J89K9t27A@?PT"ZNGmqZQ'tA\5nO\B:h"FLKsSK2lO(7oK6Re>ZARqq?OdFi/qCm7hY!*HGTcp?g+o2a7qVZ=_C!Ec.aq_lK)UmDB1+o_-5]P'1Bd[rZVBh,.nUZjsp;o%^SsmEYWQI=Errd9oamda)`*rTNUBe[@EfnBICZZgID/R59U/++F(Yl,mDC1N;9fkfFJc::*-mNE5g7fEQ0YfWe/>1pJ:=HM%X>@p?BXl.<,rZ=V4rQe9%Nn])^^DdRthjX3iq$uc$4m?)JS\f*X[qWS[[s6Xpt8C`jj6-)PlYgYlWkDi,WS[HhlUIX(ID&`/28O$+T&*M6/>?"R$>$>ZnGNdBsp"BH>NLe5Zf=b\7Nq2a]$]7S0\`@GH2kLi1%Ok-LeV@b@nnZY%IfgU75gZtNm7Z,_kFHfprD0"po*N>jfGj1P$49:"tR4*p;%f>?u1GOOC*$YnG)D:*E@97PJl=7+n[htC$X>DnMgRdG*;?$]#cF)Fqgi&4c:@&RPtqDu)Ep_pnWE6=Oa>Uge6DFUNB.U=.N>ksBOn:iEF2o;FggTZ]H1PVecpBq*e`%R=3,rEMPUZfj,WDANO>T16Xlqlqq]rG_7h-Gbl^AN3Rt[Qb=.=o4TReNUS6%gc/6@U\0.I>*B/.p&m\gVq.Nh7IlLe>SMsM]R63Z.*d[3d8f80o[,2*\^te\;kB@Pe%kE^\jTaH*Zh^C_0gKd;(nip:.[d]q``?)rk0hn38mN[s$BH#t'gCTK9:O0/S*NE(_od<.^WZ-&.j\dle=:1J6ZTL+aJ/MODCpQ,`f)1EfF(RjO&*8Y-H-24Lu$D2-(TX8(RdP`[X@HkW0j<=Ck#E>mKF/Wsm+Fl[W5MPp(Ep*KY'<1*hp`"*0B$L?-a)5d%B7O?C)aa>I!9P"4+)\]-*do&r+Wf2uTT*aK$3)QO:gU[*G8A!J3_F<-HYX>2&^Xoi^(&X~>endstream +endobj +5 0 obj +<< +/Contents 11 0 R /MediaBox [ 0 0 612 792 ] /Parent 10 0 R /Resources << +/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject << +/FormXob.18f55a53584ec2293b2ede279d934b78 4 0 R +>> +>> /Rotate 0 /Trans << + +>> + /Type /Page +>> +endobj +6 0 obj +<< +/BitsPerComponent 8 /ColorSpace /DeviceRGB /Filter [ /ASCII85Decode /FlateDecode ] /Height 100 /Length 6947 /Subtype /Image + /Type /XObject /Width 350 +>> +stream +Gb"/lpK&'N(B.;c?WRG1F(im2;q"^YVTd5nF/b&GiVMSOO.n!D"(6Je/NZ@GG-)Jfr&F;?h^q6qA1$Je0):+:(;eP^sq_&Q/gb+@Ja<#_EU2AKV7FKTq$1#SBr=JfmHP)$kg*+_;c%6A1SS+:&QW#(dsc8/V\b&=I4SOPb'h[G1Q'H1U3'`EU)_59p3SnL8@A1V@A&NH0k-Msp[=d\#^X9m=Q@Thk9a[QhOb5jI=NSgm1*?!k0qMo&L`Y!k?/5-o5@p)p*iF!o;<,B5YWBpMrA^9i=RR-&:72!os2V4?q0]2T_fBN;jb6W]=4Ck$!4@d!1`C6`'&W^_J.L2*,"h3X833K3I:7,fB8gZbEXOCjf,?=nFGSVBLSD[V&#leY/q>3g(PthDiNY*43fC`s2q%JB6V@2)G1Z=)31Qn]HClB)N\-nVE+dlDYjVQL4I'k@*dRk:1;F=oY'Ro57\\b)0K)90CCW:TA9u,%)k35osJR]pRKi;or84U_(`Zd\Cm6rU[]*,6([n'MorWR=e9D)1/V4cM/iRrN#]KQ\+q)Ke32EA2,$b[9tG*"II?sL)V/6'Qttro)-Recb&A]jS65@*O)=_T1*k$OotHBdbo)R'b2rpibjh0mhZqG),!%0_4)]"5\6L?#P"i]XS`Lg6o=Gn(Tr&GuU%01/dkT?J]Kq)4KDFK=G'1bW)/72R>RdM^.k2A*atA6XA_0'.9#"FFNY4-#o\#6^P@%Cf2:)XWLll'4:_.)O^,pT)ZOQR>6b;g^ZdKCl\,'N:dHDFA[jX,%kCme)CodW-\?=%_R2d[20n0lH"*WGLf0m/.?Rcs"9_&cJ$kJbq4.2?sdV]NcAhJu]t$,J>#dDGT@VK:e:*drgJ6R3K%3+0[][C@mVV3r8U1E?//K?QQLa9b+&,@Ve's.HW4hl^B-dAm"j)"p<`[KaFjB/gm]D2UT+`A.@h\a69sg_)d'[<)7CmYKL6"]bA:P*R#.RHeODE8]B-k80#+V/_5@-"-q5?AbHq32+[og1KYsZOO#U<>bIs3.$p4[c/H,5`DieUMb!E/\*_ZMN);5:b>Ju@8X8H7JNsGatnFDh&-j)`AJiHa&;l5gg80-)f)7.R/".R11L[*%,X+_V$@odK-gW#fklq#ok?l&`uipie84K?PSMpfnVWG^0D`GK+&N'\CuII1!$d;`gY."b$]2sPh@;8ups(LP,(Ul#kX61GARitSr7Ts%=C*nBH(d-[AqHOp7_4>$#+Oa&)1PZDSGhK4rO&@#g:GknSWKY>%QD"r`2ZS%a12ts6p?V2+5ZoB0?=3KSPhX2-/>J!Km>SY*NuVIShqQPeTKs5apI`'\RZU7"i4FjV/T8Ca#>I5_0/%^dO-/0cqD2FkkfgalGQPQF![Q\1IE].,"^,N/[_K>BubqX^(N#)iZfK2u;7+dED,JUK+#[[H.elo:f?1f@!YA]T.C;bPk<4!.,$&Z@a13-t.O*bK_8##aCJao+Rd](,heJTJc0\`XdM;rHuqq:VMGprO?TDRl@9/(NO%%)j+@ZW:C;U\ZFGcPcA!7&OTCK$=mDcLIOQIkMrMc6o>'-B*8pXpW%$V;Wq:qUk&Y@mV%u8Ok5ZCeO^"kP`YGA_7j3"eu6`'[N3Aljf@8qgaBa>T")Y=rnN9[?F/"^ghXZ:DosZDi">ZT'%PY,qTO$^[I"*+DS,7Hs,AZ:Lq5f_JN#L:9tMG=;d,Sghb3B0L#J6rKLGrd,,G+QC/":%kIuE6j0V\MJTt5qrS,9JnVB\)43:.lh_LfmSnZon4V@$R*-mS:EIWJ1'!PH@Jl!`Xpg.%QfXi;YU7M(1+NG2ePGU6CSP,Kl-NsF?TTZNf9^AhPa)8L5bHF45/jbu0+Mu*)!'?%f-J\q/@Ar$_r@Nf-6;CgDB]u!PnX&c_odKU`Z]R5N5-M*MjRKKr>j:"R>G/eC/4_@2-E9XX)b7ik.ch=^.:Jg`F0rk]gQ^kpY:N`.E1&1JOl_6>TnPBoP)8FOOE@/"I>UYV#NQIGe`4+Yni2tY;G\ND-J_sV/1_5[`hpa4aFiKNY\sD*8Qi$,m5X@)iO*La9idI%"93qmYMP\@PK^pD=8lYt"^_RqZ^lHCq8D8&s!f"b'-^e/o0rk`Z4/1Uf'>`gb0CE=:fq2W%80?M_CXRDQm6t_Zdn]C>(?mRNUQ.)4CK"IIY!.NX00]GKB0Ur"n#g3_kfI=8)'jjto*@O7`Sg3pTmMSEE0j@2S4]iqG,I>b^-'L`AeVI9,k&,T%Wt*t#STKAQA)MmL#o!D#AXPqM4dc0#TV,<"oGJCZ5EMO'Wbbg#p0/"O?G-%]cYNA4=Y#jB6\0+j_;W&*3iBnj?MS`9QR[L&@D47qTl"/AL>!`7i?DlA4W#S#WB`mnAt2fY`EK+Z`fGCQr92!J,#\Wm#oaNK6fLIX?nqr4KRM;S]Nn4SlB[Qk0]q4d_K=iq"5#J&`UPQGs=V='FoVM6&:UX&-O.Cb!6Mk#UojA&0dnY"@7j*1(XR36HVP)KEs$EKVeWGa-4>$eJ#lg6-:m5Ut2>GW#\uP(49Y@bXT^!((X#FK!6_Fm?Qt;,MU(gng)1$F^rPMP_1?drNcOkXen4&eobB0E7R-5s*:0-]9j2_QAIi1s>s2rg4(l!o4Vh1DDI`!fQPRSh'Q&50T?@!>.GGM[RGM^bXphj8GoL8&bjqlH"Si1765M3IEjc*jfgb(.'Lrqf"2d50'[d&ctd_4Z?+-tRidMSeYlPK#1\mEbegrA*^Qjs=eq\83BQZHj'N$?_hdgMHU$)n`/;a$nJE0(%iEi[[PHWO]J\0YN\@dW\gEXbX8]^.\?:G7,*_!XV4/pOmH8]d[o4fG\540%W?_pH.phA,-l7>g7W;3A46#5ub@3!*R=$6bZEEaIDO-ZfO>tHk`R3!=Ia7_BaU?U+pZ6h5#Up,8PIqtR.93>tN8+2cb+MqDQ<@<+n.Cs-=V;b#H-bnPF1:T4gaO+@!\7bG6*R/e%,h^La:Ht?KJfG3R$):>1:k6m+:mTZ;[0bqqV*H##*b`RPU'`I)=PVuXZNb);",=.fcabbih615!l"l[okf%_*4<6+(5TZ`l&]%F?O$#fGg(VT7rK8/pa:$j5m/NTYaO91@T3T%E_LpO^3$>Lb4O/$sf`6IfEaf64^iCkEgi_g\KOo@pV/$(9k*RI;FZWYZgn<9='$/(:TPgL+Ho^B2kKZLa?i)7F;9\ffH]iQ+P<:/HqMHm@QF].RVj@Lm+7AZKnHP(^LU(iZ9I4]D,_#B6)9PV:)+IX9#=R!=,X70ZXj6Li8toJH`m\0.DpYSUe2b*]ln2arm"OAJS0u&V60Tpd@?XAEQ1<*K%eg)SZA\3``g\+\lpa4C8'X#"."nG7IOO"R/ZicG*XLK)c&=:^Ku:Y_ZdeA)sEGA`5.7AGl%t%FMaDn-%1@11\mG@8X0b9XIibGrk+Q85+MrJXKWoq%sQna+@+n'3,*"[J+E1\jN\H+qaIMYdJ/UVp3-Q7q-gMDgB-'X;Qt]Wmo^Np,eK;5207?#1\mEbX0H+,*4PgT+r/C]?,N)_nac3j0PDd#Qr`-A&dZ@D=VTsTCt,cG/bZFOS6?S,:^UmaqQ6.q\Bf>aR77K0ULttcc>T+s41)N%'&2g6CkdXpu-kV?FC);O0fi:DLf38p4b)^$D.'E.u+`96QGm]44D=p):'PIVbeL:D)CVJ\DT"Hc2^fF*YcI#b]d'.Z,?@B)fg01;h2rP^OICR.YCdDWb*Dc(_K:V#_EU2AKV7FKTq$1#SBr=JfmHP)$kg*+_;c%6A1SS+:&QW#(dsc8/V\b&0YU-#XX3/JKP41;MJ&&p4"FgV3!G\(pj15S#)E,3Th.\/Y;BEitUVqH9[#MOu0!"7^f49cS()3m_-W^W6:Xp2P"."p9SDL@u@=`[?iH`E9$17Xn(e))tkU#R-A*nIt#5uS?q$YH<>LVkH6%Da(KYC6=""1,0]34S@e%"-%@bZ[$3idk5L*.S,(p%S)Ul]s"LR';n"0nM/Q7f>--Ch9HH_BjG`C'cZ1^E'go.0V[A]pD:V[q7+SA5db`+!`)b"6]TqT_X-KHj:M6_tK7cgo\(&ACq(;CF8ZV?'Gu0]g7]7K\d5`$sEH#CTh8J0GM^>[Ih-!ZR]`CcRBWF\f$\=;?f.V\,We)kP`ZH:tSsI8FQR28_Q6pCK2`rm,\'H4dCc)%-T&7o!kaUFEL.ia\XQo>]7,\lYFPi0b[Eo(e4&n$q(+upVVnBRT0RI/Bh]$r`\%hG!]g"iRGPXJ:eP5*3$\T75H8*U^Wbjh>^VQiJ[TSd/)^tC,<6*7+SA5CFA9ekc$C\AO(Uh(\)/F3G:O^/uUHl1KB":o:H8/ktqYp9T#C8;2'G668kjQ-L>9WKUuWfc'W=/12o^*s(=?ZbK-?!4Gq^UaBI,.mH"0I4U(OoaoD[9?#(mAI"NaLYPrd+ctQk062'LQE[9OJQ751jELSE-r!<5Z25O?O8]FK$9Ig5;&>GUsPKm+\5BT6W]PdW+VUY>u^puLhG$IhJMIoOK8/XC^])T+,(EZsYEOF#/.8ZgM80],$g(IPh3YOS8&%/<[CItJr`J7=]bUHPk[_?h8[AXN#fW"H09(uRD?s(.*J,E_oH1Z'?-t0@hM,N^_lJ;T,#n]j4GJ0NI=O&=@D&%Xr`je?b34SM6'^JekXLTcr8%7GL-fo$"u+E](RB_(#^gh-NDX0oBT:gS<_]"8=endstream +endobj +7 0 obj +<< +/Contents 12 0 R /MediaBox [ 0 0 612 792 ] /Parent 10 0 R /Resources << +/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ] /XObject << +/FormXob.bd0eb65b2aba982ca2209699f3ec766d 6 0 R +>> +>> /Rotate 0 /Trans << + +>> + /Type /Page +>> +endobj +8 0 obj +<< +/PageMode /UseNone /Pages 10 0 R /Type /Catalog +>> +endobj +9 0 obj +<< +/Author (\(anonymous\)) /CreationDate (D:20251205104950+01'00') /Creator (\(unspecified\)) /Keywords () /ModDate (D:20251205104950+01'00') /Producer (ReportLab PDF Library - www.reportlab.com) + /Subject (\(unspecified\)) /Title (\(anonymous\)) /Trapped /False +>> +endobj +10 0 obj +<< +/Count 2 /Kids [ 5 0 R 7 0 R ] /Type /Pages +>> +endobj +11 0 obj +<< +/Filter [ /ASCII85Decode /FlateDecode ] /Length 1477 +>> +stream +GatU3;01_T&:WeDlpEj%YD:GeAR!G!!H*Y&>Y#17[ZAEDTGHm%Rp>3]mol#;#aTd.j-_UpIf=N*h3GAli8R2Ts'u@R]7:D6#nRH%'bKL]=PU`'dG+DO7YVmLaq9TY*(#K(7,ZFR1QR(HHhAHplQ-\$'0!5`OI$%-2(td`CO^[1ATGPQc2W=iA2Ln?:%+ss`H69a[h#h@i[7k+$&a#8pL!F$N5-`7:ri(bH,-3;q>N,L=pI8;lsLZ,86=@7QjT(8V-l1d_.<>eVp5!2";kDtf-FX\8;^]QndArA2G_m`&'>?ltYION0X"CkS8Kj5ZKR:e2d=X=EB`+,4PV&*l`3M&VisZ+Tp_.4o=)?4?jP>#ucr&mBNDjE=m-IF'M["'E;1JHh=/btFdVE!&_0bH/K5Em'GD>CdbZVshfPdSR)/P9p4X:c$nl2.0rX9`Cr(aZh]0(r9V:#f!!B\1ordGJO-.6Z^on#!d0W>QP[N"JAr!`BI/nCIuL1!14Sm`_A\2ok@aD\3j%Br7!HldEX:DH1:hVA3g2fr&^#']q/TR2TjjFLahFM__PN0da4BTtUA?Eh^O?=9@FKWB/66$q43S1ib^Of%&6b9R8N%%SF1`QoH!ARU\1H/uGE&SZ+fe@=]j^-=`gEZ1a$fd7bO>#0h^kHWt&T*!(T([ndGo:&S8=!0d\DoRACVK2/b26AnhHoVVZ6;l$F1:a'.B$Ja13Fj[DA-Y=EGjHd20\`NK2LiqkY"21+"jF+ds;Di^_GDf#$8(C&186q7<9fq==5hkQ]#"rsD,@nc@/a1U('ShD,$=2#dRT$Nfq.#kkc'rdj7PXZ]uU4[H7m?Ed\i>mfTlT>%;MB!5H#^4C&5USWa4$-ZB-Tb`6$I0)!bD"`7mD*SPB;t#M;1]\k2Pd7Lpn[p67!$=&k?'G/="o"FJ+*UWK=+\TA/L!$].tN61#`g^/`Y^\rk)=b"_lCKNLt:LJ1a1atfrWBN\4@Gq/A7:\$OSG_k4RYS8b5"200bJ[3G*~>endstream +endobj +12 0 obj +<< +/Filter [ /ASCII85Decode /FlateDecode ] /Length 1580 +>> +stream +Gat=,hfG8H&:X@\Yn+C5Zt!#[KL#9tbGBErleJk(gUW*R@$"kZ(gMM8c$t3n5Ti-2-T@V2])&e7Dmd2d7DQkaC&a1e!5GYph0T3V_KlTC3=OI\iF]duYol1b#U,i\_'jA/=a@ctBk5RYE72I!j0i^$;K>p_LK$I["R6$up`GpUY\i[&P`V\@>foEua/Q-[=RRLL&$h/"%&,_!as_^j19+3_"VE`d(YDA`=&:_"e5:KElSl'I%dUTC=mnscm>RSmOK^IO![;Jo&i:c-q[<6'j4J57BY4LO4(\\5:@"T?:f.:ghp%q=<9[eg'i;g1]EEJi[T/G?V]ZecqZdf0Ab),Qq3[BNo$%KW`G#%3'O`NQ2W#b,mi,M6J"L022'.hFBS=jF_enHO@S/Ce9E&q&4GPS3hOs[?j\k&V/;[,/P0P!$<)ADT]_f-MdI,D_ga]#%FWi*Z__DH@?e7l'JI%+2`o^cLebt$K>]#7(ML]/0Va^\;qlrhZFa'O#?nSOdRNquBeT&InS3E'\SgnVX,1>Sf_RDts'f]W=tF3itK%5#$]=U-XoRFJY5q>^qjJIO+6V6cD52q#:10ZUIUWt%#B==e;mEX-gXK\o44IQnY$7P3X_Yf/JLGpX%:b-`V&`\EA$gWoK!OBILNY.Elrc?<5D?tNGp_SVnX.]IQA$E]RuU\prJZd\7dT<7JC.NX&^N-$NAR-c]lGcJ29tg>me9s\lG$J6.?4fCTsg0Ljb^$^aj*V9I'E7h:cj\JD[+l,SNW'7^[`3?I:$B<&]+"DpZ-Vs#7eT1d%p%*cX?F;_-JQ5V8k@C5j@gB7AXIC&MM.)_fFq(d40eBZh-afp(f-\J_*qRR$J%>WjSFG9Yd`d3L#Ki>VB"omTaj6D.&_%g*3_;hOBS6.$<#I*seGGbg3%/MS7eOA$Jh>endstream +endobj +xref +0 13 +0000000000 65535 f +0000000073 00000 n +0000000114 00000 n +0000000221 00000 n +0000000333 00000 n +0000007848 00000 n +0000008106 00000 n +0000015244 00000 n +0000015502 00000 n +0000015571 00000 n +0000015854 00000 n +0000015920 00000 n +0000017489 00000 n +trailer +<< +/ID +[<43e4692dcfea9606b90854ef4511b7c6><43e4692dcfea9606b90854ef4511b7c6>] +% ReportLab generated PDF document -- digest (http://www.reportlab.com) + +/Info 9 0 R +/Root 8 0 R +/Size 13 +>> +startxref +19161 +%%EOF diff --git a/packages/markitdown/tests/test_pdf_tables.py b/packages/markitdown/tests/test_pdf_tables.py new file mode 100644 index 0000000..92a79dd --- /dev/null +++ b/packages/markitdown/tests/test_pdf_tables.py @@ -0,0 +1,871 @@ +#!/usr/bin/env python3 -m pytest +"""Tests for PDF table extraction functionality.""" +import os +import re +import pytest + +from markitdown import MarkItDown + +TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), "test_files") + + +# --- Helper Functions --- +def validate_strings(result, expected_strings, exclude_strings=None): + """Validate presence or absence of specific strings.""" + text_content = result.text_content.replace("\\", "") + for string in expected_strings: + assert string in text_content, f"Expected string not found: {string}" + if exclude_strings: + for string in exclude_strings: + assert string not in text_content, f"Excluded string found: {string}" + + +def validate_markdown_table(result, expected_headers, expected_data_samples): + """Validate that a markdown table exists with expected headers and data.""" + text_content = result.text_content + + # Check for markdown table structure (| header | header |) + assert "|" in text_content, "No markdown table markers found" + + # Check headers are present + for header in expected_headers: + assert header in text_content, f"Expected table header not found: {header}" + + # Check some data values are present + for data in expected_data_samples: + assert data in text_content, f"Expected table data not found: {data}" + + +def extract_markdown_tables(text_content): + """ + Extract all markdown tables from text content. + Returns a list of tables, where each table is a list of rows, + and each row is a list of cell values. + """ + tables = [] + lines = text_content.split("\n") + current_table = [] + in_table = False + + for line in lines: + line = line.strip() + if line.startswith("|") and line.endswith("|"): + # Skip separator rows (contain only dashes and pipes) + if re.match(r"^\|[\s\-|]+\|$", line): + continue + # Parse cells from the row + cells = [cell.strip() for cell in line.split("|")[1:-1]] + current_table.append(cells) + in_table = True + else: + if in_table and current_table: + tables.append(current_table) + current_table = [] + in_table = False + + # Don't forget the last table + if current_table: + tables.append(current_table) + + return tables + + +def validate_table_structure(table): + """ + Validate that a table has consistent structure: + - All rows have the same number of columns + - Has at least a header row and one data row + """ + if not table: + return False, "Table is empty" + + if len(table) < 2: + return False, "Table should have at least header and one data row" + + num_cols = len(table[0]) + if num_cols < 2: + return False, f"Table should have at least 2 columns, found {num_cols}" + + for i, row in enumerate(table): + if len(row) != num_cols: + return False, f"Row {i} has {len(row)} columns, expected {num_cols}" + + return True, "Table structure is valid" + + +class TestPdfTableExtraction: + """Test PDF table extraction with various PDF types.""" + + @pytest.fixture + def markitdown(self): + """Create MarkItDown instance.""" + return MarkItDown() + + def test_borderless_table_extraction(self, markitdown): + """Test extraction of borderless tables from SPARSE inventory PDF. + + Expected output structure: + - Header: INVENTORY RECONCILIATION REPORT with Report ID, Warehouse, Date, Prepared By + - Pipe-separated rows with inventory data + - Text section: Variance Analysis with Summary Statistics + - More pipe-separated rows with extended inventory review + - Footer: Recommendations section + """ + pdf_path = os.path.join( + TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Validate document header content + expected_strings = [ + "INVENTORY RECONCILIATION REPORT", + "Report ID: SPARSE-2024-INV-1234", + "Warehouse: Distribution Center East", + "Report Date: 2024-11-15", + "Prepared By: Sarah Martinez", + ] + validate_strings(result, expected_strings) + + # Validate pipe-separated format is used + assert "|" in text_content, "Should have pipe separators for form-style data" + + # --- Validate First Table Data (Inventory Variance) --- + # Validate table headers are present + first_table_headers = [ + "Product Code", + "Location", + "Expected", + "Actual", + "Variance", + "Status", + ] + for header in first_table_headers: + assert header in text_content, f"Should contain header '{header}'" + + # Validate first table has all expected SKUs + first_table_skus = ["SKU-8847", "SKU-9201", "SKU-4563", "SKU-7728"] + for sku in first_table_skus: + assert sku in text_content, f"Should contain {sku}" + + # Validate first table has correct status values + expected_statuses = ["OK", "CRITICAL"] + for status in expected_statuses: + assert status in text_content, f"Should contain status '{status}'" + + # Validate first table has location codes + expected_locations = ["A-12", "B-07", "C-15", "D-22", "A-08"] + for loc in expected_locations: + assert loc in text_content, f"Should contain location '{loc}'" + + # --- Validate Second Table Data (Extended Inventory Review) --- + # Validate second table headers + second_table_headers = [ + "Category", + "Unit Cost", + "Total Value", + "Last Audit", + "Notes", + ] + for header in second_table_headers: + assert header in text_content, f"Should contain header '{header}'" + + # Validate second table has all expected SKUs (10 products) + second_table_skus = [ + "SKU-8847", + "SKU-9201", + "SKU-4563", + "SKU-7728", + "SKU-3345", + "SKU-5512", + "SKU-6678", + "SKU-7789", + "SKU-2234", + "SKU-1123", + ] + for sku in second_table_skus: + assert sku in text_content, f"Should contain {sku}" + + # Validate second table has categories + expected_categories = ["Electronics", "Hardware", "Software", "Accessories"] + for category in expected_categories: + assert category in text_content, f"Should contain category '{category}'" + + # Validate second table has cost values (spot check) + expected_costs = ["$45.00", "$32.50", "$120.00", "$15.75"] + for cost in expected_costs: + assert cost in text_content, f"Should contain cost '{cost}'" + + # Validate second table has note values + expected_notes = ["Verified", "Critical", "Pending"] + for note in expected_notes: + assert note in text_content, f"Should contain note '{note}'" + + # --- Validate Analysis Text Section --- + analysis_strings = [ + "Variance Analysis:", + "Summary Statistics:", + "Total Variance Cost: $4,287.50", + "Critical Items: 1", + "Overall Accuracy: 97.2%", + "Recommendations:", + ] + validate_strings(result, analysis_strings) + + # --- Validate Document Structure Order --- + # Verify sections appear in correct order + # Note: Using flexible patterns since column merging may occur based on gap detection + import re + + header_pos = text_content.find("INVENTORY RECONCILIATION REPORT") + # Look for Product Code header - may be in same column as Location or separate + first_table_match = re.search(r"\|\s*Product Code", text_content) + variance_pos = text_content.find("Variance Analysis:") + extended_review_pos = text_content.find("Extended Inventory Review:") + # Second table - look for SKU entries after extended review section + # The table may not have pipes on every row due to paragraph detection + second_table_pos = -1 + if extended_review_pos != -1: + # Look for either "| Product Code" or "Product Code" as table header + second_table_match = re.search( + r"Product Code.*Category", text_content[extended_review_pos:] + ) + if second_table_match: + # Adjust position to be relative to full text + second_table_pos = extended_review_pos + second_table_match.start() + recommendations_pos = text_content.find("Recommendations:") + + positions = { + "header": header_pos, + "first_table": first_table_match.start() if first_table_match else -1, + "variance_analysis": variance_pos, + "extended_review": extended_review_pos, + "second_table": second_table_pos, + "recommendations": recommendations_pos, + } + + # All sections should be found + for name, pos in positions.items(): + assert pos != -1, f"Section '{name}' not found in output" + + # Verify correct order + assert ( + positions["header"] < positions["first_table"] + ), "Header should come before first table" + assert ( + positions["first_table"] < positions["variance_analysis"] + ), "First table should come before Variance Analysis" + assert ( + positions["variance_analysis"] < positions["extended_review"] + ), "Variance Analysis should come before Extended Review" + assert ( + positions["extended_review"] < positions["second_table"] + ), "Extended Review should come before second table" + assert ( + positions["second_table"] < positions["recommendations"] + ), "Second table should come before Recommendations" + + def test_borderless_table_no_duplication(self, markitdown): + """Test that borderless table content is not duplicated excessively.""" + pdf_path = os.path.join( + TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Count occurrences of unique table data - should not be excessively duplicated + # SKU-8847 appears in both tables, plus possibly once in summary text + sku_count = text_content.count("SKU-8847") + # Should appear at most 4 times (2 tables + minor text references), not more + assert ( + sku_count <= 4 + ), f"SKU-8847 appears too many times ({sku_count}), suggests duplication issue" + + def test_borderless_table_correct_position(self, markitdown): + """Test that tables appear in correct positions relative to text.""" + pdf_path = os.path.join( + TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Verify content order - header should come before table content, which should come before analysis + header_pos = text_content.find("Prepared By: Sarah Martinez") + # Look for Product Code in any pipe-separated format + product_code_pos = text_content.find("Product Code") + variance_pos = text_content.find("Variance Analysis:") + + assert header_pos != -1, "Header should be found" + assert product_code_pos != -1, "Product Code should be found" + assert variance_pos != -1, "Variance Analysis should be found" + + assert ( + header_pos < product_code_pos < variance_pos + ), "Product data should appear between header and Variance Analysis" + + # Second table content should appear after "Extended Inventory Review" + extended_review_pos = text_content.find("Extended Inventory Review:") + # Look for Category header which is in second table + category_pos = text_content.find("Category") + recommendations_pos = text_content.find("Recommendations:") + + if ( + extended_review_pos != -1 + and category_pos != -1 + and recommendations_pos != -1 + ): + # Find Category position after Extended Inventory Review + category_after_review = text_content.find("Category", extended_review_pos) + if category_after_review != -1: + assert ( + extended_review_pos < category_after_review < recommendations_pos + ), "Extended review table should appear between Extended Inventory Review and Recommendations" + + def test_receipt_pdf_extraction(self, markitdown): + """Test extraction of receipt PDF (no tables, formatted text). + + Expected output structure: + - Store header: TECHMART ELECTRONICS with address + - Transaction info: Store #, date, TXN, Cashier, Register + - Line items: 6 products with prices and member discounts + - Totals: Subtotal, Member Discount, Sales Tax, Rewards, TOTAL + - Payment info: Visa Card, Auth, Ref + - Rewards member info: Name, ID, Points + - Return policy and footer + """ + pdf_path = os.path.join( + TEST_FILES_DIR, "RECEIPT-2024-TXN-98765_retail_purchase.pdf" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # --- Validate Store Header --- + store_header = [ + "TECHMART ELECTRONICS", + "4567 Innovation Blvd", + "San Francisco, CA 94103", + "(415) 555-0199", + ] + validate_strings(result, store_header) + + # --- Validate Transaction Info --- + transaction_info = [ + "Store #0342 - Downtown SF", + "11/23/2024", + "TXN: TXN-98765-2024", + "Cashier: Emily Rodriguez", + "Register: POS-07", + ] + validate_strings(result, transaction_info) + + # --- Validate Line Items (6 products) --- + line_items = [ + # Product 1: Headphones + "Wireless Noise-Cancelling", + "Headphones - Premium Black", + "AUDIO-5521", + "$349.99", + "$299.99", + # Product 2: USB-C Hub + "USB-C Hub 7-in-1 Adapter", + "ACC-8834", + "$79.99", + "$159.98", + # Product 3: Portable SSD + "Portable SSD 2TB", + "STOR-2241", + "$289.00", + "$260.00", + # Product 4: Wireless Mouse + "Ergonomic Wireless Mouse", + "ACC-9012", + "$59.99", + # Product 5: Screen Cleaning Kit + "Screen Cleaning Kit", + "CARE-1156", + "$12.99", + "$38.97", + # Product 6: HDMI Cable + "HDMI 2.1 Cable 6ft", + "CABLE-7789", + "$24.99", + "$44.98", + ] + validate_strings(result, line_items) + + # --- Validate Totals --- + totals = [ + "SUBTOTAL", + "$863.91", + "Member Discount", + "Sales Tax (8.5%)", + "$66.23", + "Rewards Applied", + "-$25.00", + "TOTAL", + "$821.14", + ] + validate_strings(result, totals) + + # --- Validate Payment Info --- + payment_info = [ + "PAYMENT METHOD", + "Visa Card ending in 4782", + "Auth: 847392", + "REF-20241123-98765", + ] + validate_strings(result, payment_info) + + # --- Validate Rewards Member Info --- + rewards_info = [ + "REWARDS MEMBER", + "Sarah Mitchell", + "ID: TM-447821", + "Points Earned: 821", + "Total Points: 3,247", + ] + validate_strings(result, rewards_info) + + # --- Validate Return Policy & Footer --- + footer_info = [ + "RETURN POLICY", + "Returns within 30 days", + "Receipt required", + "Thank you for shopping!", + "www.techmart.example.com", + ] + validate_strings(result, footer_info) + + # --- Validate Document Structure Order --- + positions = { + "store_header": text_content.find("TECHMART ELECTRONICS"), + "transaction": text_content.find("TXN: TXN-98765-2024"), + "first_item": text_content.find("Wireless Noise-Cancelling"), + "subtotal": text_content.find("SUBTOTAL"), + "total": text_content.find("TOTAL"), + "payment": text_content.find("PAYMENT METHOD"), + "rewards": text_content.find("REWARDS MEMBER"), + "return_policy": text_content.find("RETURN POLICY"), + } + + # All sections should be found + for name, pos in positions.items(): + assert pos != -1, f"Section '{name}' not found in output" + + # Verify correct order + assert ( + positions["store_header"] < positions["transaction"] + ), "Store header should come before transaction" + assert ( + positions["transaction"] < positions["first_item"] + ), "Transaction should come before items" + assert ( + positions["first_item"] < positions["subtotal"] + ), "Items should come before subtotal" + assert ( + positions["subtotal"] < positions["total"] + ), "Subtotal should come before total" + assert ( + positions["total"] < positions["payment"] + ), "Total should come before payment" + assert ( + positions["payment"] < positions["rewards"] + ), "Payment should come before rewards" + assert ( + positions["rewards"] < positions["return_policy"] + ), "Rewards should come before return policy" + + def test_multipage_invoice_extraction(self, markitdown): + """Test extraction of multipage invoice PDF with form-style layout. + + Expected output: Pipe-separated format with clear cell boundaries. + Form data should be extracted with pipes indicating column separations. + """ + pdf_path = os.path.join(TEST_FILES_DIR, "REPAIR-2022-INV-001_multipage.pdf") + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Validate basic content is extracted + expected_strings = [ + "ZAVA AUTO REPAIR", + "Collision Repair", + "Redmond, WA", + "Gabriel Diaz", + "Jeep", + "Grand Cherokee", + "Parts", + "Body Labor", + "Paint Labor", + "GRAND TOTAL", + # Second page content + "Bruce Wayne", + "Batmobile", + ] + validate_strings(result, expected_strings) + + # Validate pipe-separated table format + # Form-style documents should use pipes to separate cells + assert "|" in text_content, "Form-style PDF should contain pipe separators" + + # Validate key form fields are properly separated + # These patterns check that label and value are in separate cells + # Note: cells may have padding spaces for column alignment + import re + + assert re.search( + r"\| Insured name\s*\|", text_content + ), "Insured name should be in its own cell" + assert re.search( + r"\| Gabriel Diaz\s*\|", text_content + ), "Gabriel Diaz should be in its own cell" + assert re.search( + r"\| Year\s*\|", text_content + ), "Year label should be in its own cell" + assert re.search( + r"\| 2022\s*\|", text_content + ), "Year value should be in its own cell" + + # Validate table structure for estimate totals + assert ( + re.search(r"\| Hours\s*\|", text_content) or "Hours |" in text_content + ), "Hours column header should be present" + assert ( + re.search(r"\| Rate\s*\|", text_content) or "Rate |" in text_content + ), "Rate column header should be present" + assert ( + re.search(r"\| Cost\s*\|", text_content) or "Cost |" in text_content + ), "Cost column header should be present" + + # Validate numeric values are extracted + assert "2,100" in text_content, "Parts cost should be extracted" + assert "300" in text_content, "Body labor cost should be extracted" + assert "225" in text_content, "Paint labor cost should be extracted" + assert "5,738" in text_content, "Grand total should be extracted" + + # Validate second page content (Bruce Wayne invoice) + assert "Bruce Wayne" in text_content, "Second page customer name" + assert "Batmobile" in text_content, "Second page vehicle model" + assert "211,522" in text_content, "Second page grand total" + + # Validate disclaimer text is NOT in table format (long paragraph) + # The disclaimer should be extracted as plain text, not pipe-separated + assert ( + "preliminary estimate" in text_content.lower() + ), "Disclaimer text should be present" + + def test_academic_pdf_extraction(self, markitdown): + """Test extraction of academic paper PDF (scientific document). + + Expected output: Plain text without tables or pipe characters. + Scientific documents should be extracted as flowing text with proper spacing, + not misinterpreted as tables. + """ + pdf_path = os.path.join(TEST_FILES_DIR, "test.pdf") + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Validate academic paper content with proper spacing + expected_strings = [ + "Introduction", + "Large language models", # Should have proper spacing, not "Largelanguagemodels" + "agents", + "multi-agent", # Should be properly hyphenated + ] + validate_strings(result, expected_strings) + + # Validate proper text formatting (words separated by spaces) + assert "LLMs" in text_content, "Should contain 'LLMs' acronym" + assert "reasoning" in text_content, "Should contain 'reasoning'" + assert "observations" in text_content, "Should contain 'observations'" + + # Ensure content is not empty and has proper length + assert len(text_content) > 1000, "Academic PDF should have substantial content" + + # Scientific documents should NOT have tables or pipe characters + assert ( + "|" not in text_content + ), "Scientific document should not contain pipe characters (no tables)" + + # Verify no markdown tables were extracted + tables = extract_markdown_tables(text_content) + assert ( + len(tables) == 0 + ), f"Scientific document should have no tables, found {len(tables)}" + + # Verify text is properly formatted with spaces between words + # Check that common phrases are NOT joined together (which would indicate bad extraction) + assert ( + "Largelanguagemodels" not in text_content + ), "Text should have proper spacing, not joined words" + assert ( + "multiagentconversations" not in text_content.lower() + ), "Text should have proper spacing between words" + + def test_scanned_pdf_handling(self, markitdown): + """Test handling of scanned/image-based PDF (no text layer). + + Expected output: Empty - scanned PDFs without OCR have no text layer. + """ + pdf_path = os.path.join( + TEST_FILES_DIR, "MEDRPT-2024-PAT-3847_medical_report_scan.pdf" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + + # Scanned PDFs without OCR have no text layer, so extraction should be empty + assert ( + result is not None + ), "Converter should return a result even for scanned PDFs" + assert result.text_content is not None, "text_content should not be None" + + # Verify extraction is empty (no text layer in scanned PDF) + assert ( + result.text_content.strip() == "" + ), f"Scanned PDF should have empty extraction, got: '{result.text_content[:100]}...'" + + +class TestPdfTableMarkdownFormat: + """Test that extracted tables have proper markdown formatting.""" + + @pytest.fixture + def markitdown(self): + """Create MarkItDown instance.""" + return MarkItDown() + + def test_markdown_table_has_pipe_format(self, markitdown): + """Test that form-style PDFs have pipe-separated format.""" + pdf_path = os.path.join( + TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Find rows with pipes + lines = text_content.split("\n") + pipe_rows = [ + line for line in lines if line.startswith("|") and line.endswith("|") + ] + + assert len(pipe_rows) > 0, "Should have pipe-separated rows" + + # Check that Product Code appears in a pipe-separated row + product_code_found = any("Product Code" in row for row in pipe_rows) + assert product_code_found, "Product Code should be in pipe-separated format" + + def test_markdown_table_columns_have_pipes(self, markitdown): + """Test that form-style PDF columns are separated with pipes.""" + pdf_path = os.path.join( + TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Find table rows and verify column structure + lines = text_content.split("\n") + table_rows = [ + line for line in lines if line.startswith("|") and line.endswith("|") + ] + + assert len(table_rows) > 0, "Should have markdown table rows" + + # Check that at least some rows have multiple columns (pipes) + multi_col_rows = [row for row in table_rows if row.count("|") >= 3] + assert ( + len(multi_col_rows) > 5 + ), f"Should have rows with multiple columns, found {len(multi_col_rows)}" + + +class TestPdfTableStructureConsistency: + """Test that extracted tables have consistent structure across all PDF types.""" + + @pytest.fixture + def markitdown(self): + """Create MarkItDown instance.""" + return MarkItDown() + + def test_borderless_table_structure(self, markitdown): + """Test that borderless table PDF has pipe-separated structure.""" + pdf_path = os.path.join( + TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Should have pipe-separated content + assert "|" in text_content, "Borderless table PDF should have pipe separators" + + # Check that key content is present + assert "Product Code" in text_content, "Should contain Product Code" + assert "SKU-8847" in text_content, "Should contain first SKU" + assert "SKU-9201" in text_content, "Should contain second SKU" + + def test_multipage_invoice_table_structure(self, markitdown): + """Test that multipage invoice PDF has pipe-separated format.""" + pdf_path = os.path.join(TEST_FILES_DIR, "REPAIR-2022-INV-001_multipage.pdf") + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + text_content = result.text_content + + # Should have pipe-separated content + assert "|" in text_content, "Invoice PDF should have pipe separators" + + # Find rows with pipes + lines = text_content.split("\n") + pipe_rows = [ + line for line in lines if line.startswith("|") and line.endswith("|") + ] + + assert ( + len(pipe_rows) > 10 + ), f"Should have multiple pipe-separated rows, found {len(pipe_rows)}" + + # Check that some rows have multiple columns + multi_col_rows = [row for row in pipe_rows if row.count("|") >= 4] + assert len(multi_col_rows) > 5, "Should have rows with 3+ columns" + + def test_receipt_has_no_tables(self, markitdown): + """Test that receipt PDF doesn't incorrectly extract tables from formatted text.""" + pdf_path = os.path.join( + TEST_FILES_DIR, "RECEIPT-2024-TXN-98765_retail_purchase.pdf" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + tables = extract_markdown_tables(result.text_content) + + # Receipt should not have markdown tables extracted + # (it's formatted text, not tabular data) + # If tables are extracted, they should be minimal/empty + total_table_rows = sum(len(t) for t in tables) + assert ( + total_table_rows < 5 + ), f"Receipt should not have significant tables, found {total_table_rows} rows" + + def test_scanned_pdf_no_tables(self, markitdown): + """Test that scanned PDF has empty extraction and no tables.""" + pdf_path = os.path.join( + TEST_FILES_DIR, "MEDRPT-2024-PAT-3847_medical_report_scan.pdf" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + + # Scanned PDF with no text layer should have empty extraction + assert ( + result.text_content.strip() == "" + ), "Scanned PDF should have empty extraction" + + tables = extract_markdown_tables(result.text_content) + + # Scanned PDF with no text layer should have no tables + assert len(tables) == 0, "Scanned PDF should have no extracted tables" + + def test_all_pdfs_table_rows_consistent(self, markitdown): + """Test that all PDF tables have rows with pipe-separated content. + + Note: With gap-based column detection, rows may have different column counts + depending on how content is spaced in the PDF. What's important is that each + row has pipe separators and the content is readable. + """ + pdf_files = [ + "SPARSE-2024-INV-1234_borderless_table.pdf", + "REPAIR-2022-INV-001_multipage.pdf", + "RECEIPT-2024-TXN-98765_retail_purchase.pdf", + "test.pdf", + ] + + for pdf_file in pdf_files: + pdf_path = os.path.join(TEST_FILES_DIR, pdf_file) + if not os.path.exists(pdf_path): + continue + + result = markitdown.convert(pdf_path) + tables = extract_markdown_tables(result.text_content) + + for table_idx, table in enumerate(tables): + if not table: + continue + + # Verify each row has at least one column (pipe-separated content) + for row_idx, row in enumerate(table): + assert ( + len(row) >= 1 + ), f"{pdf_file}: Table {table_idx}, row {row_idx} has no columns" + + # Verify the row has non-empty content + row_content = " ".join(cell.strip() for cell in row) + assert ( + len(row_content.strip()) > 0 + ), f"{pdf_file}: Table {table_idx}, row {row_idx} is empty" + + def test_borderless_table_data_integrity(self, markitdown): + """Test that borderless table extraction preserves data integrity.""" + pdf_path = os.path.join( + TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf" + ) + + if not os.path.exists(pdf_path): + pytest.skip(f"Test file not found: {pdf_path}") + + result = markitdown.convert(pdf_path) + tables = extract_markdown_tables(result.text_content) + + assert len(tables) >= 2, "Should have at least 2 tables" + + # Check first table has expected SKU data + first_table = tables[0] + table_text = str(first_table) + assert "SKU-8847" in table_text, "First table should contain SKU-8847" + assert "SKU-9201" in table_text, "First table should contain SKU-9201" + + # Check second table has expected category data + second_table = tables[1] + table_text = str(second_table) + assert "Electronics" in table_text, "Second table should contain Electronics" + assert "Hardware" in table_text, "Second table should contain Hardware"