[MS] Extend table support for wide tables (#1552)

* feat: enhance PDF table extraction to support complex forms and add new test cases
* feat: enhance PDF table extraction with adaptive column clustering and add comprehensive test cases
* fix: correct formatting and improve assertions in PDF table tests
This commit is contained in:
lesyk
2026-02-13 19:45:39 +01:00
committed by GitHub
parent 7fdaefb724
commit c83de14a9c
11 changed files with 784 additions and 5 deletions
+3
View File
@@ -1,2 +1,5 @@
packages/markitdown/tests/test_files/** linguist-vendored
packages/markitdown-sample-plugin/tests/test_files/** linguist-vendored
# Treat PDF files as binary to prevent line ending conversion
*.pdf binary
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
#
# SPDX-License-Identifier: MIT
__version__ = "0.1.5b1"
__version__ = "0.1.5b2"
@@ -198,15 +198,62 @@ def _extract_form_content_from_words(page: Any) -> str | None:
if not all_table_x_positions:
return None
# Compute global column boundaries
# Compute adaptive column clustering tolerance based on gap analysis
all_table_x_positions.sort()
# Calculate gaps between consecutive x-positions
gaps = []
for i in range(len(all_table_x_positions) - 1):
gap = all_table_x_positions[i + 1] - all_table_x_positions[i]
if gap > 5: # Only significant gaps
gaps.append(gap)
# Determine optimal tolerance using statistical analysis
if gaps and len(gaps) >= 3:
# Use 70th percentile of gaps as threshold (balances precision/recall)
sorted_gaps = sorted(gaps)
percentile_70_idx = int(len(sorted_gaps) * 0.70)
adaptive_tolerance = sorted_gaps[percentile_70_idx]
# Clamp tolerance to reasonable range [25, 50]
adaptive_tolerance = max(25, min(50, adaptive_tolerance))
else:
# Fallback to conservative value
adaptive_tolerance = 35
# Compute global column boundaries using adaptive tolerance
global_columns: list[float] = []
for x in all_table_x_positions:
if not global_columns or x - global_columns[-1] > 30:
if not global_columns or x - global_columns[-1] > adaptive_tolerance:
global_columns.append(x)
# Too many columns suggests dense text, not a form
if len(global_columns) > 8:
# Adaptive max column check based on page characteristics
# Calculate average column width
if len(global_columns) > 1:
content_width = global_columns[-1] - global_columns[0]
avg_col_width = content_width / len(global_columns)
# Forms with very narrow columns (< 30px) are likely dense text
if avg_col_width < 30:
return None
# Compute adaptive max based on columns per inch
# Typical forms have 3-8 columns per inch
columns_per_inch = len(global_columns) / (content_width / 72)
# If density is too high (> 10 cols/inch), likely not a form
if columns_per_inch > 10:
return None
# Adaptive max: allow more columns for wider pages
# Standard letter is 612pt wide, so scale accordingly
adaptive_max_columns = int(20 * (page_width / 612))
adaptive_max_columns = max(15, adaptive_max_columns) # At least 15
if len(global_columns) > adaptive_max_columns:
return None
else:
# Single column, not a form
return None
# Now classify each row as table row or not
@@ -0,0 +1,81 @@
TECHMART ELECTRONICS
4567 Innovation Blvd
San Francisco, CA 94103
(415) 555-0199
===================================
Store #0342 - Downtown SF
11/23/2024 14:32:18 PST
TXN: TXN-98765-2024
Cashier: Emily Rodriguez
Register: POS-07
-----------------------------------
Wireless Noise-Cancelling
Headphones - Premium Black
AUDIO-5521 1 @ $349.99
Member Discount $-50.00
$299.99
USB-C Hub 7-in-1 Adapter
with HDMI & Ethernet
ACC-8834 2 @ $79.99
$159.98
Portable SSD 2TB
Thunderbolt 3 Compatible
STOR-2241 1 @ $289.00
Member Discount $-29.00
$260.00
Ergonomic Wireless Mouse
Rechargeable Battery
ACC-9012 1 @ $59.99
$59.99
Screen Cleaning Kit
Professional Grade
CARE-1156 3 @ $12.99
$38.97
HDMI 2.1 Cable 6ft
8K Resolution Support
CABLE-7789 2 @ $24.99
Member Discount $-5.00
$44.98
-----------------------------------
SUBTOTAL $863.91
Member Discount (15%)-$84.00
Sales Tax (8.5%) $66.23
Rewards Applied -$25.00
===================================
TOTAL $821.14
===================================
PAYMENT METHOD
Visa Card ending in 4782
Auth: 847392
Ref: REF-20241123-98765
-----------------------------------
REWARDS MEMBER
Sarah Mitchell
ID: TM-447821
Points Earned: 821
Total Points: 3,247
Next Reward: $50 gift card
at 5,000 pts (1,753 to go)
-----------------------------------
RETURN POLICY
Returns within 30 days
Receipt required
Electronics must be unopened
*TXN98765202411231432*
Thank you for shopping!
www.techmart.example.com
===================================
@@ -0,0 +1,76 @@
ZAVA AUTO REPAIR
Certified Collision Repair
123 Main Street, Redmond, WA 98052
Phone: (425) 000-0000
Preliminary Estimate (ID: EST-1008)
| Customer Information | | | Vehicle Information | |
| -------------------- | ------------------- | --- | ------------------- | ----------------- |
| Insured name | Gabriel Diaz | | Year | 2022 |
| Claim # | SF-1008 | | Make | Jeep |
| Policy # | POL-2022-555 | | Model | Grand Cherokee |
| Phone | (425) 111-1111 | | Trim | Limited |
| Email | gabriel@contoso.com | | VIN | 1C4RJFBG2NC123456 |
| | | | Color | White |
| | | | Odometer | 9,800 |
| Repair Order # | RO-20221108 | | Estimator | Ellis Turner |
Estimate Totals
| | | Hours | Rate | Cost |
| ---------------- | --- | ----- | ---- | ----- |
| Parts | | | | 2,100 |
| Body Labor | | 2 | 150 | 300 |
| Paint Labor | | 1.5 | 150 | 225 |
| Mechanical Labor | | - | - | - |
Supplies
| | Paint Supplies | | | 60 |
| ------------- | ------------------------ | --- | ------ | ------ |
| | Body Supplies | | | 30 |
| Other Charges | | | | 15 |
| Subtotal | | | | 2,730 |
| Sales Tax | | | 10.20% | 278.46 |
| GRAND TOTAL | | | | 5,738 |
| Note | Minor rear bumper repair | | | |
This is a preliminary estimate for the visible damage of the vehicle. Additional damage / repairs / parts may be found
after the vehicle has been disassembled and damaged parts have been removed. Suspension damages may be
present, but can not be determined until an alignment on the vehicle has been done. Parts Prices may vary due to
models and vehicle maker price updates. Please be advised if vehicle owner elects to have vehicle sent to service for
any mechanical concerns, ALL service departments charge a vehicle diagnostic charge. If the mechanical concern is
deemed not related to an insurance claim, vehicle owner will be reponsible for charges.
ZAVA AUTO REPAIR
Certified Collision Repair
123 Main Street, Redmond, WA 98052
Phone: (425) 000-0000
Preliminary Estimate (ID: EST-1008)
Customer Information Vehicle Information
| Insured name | Bruce Wayne | | Year | 2025 |
| -------------- | -------------------------- | --- | --------- | ------------ |
| Claim # | | 999 | Make | Batman |
| Policy # | IM-BATMAN | | Model | Batmobile |
| Phone | (416) 555-1234 | | Trim | Limited |
| Email | batman@wayneindustries.com | | VIN | XXX |
| | | | Color | Black |
| | | | Odometer | 1 |
| Repair Order # | RO-20221108 | | Estimator | Ellis Turner |
Estimate Totals
| | | Hours | Rate | Cost |
| ---------------- | --- | ----- | ---- | ------ |
| Parts | | | | 99,999 |
| Body Labor | | 2 | 150 | 300 |
| Paint Labor | | 1.5 | 150 | 225 |
| Mechanical Labor | | - | - | - |
Supplies
| | Paint Supplies | | | 60 |
| ------------- | ------------------------ | --- | ------ | --------- |
| | Body Supplies | | | 30 |
| Other Charges | | | | 15 |
| Subtotal | | | | 100,629 |
| Sales Tax | | | 10.20% | 10264.158 |
| GRAND TOTAL | | | | 211,522 |
| Note | Minor rear bumper repair | | | |
This is a preliminary estimate for the visible damage of the vehicle. Additional damage / repairs / parts may be found
after the vehicle has been disassembled and damaged parts have been removed. Suspension damages may be
present, but can not be determined until an alignment on the vehicle has been done. Parts Prices may vary due to
models and vehicle maker price updates. Please be advised if vehicle owner elects to have vehicle sent to service for
any mechanical concerns, ALL service departments charge a vehicle diagnostic charge. If the mechanical concern is
deemed not related to an insurance claim, vehicle owner will be reponsible for charges.
@@ -0,0 +1,44 @@
INVENTORY RECONCILIATION REPORT
Report ID: SPARSE-2024-INV-1234
Warehouse: Distribution Center East
Report Date: 2024-11-15
Prepared By: Sarah Martinez
| Product Code | Location | Expected | Actual | Variance | Status |
| ------------ | -------- | -------- | ------ | -------- | -------- |
| SKU-8847 | A-12 | 450 | | | |
| | B-07 | | 289 | -23 | |
| SKU-9201 | | 780 | 778 | | OK |
| | C-15 | | | +15 | |
| SKU-4563 | D-22 | | 156 | | CRITICAL |
| | | 180 | | -24 | |
| SKU-7728 | A-08 | 920 | | | |
| | | | 935 | +15 | OK |
Variance Analysis:
Summary Statistics:
Total Variance Cost: $4,287.50
Critical Items: 1
Overall Accuracy: 97.2%
Detailed Analysis by Category:
The inventory reconciliation reveals several key findings. The primary variance driver is SKU-4563,
which shows a -24 unit discrepancy requiring immediate investigation. Location B-07 handling of
SKU-8847 also demonstrates significant variance. Cross-location verification protocols should be
reviewed to prevent future discrepancies. The overall accuracy rate of 97.2% meets our target
threshold, but critical items require expedited resolution to maintain operational efficiency.
Extended Inventory Review:
| Product Code | Category | Unit Cost | Total Value | Last Audit | Notes |
| ------------ | ----------- | --------- | ----------- | ---------- | ---------- |
| SKU-8847 | Electronics | $45.00 | $13,005.00 | 2024-10-15 | |
| SKU-9201 | Hardware | $32.50 | $25,285.00 | 2024-10-22 | Verified |
| SKU-4563 | Software | $120.00 | $18,720.00 | | Critical |
| SKU-7728 | Accessories | $15.75 | $14,726.25 | 2024-11-01 | |
| SKU-3345 | Electronics | $67.00 | $22,445.00 | 2024-10-18 | |
| SKU-5512 | Hardware | $89.00 | $31,150.00 | | Pending |
| SKU-6678 | Software | $200.00 | $42,000.00 | 2024-10-25 | High Value |
| SKU-7789 | Accessories | $8.50 | $5,950.00 | 2024-11-05 | |
| SKU-2234 | Electronics | $125.00 | $35,000.00 | | |
| SKU-1123 | Hardware | $55.00 | $27,500.00 | 2024-10-30 | Verified |
Recommendations:
1. Immediate review of SKU-4563 handling procedures. 2. Implement additional verification for critical
items. 3. Schedule follow-up audit for high-value products (SKU-6678, SKU-2234).
Approval:
@@ -0,0 +1,62 @@
BOOKING ORDER
Print Date 12/15/2024 14:30:22
Page 1 of 1
STARLIGHT CINEMAS
Orders
| Order / Rev: | 2024-12-5678 | | | Cinema: | | Downtown Multiplex |
| ------------ | -------------- | --- | --- | ---------------- | --- | ------------------ |
| Alt Order #: | SC-WINTER-2024 | | | Primary Contact: | | Sarah Johnson |
Product Desc: Holiday Movie Marathon Package Location: NYC-01
| Estimate: | EST-456 | | | Region: | | NORTHEAST |
| -------------------- | ----------------------- | --- | --- | ------- | --- | --------- |
| Booking Dates: | 12/20/2024 - 12/31/2024 | | | | | |
| Original Date / Rev: | 12/01/24 / 12/10/24 | | | | | |
| Order Type: | Premium Package | | | | | |
Booking Agency
| Name: | Premier Entertainment Group | | | | | |
| ---------------- | --------------------------- | --- | --- | -------------- | --- | --------- |
| | | | | Billing Type: | | Net 30 |
| Contact: | Michael Chen | | | | | |
| | | | | Payment Terms: | | Corporate |
| Billing Contact: | accounting@premierent.com | | | | | |
| | | | | Commission: | | 10% |
555 Broadway Suite 1200
New York, NY 10012
Customer
| Name: | Universal Studios Distribution | | | | | |
| -------------- | ------------------------------ | --- | --- | --- | --- | --- |
| Category: | Film Distributor | | | | | |
| Contact Email: | bookings@universalstudios.com | | | | | |
| Customer ID: | CUST-98765 | | | | | |
| Revenue Code: | FILM-PREMIUM | | | | | |
Booking Summary
| Start Date | End Date | # Shows | Gross Amount | Net Amount | | |
| ---------- | -------- | ------- | ------------ | ---------- | --- | --- |
| 12/20/24 | 12/31/24 | 48 | $12,500.00 | $11,250.00 | | |
Totals
| Month | # Shows | Gross Amount | | Net Amount | | Occupancy |
| ------------- | ------- | ------------ | --- | ---------- | --- | --------- |
| December 2024 | 48 | $12,500.00 | | $11,250.00 | | 85% |
| Totals | 48 | $12,500.00 | | $11,250.00 | | 85% |
Account Representatives
Representative Territory Region Start Date / End Date Commission %
| Sarah Johnson | NYC Metro | NORTHEAST | 12/20/24 - 12/31/24 | | 100% | |
| ------------- | --------- | --------- | ------------------- | --- | ---- | --- |
Show Schedule Details
Ln Screen Start End Movie Title Format Showtime Days Shows Rate Type Total
1 SCR-1 12/20/24 12/25/24 Holiday Spectacular IMAX 3D 7:00 PM Daily 12 $250 PM $3,000
(Runtime: 142 min); Holiday Season Premium
2 SCR-2 12/20/24 12/31/24 Winter Wonderland Standard 4:30 PM Daily 24 $150 MT $3,600
(Runtime: 98 min); Matinee Special
3 SCR-1 12/26/24 12/31/24 New Year Mystery 4DX 9:30 PM Daily 12 $300 PM $3,600
(Runtime: 116 min); Premium Experience
Show Details
| Show Screen | Date Range | Title | Showtime | Days Type | Rate | Revenue |
| ----------- | ---------- | ----- | -------- | --------- | ---- | ------- |
1 SCR-1 12/20-12/25 Holiday Spectacular 7:00 PM Daily PM $250 $3,000
This booking order is subject to cinema availability and standard terms.
2 SCR-2 12/20-12/31 Winter Wonderland 4:30 PM Daily MT $150 $3,600
All showtimes are approximate and subject to change.
3 SCR-1 12/26-12/31 New Year Mystery 9:30 PM Daily PM $300 $3,600
| Total Revenue: | | | | | | $12,500.00 |
| -------------- | --- | --- | --- | --- | --- | ---------- |
@@ -0,0 +1,65 @@
1
Introduction
Large language models (LLMs) are becoming a crucial building block in developing powerful agents
that utilize LLMs for reasoning, tool usage, and adapting to new observations (Yao et al., 2022; Xi
et al., 2023; Wang et al., 2023b) in many real-world tasks. Given the expanding tasks that could
benefit from LLMs and the growing task complexity, an intuitive approach to scale up the power of
agents is to use multiple agents that cooperate. Prior work suggests that multiple agents can help
encourage divergent thinking (Liang et al., 2023), improve factuality and reasoning (Du et al., 2023),
and provide validation (Wu et al., 2023). In light of the intuition and early evidence of promise, it is
intriguing to ask the following question: how can we facilitate the development of LLM applications
that could span a broad spectrum of domains and complexities based on the multi-agent approach?
Our insight is to use multi-agent conversations to achieve it. There are at least three reasons con-
firming its general feasibility and utility thanks to recent advances in LLMs: First, because chat-
optimized LLMs (e.g., GPT-4) show the ability to incorporate feedback, LLM agents can cooperate
through conversations with each other or human(s), e.g., a dialog where agents provide and seek rea-
soning, observations, critiques, and validation. Second, because a single LLM can exhibit a broad
range of capabilities (especially when configured with the correct prompt and inference settings),
conversations between differently configured agents can help combine these broad LLM capabilities
in a modular and complementary manner. Third, LLMs have demonstrated ability to solve complex
tasks when the tasks are broken into simpler subtasks. Multi-agent conversations can enable this
partitioning and integration in an intuitive manner. How can we leverage the above insights and
support different applications with the common requirement of coordinating multiple agents, poten-
tially backed by LLMs, humans, or tools exhibiting different capacities? We desire a multi-agent
conversation framework with generic abstraction and effective implementation that has the flexibil-
ity to satisfy different application needs. Achieving this requires addressing two critical questions:
(1) How can we design individual agents that are capable, reusable, customizable, and effective in
multi-agent collaboration? (2) How can we develop a straightforward, unified interface that can
accommodate a wide range of agent conversation patterns? In practice, applications of varying
complexities may need distinct sets of agents with specific capabilities, and may require different
conversation patterns, such as single- or multi-turn dialogs, different human involvement modes, and
static vs. dynamic conversation. Moreover, developers may prefer the flexibility to program agent
interactions in natural language or code. Failing to adequately address these two questions would
limit the frameworks scope of applicability and generality.
While there is contemporaneous exploration of multi-agent approaches,3 we present AutoGen, a
generalized multi-agent conversation framework (Figure 1), based on the following new concepts.
1 Customizable and conversable agents. AutoGen uses a generic design of agents that can lever-
age LLMs, human inputs, tools, or a combination of them. The result is that developers can
easily and quickly create agents with different roles (e.g., agents to write code, execute code,
wire in human feedback, validate outputs, etc.) by selecting and configuring a subset of built-in
capabilities. The agents backend can also be readily extended to allow more custom behaviors.
To make these agents suitable for multi-agent conversation, every agent is made conversable
they can receive, react, and respond to messages. When configured properly, an agent can hold
multiple turns of conversations with other agents autonomously or solicit human inputs at cer-
tain rounds, enabling human agency and automation. The conversable agent design leverages the
strong capability of the most advanced LLMs in taking feedback and making progress via chat
and also allows combining capabilities of LLMs in a modular fashion. (Section 2.1)
2 Conversation programming. A fundamental insight of AutoGen is to simplify and unify com-
plex LLM application workflows as multi-agent conversations. So AutoGen adopts a program-
ming paradigm centered around these inter-agent conversations. We refer to this paradigm as
conversation programming, which streamlines the development of intricate applications via two
primary steps: (1) defining a set of conversable agents with specific capabilities and roles (as
described above); (2) programming the interaction behavior between agents via conversation-
centric computation and control. Both steps can be achieved via a fusion of natural and pro-
gramming languages to build applications with a wide range of conversation patterns and agent
behaviors. AutoGen provides ready-to-use implementations and also allows easy extension and
experimentation for both steps. (Section 2.2)
3We refer to Appendix A for a detailed discussion.
2
@@ -0,0 +1,74 @@
%PDF-1.3
%“Œ‹ž ReportLab Generated PDF document (opensource)
1 0 obj
<<
/F1 2 0 R /F2 3 0 R
>>
endobj
2 0 obj
<<
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
>>
endobj
3 0 obj
<<
/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
>>
endobj
4 0 obj
<<
/Contents 8 0 R /MediaBox [ 0 0 612 792 ] /Parent 7 0 R /Resources <<
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
>> /Rotate 0 /Trans <<
>>
/Type /Page
>>
endobj
5 0 obj
<<
/PageMode /UseNone /Pages 7 0 R /Type /Catalog
>>
endobj
6 0 obj
<<
/Author (anonymous) /CreationDate (D:20260210121342+01'00') /Creator (anonymous) /Keywords () /ModDate (D:20260210121342+01'00') /Producer (ReportLab PDF Library - \(opensource\))
/Subject (unspecified) /Title (untitled) /Trapped /False
>>
endobj
7 0 obj
<<
/Count 1 /Kids [ 4 0 R ] /Type /Pages
>>
endobj
8 0 obj
<<
/Filter [ /ASCII85Decode /FlateDecode ] /Length 2414
>>
stream
Gat=m?#SK-&r,lL<tO"8con<J;5Cq;2s]18RrdR7Y[)>Ym#<31@rCsJ89.W.qa3u?82hU4/rD6bm_^2.o5\G6@H<5/\G85.&:2)\f,l]`/mA:-0HF*!^.%Yd0?rr<_LD*'1j8Q\=IJXu'N"=HL>KSX^]339h+)S%SB[D8U\2B8rL_pR7\MXONW%HeW99+,0hH$AU#^KYAoZ)6P-2'6m5cj7lZu'kGHQ:/\R1,Ma%hEl2eYq(:LZ"-`3OktM:dm<m,u<)W99/X#l.?0OO\Z_]Y4.9BoSuKGOrdaFbq^/)*_g%gm8s\<gU<a%e]re(gWm_H[^0bn(=;0X%(_^H%$;+Se<aM)L.FrW&=UUc*X3He'XMO]CgP3P*<]$#uOPN2Z#n\O]@7_]#$ZH.Gr&KZm_M+6]8I5lYVcZLH,)V@L:BCib%tuWd,p*A"0Gb=6gIkI+Y5.[<aH_D`iMaKNQpo.UHf=F]to-Ui6XS[Q;Qh\cD=LT#YQGpn9rsUVm:qR"1%pnrSZa/Mi*P+f4j?B6-uV5Em_Zqog)^@RtF[F-adqASQb%i[(eIIqZ)_CVEHpGDgIpX[[uQ4J6DNf5X^CB'JA+,d^#?/[fq)jH^+:rbdW>Y'H/a/1^A\lZD2qMb,5%-$pOaW5-%BjndGRZ<CV&?T^r@PWF)!H#gDKcZj?[/gATBZ;=XJ$_a;??F-qtH(HaQX?W#iIL#17<Y25AC[ePo/pO[]=c0(\#/j9R%W/]$do:5b%.e4%S0Z';YJm/!GO9jt-H8W>JTK5I,b-cnrpc!2H0BZZ`1%R*aB!ZE'JRRYNJ<J4B`!j/maqpD>*Rq$U:[Tq%Lr[m+DHGg*dP\Ee>\#VYo43^R>kA9W2b/WU:k/M#%^2;nC+,e'dAcEOp?t5Kk;4+.f4MU@-mf7iCT_29s_%g,%K_gB8!kWS28T%T6'u_$GK'qX*VP>7>5?dW_<?$QPg!n")cT(<-[c/-kEbS'`*BYR5SB9TPY<1jq1#Q/EWpCJrY=s;bQfH^=uT:DTR3.8/N>W)r8_SF*7+f;4415n3,ECi2P6&bjmn17t+qU8;D])\Qt.8QLi)?kJ`.t+lkW'Y4e876l-2di)Y?.3\K1<(0IrEfm1<:Oc^u?7B::q;On$J5_C7T<u%071ASb!ZD1u7Yd"g`I'`PJ>**>tRZrdD6q3W@5QfbW8242uIHro=(eV*P1KjY,oj4tW&obb>^q-Iur%F#A)mgu8+V*?E<bdEC6V0+Z7OS^l.$W4hmuq:sMdJ=Sk+94D3QtUBZ:AoIiBA%s3#GJdRDFCpZ)7\MZmitKhMID(%ic%oW#tD%ERrqpk,dD3ll!E6m)e):26BLNV!WiRV*d(+Ppl'p$%?J&MqeV<=uNJ_5,4P_NC:lWf`Iu3\u+^>Y]dUOk&c=m2^<YVV2cUoq[`<<W-]MTIC50Klu6rO5RUVZ"h`#"4adtt2qjs2b12hQi!@JBp4Jln>:1Dtc(*!NBU*DeAtLhuWu&JLWFQi:;ka#?AD6V.A_[>n$T,.]8d=tffJ,?'DbCKQ-BnKqTn_:1LGc865V]FFi=AAF`DGhW(F]2^o?>VbGN:;=!-s;ea7]Ll\f+eiZ8XZb0*mZp%8*K_pf+1"2fKuO1pNK%7f_(mPTD@0&ljSV?o$5BpUmleYs^Faq_SM'jX.o\d*6%j(EtY.N"m2B'E@[.Y_8Be+m(58m$\dcqm$?,0it)/=9@9kRfJB;N7D9t\'F<:#c$P82`UKqgN]$kU]5eLPZMR=0bO[rPk"\?hu>sT^KFg`B>!pml-a[ImSeWp!_l3s!E>gFKq4ng:"n=N:m57rHjN)GML<=a1ktQpUT8:?[D:c7+Gm@2q;uN1Q3)hpeThe-&[#`KYZ4e_=o]kk1KH/^jo:"<0_nRJingk\[1Jltc<,.Jq2\*]=AVcIiY#?iMASrc$Bp)4m=NdIOJ&,H=+<MC=^7]?Tb>M"H6ZdXTX2Ba;Gp=J-m]$,8ZCU/77rHJ,%1.[/DlnkH:pIIV$Oh.;:t?5e3.cs^[G:H=e;i>c+>B=)C&l7T)S<Bld"_W)BtgI(/F`Le;ULQ,!FM!^<8Kk?L6b_>G8Jp-TG;!V1144#29r2%;n-RmNHrGdR!76&H"R_D-]`c"1FCgZl*",7SUVuqc0oapDQ=^`nj#FFk@2%[K[V45$!KQIH[=;SUpTE8T!QLliC=5-9]nkQpBVdHM6-g)tYBAPuOqr^qkn[Wh4C;6L89J;D>5@cYM$2Y/24scnNiWp4jWhfJAF^ck!@I(VPV*s,pdkPKn<Zg-T3I%d.sSl"^f-Gm=*riV,>(\770jbu^lf\h1+IH>c;Bo;Pdg;!fA)'kmg$"\P3oX=/N5/rUltb3K-BdRTR;-W)J1bDbE?g<MKG;cK`l?D4l>.,O@6id::q]JXBH\Ws#0[#'8-5JQL>/c~>endstream
endobj
xref
0 9
0000000000 65535 f
0000000061 00000 n
0000000102 00000 n
0000000209 00000 n
0000000321 00000 n
0000000514 00000 n
0000000582 00000 n
0000000843 00000 n
0000000902 00000 n
trailer
<<
/ID
[<e319d5c305edb8c0fb6be9e44c6178fa><e319d5c305edb8c0fb6be9e44c6178fa>]
% ReportLab generated PDF document -- digest (opensource)
/Info 6 0 R
/Root 5 0 R
/Size 9
>>
startxref
3407
%%EOF
@@ -1,5 +1,6 @@
#!/usr/bin/env python3 -m pytest
"""Tests for PDF table extraction functionality."""
import os
import re
import pytest
@@ -650,6 +651,332 @@ class TestPdfTableExtraction:
result.text_content.strip() == ""
), f"Scanned PDF should have empty extraction, got: '{result.text_content[:100]}...'"
def test_movie_theater_booking_pdf_extraction(self, markitdown):
"""Test extraction of movie theater booking PDF with complex tables.
Expected output: Pipe-separated format with booking details, agency info,
customer details, and show schedules in structured tables.
"""
pdf_path = os.path.join(TEST_FILES_DIR, "movie-theater-booking-2024.pdf")
if not os.path.exists(pdf_path):
pytest.skip(f"Test file not found: {pdf_path}")
result = markitdown.convert(pdf_path)
text_content = result.text_content
# Validate pipe-separated table format
assert "|" in text_content, "Booking order should contain pipe separators"
# Validate key booking information
expected_strings = [
"BOOKING ORDER",
"2024-12-5678", # Order number
"Holiday Movie Marathon Package", # Product description
"12/20/2024 - 12/31/2024", # Booking dates
"SC-WINTER-2024", # Alt order number
"STARLIGHT CINEMAS", # Cinema brand
]
validate_strings(result, expected_strings)
# Validate agency information
agency_strings = [
"Premier Entertainment Group", # Agency name
"Michael Chen", # Contact
"Sarah Johnson", # Primary contact
"Downtown Multiplex", # Cinema name
]
validate_strings(result, agency_strings)
# Validate customer information
customer_strings = [
"Universal Studios Distribution", # Customer name
"Film Distributor", # Category
"CUST-98765", # Customer ID
]
validate_strings(result, customer_strings)
# Validate booking summary totals
booking_strings = [
"$12,500.00", # Gross amount
"$11,250.00", # Net amount
"December 2024", # Month
"48", # Number of shows
]
validate_strings(result, booking_strings)
# Validate show schedule details
show_strings = [
"Holiday Spectacular", # Movie title
"Winter Wonderland", # Movie title
"New Year Mystery", # Movie title
"IMAX 3D", # Format
"$250", # Rate
"$300", # Rate
"$3,000", # Revenue
"$3,600", # Revenue
]
validate_strings(result, show_strings)
class TestPdfFullOutputComparison:
"""Test that PDF extraction produces expected complete outputs."""
@pytest.fixture
def markitdown(self):
"""Create MarkItDown instance."""
return MarkItDown()
def test_movie_theater_full_output(self, markitdown):
"""Test complete output for movie theater booking PDF."""
pdf_path = os.path.join(TEST_FILES_DIR, "movie-theater-booking-2024.pdf")
expected_path = os.path.join(
TEST_FILES_DIR, "expected_outputs", "movie-theater-booking-2024.md"
)
if not os.path.exists(pdf_path):
pytest.skip(f"Test file not found: {pdf_path}")
if not os.path.exists(expected_path):
pytest.skip(f"Expected output not found: {expected_path}")
result = markitdown.convert(pdf_path)
actual_output = result.text_content
with open(expected_path, "r", encoding="utf-8") as f:
expected_output = f.read()
# Compare outputs
actual_lines = [line.rstrip() for line in actual_output.split("\n")]
expected_lines = [line.rstrip() for line in expected_output.split("\n")]
# Check line count
assert abs(len(actual_lines) - len(expected_lines)) <= 2, (
f"Line count mismatch: actual={len(actual_lines)}, "
f"expected={len(expected_lines)}"
)
# Check structural elements
assert actual_output.count("|") > 80, "Should have many pipe separators"
assert actual_output.count("---") > 8, "Should have table separators"
# Validate critical sections
for section in [
"BOOKING ORDER",
"STARLIGHT CINEMAS",
"2024-12-5678",
"Holiday Spectacular",
"$12,500.00",
]:
assert section in actual_output, f"Missing section: {section}"
# Check table structure
table_rows = [line for line in actual_lines if line.startswith("|")]
assert (
len(table_rows) > 15
), f"Should have >15 table rows, got {len(table_rows)}"
def test_sparse_borderless_table_full_output(self, markitdown):
"""Test complete output for SPARSE borderless table PDF."""
pdf_path = os.path.join(
TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf"
)
expected_path = os.path.join(
TEST_FILES_DIR,
"expected_outputs",
"SPARSE-2024-INV-1234_borderless_table.md",
)
if not os.path.exists(pdf_path):
pytest.skip(f"Test file not found: {pdf_path}")
if not os.path.exists(expected_path):
pytest.skip(f"Expected output not found: {expected_path}")
result = markitdown.convert(pdf_path)
actual_output = result.text_content
with open(expected_path, "r", encoding="utf-8") as f:
expected_output = f.read()
# Compare outputs
actual_lines = [line.rstrip() for line in actual_output.split("\n")]
expected_lines = [line.rstrip() for line in expected_output.split("\n")]
# Check line count is close
assert abs(len(actual_lines) - len(expected_lines)) <= 2, (
f"Line count mismatch: actual={len(actual_lines)}, "
f"expected={len(expected_lines)}"
)
# Check structural elements
assert actual_output.count("|") > 50, "Should have many pipe separators"
# Validate critical sections
for section in [
"INVENTORY RECONCILIATION REPORT",
"SPARSE-2024-INV-1234",
"SKU-8847",
"SKU-9201",
"Variance Analysis",
]:
assert section in actual_output, f"Missing section: {section}"
def test_repair_multipage_full_output(self, markitdown):
"""Test complete output for REPAIR multipage invoice PDF."""
pdf_path = os.path.join(TEST_FILES_DIR, "REPAIR-2022-INV-001_multipage.pdf")
expected_path = os.path.join(
TEST_FILES_DIR, "expected_outputs", "REPAIR-2022-INV-001_multipage.md"
)
if not os.path.exists(pdf_path):
pytest.skip(f"Test file not found: {pdf_path}")
if not os.path.exists(expected_path):
pytest.skip(f"Expected output not found: {expected_path}")
result = markitdown.convert(pdf_path)
actual_output = result.text_content
with open(expected_path, "r", encoding="utf-8") as f:
expected_output = f.read()
# Compare outputs
actual_lines = [line.rstrip() for line in actual_output.split("\n")]
expected_lines = [line.rstrip() for line in expected_output.split("\n")]
# Check line count is close
assert abs(len(actual_lines) - len(expected_lines)) <= 2, (
f"Line count mismatch: actual={len(actual_lines)}, "
f"expected={len(expected_lines)}"
)
# Check structural elements
assert actual_output.count("|") > 40, "Should have many pipe separators"
# Validate critical sections
for section in [
"ZAVA AUTO REPAIR",
"Gabriel Diaz",
"Jeep",
"Grand Cherokee",
"GRAND TOTAL",
]:
assert section in actual_output, f"Missing section: {section}"
def test_receipt_full_output(self, markitdown):
"""Test complete output for RECEIPT retail purchase PDF."""
pdf_path = os.path.join(
TEST_FILES_DIR, "RECEIPT-2024-TXN-98765_retail_purchase.pdf"
)
expected_path = os.path.join(
TEST_FILES_DIR,
"expected_outputs",
"RECEIPT-2024-TXN-98765_retail_purchase.md",
)
if not os.path.exists(pdf_path):
pytest.skip(f"Test file not found: {pdf_path}")
if not os.path.exists(expected_path):
pytest.skip(f"Expected output not found: {expected_path}")
result = markitdown.convert(pdf_path)
actual_output = result.text_content
with open(expected_path, "r", encoding="utf-8") as f:
expected_output = f.read()
# Compare outputs
actual_lines = [line.rstrip() for line in actual_output.split("\n")]
expected_lines = [line.rstrip() for line in expected_output.split("\n")]
# Check line count is close
assert abs(len(actual_lines) - len(expected_lines)) <= 2, (
f"Line count mismatch: actual={len(actual_lines)}, "
f"expected={len(expected_lines)}"
)
# Validate critical sections
for section in [
"TECHMART ELECTRONICS",
"TXN-98765-2024",
"Sarah Mitchell",
"$821.14",
"RETURN POLICY",
]:
assert section in actual_output, f"Missing section: {section}"
def test_academic_paper_full_output(self, markitdown):
"""Test complete output for academic paper PDF."""
pdf_path = os.path.join(TEST_FILES_DIR, "test.pdf")
expected_path = os.path.join(TEST_FILES_DIR, "expected_outputs", "test.md")
if not os.path.exists(pdf_path):
pytest.skip(f"Test file not found: {pdf_path}")
if not os.path.exists(expected_path):
pytest.skip(f"Expected output not found: {expected_path}")
result = markitdown.convert(pdf_path)
actual_output = result.text_content
with open(expected_path, "r", encoding="utf-8") as f:
expected_output = f.read()
# Compare outputs
actual_lines = [line.rstrip() for line in actual_output.split("\n")]
expected_lines = [line.rstrip() for line in expected_output.split("\n")]
# Check line count is close
assert abs(len(actual_lines) - len(expected_lines)) <= 2, (
f"Line count mismatch: actual={len(actual_lines)}, "
f"expected={len(expected_lines)}"
)
# Academic paper should not have pipe separators
assert (
actual_output.count("|") == 0
), "Academic paper should not have pipe separators"
# Validate critical sections
for section in [
"Introduction",
"Large language models",
"agents",
"multi-agent",
]:
assert section in actual_output, f"Missing section: {section}"
def test_medical_scan_full_output(self, markitdown):
"""Test complete output for medical report scan PDF (empty, no text layer)."""
pdf_path = os.path.join(
TEST_FILES_DIR, "MEDRPT-2024-PAT-3847_medical_report_scan.pdf"
)
expected_path = os.path.join(
TEST_FILES_DIR,
"expected_outputs",
"MEDRPT-2024-PAT-3847_medical_report_scan.md",
)
if not os.path.exists(pdf_path):
pytest.skip(f"Test file not found: {pdf_path}")
if not os.path.exists(expected_path):
pytest.skip(f"Expected output not found: {expected_path}")
result = markitdown.convert(pdf_path)
actual_output = result.text_content
with open(expected_path, "r", encoding="utf-8") as f:
expected_output = f.read()
# Both should be empty (scanned PDF with no text layer)
assert actual_output.strip() == "", "Scanned PDF should produce empty output"
assert (
expected_output.strip() == ""
), "Expected output should be empty for scanned PDF"
class TestPdfTableMarkdownFormat:
"""Test that extracted tables have proper markdown formatting."""