[MS] Extend table support for wide tables (#1552)

* feat: enhance PDF table extraction to support complex forms and add new test cases * feat: enhance PDF table extraction with adaptive column clustering and add comprehensive test cases * fix: correct formatting and improve assertions in PDF table tests
2026-02-13 19:45:39 +01:00
parent 7fdaefb724
commit c83de14a9c
11 changed files with 784 additions and 5 deletions
@@ -1,2 +1,5 @@
 packages/markitdown/tests/test_files/** linguist-vendored
 packages/markitdown-sample-plugin/tests/test_files/** linguist-vendored
+
+# Treat PDF files as binary to prevent line ending conversion
+*.pdf binary
@@ -1,4 +1,4 @@
 # SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
 #
 # SPDX-License-Identifier: MIT
-__version__ = "0.1.5b1"
+__version__ = "0.1.5b2"
@@ -198,15 +198,62 @@ def _extract_form_content_from_words(page: Any) -> str | None:
    if not all_table_x_positions:
        return None

-    # Compute global column boundaries
+    # Compute adaptive column clustering tolerance based on gap analysis
    all_table_x_positions.sort()
+
+    # Calculate gaps between consecutive x-positions
+    gaps = []
+    for i in range(len(all_table_x_positions) - 1):
+        gap = all_table_x_positions[i + 1] - all_table_x_positions[i]
+        if gap > 5:  # Only significant gaps
+            gaps.append(gap)
+
+    # Determine optimal tolerance using statistical analysis
+    if gaps and len(gaps) >= 3:
+        # Use 70th percentile of gaps as threshold (balances precision/recall)
+        sorted_gaps = sorted(gaps)
+        percentile_70_idx = int(len(sorted_gaps) * 0.70)
+        adaptive_tolerance = sorted_gaps[percentile_70_idx]
+
+        # Clamp tolerance to reasonable range [25, 50]
+        adaptive_tolerance = max(25, min(50, adaptive_tolerance))
+    else:
+        # Fallback to conservative value
+        adaptive_tolerance = 35
+
+    # Compute global column boundaries using adaptive tolerance
    global_columns: list[float] = []
    for x in all_table_x_positions:
-        if not global_columns or x - global_columns[-1] > 30:
+        if not global_columns or x - global_columns[-1] > adaptive_tolerance:
            global_columns.append(x)

-    # Too many columns suggests dense text, not a form
-    if len(global_columns) > 8:
+    # Adaptive max column check based on page characteristics
+    # Calculate average column width
+    if len(global_columns) > 1:
+        content_width = global_columns[-1] - global_columns[0]
+        avg_col_width = content_width / len(global_columns)
+
+        # Forms with very narrow columns (< 30px) are likely dense text
+        if avg_col_width < 30:
+            return None
+
+        # Compute adaptive max based on columns per inch
+        # Typical forms have 3-8 columns per inch
+        columns_per_inch = len(global_columns) / (content_width / 72)
+
+        # If density is too high (> 10 cols/inch), likely not a form
+        if columns_per_inch > 10:
+            return None
+
+        # Adaptive max: allow more columns for wider pages
+        # Standard letter is 612pt wide, so scale accordingly
+        adaptive_max_columns = int(20 * (page_width / 612))
+        adaptive_max_columns = max(15, adaptive_max_columns)  # At least 15
+
+        if len(global_columns) > adaptive_max_columns:
+            return None
+    else:
+        # Single column, not a form
        return None

    # Now classify each row as table row or not
@@ -0,0 +1,81 @@
+TECHMART ELECTRONICS
+4567 Innovation Blvd
+San Francisco, CA 94103
+(415) 555-0199
+
+===================================
+
+Store #0342 - Downtown SF
+11/23/2024 14:32:18 PST
+TXN: TXN-98765-2024
+Cashier: Emily Rodriguez
+Register: POS-07
+
+-----------------------------------
+
+Wireless Noise-Cancelling
+Headphones - Premium Black
+AUDIO-5521 1 @ $349.99
+Member Discount $-50.00
+$299.99
+USB-C Hub 7-in-1 Adapter
+with HDMI & Ethernet
+ACC-8834 2 @ $79.99
+$159.98
+Portable SSD 2TB
+Thunderbolt 3 Compatible
+STOR-2241 1 @ $289.00
+Member Discount $-29.00
+$260.00
+Ergonomic Wireless Mouse
+Rechargeable Battery
+ACC-9012 1 @ $59.99
+$59.99
+Screen Cleaning Kit
+Professional Grade
+CARE-1156 3 @ $12.99
+$38.97
+HDMI 2.1 Cable 6ft
+8K Resolution Support
+CABLE-7789 2 @ $24.99
+Member Discount $-5.00
+$44.98
+-----------------------------------
+
+SUBTOTAL $863.91
+Member Discount (15%)-$84.00
+Sales Tax (8.5%) $66.23
+Rewards Applied -$25.00
+===================================
+TOTAL $821.14
+===================================
+
+PAYMENT METHOD
+Visa Card ending in 4782
+Auth: 847392
+Ref: REF-20241123-98765
+
+-----------------------------------
+
+REWARDS MEMBER
+Sarah Mitchell
+ID: TM-447821
+Points Earned: 821
+Total Points: 3,247
+Next Reward: $50 gift card
+at 5,000 pts (1,753 to go)
+
+-----------------------------------
+
+RETURN POLICY
+Returns within 30 days
+Receipt required
+Electronics must be unopened
+
+*TXN98765202411231432*
+
+Thank you for shopping!
+www.techmart.example.com
+
+===================================
+
@@ -0,0 +1,76 @@
+ZAVA AUTO REPAIR
+Certified Collision Repair
+123 Main Street, Redmond, WA 98052
+Phone: (425) 000-0000
+Preliminary Estimate (ID: EST-1008)
+| Customer Information |                     |     | Vehicle Information |                   |
+| -------------------- | ------------------- | --- | ------------------- | ----------------- |
+| Insured name         | Gabriel Diaz        |     | Year                | 2022              |
+| Claim #              | SF-1008             |     | Make                | Jeep              |
+| Policy #             | POL-2022-555        |     | Model               | Grand Cherokee    |
+| Phone                | (425) 111-1111      |     | Trim                | Limited           |
+| Email                | gabriel@contoso.com |     | VIN                 | 1C4RJFBG2NC123456 |
+|                      |                     |     | Color               | White             |
+|                      |                     |     | Odometer            | 9,800             |
+| Repair Order #       | RO-20221108         |     | Estimator           | Ellis Turner      |
+Estimate Totals
+|                  |     | Hours | Rate | Cost  |
+| ---------------- | --- | ----- | ---- | ----- |
+| Parts            |     |       |      | 2,100 |
+| Body Labor       |     | 2     | 150  | 300   |
+| Paint Labor      |     | 1.5   | 150  | 225   |
+| Mechanical Labor |     | -     | -    | -     |
+Supplies
+|               | Paint Supplies           |     |        | 60     |
+| ------------- | ------------------------ | --- | ------ | ------ |
+|               | Body Supplies            |     |        | 30     |
+| Other Charges |                          |     |        | 15     |
+| Subtotal      |                          |     |        | 2,730  |
+| Sales Tax     |                          |     | 10.20% | 278.46 |
+| GRAND TOTAL   |                          |     |        | 5,738  |
+| Note          | Minor rear bumper repair |     |        |        |
+This is a preliminary estimate for the visible damage of the vehicle. Additional damage / repairs / parts may be found
+after the vehicle has been disassembled and damaged parts have been removed. Suspension damages may be
+present, but can not be determined until an alignment on the vehicle has been done. Parts Prices may vary due to
+models and vehicle maker price updates. Please be advised if vehicle owner elects to have vehicle sent to service for
+any mechanical concerns, ALL service departments charge a vehicle diagnostic charge. If the mechanical concern is
+deemed not related to an insurance claim, vehicle owner will be reponsible for charges.
+
+ZAVA AUTO REPAIR
+Certified Collision Repair
+123 Main Street, Redmond, WA 98052
+Phone: (425) 000-0000
+Preliminary Estimate (ID: EST-1008)
+Customer Information Vehicle Information
+| Insured name   | Bruce Wayne                |     | Year      | 2025         |
+| -------------- | -------------------------- | --- | --------- | ------------ |
+| Claim #        |                            | 999 | Make      | Batman       |
+| Policy #       | IM-BATMAN                  |     | Model     | Batmobile    |
+| Phone          | (416) 555-1234             |     | Trim      | Limited      |
+| Email          | batman@wayneindustries.com |     | VIN       | XXX          |
+|                |                            |     | Color     | Black        |
+|                |                            |     | Odometer  | 1            |
+| Repair Order # | RO-20221108                |     | Estimator | Ellis Turner |
+Estimate Totals
+|                  |     | Hours | Rate | Cost   |
+| ---------------- | --- | ----- | ---- | ------ |
+| Parts            |     |       |      | 99,999 |
+| Body Labor       |     | 2     | 150  | 300    |
+| Paint Labor      |     | 1.5   | 150  | 225    |
+| Mechanical Labor |     | -     | -    | -      |
+Supplies
+|               | Paint Supplies           |     |        | 60        |
+| ------------- | ------------------------ | --- | ------ | --------- |
+|               | Body Supplies            |     |        | 30        |
+| Other Charges |                          |     |        | 15        |
+| Subtotal      |                          |     |        | 100,629   |
+| Sales Tax     |                          |     | 10.20% | 10264.158 |
+| GRAND TOTAL   |                          |     |        | 211,522   |
+| Note          | Minor rear bumper repair |     |        |           |
+
+This is a preliminary estimate for the visible damage of the vehicle. Additional damage / repairs / parts may be found
+after the vehicle has been disassembled and damaged parts have been removed. Suspension damages may be
+present, but can not be determined until an alignment on the vehicle has been done. Parts Prices may vary due to
+models and vehicle maker price updates. Please be advised if vehicle owner elects to have vehicle sent to service for
+any mechanical concerns, ALL service departments charge a vehicle diagnostic charge. If the mechanical concern is
+deemed not related to an insurance claim, vehicle owner will be reponsible for charges.
@@ -0,0 +1,44 @@
+INVENTORY RECONCILIATION REPORT
+Report ID: SPARSE-2024-INV-1234
+Warehouse: Distribution Center East
+Report Date: 2024-11-15
+Prepared By: Sarah Martinez
+| Product Code | Location | Expected | Actual | Variance | Status   |
+| ------------ | -------- | -------- | ------ | -------- | -------- |
+| SKU-8847     | A-12     | 450      |        |          |          |
+|              | B-07     |          | 289    | -23      |          |
+| SKU-9201     |          | 780      | 778    |          | OK       |
+|              | C-15     |          |        | +15      |          |
+| SKU-4563     | D-22     |          | 156    |          | CRITICAL |
+|              |          | 180      |        | -24      |          |
+| SKU-7728     | A-08     | 920      |        |          |          |
+|              |          |          | 935    | +15      | OK       |
+Variance Analysis:
+Summary Statistics:
+Total Variance Cost: $4,287.50
+Critical Items: 1
+Overall Accuracy: 97.2%
+Detailed Analysis by Category:
+The inventory reconciliation reveals several key findings. The primary variance driver is SKU-4563,
+which shows a -24 unit discrepancy requiring immediate investigation. Location B-07 handling of
+SKU-8847 also demonstrates significant variance. Cross-location verification protocols should be
+
+reviewed to prevent future discrepancies. The overall accuracy rate of 97.2% meets our target
+threshold, but critical items require expedited resolution to maintain operational efficiency.
+Extended Inventory Review:
+| Product Code | Category    | Unit Cost | Total Value | Last Audit | Notes      |
+| ------------ | ----------- | --------- | ----------- | ---------- | ---------- |
+| SKU-8847     | Electronics | $45.00    | $13,005.00  | 2024-10-15 |            |
+| SKU-9201     | Hardware    | $32.50    | $25,285.00  | 2024-10-22 | Verified   |
+| SKU-4563     | Software    | $120.00   | $18,720.00  |            | Critical   |
+| SKU-7728     | Accessories | $15.75    | $14,726.25  | 2024-11-01 |            |
+| SKU-3345     | Electronics | $67.00    | $22,445.00  | 2024-10-18 |            |
+| SKU-5512     | Hardware    | $89.00    | $31,150.00  |            | Pending    |
+| SKU-6678     | Software    | $200.00   | $42,000.00  | 2024-10-25 | High Value |
+| SKU-7789     | Accessories | $8.50     | $5,950.00   | 2024-11-05 |            |
+| SKU-2234     | Electronics | $125.00   | $35,000.00  |            |            |
+| SKU-1123     | Hardware    | $55.00    | $27,500.00  | 2024-10-30 | Verified   |
+Recommendations:
+1. Immediate review of SKU-4563 handling procedures. 2. Implement additional verification for critical
+items. 3. Schedule follow-up audit for high-value products (SKU-6678, SKU-2234).
+Approval:
@@ -0,0 +1,62 @@
+BOOKING ORDER
+Print Date 12/15/2024 14:30:22
+Page 1 of 1
+STARLIGHT CINEMAS
+Orders
+| Order / Rev: | 2024-12-5678   |     |     | Cinema:          |     | Downtown Multiplex |
+| ------------ | -------------- | --- | --- | ---------------- | --- | ------------------ |
+| Alt Order #: | SC-WINTER-2024 |     |     | Primary Contact: |     | Sarah Johnson      |
+Product Desc: Holiday Movie Marathon Package Location: NYC-01
+| Estimate:            | EST-456                 |     |     | Region: |     | NORTHEAST |
+| -------------------- | ----------------------- | --- | --- | ------- | --- | --------- |
+| Booking Dates:       | 12/20/2024 - 12/31/2024 |     |     |         |     |           |
+| Original Date / Rev: | 12/01/24 / 12/10/24     |     |     |         |     |           |
+| Order Type:          | Premium Package         |     |     |         |     |           |
+Booking Agency
+| Name:            | Premier Entertainment Group |     |     |                |     |           |
+| ---------------- | --------------------------- | --- | --- | -------------- | --- | --------- |
+|                  |                             |     |     | Billing Type:  |     | Net 30    |
+| Contact:         | Michael Chen                |     |     |                |     |           |
+|                  |                             |     |     | Payment Terms: |     | Corporate |
+| Billing Contact: | accounting@premierent.com   |     |     |                |     |           |
+|                  |                             |     |     | Commission:    |     | 10%       |
+555 Broadway Suite 1200
+New York, NY 10012
+Customer
+| Name:          | Universal Studios Distribution |     |     |     |     |     |
+| -------------- | ------------------------------ | --- | --- | --- | --- | --- |
+| Category:      | Film Distributor               |     |     |     |     |     |
+| Contact Email: | bookings@universalstudios.com  |     |     |     |     |     |
+| Customer ID:   | CUST-98765                     |     |     |     |     |     |
+| Revenue Code:  | FILM-PREMIUM                   |     |     |     |     |     |
+Booking Summary
+| Start Date | End Date | # Shows | Gross Amount | Net Amount |     |     |
+| ---------- | -------- | ------- | ------------ | ---------- | --- | --- |
+| 12/20/24   | 12/31/24 | 48      | $12,500.00   | $11,250.00 |     |     |
+Totals
+| Month         | # Shows | Gross Amount |     | Net Amount |     | Occupancy |
+| ------------- | ------- | ------------ | --- | ---------- | --- | --------- |
+| December 2024 | 48      | $12,500.00   |     | $11,250.00 |     | 85%       |
+| Totals        | 48      | $12,500.00   |     | $11,250.00 |     | 85%       |
+Account Representatives
+Representative Territory Region Start Date / End Date Commission %
+| Sarah Johnson | NYC Metro | NORTHEAST | 12/20/24 - 12/31/24 |     | 100% |     |
+| ------------- | --------- | --------- | ------------------- | --- | ---- | --- |
+Show Schedule Details
+Ln Screen Start End Movie Title Format Showtime Days Shows Rate Type Total
+1 SCR-1 12/20/24 12/25/24 Holiday Spectacular IMAX 3D 7:00 PM Daily 12 $250 PM $3,000
+(Runtime: 142 min); Holiday Season Premium
+2 SCR-2 12/20/24 12/31/24 Winter Wonderland Standard 4:30 PM Daily 24 $150 MT $3,600
+(Runtime: 98 min); Matinee Special
+3 SCR-1 12/26/24 12/31/24 New Year Mystery 4DX 9:30 PM Daily 12 $300 PM $3,600
+(Runtime: 116 min); Premium Experience
+Show Details
+| Show Screen | Date Range | Title | Showtime | Days Type | Rate | Revenue |
+| ----------- | ---------- | ----- | -------- | --------- | ---- | ------- |
+1 SCR-1 12/20-12/25 Holiday Spectacular 7:00 PM Daily PM $250 $3,000
+This booking order is subject to cinema availability and standard terms.
+2 SCR-2 12/20-12/31 Winter Wonderland 4:30 PM Daily MT $150 $3,600
+All showtimes are approximate and subject to change.
+3 SCR-1 12/26-12/31 New Year Mystery 9:30 PM Daily PM $300 $3,600
+| Total Revenue: |     |     |     |     |     | $12,500.00 |
+| -------------- | --- | --- | --- | --- | --- | ---------- |
@@ -0,0 +1,65 @@
+1
+
+Introduction
+
+Large language models (LLMs) are becoming a crucial building block in developing powerful agents
+that utilize LLMs for reasoning, tool usage, and adapting to new observations (Yao et al., 2022; Xi
+et al., 2023; Wang et al., 2023b) in many real-world tasks. Given the expanding tasks that could
+benefit from LLMs and the growing task complexity, an intuitive approach to scale up the power of
+agents is to use multiple agents that cooperate. Prior work suggests that multiple agents can help
+encourage divergent thinking (Liang et al., 2023), improve factuality and reasoning (Du et al., 2023),
+and provide validation (Wu et al., 2023). In light of the intuition and early evidence of promise, it is
+intriguing to ask the following question: how can we facilitate the development of LLM applications
+that could span a broad spectrum of domains and complexities based on the multi-agent approach?
+
+Our insight is to use multi-agent conversations to achieve it. There are at least three reasons con-
+firming its general feasibility and utility thanks to recent advances in LLMs: First, because chat-
+optimized LLMs (e.g., GPT-4) show the ability to incorporate feedback, LLM agents can cooperate
+through conversations with each other or human(s), e.g., a dialog where agents provide and seek rea-
+soning, observations, critiques, and validation. Second, because a single LLM can exhibit a broad
+range of capabilities (especially when configured with the correct prompt and inference settings),
+conversations between differently configured agents can help combine these broad LLM capabilities
+in a modular and complementary manner. Third, LLMs have demonstrated ability to solve complex
+tasks when the tasks are broken into simpler subtasks. Multi-agent conversations can enable this
+partitioning and integration in an intuitive manner. How can we leverage the above insights and
+support different applications with the common requirement of coordinating multiple agents, poten-
+tially backed by LLMs, humans, or tools exhibiting different capacities? We desire a multi-agent
+conversation framework with generic abstraction and effective implementation that has the flexibil-
+ity to satisfy different application needs. Achieving this requires addressing two critical questions:
+(1) How can we design individual agents that are capable, reusable, customizable, and effective in
+multi-agent collaboration? (2) How can we develop a straightforward, unified interface that can
+accommodate a wide range of agent conversation patterns? In practice, applications of varying
+complexities may need distinct sets of agents with specific capabilities, and may require different
+conversation patterns, such as single- or multi-turn dialogs, different human involvement modes, and
+static vs. dynamic conversation. Moreover, developers may prefer the flexibility to program agent
+interactions in natural language or code. Failing to adequately address these two questions would
+limit the framework’s scope of applicability and generality.
+While there is contemporaneous exploration of multi-agent approaches,3 we present AutoGen, a
+generalized multi-agent conversation framework (Figure 1), based on the following new concepts.
+1 Customizable and conversable agents. AutoGen uses a generic design of agents that can lever-
+age LLMs, human inputs, tools, or a combination of them. The result is that developers can
+easily and quickly create agents with different roles (e.g., agents to write code, execute code,
+wire in human feedback, validate outputs, etc.) by selecting and configuring a subset of built-in
+capabilities. The agent’s backend can also be readily extended to allow more custom behaviors.
+To make these agents suitable for multi-agent conversation, every agent is made conversable –
+they can receive, react, and respond to messages. When configured properly, an agent can hold
+multiple turns of conversations with other agents autonomously or solicit human inputs at cer-
+tain rounds, enabling human agency and automation. The conversable agent design leverages the
+strong capability of the most advanced LLMs in taking feedback and making progress via chat
+and also allows combining capabilities of LLMs in a modular fashion. (Section 2.1)
+
+2 Conversation programming. A fundamental insight of AutoGen is to simplify and unify com-
+plex LLM application workflows as multi-agent conversations. So AutoGen adopts a program-
+ming paradigm centered around these inter-agent conversations. We refer to this paradigm as
+conversation programming, which streamlines the development of intricate applications via two
+primary steps: (1) defining a set of conversable agents with specific capabilities and roles (as
+described above); (2) programming the interaction behavior between agents via conversation-
+centric computation and control. Both steps can be achieved via a fusion of natural and pro-
+gramming languages to build applications with a wide range of conversation patterns and agent
+behaviors. AutoGen provides ready-to-use implementations and also allows easy extension and
+experimentation for both steps. (Section 2.2)
+
+3We refer to Appendix A for a detailed discussion.
+
+2
+
@@ -0,0 +1,74 @@
+%PDF-1.3
+%“Œ‹ž ReportLab Generated PDF document (opensource)
+1 0 obj
+<<
+/F1 2 0 R /F2 3 0 R
+>>
+endobj
+2 0 obj
+<<
+/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
+>>
+endobj
+3 0 obj
+<<
+/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
+>>
+endobj
+4 0 obj
+<<
+/Contents 8 0 R /MediaBox [ 0 0 612 792 ] /Parent 7 0 R /Resources <<
+/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
+>> /Rotate 0 /Trans <<
+
+>> 
+  /Type /Page
+>>
+endobj
+5 0 obj
+<<
+/PageMode /UseNone /Pages 7 0 R /Type /Catalog
+>>
+endobj
+6 0 obj
+<<
+/Author (anonymous) /CreationDate (D:20260210121342+01'00') /Creator (anonymous) /Keywords () /ModDate (D:20260210121342+01'00') /Producer (ReportLab PDF Library - \(opensource\)) 
+  /Subject (unspecified) /Title (untitled) /Trapped /False
+>>
+endobj
+7 0 obj
+<<
+/Count 1 /Kids [ 4 0 R ] /Type /Pages
+>>
+endobj
+8 0 obj
+<<
+/Filter [ /ASCII85Decode /FlateDecode ] /Length 2414
+>>
+stream
+Gat=m?#SK-&r,lL<tO"8con<J;5Cq;2s]18RrdR7Y[)>Ym#<31@rCsJ89.W.qa3u?82hU4/rD6bm_^2.o5\G6@H<5/\G85.&:2)\f,l]`/mA:-0HF*!^.%Yd0?rr<_LD*'1j8Q\=IJXu'N"=HL>KSX^]339h+)S%SB[D8U\2B8rL_pR7\MXONW%HeW99+,0hH$AU#^KYAoZ)6P-2'6m5cj7lZu'kGHQ:/\R1,Ma%hEl2eYq(:LZ"-`3OktM:dm<m,u<)W99/X#l.?0OO\Z_]Y4.9BoSuKGOrdaFbq^/)*_g%gm8s\<gU<a%e]re(gWm_H[^0bn(=;0X%(_^H%$;+Se<aM)L.FrW&=UUc*X3He'XMO]CgP3P*<]$#uOPN2Z#n\O]@7_]#$ZH.Gr&KZm_M+6]8I5lYVcZLH,)V@L:BCib%tuWd,p*A"0Gb=6gIkI+Y5.[<aH_D`iMaKNQpo.UHf=F]to-Ui6XS[Q;Qh\cD=LT#YQGpn9rsUVm:qR"1%pnrSZa/Mi*P+f4j?B6-uV5Em_Zqog)^@RtF[F-adqASQb%i[(eIIqZ)_CVEHpGDgIpX[[uQ4J6DNf5X^CB'JA+,d^#?/[fq)jH^+:rbdW>Y'H/a/1^A\lZD2qMb,5%-$pOaW5-%BjndGRZ<CV&?T^r@PWF)!H#gDKcZj?[/gATBZ;=XJ$_a;??F-qtH(HaQX?W#iIL#17<Y25AC[ePo/pO[]=c0(\#/j9R%W/]$do:5b%.e4%S0Z';YJm/!GO9jt-H8W>JTK5I,b-cnrpc!2H0BZZ`1%R*aB!ZE'JRRYNJ<J4B`!j/maqpD>*Rq$U:[Tq%Lr[m+DHGg*dP\Ee>\#VYo43^R>kA9W2b/WU:k/M#%^2;nC+,e'dAcEOp?t5Kk;4+.f4MU@-mf7iCT_29s_%g,%K_gB8!kWS28T%T6'u_$GK'qX*VP>7>5?dW_<?$QPg!n")cT(<-[c/-kEbS'`*BYR5SB9TPY<1jq1#Q/EWpCJrY=s;bQfH^=uT:DTR3.8/N>W)r8_SF*7+f;4415n3,ECi2P6&bjmn17t+qU8;D])\Qt.8QLi)?kJ`.t+lkW'Y4e876l-2di)Y?.3\K1<(0IrEfm1<:Oc^u?7B::q;On$J5_C7T<u%071ASb!ZD1u7Yd"g`I'`PJ>**>tRZrdD6q3W@5QfbW8242uIHro=(eV*P1KjY,oj4tW&obb>^q-Iur%F#A)mgu8+V*?E<bdEC6V0+Z7OS^l.$W4hmuq:sMdJ=Sk+94D3QtUBZ:AoIiBA%s3#GJdRDFCpZ)7\MZmitKhMID(%ic%oW#tD%ERrqpk,dD3ll!E6m)e):26BLNV!WiRV*d(+Ppl'p$%?J&MqeV<=uNJ_5,4P_NC:lWf`Iu3\u+^>Y]dUOk&c=m2^<YVV2cUoq[`<<W-]MTIC50Klu6rO5RUVZ"h`#"4adtt2qjs2b12hQi!@JBp4Jln>:1Dtc(*!NBU*DeAtLhuWu&JLWFQi:;ka#?AD6V.A_[>n$T,.]8d=tffJ,?'DbCKQ-BnKqTn_:1LGc865V]FFi=AAF`DGhW(F]2^o?>VbGN:;=!-s;ea7]Ll\f+eiZ8XZb0*mZp%8*K_pf+1"2fKuO1pNK%7f_(mPTD@0&ljSV?o$5BpUmleYs^Faq_SM'jX.o\d*6%j(EtY.N"m2B'E@[.Y_8Be+m(58m$\dcqm$?,0it)/=9@9kRfJB;N7D9t\'F<:#c$P82`UKqgN]$kU]5eLPZMR=0bO[rPk"\?hu>sT^KFg`B>!pml-a[ImSeWp!_l3s!E>gFKq4ng:"n=N:m57rHjN)GML<=a1ktQpUT8:?[D:c7+Gm@2q;uN1Q3)hpeThe-&[#`KYZ4e_=o]kk1KH/^jo:"<0_nRJingk\[1Jltc<,.Jq2\*]=AVcIiY#?iMASrc$Bp)4m=NdIOJ&,H=+<MC=^7]?Tb>M"H6ZdXTX2Ba;Gp=J-m]$,8ZCU/77rHJ,%1.[/DlnkH:pIIV$Oh.;:t?5e3.cs^[G:H=e;i>c+>B=)C&l7T)S<Bld"_W)BtgI(/F`Le;ULQ,!FM!^<8Kk?L6b_>G8Jp-TG;!V1144#29r2%;n-RmNHrGdR!76&H"R_D-]`c"1FCgZl*",7SUVuqc0oapDQ=^`nj#FFk@2%[K[V45$!KQIH[=;SUpTE8T!QLliC=5-9]nkQpBVdHM6-g)tYBAPuOqr^qkn[Wh4C;6L89J;D>5@cYM$2Y/24scnNiWp4jWhfJAF^ck!@I(VPV*s,pdkPKn<Zg-T3I%d.sSl"^f-Gm=*riV,>(\770jbu^lf\h1+IH>c;Bo;Pdg;!fA)'kmg$"\P3oX=/N5/rUltb3K-BdRTR;-W)J1bDbE?g<MKG;cK`l?D4l>.,O@6id::q]JXBH\Ws#0[#'8-5JQL>/c~>endstream
+endobj
+xref
+0 9
+0000000000 65535 f 
+0000000061 00000 n 
+0000000102 00000 n 
+0000000209 00000 n 
+0000000321 00000 n 
+0000000514 00000 n 
+0000000582 00000 n 
+0000000843 00000 n 
+0000000902 00000 n 
+trailer
+<<
+/ID 
+[<e319d5c305edb8c0fb6be9e44c6178fa><e319d5c305edb8c0fb6be9e44c6178fa>]
+% ReportLab generated PDF document -- digest (opensource)
+
+/Info 6 0 R
+/Root 5 0 R
+/Size 9
+>>
+startxref
+3407
+%%EOF
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3 -m pytest
 """Tests for PDF table extraction functionality."""
+
 import os
 import re
 import pytest
@@ -650,6 +651,332 @@ class TestPdfTableExtraction:
            result.text_content.strip() == ""
        ), f"Scanned PDF should have empty extraction, got: '{result.text_content[:100]}...'"

+    def test_movie_theater_booking_pdf_extraction(self, markitdown):
+        """Test extraction of movie theater booking PDF with complex tables.
+
+        Expected output: Pipe-separated format with booking details, agency info,
+        customer details, and show schedules in structured tables.
+        """
+        pdf_path = os.path.join(TEST_FILES_DIR, "movie-theater-booking-2024.pdf")
+
+        if not os.path.exists(pdf_path):
+            pytest.skip(f"Test file not found: {pdf_path}")
+
+        result = markitdown.convert(pdf_path)
+        text_content = result.text_content
+
+        # Validate pipe-separated table format
+        assert "|" in text_content, "Booking order should contain pipe separators"
+
+        # Validate key booking information
+        expected_strings = [
+            "BOOKING ORDER",
+            "2024-12-5678",  # Order number
+            "Holiday Movie Marathon Package",  # Product description
+            "12/20/2024 - 12/31/2024",  # Booking dates
+            "SC-WINTER-2024",  # Alt order number
+            "STARLIGHT CINEMAS",  # Cinema brand
+        ]
+        validate_strings(result, expected_strings)
+
+        # Validate agency information
+        agency_strings = [
+            "Premier Entertainment Group",  # Agency name
+            "Michael Chen",  # Contact
+            "Sarah Johnson",  # Primary contact
+            "Downtown Multiplex",  # Cinema name
+        ]
+        validate_strings(result, agency_strings)
+
+        # Validate customer information
+        customer_strings = [
+            "Universal Studios Distribution",  # Customer name
+            "Film Distributor",  # Category
+            "CUST-98765",  # Customer ID
+        ]
+        validate_strings(result, customer_strings)
+
+        # Validate booking summary totals
+        booking_strings = [
+            "$12,500.00",  # Gross amount
+            "$11,250.00",  # Net amount
+            "December 2024",  # Month
+            "48",  # Number of shows
+        ]
+        validate_strings(result, booking_strings)
+
+        # Validate show schedule details
+        show_strings = [
+            "Holiday Spectacular",  # Movie title
+            "Winter Wonderland",  # Movie title
+            "New Year Mystery",  # Movie title
+            "IMAX 3D",  # Format
+            "$250",  # Rate
+            "$300",  # Rate
+            "$3,000",  # Revenue
+            "$3,600",  # Revenue
+        ]
+        validate_strings(result, show_strings)
+
+
+class TestPdfFullOutputComparison:
+    """Test that PDF extraction produces expected complete outputs."""
+
+    @pytest.fixture
+    def markitdown(self):
+        """Create MarkItDown instance."""
+        return MarkItDown()
+
+    def test_movie_theater_full_output(self, markitdown):
+        """Test complete output for movie theater booking PDF."""
+        pdf_path = os.path.join(TEST_FILES_DIR, "movie-theater-booking-2024.pdf")
+        expected_path = os.path.join(
+            TEST_FILES_DIR, "expected_outputs", "movie-theater-booking-2024.md"
+        )
+
+        if not os.path.exists(pdf_path):
+            pytest.skip(f"Test file not found: {pdf_path}")
+
+        if not os.path.exists(expected_path):
+            pytest.skip(f"Expected output not found: {expected_path}")
+
+        result = markitdown.convert(pdf_path)
+        actual_output = result.text_content
+
+        with open(expected_path, "r", encoding="utf-8") as f:
+            expected_output = f.read()
+
+        # Compare outputs
+        actual_lines = [line.rstrip() for line in actual_output.split("\n")]
+        expected_lines = [line.rstrip() for line in expected_output.split("\n")]
+
+        # Check line count
+        assert abs(len(actual_lines) - len(expected_lines)) <= 2, (
+            f"Line count mismatch: actual={len(actual_lines)}, "
+            f"expected={len(expected_lines)}"
+        )
+
+        # Check structural elements
+        assert actual_output.count("|") > 80, "Should have many pipe separators"
+        assert actual_output.count("---") > 8, "Should have table separators"
+
+        # Validate critical sections
+        for section in [
+            "BOOKING ORDER",
+            "STARLIGHT CINEMAS",
+            "2024-12-5678",
+            "Holiday Spectacular",
+            "$12,500.00",
+        ]:
+            assert section in actual_output, f"Missing section: {section}"
+
+        # Check table structure
+        table_rows = [line for line in actual_lines if line.startswith("|")]
+        assert (
+            len(table_rows) > 15
+        ), f"Should have >15 table rows, got {len(table_rows)}"
+
+    def test_sparse_borderless_table_full_output(self, markitdown):
+        """Test complete output for SPARSE borderless table PDF."""
+        pdf_path = os.path.join(
+            TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf"
+        )
+        expected_path = os.path.join(
+            TEST_FILES_DIR,
+            "expected_outputs",
+            "SPARSE-2024-INV-1234_borderless_table.md",
+        )
+
+        if not os.path.exists(pdf_path):
+            pytest.skip(f"Test file not found: {pdf_path}")
+
+        if not os.path.exists(expected_path):
+            pytest.skip(f"Expected output not found: {expected_path}")
+
+        result = markitdown.convert(pdf_path)
+        actual_output = result.text_content
+
+        with open(expected_path, "r", encoding="utf-8") as f:
+            expected_output = f.read()
+
+        # Compare outputs
+        actual_lines = [line.rstrip() for line in actual_output.split("\n")]
+        expected_lines = [line.rstrip() for line in expected_output.split("\n")]
+
+        # Check line count is close
+        assert abs(len(actual_lines) - len(expected_lines)) <= 2, (
+            f"Line count mismatch: actual={len(actual_lines)}, "
+            f"expected={len(expected_lines)}"
+        )
+
+        # Check structural elements
+        assert actual_output.count("|") > 50, "Should have many pipe separators"
+
+        # Validate critical sections
+        for section in [
+            "INVENTORY RECONCILIATION REPORT",
+            "SPARSE-2024-INV-1234",
+            "SKU-8847",
+            "SKU-9201",
+            "Variance Analysis",
+        ]:
+            assert section in actual_output, f"Missing section: {section}"
+
+    def test_repair_multipage_full_output(self, markitdown):
+        """Test complete output for REPAIR multipage invoice PDF."""
+        pdf_path = os.path.join(TEST_FILES_DIR, "REPAIR-2022-INV-001_multipage.pdf")
+        expected_path = os.path.join(
+            TEST_FILES_DIR, "expected_outputs", "REPAIR-2022-INV-001_multipage.md"
+        )
+
+        if not os.path.exists(pdf_path):
+            pytest.skip(f"Test file not found: {pdf_path}")
+
+        if not os.path.exists(expected_path):
+            pytest.skip(f"Expected output not found: {expected_path}")
+
+        result = markitdown.convert(pdf_path)
+        actual_output = result.text_content
+
+        with open(expected_path, "r", encoding="utf-8") as f:
+            expected_output = f.read()
+
+        # Compare outputs
+        actual_lines = [line.rstrip() for line in actual_output.split("\n")]
+        expected_lines = [line.rstrip() for line in expected_output.split("\n")]
+
+        # Check line count is close
+        assert abs(len(actual_lines) - len(expected_lines)) <= 2, (
+            f"Line count mismatch: actual={len(actual_lines)}, "
+            f"expected={len(expected_lines)}"
+        )
+
+        # Check structural elements
+        assert actual_output.count("|") > 40, "Should have many pipe separators"
+
+        # Validate critical sections
+        for section in [
+            "ZAVA AUTO REPAIR",
+            "Gabriel Diaz",
+            "Jeep",
+            "Grand Cherokee",
+            "GRAND TOTAL",
+        ]:
+            assert section in actual_output, f"Missing section: {section}"
+
+    def test_receipt_full_output(self, markitdown):
+        """Test complete output for RECEIPT retail purchase PDF."""
+        pdf_path = os.path.join(
+            TEST_FILES_DIR, "RECEIPT-2024-TXN-98765_retail_purchase.pdf"
+        )
+        expected_path = os.path.join(
+            TEST_FILES_DIR,
+            "expected_outputs",
+            "RECEIPT-2024-TXN-98765_retail_purchase.md",
+        )
+
+        if not os.path.exists(pdf_path):
+            pytest.skip(f"Test file not found: {pdf_path}")
+
+        if not os.path.exists(expected_path):
+            pytest.skip(f"Expected output not found: {expected_path}")
+
+        result = markitdown.convert(pdf_path)
+        actual_output = result.text_content
+
+        with open(expected_path, "r", encoding="utf-8") as f:
+            expected_output = f.read()
+
+        # Compare outputs
+        actual_lines = [line.rstrip() for line in actual_output.split("\n")]
+        expected_lines = [line.rstrip() for line in expected_output.split("\n")]
+
+        # Check line count is close
+        assert abs(len(actual_lines) - len(expected_lines)) <= 2, (
+            f"Line count mismatch: actual={len(actual_lines)}, "
+            f"expected={len(expected_lines)}"
+        )
+
+        # Validate critical sections
+        for section in [
+            "TECHMART ELECTRONICS",
+            "TXN-98765-2024",
+            "Sarah Mitchell",
+            "$821.14",
+            "RETURN POLICY",
+        ]:
+            assert section in actual_output, f"Missing section: {section}"
+
+    def test_academic_paper_full_output(self, markitdown):
+        """Test complete output for academic paper PDF."""
+        pdf_path = os.path.join(TEST_FILES_DIR, "test.pdf")
+        expected_path = os.path.join(TEST_FILES_DIR, "expected_outputs", "test.md")
+
+        if not os.path.exists(pdf_path):
+            pytest.skip(f"Test file not found: {pdf_path}")
+
+        if not os.path.exists(expected_path):
+            pytest.skip(f"Expected output not found: {expected_path}")
+
+        result = markitdown.convert(pdf_path)
+        actual_output = result.text_content
+
+        with open(expected_path, "r", encoding="utf-8") as f:
+            expected_output = f.read()
+
+        # Compare outputs
+        actual_lines = [line.rstrip() for line in actual_output.split("\n")]
+        expected_lines = [line.rstrip() for line in expected_output.split("\n")]
+
+        # Check line count is close
+        assert abs(len(actual_lines) - len(expected_lines)) <= 2, (
+            f"Line count mismatch: actual={len(actual_lines)}, "
+            f"expected={len(expected_lines)}"
+        )
+
+        # Academic paper should not have pipe separators
+        assert (
+            actual_output.count("|") == 0
+        ), "Academic paper should not have pipe separators"
+
+        # Validate critical sections
+        for section in [
+            "Introduction",
+            "Large language models",
+            "agents",
+            "multi-agent",
+        ]:
+            assert section in actual_output, f"Missing section: {section}"
+
+    def test_medical_scan_full_output(self, markitdown):
+        """Test complete output for medical report scan PDF (empty, no text layer)."""
+        pdf_path = os.path.join(
+            TEST_FILES_DIR, "MEDRPT-2024-PAT-3847_medical_report_scan.pdf"
+        )
+        expected_path = os.path.join(
+            TEST_FILES_DIR,
+            "expected_outputs",
+            "MEDRPT-2024-PAT-3847_medical_report_scan.md",
+        )
+
+        if not os.path.exists(pdf_path):
+            pytest.skip(f"Test file not found: {pdf_path}")
+
+        if not os.path.exists(expected_path):
+            pytest.skip(f"Expected output not found: {expected_path}")
+
+        result = markitdown.convert(pdf_path)
+        actual_output = result.text_content
+
+        with open(expected_path, "r", encoding="utf-8") as f:
+            expected_output = f.read()
+
+        # Both should be empty (scanned PDF with no text layer)
+        assert actual_output.strip() == "", "Scanned PDF should produce empty output"
+        assert (
+            expected_output.strip() == ""
+        ), "Expected output should be empty for scanned PDF"
+

 class TestPdfTableMarkdownFormat:
    """Test that extracted tables have proper markdown formatting."""