[MS] Extend table support for wide tables (#1552)
* feat: enhance PDF table extraction to support complex forms and add new test cases * feat: enhance PDF table extraction with adaptive column clustering and add comprehensive test cases * fix: correct formatting and improve assertions in PDF table tests
This commit is contained in:
@@ -1,2 +1,5 @@
|
||||
packages/markitdown/tests/test_files/** linguist-vendored
|
||||
packages/markitdown-sample-plugin/tests/test_files/** linguist-vendored
|
||||
|
||||
# Treat PDF files as binary to prevent line ending conversion
|
||||
*.pdf binary
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# SPDX-FileCopyrightText: 2024-present Adam Fourney <adamfo@microsoft.com>
|
||||
#
|
||||
# SPDX-License-Identifier: MIT
|
||||
__version__ = "0.1.5b1"
|
||||
__version__ = "0.1.5b2"
|
||||
|
||||
@@ -198,15 +198,62 @@ def _extract_form_content_from_words(page: Any) -> str | None:
|
||||
if not all_table_x_positions:
|
||||
return None
|
||||
|
||||
# Compute global column boundaries
|
||||
# Compute adaptive column clustering tolerance based on gap analysis
|
||||
all_table_x_positions.sort()
|
||||
|
||||
# Calculate gaps between consecutive x-positions
|
||||
gaps = []
|
||||
for i in range(len(all_table_x_positions) - 1):
|
||||
gap = all_table_x_positions[i + 1] - all_table_x_positions[i]
|
||||
if gap > 5: # Only significant gaps
|
||||
gaps.append(gap)
|
||||
|
||||
# Determine optimal tolerance using statistical analysis
|
||||
if gaps and len(gaps) >= 3:
|
||||
# Use 70th percentile of gaps as threshold (balances precision/recall)
|
||||
sorted_gaps = sorted(gaps)
|
||||
percentile_70_idx = int(len(sorted_gaps) * 0.70)
|
||||
adaptive_tolerance = sorted_gaps[percentile_70_idx]
|
||||
|
||||
# Clamp tolerance to reasonable range [25, 50]
|
||||
adaptive_tolerance = max(25, min(50, adaptive_tolerance))
|
||||
else:
|
||||
# Fallback to conservative value
|
||||
adaptive_tolerance = 35
|
||||
|
||||
# Compute global column boundaries using adaptive tolerance
|
||||
global_columns: list[float] = []
|
||||
for x in all_table_x_positions:
|
||||
if not global_columns or x - global_columns[-1] > 30:
|
||||
if not global_columns or x - global_columns[-1] > adaptive_tolerance:
|
||||
global_columns.append(x)
|
||||
|
||||
# Too many columns suggests dense text, not a form
|
||||
if len(global_columns) > 8:
|
||||
# Adaptive max column check based on page characteristics
|
||||
# Calculate average column width
|
||||
if len(global_columns) > 1:
|
||||
content_width = global_columns[-1] - global_columns[0]
|
||||
avg_col_width = content_width / len(global_columns)
|
||||
|
||||
# Forms with very narrow columns (< 30px) are likely dense text
|
||||
if avg_col_width < 30:
|
||||
return None
|
||||
|
||||
# Compute adaptive max based on columns per inch
|
||||
# Typical forms have 3-8 columns per inch
|
||||
columns_per_inch = len(global_columns) / (content_width / 72)
|
||||
|
||||
# If density is too high (> 10 cols/inch), likely not a form
|
||||
if columns_per_inch > 10:
|
||||
return None
|
||||
|
||||
# Adaptive max: allow more columns for wider pages
|
||||
# Standard letter is 612pt wide, so scale accordingly
|
||||
adaptive_max_columns = int(20 * (page_width / 612))
|
||||
adaptive_max_columns = max(15, adaptive_max_columns) # At least 15
|
||||
|
||||
if len(global_columns) > adaptive_max_columns:
|
||||
return None
|
||||
else:
|
||||
# Single column, not a form
|
||||
return None
|
||||
|
||||
# Now classify each row as table row or not
|
||||
|
||||
Vendored
Vendored
+81
@@ -0,0 +1,81 @@
|
||||
TECHMART ELECTRONICS
|
||||
4567 Innovation Blvd
|
||||
San Francisco, CA 94103
|
||||
(415) 555-0199
|
||||
|
||||
===================================
|
||||
|
||||
Store #0342 - Downtown SF
|
||||
11/23/2024 14:32:18 PST
|
||||
TXN: TXN-98765-2024
|
||||
Cashier: Emily Rodriguez
|
||||
Register: POS-07
|
||||
|
||||
-----------------------------------
|
||||
|
||||
Wireless Noise-Cancelling
|
||||
Headphones - Premium Black
|
||||
AUDIO-5521 1 @ $349.99
|
||||
Member Discount $-50.00
|
||||
$299.99
|
||||
USB-C Hub 7-in-1 Adapter
|
||||
with HDMI & Ethernet
|
||||
ACC-8834 2 @ $79.99
|
||||
$159.98
|
||||
Portable SSD 2TB
|
||||
Thunderbolt 3 Compatible
|
||||
STOR-2241 1 @ $289.00
|
||||
Member Discount $-29.00
|
||||
$260.00
|
||||
Ergonomic Wireless Mouse
|
||||
Rechargeable Battery
|
||||
ACC-9012 1 @ $59.99
|
||||
$59.99
|
||||
Screen Cleaning Kit
|
||||
Professional Grade
|
||||
CARE-1156 3 @ $12.99
|
||||
$38.97
|
||||
HDMI 2.1 Cable 6ft
|
||||
8K Resolution Support
|
||||
CABLE-7789 2 @ $24.99
|
||||
Member Discount $-5.00
|
||||
$44.98
|
||||
-----------------------------------
|
||||
|
||||
SUBTOTAL $863.91
|
||||
Member Discount (15%)-$84.00
|
||||
Sales Tax (8.5%) $66.23
|
||||
Rewards Applied -$25.00
|
||||
===================================
|
||||
TOTAL $821.14
|
||||
===================================
|
||||
|
||||
PAYMENT METHOD
|
||||
Visa Card ending in 4782
|
||||
Auth: 847392
|
||||
Ref: REF-20241123-98765
|
||||
|
||||
-----------------------------------
|
||||
|
||||
REWARDS MEMBER
|
||||
Sarah Mitchell
|
||||
ID: TM-447821
|
||||
Points Earned: 821
|
||||
Total Points: 3,247
|
||||
Next Reward: $50 gift card
|
||||
at 5,000 pts (1,753 to go)
|
||||
|
||||
-----------------------------------
|
||||
|
||||
RETURN POLICY
|
||||
Returns within 30 days
|
||||
Receipt required
|
||||
Electronics must be unopened
|
||||
|
||||
*TXN98765202411231432*
|
||||
|
||||
Thank you for shopping!
|
||||
www.techmart.example.com
|
||||
|
||||
===================================
|
||||
|
||||
+76
@@ -0,0 +1,76 @@
|
||||
ZAVA AUTO REPAIR
|
||||
Certified Collision Repair
|
||||
123 Main Street, Redmond, WA 98052
|
||||
Phone: (425) 000-0000
|
||||
Preliminary Estimate (ID: EST-1008)
|
||||
| Customer Information | | | Vehicle Information | |
|
||||
| -------------------- | ------------------- | --- | ------------------- | ----------------- |
|
||||
| Insured name | Gabriel Diaz | | Year | 2022 |
|
||||
| Claim # | SF-1008 | | Make | Jeep |
|
||||
| Policy # | POL-2022-555 | | Model | Grand Cherokee |
|
||||
| Phone | (425) 111-1111 | | Trim | Limited |
|
||||
| Email | gabriel@contoso.com | | VIN | 1C4RJFBG2NC123456 |
|
||||
| | | | Color | White |
|
||||
| | | | Odometer | 9,800 |
|
||||
| Repair Order # | RO-20221108 | | Estimator | Ellis Turner |
|
||||
Estimate Totals
|
||||
| | | Hours | Rate | Cost |
|
||||
| ---------------- | --- | ----- | ---- | ----- |
|
||||
| Parts | | | | 2,100 |
|
||||
| Body Labor | | 2 | 150 | 300 |
|
||||
| Paint Labor | | 1.5 | 150 | 225 |
|
||||
| Mechanical Labor | | - | - | - |
|
||||
Supplies
|
||||
| | Paint Supplies | | | 60 |
|
||||
| ------------- | ------------------------ | --- | ------ | ------ |
|
||||
| | Body Supplies | | | 30 |
|
||||
| Other Charges | | | | 15 |
|
||||
| Subtotal | | | | 2,730 |
|
||||
| Sales Tax | | | 10.20% | 278.46 |
|
||||
| GRAND TOTAL | | | | 5,738 |
|
||||
| Note | Minor rear bumper repair | | | |
|
||||
This is a preliminary estimate for the visible damage of the vehicle. Additional damage / repairs / parts may be found
|
||||
after the vehicle has been disassembled and damaged parts have been removed. Suspension damages may be
|
||||
present, but can not be determined until an alignment on the vehicle has been done. Parts Prices may vary due to
|
||||
models and vehicle maker price updates. Please be advised if vehicle owner elects to have vehicle sent to service for
|
||||
any mechanical concerns, ALL service departments charge a vehicle diagnostic charge. If the mechanical concern is
|
||||
deemed not related to an insurance claim, vehicle owner will be reponsible for charges.
|
||||
|
||||
ZAVA AUTO REPAIR
|
||||
Certified Collision Repair
|
||||
123 Main Street, Redmond, WA 98052
|
||||
Phone: (425) 000-0000
|
||||
Preliminary Estimate (ID: EST-1008)
|
||||
Customer Information Vehicle Information
|
||||
| Insured name | Bruce Wayne | | Year | 2025 |
|
||||
| -------------- | -------------------------- | --- | --------- | ------------ |
|
||||
| Claim # | | 999 | Make | Batman |
|
||||
| Policy # | IM-BATMAN | | Model | Batmobile |
|
||||
| Phone | (416) 555-1234 | | Trim | Limited |
|
||||
| Email | batman@wayneindustries.com | | VIN | XXX |
|
||||
| | | | Color | Black |
|
||||
| | | | Odometer | 1 |
|
||||
| Repair Order # | RO-20221108 | | Estimator | Ellis Turner |
|
||||
Estimate Totals
|
||||
| | | Hours | Rate | Cost |
|
||||
| ---------------- | --- | ----- | ---- | ------ |
|
||||
| Parts | | | | 99,999 |
|
||||
| Body Labor | | 2 | 150 | 300 |
|
||||
| Paint Labor | | 1.5 | 150 | 225 |
|
||||
| Mechanical Labor | | - | - | - |
|
||||
Supplies
|
||||
| | Paint Supplies | | | 60 |
|
||||
| ------------- | ------------------------ | --- | ------ | --------- |
|
||||
| | Body Supplies | | | 30 |
|
||||
| Other Charges | | | | 15 |
|
||||
| Subtotal | | | | 100,629 |
|
||||
| Sales Tax | | | 10.20% | 10264.158 |
|
||||
| GRAND TOTAL | | | | 211,522 |
|
||||
| Note | Minor rear bumper repair | | | |
|
||||
|
||||
This is a preliminary estimate for the visible damage of the vehicle. Additional damage / repairs / parts may be found
|
||||
after the vehicle has been disassembled and damaged parts have been removed. Suspension damages may be
|
||||
present, but can not be determined until an alignment on the vehicle has been done. Parts Prices may vary due to
|
||||
models and vehicle maker price updates. Please be advised if vehicle owner elects to have vehicle sent to service for
|
||||
any mechanical concerns, ALL service departments charge a vehicle diagnostic charge. If the mechanical concern is
|
||||
deemed not related to an insurance claim, vehicle owner will be reponsible for charges.
|
||||
Vendored
+44
@@ -0,0 +1,44 @@
|
||||
INVENTORY RECONCILIATION REPORT
|
||||
Report ID: SPARSE-2024-INV-1234
|
||||
Warehouse: Distribution Center East
|
||||
Report Date: 2024-11-15
|
||||
Prepared By: Sarah Martinez
|
||||
| Product Code | Location | Expected | Actual | Variance | Status |
|
||||
| ------------ | -------- | -------- | ------ | -------- | -------- |
|
||||
| SKU-8847 | A-12 | 450 | | | |
|
||||
| | B-07 | | 289 | -23 | |
|
||||
| SKU-9201 | | 780 | 778 | | OK |
|
||||
| | C-15 | | | +15 | |
|
||||
| SKU-4563 | D-22 | | 156 | | CRITICAL |
|
||||
| | | 180 | | -24 | |
|
||||
| SKU-7728 | A-08 | 920 | | | |
|
||||
| | | | 935 | +15 | OK |
|
||||
Variance Analysis:
|
||||
Summary Statistics:
|
||||
Total Variance Cost: $4,287.50
|
||||
Critical Items: 1
|
||||
Overall Accuracy: 97.2%
|
||||
Detailed Analysis by Category:
|
||||
The inventory reconciliation reveals several key findings. The primary variance driver is SKU-4563,
|
||||
which shows a -24 unit discrepancy requiring immediate investigation. Location B-07 handling of
|
||||
SKU-8847 also demonstrates significant variance. Cross-location verification protocols should be
|
||||
|
||||
reviewed to prevent future discrepancies. The overall accuracy rate of 97.2% meets our target
|
||||
threshold, but critical items require expedited resolution to maintain operational efficiency.
|
||||
Extended Inventory Review:
|
||||
| Product Code | Category | Unit Cost | Total Value | Last Audit | Notes |
|
||||
| ------------ | ----------- | --------- | ----------- | ---------- | ---------- |
|
||||
| SKU-8847 | Electronics | $45.00 | $13,005.00 | 2024-10-15 | |
|
||||
| SKU-9201 | Hardware | $32.50 | $25,285.00 | 2024-10-22 | Verified |
|
||||
| SKU-4563 | Software | $120.00 | $18,720.00 | | Critical |
|
||||
| SKU-7728 | Accessories | $15.75 | $14,726.25 | 2024-11-01 | |
|
||||
| SKU-3345 | Electronics | $67.00 | $22,445.00 | 2024-10-18 | |
|
||||
| SKU-5512 | Hardware | $89.00 | $31,150.00 | | Pending |
|
||||
| SKU-6678 | Software | $200.00 | $42,000.00 | 2024-10-25 | High Value |
|
||||
| SKU-7789 | Accessories | $8.50 | $5,950.00 | 2024-11-05 | |
|
||||
| SKU-2234 | Electronics | $125.00 | $35,000.00 | | |
|
||||
| SKU-1123 | Hardware | $55.00 | $27,500.00 | 2024-10-30 | Verified |
|
||||
Recommendations:
|
||||
1. Immediate review of SKU-4563 handling procedures. 2. Implement additional verification for critical
|
||||
items. 3. Schedule follow-up audit for high-value products (SKU-6678, SKU-2234).
|
||||
Approval:
|
||||
+62
@@ -0,0 +1,62 @@
|
||||
BOOKING ORDER
|
||||
Print Date 12/15/2024 14:30:22
|
||||
Page 1 of 1
|
||||
STARLIGHT CINEMAS
|
||||
Orders
|
||||
| Order / Rev: | 2024-12-5678 | | | Cinema: | | Downtown Multiplex |
|
||||
| ------------ | -------------- | --- | --- | ---------------- | --- | ------------------ |
|
||||
| Alt Order #: | SC-WINTER-2024 | | | Primary Contact: | | Sarah Johnson |
|
||||
Product Desc: Holiday Movie Marathon Package Location: NYC-01
|
||||
| Estimate: | EST-456 | | | Region: | | NORTHEAST |
|
||||
| -------------------- | ----------------------- | --- | --- | ------- | --- | --------- |
|
||||
| Booking Dates: | 12/20/2024 - 12/31/2024 | | | | | |
|
||||
| Original Date / Rev: | 12/01/24 / 12/10/24 | | | | | |
|
||||
| Order Type: | Premium Package | | | | | |
|
||||
Booking Agency
|
||||
| Name: | Premier Entertainment Group | | | | | |
|
||||
| ---------------- | --------------------------- | --- | --- | -------------- | --- | --------- |
|
||||
| | | | | Billing Type: | | Net 30 |
|
||||
| Contact: | Michael Chen | | | | | |
|
||||
| | | | | Payment Terms: | | Corporate |
|
||||
| Billing Contact: | accounting@premierent.com | | | | | |
|
||||
| | | | | Commission: | | 10% |
|
||||
555 Broadway Suite 1200
|
||||
New York, NY 10012
|
||||
Customer
|
||||
| Name: | Universal Studios Distribution | | | | | |
|
||||
| -------------- | ------------------------------ | --- | --- | --- | --- | --- |
|
||||
| Category: | Film Distributor | | | | | |
|
||||
| Contact Email: | bookings@universalstudios.com | | | | | |
|
||||
| Customer ID: | CUST-98765 | | | | | |
|
||||
| Revenue Code: | FILM-PREMIUM | | | | | |
|
||||
Booking Summary
|
||||
| Start Date | End Date | # Shows | Gross Amount | Net Amount | | |
|
||||
| ---------- | -------- | ------- | ------------ | ---------- | --- | --- |
|
||||
| 12/20/24 | 12/31/24 | 48 | $12,500.00 | $11,250.00 | | |
|
||||
Totals
|
||||
| Month | # Shows | Gross Amount | | Net Amount | | Occupancy |
|
||||
| ------------- | ------- | ------------ | --- | ---------- | --- | --------- |
|
||||
| December 2024 | 48 | $12,500.00 | | $11,250.00 | | 85% |
|
||||
| Totals | 48 | $12,500.00 | | $11,250.00 | | 85% |
|
||||
Account Representatives
|
||||
Representative Territory Region Start Date / End Date Commission %
|
||||
| Sarah Johnson | NYC Metro | NORTHEAST | 12/20/24 - 12/31/24 | | 100% | |
|
||||
| ------------- | --------- | --------- | ------------------- | --- | ---- | --- |
|
||||
Show Schedule Details
|
||||
Ln Screen Start End Movie Title Format Showtime Days Shows Rate Type Total
|
||||
1 SCR-1 12/20/24 12/25/24 Holiday Spectacular IMAX 3D 7:00 PM Daily 12 $250 PM $3,000
|
||||
(Runtime: 142 min); Holiday Season Premium
|
||||
2 SCR-2 12/20/24 12/31/24 Winter Wonderland Standard 4:30 PM Daily 24 $150 MT $3,600
|
||||
(Runtime: 98 min); Matinee Special
|
||||
3 SCR-1 12/26/24 12/31/24 New Year Mystery 4DX 9:30 PM Daily 12 $300 PM $3,600
|
||||
(Runtime: 116 min); Premium Experience
|
||||
Show Details
|
||||
| Show Screen | Date Range | Title | Showtime | Days Type | Rate | Revenue |
|
||||
| ----------- | ---------- | ----- | -------- | --------- | ---- | ------- |
|
||||
1 SCR-1 12/20-12/25 Holiday Spectacular 7:00 PM Daily PM $250 $3,000
|
||||
This booking order is subject to cinema availability and standard terms.
|
||||
2 SCR-2 12/20-12/31 Winter Wonderland 4:30 PM Daily MT $150 $3,600
|
||||
All showtimes are approximate and subject to change.
|
||||
3 SCR-1 12/26-12/31 New Year Mystery 9:30 PM Daily PM $300 $3,600
|
||||
| Total Revenue: | | | | | | $12,500.00 |
|
||||
| -------------- | --- | --- | --- | --- | --- | ---------- |
|
||||
@@ -0,0 +1,65 @@
|
||||
1
|
||||
|
||||
Introduction
|
||||
|
||||
Large language models (LLMs) are becoming a crucial building block in developing powerful agents
|
||||
that utilize LLMs for reasoning, tool usage, and adapting to new observations (Yao et al., 2022; Xi
|
||||
et al., 2023; Wang et al., 2023b) in many real-world tasks. Given the expanding tasks that could
|
||||
benefit from LLMs and the growing task complexity, an intuitive approach to scale up the power of
|
||||
agents is to use multiple agents that cooperate. Prior work suggests that multiple agents can help
|
||||
encourage divergent thinking (Liang et al., 2023), improve factuality and reasoning (Du et al., 2023),
|
||||
and provide validation (Wu et al., 2023). In light of the intuition and early evidence of promise, it is
|
||||
intriguing to ask the following question: how can we facilitate the development of LLM applications
|
||||
that could span a broad spectrum of domains and complexities based on the multi-agent approach?
|
||||
|
||||
Our insight is to use multi-agent conversations to achieve it. There are at least three reasons con-
|
||||
firming its general feasibility and utility thanks to recent advances in LLMs: First, because chat-
|
||||
optimized LLMs (e.g., GPT-4) show the ability to incorporate feedback, LLM agents can cooperate
|
||||
through conversations with each other or human(s), e.g., a dialog where agents provide and seek rea-
|
||||
soning, observations, critiques, and validation. Second, because a single LLM can exhibit a broad
|
||||
range of capabilities (especially when configured with the correct prompt and inference settings),
|
||||
conversations between differently configured agents can help combine these broad LLM capabilities
|
||||
in a modular and complementary manner. Third, LLMs have demonstrated ability to solve complex
|
||||
tasks when the tasks are broken into simpler subtasks. Multi-agent conversations can enable this
|
||||
partitioning and integration in an intuitive manner. How can we leverage the above insights and
|
||||
support different applications with the common requirement of coordinating multiple agents, poten-
|
||||
tially backed by LLMs, humans, or tools exhibiting different capacities? We desire a multi-agent
|
||||
conversation framework with generic abstraction and effective implementation that has the flexibil-
|
||||
ity to satisfy different application needs. Achieving this requires addressing two critical questions:
|
||||
(1) How can we design individual agents that are capable, reusable, customizable, and effective in
|
||||
multi-agent collaboration? (2) How can we develop a straightforward, unified interface that can
|
||||
accommodate a wide range of agent conversation patterns? In practice, applications of varying
|
||||
complexities may need distinct sets of agents with specific capabilities, and may require different
|
||||
conversation patterns, such as single- or multi-turn dialogs, different human involvement modes, and
|
||||
static vs. dynamic conversation. Moreover, developers may prefer the flexibility to program agent
|
||||
interactions in natural language or code. Failing to adequately address these two questions would
|
||||
limit the framework’s scope of applicability and generality.
|
||||
While there is contemporaneous exploration of multi-agent approaches,3 we present AutoGen, a
|
||||
generalized multi-agent conversation framework (Figure 1), based on the following new concepts.
|
||||
1 Customizable and conversable agents. AutoGen uses a generic design of agents that can lever-
|
||||
age LLMs, human inputs, tools, or a combination of them. The result is that developers can
|
||||
easily and quickly create agents with different roles (e.g., agents to write code, execute code,
|
||||
wire in human feedback, validate outputs, etc.) by selecting and configuring a subset of built-in
|
||||
capabilities. The agent’s backend can also be readily extended to allow more custom behaviors.
|
||||
To make these agents suitable for multi-agent conversation, every agent is made conversable –
|
||||
they can receive, react, and respond to messages. When configured properly, an agent can hold
|
||||
multiple turns of conversations with other agents autonomously or solicit human inputs at cer-
|
||||
tain rounds, enabling human agency and automation. The conversable agent design leverages the
|
||||
strong capability of the most advanced LLMs in taking feedback and making progress via chat
|
||||
and also allows combining capabilities of LLMs in a modular fashion. (Section 2.1)
|
||||
|
||||
2 Conversation programming. A fundamental insight of AutoGen is to simplify and unify com-
|
||||
plex LLM application workflows as multi-agent conversations. So AutoGen adopts a program-
|
||||
ming paradigm centered around these inter-agent conversations. We refer to this paradigm as
|
||||
conversation programming, which streamlines the development of intricate applications via two
|
||||
primary steps: (1) defining a set of conversable agents with specific capabilities and roles (as
|
||||
described above); (2) programming the interaction behavior between agents via conversation-
|
||||
centric computation and control. Both steps can be achieved via a fusion of natural and pro-
|
||||
gramming languages to build applications with a wide range of conversation patterns and agent
|
||||
behaviors. AutoGen provides ready-to-use implementations and also allows easy extension and
|
||||
experimentation for both steps. (Section 2.2)
|
||||
|
||||
3We refer to Appendix A for a detailed discussion.
|
||||
|
||||
2
|
||||
|
||||
@@ -0,0 +1,74 @@
|
||||
%PDF-1.3
|
||||
%“Œ‹ž ReportLab Generated PDF document (opensource)
|
||||
1 0 obj
|
||||
<<
|
||||
/F1 2 0 R /F2 3 0 R
|
||||
>>
|
||||
endobj
|
||||
2 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica /Encoding /WinAnsiEncoding /Name /F1 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
3 0 obj
|
||||
<<
|
||||
/BaseFont /Helvetica-Bold /Encoding /WinAnsiEncoding /Name /F2 /Subtype /Type1 /Type /Font
|
||||
>>
|
||||
endobj
|
||||
4 0 obj
|
||||
<<
|
||||
/Contents 8 0 R /MediaBox [ 0 0 612 792 ] /Parent 7 0 R /Resources <<
|
||||
/Font 1 0 R /ProcSet [ /PDF /Text /ImageB /ImageC /ImageI ]
|
||||
>> /Rotate 0 /Trans <<
|
||||
|
||||
>>
|
||||
/Type /Page
|
||||
>>
|
||||
endobj
|
||||
5 0 obj
|
||||
<<
|
||||
/PageMode /UseNone /Pages 7 0 R /Type /Catalog
|
||||
>>
|
||||
endobj
|
||||
6 0 obj
|
||||
<<
|
||||
/Author (anonymous) /CreationDate (D:20260210121342+01'00') /Creator (anonymous) /Keywords () /ModDate (D:20260210121342+01'00') /Producer (ReportLab PDF Library - \(opensource\))
|
||||
/Subject (unspecified) /Title (untitled) /Trapped /False
|
||||
>>
|
||||
endobj
|
||||
7 0 obj
|
||||
<<
|
||||
/Count 1 /Kids [ 4 0 R ] /Type /Pages
|
||||
>>
|
||||
endobj
|
||||
8 0 obj
|
||||
<<
|
||||
/Filter [ /ASCII85Decode /FlateDecode ] /Length 2414
|
||||
>>
|
||||
stream
|
||||
Gat=m?#SK-&r,lL<tO"8con<J;5Cq;2s]18RrdR7Y[)>Ym#<31@rCsJ89.W.qa3u?82hU4/rD6bm_^2.o5\G6@H<5/\G85.&:2)\f,l]`/mA:-0HF*!^.%Yd0?rr<_LD*'1j8Q\=IJXu'N"=HL>KSX^]339h+)S%SB[D8U\2B8rL_pR7\MXONW%HeW99+,0hH$AU#^KYAoZ)6P-2'6m5cj7lZu'kGHQ:/\R1,Ma%hEl2eYq(:LZ"-`3OktM:dm<m,u<)W99/X#l.?0OO\Z_]Y4.9BoSuKGOrdaFbq^/)*_g%gm8s\<gU<a%e]re(gWm_H[^0bn(=;0X%(_^H%$;+Se<aM)L.FrW&=UUc*X3He'XMO]CgP3P*<]$#uOPN2Z#n\O]@7_]#$ZH.Gr&KZm_M+6]8I5lYVcZLH,)V@L:BCib%tuWd,p*A"0Gb=6gIkI+Y5.[<aH_D`iMaKNQpo.UHf=F]to-Ui6XS[Q;Qh\cD=LT#YQGpn9rsUVm:qR"1%pnrSZa/Mi*P+f4j?B6-uV5Em_Zqog)^@RtF[F-adqASQb%i[(eIIqZ)_CVEHpGDgIpX[[uQ4J6DNf5X^CB'JA+,d^#?/[fq)jH^+:rbdW>Y'H/a/1^A\lZD2qMb,5%-$pOaW5-%BjndGRZ<CV&?T^r@PWF)!H#gDKcZj?[/gATBZ;=XJ$_a;??F-qtH(HaQX?W#iIL#17<Y25AC[ePo/pO[]=c0(\#/j9R%W/]$do:5b%.e4%S0Z';YJm/!GO9jt-H8W>JTK5I,b-cnrpc!2H0BZZ`1%R*aB!ZE'JRRYNJ<J4B`!j/maqpD>*Rq$U:[Tq%Lr[m+DHGg*dP\Ee>\#VYo43^R>kA9W2b/WU:k/M#%^2;nC+,e'dAcEOp?t5Kk;4+.f4MU@-mf7iCT_29s_%g,%K_gB8!kWS28T%T6'u_$GK'qX*VP>7>5?dW_<?$QPg!n")cT(<-[c/-kEbS'`*BYR5SB9TPY<1jq1#Q/EWpCJrY=s;bQfH^=uT:DTR3.8/N>W)r8_SF*7+f;4415n3,ECi2P6&bjmn17t+qU8;D])\Qt.8QLi)?kJ`.t+lkW'Y4e876l-2di)Y?.3\K1<(0IrEfm1<:Oc^u?7B::q;On$J5_C7T<u%071ASb!ZD1u7Yd"g`I'`PJ>**>tRZrdD6q3W@5QfbW8242uIHro=(eV*P1KjY,oj4tW&obb>^q-Iur%F#A)mgu8+V*?E<bdEC6V0+Z7OS^l.$W4hmuq:sMdJ=Sk+94D3QtUBZ:AoIiBA%s3#GJdRDFCpZ)7\MZmitKhMID(%ic%oW#tD%ERrqpk,dD3ll!E6m)e):26BLNV!WiRV*d(+Ppl'p$%?J&MqeV<=uNJ_5,4P_NC:lWf`Iu3\u+^>Y]dUOk&c=m2^<YVV2cUoq[`<<W-]MTIC50Klu6rO5RUVZ"h`#"4adtt2qjs2b12hQi!@JBp4Jln>:1Dtc(*!NBU*DeAtLhuWu&JLWFQi:;ka#?AD6V.A_[>n$T,.]8d=tffJ,?'DbCKQ-BnKqTn_:1LGc865V]FFi=AAF`DGhW(F]2^o?>VbGN:;=!-s;ea7]Ll\f+eiZ8XZb0*mZp%8*K_pf+1"2fKuO1pNK%7f_(mPTD@0&ljSV?o$5BpUmleYs^Faq_SM'jX.o\d*6%j(EtY.N"m2B'E@[.Y_8Be+m(58m$\dcqm$?,0it)/=9@9kRfJB;N7D9t\'F<:#c$P82`UKqgN]$kU]5eLPZMR=0bO[rPk"\?hu>sT^KFg`B>!pml-a[ImSeWp!_l3s!E>gFKq4ng:"n=N:m57rHjN)GML<=a1ktQpUT8:?[D:c7+Gm@2q;uN1Q3)hpeThe-&[#`KYZ4e_=o]kk1KH/^jo:"<0_nRJingk\[1Jltc<,.Jq2\*]=AVcIiY#?iMASrc$Bp)4m=NdIOJ&,H=+<MC=^7]?Tb>M"H6ZdXTX2Ba;Gp=J-m]$,8ZCU/77rHJ,%1.[/DlnkH:pIIV$Oh.;:t?5e3.cs^[G:H=e;i>c+>B=)C&l7T)S<Bld"_W)BtgI(/F`Le;ULQ,!FM!^<8Kk?L6b_>G8Jp-TG;!V1144#29r2%;n-RmNHrGdR!76&H"R_D-]`c"1FCgZl*",7SUVuqc0oapDQ=^`nj#FFk@2%[K[V45$!KQIH[=;SUpTE8T!QLliC=5-9]nkQpBVdHM6-g)tYBAPuOqr^qkn[Wh4C;6L89J;D>5@cYM$2Y/24scnNiWp4jWhfJAF^ck!@I(VPV*s,pdkPKn<Zg-T3I%d.sSl"^f-Gm=*riV,>(\770jbu^lf\h1+IH>c;Bo;Pdg;!fA)'kmg$"\P3oX=/N5/rUltb3K-BdRTR;-W)J1bDbE?g<MKG;cK`l?D4l>.,O@6id::q]JXBH\Ws#0[#'8-5JQL>/c~>endstream
|
||||
endobj
|
||||
xref
|
||||
0 9
|
||||
0000000000 65535 f
|
||||
0000000061 00000 n
|
||||
0000000102 00000 n
|
||||
0000000209 00000 n
|
||||
0000000321 00000 n
|
||||
0000000514 00000 n
|
||||
0000000582 00000 n
|
||||
0000000843 00000 n
|
||||
0000000902 00000 n
|
||||
trailer
|
||||
<<
|
||||
/ID
|
||||
[<e319d5c305edb8c0fb6be9e44c6178fa><e319d5c305edb8c0fb6be9e44c6178fa>]
|
||||
% ReportLab generated PDF document -- digest (opensource)
|
||||
|
||||
/Info 6 0 R
|
||||
/Root 5 0 R
|
||||
/Size 9
|
||||
>>
|
||||
startxref
|
||||
3407
|
||||
%%EOF
|
||||
@@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env python3 -m pytest
|
||||
"""Tests for PDF table extraction functionality."""
|
||||
|
||||
import os
|
||||
import re
|
||||
import pytest
|
||||
@@ -650,6 +651,332 @@ class TestPdfTableExtraction:
|
||||
result.text_content.strip() == ""
|
||||
), f"Scanned PDF should have empty extraction, got: '{result.text_content[:100]}...'"
|
||||
|
||||
def test_movie_theater_booking_pdf_extraction(self, markitdown):
|
||||
"""Test extraction of movie theater booking PDF with complex tables.
|
||||
|
||||
Expected output: Pipe-separated format with booking details, agency info,
|
||||
customer details, and show schedules in structured tables.
|
||||
"""
|
||||
pdf_path = os.path.join(TEST_FILES_DIR, "movie-theater-booking-2024.pdf")
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
text_content = result.text_content
|
||||
|
||||
# Validate pipe-separated table format
|
||||
assert "|" in text_content, "Booking order should contain pipe separators"
|
||||
|
||||
# Validate key booking information
|
||||
expected_strings = [
|
||||
"BOOKING ORDER",
|
||||
"2024-12-5678", # Order number
|
||||
"Holiday Movie Marathon Package", # Product description
|
||||
"12/20/2024 - 12/31/2024", # Booking dates
|
||||
"SC-WINTER-2024", # Alt order number
|
||||
"STARLIGHT CINEMAS", # Cinema brand
|
||||
]
|
||||
validate_strings(result, expected_strings)
|
||||
|
||||
# Validate agency information
|
||||
agency_strings = [
|
||||
"Premier Entertainment Group", # Agency name
|
||||
"Michael Chen", # Contact
|
||||
"Sarah Johnson", # Primary contact
|
||||
"Downtown Multiplex", # Cinema name
|
||||
]
|
||||
validate_strings(result, agency_strings)
|
||||
|
||||
# Validate customer information
|
||||
customer_strings = [
|
||||
"Universal Studios Distribution", # Customer name
|
||||
"Film Distributor", # Category
|
||||
"CUST-98765", # Customer ID
|
||||
]
|
||||
validate_strings(result, customer_strings)
|
||||
|
||||
# Validate booking summary totals
|
||||
booking_strings = [
|
||||
"$12,500.00", # Gross amount
|
||||
"$11,250.00", # Net amount
|
||||
"December 2024", # Month
|
||||
"48", # Number of shows
|
||||
]
|
||||
validate_strings(result, booking_strings)
|
||||
|
||||
# Validate show schedule details
|
||||
show_strings = [
|
||||
"Holiday Spectacular", # Movie title
|
||||
"Winter Wonderland", # Movie title
|
||||
"New Year Mystery", # Movie title
|
||||
"IMAX 3D", # Format
|
||||
"$250", # Rate
|
||||
"$300", # Rate
|
||||
"$3,000", # Revenue
|
||||
"$3,600", # Revenue
|
||||
]
|
||||
validate_strings(result, show_strings)
|
||||
|
||||
|
||||
class TestPdfFullOutputComparison:
|
||||
"""Test that PDF extraction produces expected complete outputs."""
|
||||
|
||||
@pytest.fixture
|
||||
def markitdown(self):
|
||||
"""Create MarkItDown instance."""
|
||||
return MarkItDown()
|
||||
|
||||
def test_movie_theater_full_output(self, markitdown):
|
||||
"""Test complete output for movie theater booking PDF."""
|
||||
pdf_path = os.path.join(TEST_FILES_DIR, "movie-theater-booking-2024.pdf")
|
||||
expected_path = os.path.join(
|
||||
TEST_FILES_DIR, "expected_outputs", "movie-theater-booking-2024.md"
|
||||
)
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
if not os.path.exists(expected_path):
|
||||
pytest.skip(f"Expected output not found: {expected_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
actual_output = result.text_content
|
||||
|
||||
with open(expected_path, "r", encoding="utf-8") as f:
|
||||
expected_output = f.read()
|
||||
|
||||
# Compare outputs
|
||||
actual_lines = [line.rstrip() for line in actual_output.split("\n")]
|
||||
expected_lines = [line.rstrip() for line in expected_output.split("\n")]
|
||||
|
||||
# Check line count
|
||||
assert abs(len(actual_lines) - len(expected_lines)) <= 2, (
|
||||
f"Line count mismatch: actual={len(actual_lines)}, "
|
||||
f"expected={len(expected_lines)}"
|
||||
)
|
||||
|
||||
# Check structural elements
|
||||
assert actual_output.count("|") > 80, "Should have many pipe separators"
|
||||
assert actual_output.count("---") > 8, "Should have table separators"
|
||||
|
||||
# Validate critical sections
|
||||
for section in [
|
||||
"BOOKING ORDER",
|
||||
"STARLIGHT CINEMAS",
|
||||
"2024-12-5678",
|
||||
"Holiday Spectacular",
|
||||
"$12,500.00",
|
||||
]:
|
||||
assert section in actual_output, f"Missing section: {section}"
|
||||
|
||||
# Check table structure
|
||||
table_rows = [line for line in actual_lines if line.startswith("|")]
|
||||
assert (
|
||||
len(table_rows) > 15
|
||||
), f"Should have >15 table rows, got {len(table_rows)}"
|
||||
|
||||
def test_sparse_borderless_table_full_output(self, markitdown):
|
||||
"""Test complete output for SPARSE borderless table PDF."""
|
||||
pdf_path = os.path.join(
|
||||
TEST_FILES_DIR, "SPARSE-2024-INV-1234_borderless_table.pdf"
|
||||
)
|
||||
expected_path = os.path.join(
|
||||
TEST_FILES_DIR,
|
||||
"expected_outputs",
|
||||
"SPARSE-2024-INV-1234_borderless_table.md",
|
||||
)
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
if not os.path.exists(expected_path):
|
||||
pytest.skip(f"Expected output not found: {expected_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
actual_output = result.text_content
|
||||
|
||||
with open(expected_path, "r", encoding="utf-8") as f:
|
||||
expected_output = f.read()
|
||||
|
||||
# Compare outputs
|
||||
actual_lines = [line.rstrip() for line in actual_output.split("\n")]
|
||||
expected_lines = [line.rstrip() for line in expected_output.split("\n")]
|
||||
|
||||
# Check line count is close
|
||||
assert abs(len(actual_lines) - len(expected_lines)) <= 2, (
|
||||
f"Line count mismatch: actual={len(actual_lines)}, "
|
||||
f"expected={len(expected_lines)}"
|
||||
)
|
||||
|
||||
# Check structural elements
|
||||
assert actual_output.count("|") > 50, "Should have many pipe separators"
|
||||
|
||||
# Validate critical sections
|
||||
for section in [
|
||||
"INVENTORY RECONCILIATION REPORT",
|
||||
"SPARSE-2024-INV-1234",
|
||||
"SKU-8847",
|
||||
"SKU-9201",
|
||||
"Variance Analysis",
|
||||
]:
|
||||
assert section in actual_output, f"Missing section: {section}"
|
||||
|
||||
def test_repair_multipage_full_output(self, markitdown):
|
||||
"""Test complete output for REPAIR multipage invoice PDF."""
|
||||
pdf_path = os.path.join(TEST_FILES_DIR, "REPAIR-2022-INV-001_multipage.pdf")
|
||||
expected_path = os.path.join(
|
||||
TEST_FILES_DIR, "expected_outputs", "REPAIR-2022-INV-001_multipage.md"
|
||||
)
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
if not os.path.exists(expected_path):
|
||||
pytest.skip(f"Expected output not found: {expected_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
actual_output = result.text_content
|
||||
|
||||
with open(expected_path, "r", encoding="utf-8") as f:
|
||||
expected_output = f.read()
|
||||
|
||||
# Compare outputs
|
||||
actual_lines = [line.rstrip() for line in actual_output.split("\n")]
|
||||
expected_lines = [line.rstrip() for line in expected_output.split("\n")]
|
||||
|
||||
# Check line count is close
|
||||
assert abs(len(actual_lines) - len(expected_lines)) <= 2, (
|
||||
f"Line count mismatch: actual={len(actual_lines)}, "
|
||||
f"expected={len(expected_lines)}"
|
||||
)
|
||||
|
||||
# Check structural elements
|
||||
assert actual_output.count("|") > 40, "Should have many pipe separators"
|
||||
|
||||
# Validate critical sections
|
||||
for section in [
|
||||
"ZAVA AUTO REPAIR",
|
||||
"Gabriel Diaz",
|
||||
"Jeep",
|
||||
"Grand Cherokee",
|
||||
"GRAND TOTAL",
|
||||
]:
|
||||
assert section in actual_output, f"Missing section: {section}"
|
||||
|
||||
def test_receipt_full_output(self, markitdown):
|
||||
"""Test complete output for RECEIPT retail purchase PDF."""
|
||||
pdf_path = os.path.join(
|
||||
TEST_FILES_DIR, "RECEIPT-2024-TXN-98765_retail_purchase.pdf"
|
||||
)
|
||||
expected_path = os.path.join(
|
||||
TEST_FILES_DIR,
|
||||
"expected_outputs",
|
||||
"RECEIPT-2024-TXN-98765_retail_purchase.md",
|
||||
)
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
if not os.path.exists(expected_path):
|
||||
pytest.skip(f"Expected output not found: {expected_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
actual_output = result.text_content
|
||||
|
||||
with open(expected_path, "r", encoding="utf-8") as f:
|
||||
expected_output = f.read()
|
||||
|
||||
# Compare outputs
|
||||
actual_lines = [line.rstrip() for line in actual_output.split("\n")]
|
||||
expected_lines = [line.rstrip() for line in expected_output.split("\n")]
|
||||
|
||||
# Check line count is close
|
||||
assert abs(len(actual_lines) - len(expected_lines)) <= 2, (
|
||||
f"Line count mismatch: actual={len(actual_lines)}, "
|
||||
f"expected={len(expected_lines)}"
|
||||
)
|
||||
|
||||
# Validate critical sections
|
||||
for section in [
|
||||
"TECHMART ELECTRONICS",
|
||||
"TXN-98765-2024",
|
||||
"Sarah Mitchell",
|
||||
"$821.14",
|
||||
"RETURN POLICY",
|
||||
]:
|
||||
assert section in actual_output, f"Missing section: {section}"
|
||||
|
||||
def test_academic_paper_full_output(self, markitdown):
|
||||
"""Test complete output for academic paper PDF."""
|
||||
pdf_path = os.path.join(TEST_FILES_DIR, "test.pdf")
|
||||
expected_path = os.path.join(TEST_FILES_DIR, "expected_outputs", "test.md")
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
if not os.path.exists(expected_path):
|
||||
pytest.skip(f"Expected output not found: {expected_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
actual_output = result.text_content
|
||||
|
||||
with open(expected_path, "r", encoding="utf-8") as f:
|
||||
expected_output = f.read()
|
||||
|
||||
# Compare outputs
|
||||
actual_lines = [line.rstrip() for line in actual_output.split("\n")]
|
||||
expected_lines = [line.rstrip() for line in expected_output.split("\n")]
|
||||
|
||||
# Check line count is close
|
||||
assert abs(len(actual_lines) - len(expected_lines)) <= 2, (
|
||||
f"Line count mismatch: actual={len(actual_lines)}, "
|
||||
f"expected={len(expected_lines)}"
|
||||
)
|
||||
|
||||
# Academic paper should not have pipe separators
|
||||
assert (
|
||||
actual_output.count("|") == 0
|
||||
), "Academic paper should not have pipe separators"
|
||||
|
||||
# Validate critical sections
|
||||
for section in [
|
||||
"Introduction",
|
||||
"Large language models",
|
||||
"agents",
|
||||
"multi-agent",
|
||||
]:
|
||||
assert section in actual_output, f"Missing section: {section}"
|
||||
|
||||
def test_medical_scan_full_output(self, markitdown):
|
||||
"""Test complete output for medical report scan PDF (empty, no text layer)."""
|
||||
pdf_path = os.path.join(
|
||||
TEST_FILES_DIR, "MEDRPT-2024-PAT-3847_medical_report_scan.pdf"
|
||||
)
|
||||
expected_path = os.path.join(
|
||||
TEST_FILES_DIR,
|
||||
"expected_outputs",
|
||||
"MEDRPT-2024-PAT-3847_medical_report_scan.md",
|
||||
)
|
||||
|
||||
if not os.path.exists(pdf_path):
|
||||
pytest.skip(f"Test file not found: {pdf_path}")
|
||||
|
||||
if not os.path.exists(expected_path):
|
||||
pytest.skip(f"Expected output not found: {expected_path}")
|
||||
|
||||
result = markitdown.convert(pdf_path)
|
||||
actual_output = result.text_content
|
||||
|
||||
with open(expected_path, "r", encoding="utf-8") as f:
|
||||
expected_output = f.read()
|
||||
|
||||
# Both should be empty (scanned PDF with no text layer)
|
||||
assert actual_output.strip() == "", "Scanned PDF should produce empty output"
|
||||
assert (
|
||||
expected_output.strip() == ""
|
||||
), "Expected output should be empty for scanned PDF"
|
||||
|
||||
|
||||
class TestPdfTableMarkdownFormat:
|
||||
"""Test that extracted tables have proper markdown formatting."""
|
||||
|
||||
Reference in New Issue
Block a user