Skip to content

Commit c0b20b0

Browse files
Merge pull request #89 from developer0hye/ralph/phase17-upstream-mitigations
Mitigate upstream dependency failures (docx-rs, umya-spreadsheet)
2 parents 391d5fa + 54ccb6c commit c0b20b0

File tree

3 files changed

+148
-7
lines changed

3 files changed

+148
-7
lines changed

crates/office2pdf/src/parser/docx.rs

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -967,7 +967,7 @@ impl Parser for DocxParser {
967967
};
968968

969969
let docx = docx_rs::read_docx(data)
970-
.map_err(|e| ConvertError::Parse(format!("Failed to parse DOCX: {e}")))?;
970+
.map_err(|e| ConvertError::Parse(format!("Failed to parse DOCX (docx-rs): {e}")))?;
971971

972972
// Populate locale-specific footnote/endnote style IDs from docx styles
973973
notes.populate_style_ids(&docx.styles);
@@ -1029,11 +1029,18 @@ impl Parser for DocxParser {
10291029

10301030
match result {
10311031
Ok(elems) => elements.extend(elems),
1032-
Err(_) => {
1032+
Err(panic_info) => {
1033+
let detail = if let Some(s) = panic_info.downcast_ref::<String>() {
1034+
s.clone()
1035+
} else if let Some(s) = panic_info.downcast_ref::<&str>() {
1036+
(*s).to_string()
1037+
} else {
1038+
"unknown panic".to_string()
1039+
};
10331040
warnings.push(ConvertWarning::ParseSkipped {
10341041
format: "DOCX".to_string(),
10351042
reason: format!(
1036-
"document element at index {idx} processing panicked; skipped"
1043+
"upstream panic caught (docx-rs): element at index {idx}: {detail}"
10371044
),
10381045
});
10391046
}
@@ -2301,6 +2308,18 @@ mod tests {
23012308
}
23022309
}
23032310

2311+
#[test]
2312+
fn test_parse_error_includes_library_name() {
2313+
let parser = DocxParser;
2314+
let result = parser.parse(b"not a valid docx file", &ConvertOptions::default());
2315+
let err = result.unwrap_err();
2316+
let msg = err.to_string();
2317+
assert!(
2318+
msg.contains("docx-rs"),
2319+
"Parse error should include upstream library name 'docx-rs', got: {msg}"
2320+
);
2321+
}
2322+
23042323
// ----- Text style defaults -----
23052324

23062325
#[test]

crates/office2pdf/src/parser/xlsx.rs

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1041,8 +1041,9 @@ impl XlsxParser {
10411041
chunk_size: usize,
10421042
) -> Result<(Vec<Document>, Vec<ConvertWarning>), ConvertError> {
10431043
let cursor = Cursor::new(data);
1044-
let book = umya_spreadsheet::reader::xlsx::read_reader(cursor, true)
1045-
.map_err(|e| ConvertError::Parse(format!("Failed to parse XLSX: {e}")))?;
1044+
let book = umya_spreadsheet::reader::xlsx::read_reader(cursor, true).map_err(|e| {
1045+
ConvertError::Parse(format!("Failed to parse XLSX (umya-spreadsheet): {e}"))
1046+
})?;
10461047

10471048
let metadata = extract_xlsx_metadata(&book);
10481049
let mut chart_map = extract_charts_with_anchors(data);
@@ -1127,8 +1128,9 @@ impl Parser for XlsxParser {
11271128
options: &ConvertOptions,
11281129
) -> Result<(Document, Vec<ConvertWarning>), ConvertError> {
11291130
let cursor = Cursor::new(data);
1130-
let book = umya_spreadsheet::reader::xlsx::read_reader(cursor, true)
1131-
.map_err(|e| ConvertError::Parse(format!("Failed to parse XLSX: {e}")))?;
1131+
let book = umya_spreadsheet::reader::xlsx::read_reader(cursor, true).map_err(|e| {
1132+
ConvertError::Parse(format!("Failed to parse XLSX (umya-spreadsheet): {e}"))
1133+
})?;
11321134

11331135
// Extract metadata from umya-spreadsheet properties
11341136
let metadata = extract_xlsx_metadata(&book);
@@ -1511,6 +1513,18 @@ mod tests {
15111513
);
15121514
}
15131515

1516+
#[test]
1517+
fn test_parse_error_includes_library_name() {
1518+
let parser = XlsxParser;
1519+
let result = parser.parse(b"not an xlsx file", &ConvertOptions::default());
1520+
let err = result.unwrap_err();
1521+
let msg = err.to_string();
1522+
assert!(
1523+
msg.contains("umya-spreadsheet"),
1524+
"Parse error should include upstream library name 'umya-spreadsheet', got: {msg}"
1525+
);
1526+
}
1527+
15141528
// ----- Empty cell content -----
15151529

15161530
#[test]

docs/xlsx-parser-evaluation.md

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
# XLSX Parser Evaluation: calamine vs umya-spreadsheet
2+
3+
**Date**: 2026-03-01
4+
**Context**: Issue #83 — 22 XLSX files fail due to upstream `umya-spreadsheet` bugs
5+
**Purpose**: Evaluate `calamine` as a potential alternative for XLSX parsing
6+
7+
## 1. Library Overview
8+
9+
| Property | umya-spreadsheet | calamine |
10+
|---|---|---|
11+
| Version evaluated | 2.3.3 | 0.33.0 |
12+
| License | MIT | MIT |
13+
| MSRV || 1.75.0 |
14+
| Read support | Yes | Yes |
15+
| Write support | Yes | No (read-only) |
16+
| Formats | XLSX | XLSX, XLS, ODS, XLSB |
17+
| Monthly downloads | ~30k | ~619k |
18+
19+
## 2. API Coverage Comparison
20+
21+
| Feature | umya-spreadsheet | calamine | Notes |
22+
|---|---|---|---|
23+
| Cell values (text, number, date) | Yes | Yes | Both extract cell data |
24+
| Cell formatting (font, color, border) | Yes | **No** | calamine is value-only |
25+
| Column widths / row heights | Yes | **No** | Not exposed in API |
26+
| Merged cells | Yes | Yes | calamine added in v0.26.0 |
27+
| Number formats | Yes | Partial | calamine uses internally for datetime detection only |
28+
| Charts | Partial | **No** | umya stores chart data; calamine has no chart support |
29+
| Images | Yes | Yes | calamine via `picture` feature flag |
30+
| Formulas | Yes | Yes | Both can read formula strings |
31+
| Sheet names / selection | Yes | Yes | Both support |
32+
| Conditional formatting | Yes | **No** | Not supported |
33+
| Print area | Yes | **No** | Not supported |
34+
| Headers / footers | Yes | **No** | Not supported |
35+
| Page breaks | Yes | **No** | Not supported |
36+
37+
## 3. Robustness: Failing Fixture Test Results
38+
39+
Tested the 22 XLSX files that fail with `umya-spreadsheet` against `calamine`:
40+
41+
| File | umya-spreadsheet | calamine | Notes |
42+
|---|---|---|---|
43+
| libreoffice/chart_hyperlink.xlsx | PANIC (FileNotFound) | **PASS** | 2 sheets, 36 cells |
44+
| libreoffice/hyperlink.xlsx | PANIC (FileNotFound) | **PASS** | 2 sheets, 0 cells |
45+
| libreoffice/tdf130959.xlsx | PANIC (FileNotFound) | **PASS** | 1 sheet, 0 cells |
46+
| libreoffice/test_115192.xlsx | PANIC (FileNotFound) | **PASS** | 2 sheets, 0 cells |
47+
| poi/47504.xlsx | PANIC (FileNotFound) | **PASS** | 1 sheet, 0 cells |
48+
| poi/bug63189.xlsx | PANIC (FileNotFound) | **PASS** | 4 sheets, 1 cell |
49+
| poi/ConditionalFormattingSamples.xlsx | PANIC (FileNotFound) | **PASS** | 18 sheets, 2469 cells |
50+
| libreoffice/check-boolean.xlsx | PANIC (ParseFloatError) | **PASS** | 1 sheet, 2 cells |
51+
| libreoffice/functions-excel-2010.xlsx | PANIC (ParseIntError) | **PASS** | 2 sheets, 1635 cells |
52+
| poi/FormulaEvalTestData_Copy.xlsx | PANIC (ParseIntError) | **PASS** | 4 sheets, 58057 cells |
53+
| libreoffice/tdf100709.xlsx | PANIC (unwrap on None) | **PASS** | 1 sheet, 156 cells |
54+
| poi/64450.xlsx | PANIC (unwrap on None) | **PASS** | 2 sheets, 16 cells |
55+
| poi/sample-beta.xlsx | PANIC (unwrap on None) | FAIL | shared string index error |
56+
| libreoffice/tdf162948.xlsx | PANIC (dataBar) | **PASS** | 1 sheet, 8 cells |
57+
| poi/NewStyleConditionalFormattings.xlsx | PANIC (dataBar) | **PASS** | 1 sheet, 357 cells |
58+
| libreoffice/forcepoint107.xlsx | ERROR (invalid checksum) | FAIL | range parsing error |
59+
| libreoffice/tdf121887.xlsx | ERROR (ZipError) | **PASS** | 1 sheet, 1 cell |
60+
| libreoffice/tdf131575.xlsx | ERROR (ZipError) | FAIL | unrecognized sheet type |
61+
| libreoffice/tdf76115.xlsx | ERROR (ZipError) | FAIL | unrecognized sheet type |
62+
| poi/49609.xlsx | ERROR (ZipError) | FAIL | unrecognized sheet type |
63+
| poi/56278.xlsx | ERROR (ZipError) | **PASS** | 10 sheets, 889 cells |
64+
| poi/59021.xlsx | ERROR (ZipError) | **PASS** | 1 sheet, 16 cells |
65+
66+
**Summary**: calamine passes **17/22** files (77%) vs **0/22** for umya-spreadsheet.
67+
68+
calamine still fails on 5 files:
69+
- 3 files with non-standard sheet types (calamine doesn't recognize them)
70+
- 1 file with corrupt zip entry (both libraries fail)
71+
- 1 file with invalid shared string reference
72+
73+
## 4. Performance
74+
75+
Not benchmarked in this evaluation. Both libraries are pure Rust and expected to have comparable performance for read operations. calamine is widely used (~619k monthly downloads) and likely well-optimized for read-only workloads.
76+
77+
## 5. Maintenance
78+
79+
| Metric | umya-spreadsheet | calamine |
80+
|---|---|---|
81+
| Active development | Yes | Yes |
82+
| Open issues | ~50 | ~39 |
83+
| Panic reduction planned | Yes (v3.0.0 milestone) | N/A (fewer panics) |
84+
| Maintainer responsiveness | Moderate | Active (jmcnamara) |
85+
| Last release | Recent | Feb 2026 |
86+
87+
## 6. MSRV Compatibility
88+
89+
calamine requires MSRV 1.75.0. Our project's MSRV is 1.85+ (edition 2024). calamine is compatible.
90+
91+
## 7. Recommendation
92+
93+
**Stay with umya-spreadsheet** for now, but monitor calamine for formatting support.
94+
95+
### Rationale
96+
97+
- **calamine cannot extract cell formatting** (fonts, colors, borders, fills, column widths). This is a fundamental requirement for PDF conversion with visual fidelity (PRD §3.1 XLSX P1 features).
98+
- Switching to calamine would **regress formatting quality** — all cell styling would be lost.
99+
- calamine's formatting support is tracked in [calamine#404](https://github.com/tafia/calamine/issues/404) but has no timeline.
100+
- umya-spreadsheet's panic issues are tracked in [umya-spreadsheet#271](https://github.com/MathNya/umya-spreadsheet/issues/271) (v3.0.0 milestone) and our filed issue [umya-spreadsheet#310](https://github.com/MathNya/umya-spreadsheet/issues/310).
101+
102+
### Possible future approaches
103+
104+
1. **Partial hybrid**: Use calamine as a fallback when umya-spreadsheet panics/fails — extract cell values without formatting. This would increase success rate from 0% to 77% on the failing files, with degraded visual quality.
105+
2. **Custom parser**: Build a focused XLSX parser using `quick-xml` + `zip` that extracts exactly the data we need (cell values + formatting). Higher effort but full control.
106+
3. **Wait for upstream fixes**: Monitor umya-spreadsheet v3.0.0 for panic reduction and calamine for formatting support.
107+
108+
For now, our `catch_unwind` wrapper and improved error messages (phase 17) provide adequate mitigation for the 22 failing files (0.8% failure rate).

0 commit comments

Comments
 (0)