Skip to content

Commit c4fed54

Browse files
Merge pull request #86 from developer0hye/ralph/phase14-denylist-and-infra
fix: denylist adversarial fixtures and wrap parsing with catch_unwind
2 parents a6efe31 + 81a49d7 commit c4fed54

File tree

3 files changed

+138
-6
lines changed

3 files changed

+138
-6
lines changed

crates/office2pdf/src/lib.rs

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,17 @@ use config::{ConvertOptions, Format};
5353
use error::{ConvertError, ConvertMetrics, ConvertResult};
5454
use parser::Parser;
5555

56+
/// Extract a human-readable message from a caught panic payload.
57+
fn extract_panic_message(payload: &Box<dyn std::any::Any + Send>) -> String {
58+
if let Some(s) = payload.downcast_ref::<String>() {
59+
s.clone()
60+
} else if let Some(s) = payload.downcast_ref::<&str>() {
61+
(*s).to_string()
62+
} else {
63+
"unknown panic".to_string()
64+
}
65+
}
66+
5667
/// Convert a file at the given path to PDF bytes with warnings.
5768
///
5869
/// Detects the format from the file extension (`.docx`, `.pptx`, `.xlsx`).
@@ -131,8 +142,20 @@ pub fn convert_bytes(
131142
};
132143

133144
// Stage 1: Parse (OOXML → IR)
145+
// Wrap with catch_unwind to convert upstream panics (e.g. unwrap() in
146+
// umya-spreadsheet / docx-rs) into ConvertError::Parse.
134147
let parse_start = Instant::now();
135-
let (doc, warnings) = parser.parse(data, options)?;
148+
let parse_result =
149+
std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| parser.parse(data, options)));
150+
let (doc, warnings) = match parse_result {
151+
Ok(result) => result?,
152+
Err(panic_info) => {
153+
return Err(ConvertError::Parse(format!(
154+
"upstream parser panicked: {}",
155+
extract_panic_message(&panic_info)
156+
)));
157+
}
158+
};
136159
let parse_duration = parse_start.elapsed();
137160
let page_count = doc.pages.len() as u32;
138161

@@ -188,8 +211,20 @@ fn convert_bytes_streaming_xlsx(
188211
let xlsx_parser = parser::xlsx::XlsxParser;
189212

190213
// Stage 1: Parse into chunks
214+
// Wrap with catch_unwind (same rationale as convert_bytes).
191215
let parse_start = Instant::now();
192-
let (chunk_docs, warnings) = xlsx_parser.parse_streaming(data, options, chunk_size)?;
216+
let parse_result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
217+
xlsx_parser.parse_streaming(data, options, chunk_size)
218+
}));
219+
let (chunk_docs, warnings) = match parse_result {
220+
Ok(result) => result?,
221+
Err(panic_info) => {
222+
return Err(ConvertError::Parse(format!(
223+
"upstream parser panicked: {}",
224+
extract_panic_message(&panic_info)
225+
)));
226+
}
227+
};
193228
let parse_duration = parse_start.elapsed();
194229

195230
if chunk_docs.is_empty() {

crates/office2pdf/tests/bulk_conversion.rs

Lines changed: 71 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,59 @@ use office2pdf::config::{ConvertOptions, Format};
2020
// ---------------------------------------------------------------------------
2121

2222
const DENYLIST: &[&str] = &[
23-
// Fuzzer-generated corrupted file (invalid checksum)
23+
// ── DOCX — fuzzer-generated / corrupted zip structures ───────────
24+
"clusterfuzz-testcase-minimized-POIFuzzer-6709287337197568.docx",
25+
"clusterfuzz-testcase-minimized-POIXWPFFuzzer-4791943399604224.docx",
26+
"clusterfuzz-testcase-minimized-POIXWPFFuzzer-4959857092198400.docx",
27+
"clusterfuzz-testcase-minimized-POIXWPFFuzzer-4961551840247808.docx",
28+
"clusterfuzz-testcase-minimized-POIXWPFFuzzer-5166796835258368.docx",
29+
"clusterfuzz-testcase-minimized-POIXWPFFuzzer-5313273089884160.docx",
30+
"clusterfuzz-testcase-minimized-POIXWPFFuzzer-5564805011079168.docx",
31+
"clusterfuzz-testcase-minimized-POIXWPFFuzzer-5569740188549120.docx",
32+
"clusterfuzz-testcase-minimized-POIXWPFFuzzer-6061520554164224.docx",
33+
"clusterfuzz-testcase-minimized-POIXWPFFuzzer-6120975439364096.docx",
34+
"clusterfuzz-testcase-minimized-POIXWPFFuzzer-6442791109263360.docx",
2435
"clusterfuzz-testcase-minimized-POIXWPFFuzzer-6733884933668864.docx",
25-
// Fuzzer-generated corrupted files
36+
// Crash reporter — corrupted zip
37+
"crash-517626e815e0afa9decd0ebb6d1dee63fb9907dd.docx",
38+
// Deeply nested table cells — stack overflow risk
39+
"deep-table-cell.docx",
40+
// Truncated archive — incomplete zip
41+
"truncated62886.docx",
42+
// ── PPTX — fuzzer-generated / corrupted zip structures ───────────
43+
"clusterfuzz-testcase-minimized-POIFuzzer-5205835528404992.pptx",
44+
"clusterfuzz-testcase-minimized-POIXSLFFuzzer-4838644450394112.pptx",
45+
"clusterfuzz-testcase-minimized-POIXSLFFuzzer-4986044400861184.pptx",
46+
"clusterfuzz-testcase-minimized-POIXSLFFuzzer-5463285576892416.pptx",
47+
"clusterfuzz-testcase-minimized-POIXSLFFuzzer-5471515212382208.pptx",
48+
"clusterfuzz-testcase-minimized-POIXSLFFuzzer-5611274456596480.pptx",
49+
"clusterfuzz-testcase-minimized-POIXSLFFuzzer-6071540680032256.pptx",
50+
"clusterfuzz-testcase-minimized-POIXSLFFuzzer-6254434927378432.pptx",
51+
"clusterfuzz-testcase-minimized-POIXSLFFuzzer-6372932378820608.pptx",
52+
"clusterfuzz-testcase-minimized-POIXSLFFuzzer-6435650376957952.pptx",
53+
// Corrupted archive (OOM / hang)
54+
"Divino_Revelado.pptx",
55+
// ── XLSX — fuzzer-generated / corrupted zip structures ───────────
56+
"clusterfuzz-testcase-minimized-POIFuzzer-5040805309710336.xlsx",
57+
"clusterfuzz-testcase-minimized-POIXSSFFuzzer-4828727001088000.xlsx",
58+
"clusterfuzz-testcase-minimized-POIXSSFFuzzer-5089447305609216.xlsx",
59+
"clusterfuzz-testcase-minimized-POIXSSFFuzzer-5185049589579776.xlsx",
2660
"clusterfuzz-testcase-minimized-POIXSSFFuzzer-5265527465181184.xlsx",
2761
"clusterfuzz-testcase-minimized-POIXSSFFuzzer-5937385319563264.xlsx",
62+
"clusterfuzz-testcase-minimized-POIXSSFFuzzer-6123461607817216.xlsx",
63+
"clusterfuzz-testcase-minimized-POIXSSFFuzzer-6419366255919104.xlsx",
64+
"clusterfuzz-testcase-minimized-POIXSSFFuzzer-6448258963341312.xlsx",
65+
"clusterfuzz-testcase-minimized-XLSX2CSVFuzzer-5025401116950528.xlsx",
66+
"clusterfuzz-testcase-minimized-XLSX2CSVFuzzer-5542865479270400.xlsx",
67+
"clusterfuzz-testcase-minimized-XLSX2CSVFuzzer-5636439151607808.xlsx",
68+
"clusterfuzz-testcase-minimized-XLSX2CSVFuzzer-6504225896792064.xlsx",
69+
"clusterfuzz-testcase-minimized-XLSX2CSVFuzzer-6594557414080512.xlsx",
70+
// Crash reporters — corrupted zip
71+
"crash-274d6342e4842d61be0fb48eaadad6208ae767ae.xlsx",
72+
"crash-9bf3cd4bd6f50a8a9339d363c2c7af14b536865c.xlsx",
73+
// Corrupted / truncated archive
74+
"58616.xlsx",
75+
// ── XLSX — adversarial / OOM-inducing ────────────────────────────
2876
// XML billion-laughs attack PoCs
2977
"poc-xmlbomb.xlsx",
3078
"poc-xmlbomb-empty.xlsx",
@@ -35,6 +83,8 @@ const DENYLIST: &[&str] = &[
3583
"poc-shared-strings.xlsx",
3684
// Extreme dimensions stress test (OOM)
3785
"too-many-cols-rows.xlsx",
86+
// Hangs during conversion (CI timeout)
87+
"bug62181.xlsx",
3888
];
3989

4090
/// Returns `true` if the file should be skipped due to being on the denylist.
@@ -443,15 +493,32 @@ fn test_bulk_all_formats() {
443493
/// and does not reject normal files.
444494
#[test]
445495
fn test_denylist_filtering() {
446-
// Every entry in DENYLIST should be recognized
496+
// Every entry in DENYLIST should be recognized regardless of parent directory
447497
for name in DENYLIST {
448-
let path = PathBuf::from(format!("tests/fixtures/xlsx/poi/{name}"));
498+
let path = PathBuf::from(format!("tests/fixtures/any/dir/{name}"));
449499
assert!(
450500
is_denylisted(&path),
451501
"Expected {name} to be denylisted, but it was not"
452502
);
453503
}
454504

505+
// Denylist should cover all three formats
506+
let docx_count = DENYLIST.iter().filter(|n| n.ends_with(".docx")).count();
507+
let pptx_count = DENYLIST.iter().filter(|n| n.ends_with(".pptx")).count();
508+
let xlsx_count = DENYLIST.iter().filter(|n| n.ends_with(".xlsx")).count();
509+
assert!(
510+
docx_count >= 14,
511+
"Expected ≥14 DOCX entries, got {docx_count}"
512+
);
513+
assert!(
514+
pptx_count >= 10,
515+
"Expected ≥10 PPTX entries, got {pptx_count}"
516+
);
517+
assert!(
518+
xlsx_count >= 15,
519+
"Expected ≥15 XLSX entries, got {xlsx_count}"
520+
);
521+
455522
// Normal files must not be denylisted
456523
let normal = PathBuf::from("tests/fixtures/xlsx/poi/sample.xlsx");
457524
assert!(

crates/office2pdf/tests/xlsx_fixtures.rs

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,36 @@ xlsx_fixture_tests!(
382382
"SH108-SimpleFormattedCell.xlsx"
383383
);
384384

385+
// --- Upstream panic safety ---------------------------------------------------
386+
387+
/// Verifies that files which trigger upstream panics (umya-spreadsheet unwrap())
388+
/// return `Err` instead of panicking, thanks to catch_unwind in convert_bytes().
389+
#[test]
390+
fn upstream_panics_return_error() {
391+
let cases: &[(&str, office2pdf::config::Format)] = &[
392+
// umya-spreadsheet: unwrap() on missing zip entry (FileNotFound)
393+
(
394+
"libreoffice/chart_hyperlink.xlsx",
395+
office2pdf::config::Format::Xlsx,
396+
),
397+
// umya-spreadsheet: ParseFloatError on boolean cell
398+
(
399+
"libreoffice/check-boolean.xlsx",
400+
office2pdf::config::Format::Xlsx,
401+
),
402+
];
403+
for (name, format) in cases {
404+
let path = fixture_path(name);
405+
if !path.exists() {
406+
eprintln!("Skipping {name}: fixture not available");
407+
continue;
408+
}
409+
let data = std::fs::read(&path).unwrap();
410+
let result = office2pdf::convert_bytes(&data, *format, &ConvertOptions::default());
411+
assert!(result.is_err(), "{name} should return Err (not panic)");
412+
}
413+
}
414+
385415
// --- MIT: calamine (Rust) --------------------------------------------------
386416

387417
xlsx_fixture_tests!(date_1904, "date_1904.xlsx");

0 commit comments

Comments
 (0)