Skip to content

Commit 23de658

Browse files
Merge pull request #87 from developer0hye/ralph/phase15-typst-codegen-fixes
fix: Typst codegen bugs for math markup and edge cases
2 parents c4fed54 + 0200b47 commit 23de658

File tree

2 files changed

+355
-9
lines changed

2 files changed

+355
-9
lines changed

crates/office2pdf/src/parser/omml.rs

Lines changed: 223 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,10 @@ fn parse_omml_children(reader: &mut Reader<&[u8]>, out: &mut String, end_tag: &[
9494
ensure_math_separator(out);
9595
parse_sub_superscript(reader, out);
9696
}
97+
b"groupChr" => {
98+
ensure_math_separator(out);
99+
parse_group_chr(reader, out);
100+
}
97101
b"d" => parse_delimiter(reader, out),
98102
b"r" => parse_math_run(reader, out),
99103
b"oMath" => parse_omml_children(reader, out, b"oMath"),
@@ -238,10 +242,12 @@ fn parse_radical(reader: &mut Reader<&[u8]>, out: &mut String) {
238242
}
239243
}
240244

245+
// Use placeholder for empty radicand to avoid Typst "missing argument" error
246+
let radicand = if content.is_empty() { "\"\"" } else { &content };
241247
if deg_hide || deg.is_empty() {
242-
let _ = std::fmt::Write::write_fmt(out, format_args!("sqrt({content})"));
248+
let _ = std::fmt::Write::write_fmt(out, format_args!("sqrt({radicand})"));
243249
} else {
244-
let _ = std::fmt::Write::write_fmt(out, format_args!("root({deg}, {content})"));
250+
let _ = std::fmt::Write::write_fmt(out, format_args!("root({deg}, {radicand})"));
245251
}
246252
}
247253

@@ -289,7 +295,15 @@ fn parse_delimiter(reader: &mut Reader<&[u8]>, out: &mut String) {
289295
let beg = map_delimiter(&beg_chr);
290296
let end = map_delimiter(&end_chr);
291297
let content = elements.join(", ");
292-
let _ = std::fmt::Write::write_fmt(out, format_args!("{beg}{content}{end}"));
298+
// If either delimiter is empty, omit both to avoid unbalanced delimiters in Typst
299+
if beg.is_empty() && end.is_empty() {
300+
out.push_str(&content);
301+
} else if beg.is_empty() || end.is_empty() {
302+
// One-sided invisible delimiter: emit content without delimiters
303+
out.push_str(&content);
304+
} else {
305+
let _ = std::fmt::Write::write_fmt(out, format_args!("{beg}{content}{end}"));
306+
}
293307
}
294308

295309
fn map_delimiter(chr: &str) -> &str {
@@ -391,19 +405,35 @@ fn map_math_text(input: &str) -> String {
391405
let mut word_buf = String::new();
392406
let mut last_was_name = false;
393407

408+
let mut non_ascii_buf = String::new();
409+
394410
for ch in input.chars() {
395411
if ch.is_ascii_alphabetic() {
412+
// Flush non-ASCII buffer first
413+
if !non_ascii_buf.is_empty() {
414+
flush_non_ascii_text(&mut result, &non_ascii_buf, &mut last_was_name);
415+
non_ascii_buf.clear();
416+
}
396417
word_buf.push(ch);
397418
continue;
398419
}
399420

400421
// Flush accumulated word before processing this character
401422
if !word_buf.is_empty() {
423+
// Flush non-ASCII buffer first
424+
if !non_ascii_buf.is_empty() {
425+
flush_non_ascii_text(&mut result, &non_ascii_buf, &mut last_was_name);
426+
non_ascii_buf.clear();
427+
}
402428
flush_math_word(&mut result, &word_buf, &mut last_was_name);
403429
word_buf.clear();
404430
}
405431

406432
if let Some(name) = unicode_to_typst(ch) {
433+
if !non_ascii_buf.is_empty() {
434+
flush_non_ascii_text(&mut result, &non_ascii_buf, &mut last_was_name);
435+
non_ascii_buf.clear();
436+
}
407437
if !result.is_empty()
408438
&& (last_was_name || result.chars().last().is_some_and(|c| c.is_alphanumeric()))
409439
{
@@ -412,25 +442,52 @@ fn map_math_text(input: &str) -> String {
412442
result.push_str(name);
413443
last_was_name = true;
414444
} else if ch.is_ascii_digit() {
445+
if !non_ascii_buf.is_empty() {
446+
flush_non_ascii_text(&mut result, &non_ascii_buf, &mut last_was_name);
447+
non_ascii_buf.clear();
448+
}
415449
if last_was_name {
416450
result.push(' ');
417451
}
418452
result.push(ch);
419453
last_was_name = false;
454+
} else if !ch.is_ascii() && ch.is_alphabetic() {
455+
// Non-ASCII alphabetic (Cyrillic, CJK, etc.) — accumulate for upright() wrapping
456+
non_ascii_buf.push(ch);
420457
} else {
458+
if !non_ascii_buf.is_empty() {
459+
flush_non_ascii_text(&mut result, &non_ascii_buf, &mut last_was_name);
460+
non_ascii_buf.clear();
461+
}
421462
result.push(ch);
422463
last_was_name = false;
423464
}
424465
}
425466

426-
// Flush remaining word
467+
// Flush remaining buffers
427468
if !word_buf.is_empty() {
428469
flush_math_word(&mut result, &word_buf, &mut last_was_name);
429470
}
471+
if !non_ascii_buf.is_empty() {
472+
flush_non_ascii_text(&mut result, &non_ascii_buf, &mut last_was_name);
473+
}
430474

431475
result
432476
}
433477

478+
/// Flush accumulated non-ASCII alphabetic text as `upright("text")` for Typst math mode.
479+
fn flush_non_ascii_text(result: &mut String, text: &str, last_was_name: &mut bool) {
480+
if !result.is_empty()
481+
&& (*last_was_name || result.chars().last().is_some_and(|c| c.is_alphanumeric()))
482+
{
483+
result.push(' ');
484+
}
485+
result.push_str("upright(\"");
486+
result.push_str(text);
487+
result.push_str("\")");
488+
*last_was_name = true;
489+
}
490+
434491
/// Flush an accumulated word of ASCII letters to the result.
435492
///
436493
/// Known math function names (cos, sin, etc.) are kept intact.
@@ -811,6 +868,52 @@ fn parse_bar_props(reader: &mut Reader<&[u8]>, pos: &mut String) {
811868
}
812869
}
813870

871+
fn parse_group_chr(reader: &mut Reader<&[u8]>, out: &mut String) {
872+
let mut chr = "\u{23DF}".to_string(); // default: underbrace ⏟
873+
let mut content = String::new();
874+
875+
loop {
876+
match reader.read_event() {
877+
Ok(Event::Start(ref e)) => match e.local_name().as_ref() {
878+
b"groupChrPr" => parse_group_chr_props(reader, &mut chr),
879+
b"e" => content = parse_sub_element(reader, b"e"),
880+
other => skip_element(reader, other),
881+
},
882+
Ok(Event::End(ref e)) if e.local_name().as_ref() == b"groupChr" => break,
883+
Ok(Event::Eof) | Err(_) => break,
884+
_ => {}
885+
}
886+
}
887+
888+
let func = match chr.as_str() {
889+
"\u{23DE}" => "overbrace", // ⏞
890+
"\u{23DF}" => "underbrace", // ⏟
891+
_ => "underbrace",
892+
};
893+
let _ = std::fmt::Write::write_fmt(out, format_args!("{func}({content})"));
894+
}
895+
896+
fn parse_group_chr_props(reader: &mut Reader<&[u8]>, chr: &mut String) {
897+
loop {
898+
match reader.read_event() {
899+
Ok(Event::Start(ref e)) | Ok(Event::Empty(ref e)) => {
900+
if e.local_name().as_ref() == b"chr" {
901+
for attr in e.attributes().flatten() {
902+
if attr.key.local_name().as_ref() == b"val"
903+
&& let Ok(v) = attr.unescape_value()
904+
{
905+
*chr = v.to_string();
906+
}
907+
}
908+
}
909+
}
910+
Ok(Event::End(ref e)) if e.local_name().as_ref() == b"groupChrPr" => break,
911+
Ok(Event::Eof) | Err(_) => break,
912+
_ => {}
913+
}
914+
}
915+
}
916+
814917
fn parse_matrix(reader: &mut Reader<&[u8]>, out: &mut String) {
815918
let mut rows: Vec<Vec<String>> = Vec::new();
816919

@@ -1316,4 +1419,120 @@ mod tests {
13161419
let xml = "<m:r><m:t>α+β</m:t></m:r>";
13171420
assert_eq!(omml_to_typst(xml), "alpha+beta");
13181421
}
1422+
1423+
// --- US-310: groupChr (overbrace/underbrace) tests ---
1424+
1425+
#[test]
1426+
fn test_group_chr_overbrace() {
1427+
let xml = r#"<m:groupChr><m:groupChrPr><m:chr m:val="⏞"/><m:pos m:val="top"/></m:groupChrPr><m:e><m:r><m:t>a+b</m:t></m:r></m:e></m:groupChr>"#;
1428+
assert_eq!(omml_to_typst(xml), "overbrace(a+b)");
1429+
}
1430+
1431+
#[test]
1432+
fn test_group_chr_underbrace() {
1433+
let xml = r#"<m:groupChr><m:groupChrPr><m:chr m:val="⏟"/><m:pos m:val="bot"/></m:groupChrPr><m:e><m:r><m:t>x+y</m:t></m:r></m:e></m:groupChr>"#;
1434+
assert_eq!(omml_to_typst(xml), "underbrace(x+y)");
1435+
}
1436+
1437+
#[test]
1438+
fn test_group_chr_default_underbrace() {
1439+
// Default groupChr without explicit chr attr should use underbrace
1440+
let xml = r#"<m:groupChr><m:groupChrPr><m:pos m:val="bot"/></m:groupChrPr><m:e><m:r><m:t>z</m:t></m:r></m:e></m:groupChr>"#;
1441+
assert_eq!(omml_to_typst(xml), "underbrace(z)");
1442+
}
1443+
1444+
// --- US-311: subscript/superscript parentheses tests ---
1445+
1446+
#[test]
1447+
fn test_superscript_multi_token_parens() {
1448+
let xml = "<m:sSup><m:e><m:r><m:t>x</m:t></m:r></m:e><m:sup><m:r><m:t>n+1</m:t></m:r></m:sup></m:sSup>";
1449+
assert_eq!(omml_to_typst(xml), "x^(n+1)");
1450+
}
1451+
1452+
#[test]
1453+
fn test_subscript_multi_token_parens() {
1454+
let xml = "<m:sSub><m:e><m:r><m:t>a</m:t></m:r></m:e><m:sub><m:r><m:t>i+1</m:t></m:r></m:sub></m:sSub>";
1455+
assert_eq!(omml_to_typst(xml), "a_(i+1)");
1456+
}
1457+
1458+
// --- US-312: empty radicand tests ---
1459+
1460+
#[test]
1461+
fn test_radical_empty_radicand() {
1462+
let xml = r#"<m:rad><m:radPr><m:degHide m:val="1"/></m:radPr><m:deg/><m:e></m:e></m:rad>"#;
1463+
let result = omml_to_typst(xml);
1464+
assert!(
1465+
result.contains("sqrt(") && result.ends_with(')'),
1466+
"Empty radicand should produce valid sqrt(): got '{result}'"
1467+
);
1468+
// Should not be "sqrt()" — needs a placeholder
1469+
assert_ne!(result, "sqrt()", "Empty radicand should have a placeholder");
1470+
}
1471+
1472+
#[test]
1473+
fn test_root_empty_radicand_with_degree() {
1474+
let xml = r#"<m:rad><m:radPr><m:degHide m:val="0"/></m:radPr><m:deg><m:r><m:t>3</m:t></m:r></m:deg><m:e></m:e></m:rad>"#;
1475+
let result = omml_to_typst(xml);
1476+
assert!(
1477+
result.starts_with("root(3,") && result.ends_with(')'),
1478+
"Empty radicand with degree should produce valid root(): got '{result}'"
1479+
);
1480+
}
1481+
1482+
// --- US-313: delimiter balancing tests ---
1483+
1484+
#[test]
1485+
fn test_delimiter_empty_begin_chr() {
1486+
// When begChr is empty, should not produce unbalanced `)` alone
1487+
let xml = r#"<m:d><m:dPr><m:begChr m:val=""/><m:endChr m:val=")"/></m:dPr><m:e><m:r><m:t>x</m:t></m:r></m:e></m:d>"#;
1488+
let result = omml_to_typst(xml);
1489+
// Must not end with bare `)` without matching `(`
1490+
assert!(
1491+
!result.ends_with(')') || result.contains('('),
1492+
"Empty begChr should not produce unmatched ')': got '{result}'"
1493+
);
1494+
}
1495+
1496+
#[test]
1497+
fn test_delimiter_empty_end_chr() {
1498+
// When endChr is empty, should not produce unbalanced `(`
1499+
let xml = r#"<m:d><m:dPr><m:begChr m:val="("/><m:endChr m:val=""/></m:dPr><m:e><m:r><m:t>x</m:t></m:r></m:e></m:d>"#;
1500+
let result = omml_to_typst(xml);
1501+
// Must not have bare `(` without matching `)`
1502+
assert!(
1503+
!result.starts_with('(') || result.contains(')'),
1504+
"Empty endChr should not produce unmatched '(': got '{result}'"
1505+
);
1506+
}
1507+
1508+
#[test]
1509+
fn test_delimiter_both_empty() {
1510+
// When both begChr and endChr are empty, should just emit content
1511+
let xml = r#"<m:d><m:dPr><m:begChr m:val=""/><m:endChr m:val=""/></m:dPr><m:e><m:r><m:t>x</m:t></m:r></m:e></m:d>"#;
1512+
let result = omml_to_typst(xml);
1513+
assert_eq!(
1514+
result, "x",
1515+
"Both empty delimiters should emit bare content: got '{result}'"
1516+
);
1517+
}
1518+
1519+
// --- US-314: non-ASCII text in math context ---
1520+
1521+
#[test]
1522+
fn test_non_ascii_cyrillic_in_math() {
1523+
let xml = r#"<m:r><m:t>если</m:t></m:r>"#;
1524+
let result = omml_to_typst(xml);
1525+
// Cyrillic text in math should be wrapped in upright() to avoid "unknown variable"
1526+
assert!(
1527+
result.contains("upright("),
1528+
"Cyrillic text in math should be wrapped in upright(): got '{result}'"
1529+
);
1530+
}
1531+
1532+
#[test]
1533+
fn test_non_ascii_single_char_passthrough() {
1534+
// Single non-ASCII char that maps to a Typst symbol should pass through
1535+
let xml = r#"<m:r><m:t>α</m:t></m:r>"#;
1536+
assert_eq!(omml_to_typst(xml), "alpha");
1537+
}
13191538
}

0 commit comments

Comments
 (0)