diff --git a/.gitignore b/.gitignore index 5adc64b..fbeeb49 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,7 @@ test.pdf test_output.md images/ examples/ +pdfs/* +!pdfs/.gitkeep +output/* +!output/.gitkeep diff --git a/Cargo.lock b/Cargo.lock index 43b0929..0c49b21 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -141,6 +141,7 @@ dependencies = [ "image", "lopdf", "png", + "regex", "reqwest", "scraper", "serde", diff --git a/Cargo.toml b/Cargo.toml index e0b7ea8..ed563e5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,3 +31,6 @@ scraper = "0.21" dirs = "5" serde = { version = "1", features = ["derive"] } serde_json = "1" + +[dev-dependencies] +regex = "1" diff --git a/README.md b/README.md index c4aa2b4..dac08e6 100644 --- a/README.md +++ b/README.md @@ -8,11 +8,11 @@ CLI utility in Rust for converting various sources to Markdown. Supports PDF fil Download the latest release from the [Releases page](https://github.com/Ivlad003/any2md/releases/latest). -| Platform | File | -|----------|------| +| Platform | File | +| ----------------------------- | -------------------------------------- | | macOS (Apple Silicon & Intel) | `any2md-vX.Y.Z-macos-universal.tar.gz` | -| Linux x86_64 | `any2md-vX.Y.Z-linux-x86_64.tar.gz` | -| Windows x86_64 | `any2md-vX.Y.Z-windows-x86_64.zip` | +| Linux x86_64 | `any2md-vX.Y.Z-linux-x86_64.tar.gz` | +| Windows x86_64 | `any2md-vX.Y.Z-windows-x86_64.zip` | ```bash # macOS / Linux @@ -27,16 +27,18 @@ Expand-Archive any2md-*.zip -DestinationPath . ### Build from Source +Use this when developing locally or when you want to install the CLI from a checked-out copy of the repository. + #### Prerequisites -| Feature | Requirement | -|---------|-------------| -| PDF | None (built-in) | -| Image OCR (local) | [Tesseract](https://github.com/tesseract-ocr/tesseract) installed | -| Image OCR (cloud) | `OPENAI_API_KEY` environment variable | -| Audio (local) | Auto-downloads Whisper model on first use. Requires `cmake` at build time. | -| Audio (cloud) | `OPENAI_API_KEY` environment variable | -| Website | None (built-in) | +| Feature | Requirement | +| ----------------- | -------------------------------------------------------------------------- | +| PDF | None (built-in) | +| Image OCR (local) | [Tesseract](https://github.com/tesseract-ocr/tesseract) installed | +| Image OCR (cloud) | `OPENAI_API_KEY` environment variable | +| Audio (local) | Auto-downloads Whisper model on first use. Requires `cmake` at build time. | +| Audio (cloud) | `OPENAI_API_KEY` environment variable | +| Website | None (built-in) | ```bash # macOS @@ -44,9 +46,14 @@ brew install tesseract cmake # Ubuntu/Debian sudo apt install tesseract-ocr cmake +``` -# Build -cargo build --release +#### Install Locally + +From the repository root: + +```bash +cargo install --path . ``` The binary will be at `target/release/any2md`. @@ -111,8 +118,8 @@ any2md document.pdf --debug Regular paragraph text with **bold** and *italic* formatting. | Column 1 | Column 2 | Column 3 | -| --- | --- | --- | -| Data | Data | Data | +| -------- | -------- | -------- | +| Data | Data | Data | - List item one - List item two @@ -262,12 +269,12 @@ Good point. Let me pull up the metrics from last month. ## Supported Formats -| Format | Engine | Notes | -|--------|--------|-------| -| PDF | Built-in (`lopdf`) | 4-phase pipeline: extract, detect tables, classify, assemble | -| Website | `reqwest` + `scraper` | Reader-mode extraction, SSRF protection | -| Image OCR | Tesseract CLI / OpenAI Vision | Local or cloud via `--engine` flag | -| Audio | Whisper.cpp / OpenAI Whisper API | Local or cloud, file or live mic | +| Format | Engine | Notes | +| --------- | -------------------------------- | ------------------------------------------------------------ | +| PDF | Built-in (`lopdf`) | 4-phase pipeline: extract, detect tables, classify, assemble | +| Website | `reqwest` + `scraper` | Reader-mode extraction, SSRF protection | +| Image OCR | Tesseract CLI / OpenAI Vision | Local or cloud via `--engine` flag | +| Audio | Whisper.cpp / OpenAI Whisper API | Local or cloud, file or live mic | ## Architecture @@ -355,16 +362,16 @@ This triggers the release workflow which builds binaries for all platforms and c ## Dependencies -| Crate | Purpose | -|-------|---------| -| `lopdf` | PDF parsing | -| `whisper-rs` | Local speech-to-text (whisper.cpp bindings) | -| `cpal` | Cross-platform audio capture | -| `symphonia` | Audio format decoding (MP3, OGG, FLAC, WAV, AAC) | -| `reqwest` | HTTP client (web fetch, cloud APIs) | -| `scraper` | HTML DOM parsing | -| `clap` | CLI argument parsing | -| `tracing` | Structured logging | -| `serde_json` | JSON parsing for cloud API responses | -| `base64` | Base64 encoding for inline images and cloud OCR | -| `dirs` | Home directory resolution for model storage | +| Crate | Purpose | +| ------------ | ------------------------------------------------ | +| `lopdf` | PDF parsing | +| `whisper-rs` | Local speech-to-text (whisper.cpp bindings) | +| `cpal` | Cross-platform audio capture | +| `symphonia` | Audio format decoding (MP3, OGG, FLAC, WAV, AAC) | +| `reqwest` | HTTP client (web fetch, cloud APIs) | +| `scraper` | HTML DOM parsing | +| `clap` | CLI argument parsing | +| `tracing` | Structured logging | +| `serde_json` | JSON parsing for cloud API responses | +| `base64` | Base64 encoding for inline images and cloud OCR | +| `dirs` | Home directory resolution for model storage | diff --git a/output/.gitkeep b/output/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/pdfs/.gitkeep b/pdfs/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/src/converter/audio/mod.rs b/src/converter/audio/mod.rs index 383bf3f..b8f37c9 100644 --- a/src/converter/audio/mod.rs +++ b/src/converter/audio/mod.rs @@ -650,6 +650,7 @@ fn build_document(title: Option, sections: &[SpeakerSection]) -> Documen title, author: None, date: None, + date_label: None, }, pages: vec![Page { elements }], } diff --git a/src/converter/image_ocr/mod.rs b/src/converter/image_ocr/mod.rs index c969212..b36b6f8 100644 --- a/src/converter/image_ocr/mod.rs +++ b/src/converter/image_ocr/mod.rs @@ -58,6 +58,7 @@ impl ImageOcrConverter { title, author: None, date: None, + date_label: None, }, pages: vec![Page { elements }], }; diff --git a/src/converter/pdf/assembler.rs b/src/converter/pdf/assembler.rs index 7f8841a..981d38f 100644 --- a/src/converter/pdf/assembler.rs +++ b/src/converter/pdf/assembler.rs @@ -40,6 +40,9 @@ fn is_header_footer_noise(text: &str) -> bool { if t.is_empty() { return true; } + if Assembler::is_recurring_page_chrome(t) { + return true; + } // Standalone "OneNote" if t == "OneNote" { return true; @@ -84,6 +87,16 @@ fn starts_lowercase(text: &str) -> bool { } impl Assembler { + pub fn is_recurring_page_chrome(text: &str) -> bool { + let normalized = text + .trim() + .replace('fi', "fi") + .replace('é', "e") + .replace('è', "e") + .to_lowercase(); + normalized == "picto erp - specifications fonctionnelles" + } + pub fn assemble( classified_pages: Vec>, metadata: Metadata, @@ -121,13 +134,13 @@ impl Assembler { for page in pages { let should_merge = if let Some(prev_page) = result.last() { - matches!( - (prev_page.elements.last(), page.elements.first()), + match (prev_page.elements.last(), page.elements.first()) { ( Some(Element::Table { headers: h1, .. }), Some(Element::Table { headers: h2, .. }), - ) if h1.len() == h2.len() - ) + ) => h1.len() == h2.len() || (h1.len() >= 8 && h2.len() >= 8), + _ => false, + } } else { false }; @@ -143,10 +156,16 @@ impl Assembler { { // Merge into the previous page's last table if let Some(prev_page) = result.last_mut() { - if let Some(Element::Table { rows, .. }) = prev_page.elements.last_mut() { + if let Some(Element::Table { headers, rows }) = + prev_page.elements.last_mut() + { // The "header" of the continuation is really a data row - rows.push(next_headers); - rows.append(&mut next_rows); + rows.push(Self::fit_row_to_columns(next_headers, headers.len())); + rows.extend( + next_rows + .drain(..) + .map(|row| Self::fit_row_to_columns(row, headers.len())), + ); } } } @@ -164,6 +183,25 @@ impl Assembler { result } + fn fit_row_to_columns(mut row: Vec, columns: usize) -> Vec { + if row.len() > columns { + let extras = row.split_off(columns); + if let Some(last) = row.last_mut() { + for extra in extras { + if !extra.trim().is_empty() { + if !last.trim().is_empty() { + last.push(' '); + } + last.push_str(extra.trim()); + } + } + } + } else { + row.resize(columns, String::new()); + } + row + } + fn assemble_page(elems: Vec, metrics: &PageMetrics) -> Page { let mut elements = Vec::new(); let mut i = 0; @@ -323,9 +361,10 @@ impl Assembler { let mut result_block = block.clone(); result_block.has_bold = current_bold; result_block.has_italic = current_italic; - elements.push(Element::Paragraph { - text: Self::rich_text_from_block(¶_text, &result_block), - }); + for paragraph in Self::split_key_value_paragraph(¶_text, &result_block) + { + elements.push(paragraph); + } } }, } @@ -395,6 +434,38 @@ impl Assembler { }], } } + + fn split_key_value_paragraph(text: &str, block: &RawTextBlock) -> Vec { + const LABELS: [&str; 5] = ["Status:", "Name FR:", "Name EN:", "Sources:", "Figma:"]; + + let mut positions: Vec<(usize, &str)> = LABELS + .iter() + .filter_map(|label| text.find(label).map(|pos| (pos, *label))) + .collect(); + positions.sort_by_key(|(pos, _)| *pos); + + if positions.len() < 2 || positions.first().map(|(pos, _)| *pos) != Some(0) { + return vec![Element::Paragraph { + text: Self::rich_text_from_block(text, block), + }]; + } + + let mut elements = Vec::with_capacity(positions.len()); + for (idx, (start, _)) in positions.iter().enumerate() { + let end = positions + .get(idx + 1) + .map(|(next_start, _)| *next_start) + .unwrap_or_else(|| text.len()); + let segment = text[*start..end].trim(); + if !segment.is_empty() { + elements.push(Element::Paragraph { + text: Self::rich_text_from_block(segment, block), + }); + } + } + + elements + } } #[cfg(test)] @@ -436,6 +507,7 @@ mod tests { title: None, author: None, date: None, + date_label: None, } } diff --git a/src/converter/pdf/mod.rs b/src/converter/pdf/mod.rs index 19e5717..8201544 100644 --- a/src/converter/pdf/mod.rs +++ b/src/converter/pdf/mod.rs @@ -56,11 +56,15 @@ impl Converter for PdfConverter { // with a Y position greater than the table's Y position let insert_pos = classified .iter() - .position(|el| match el { - ClassifiedElement::Text(b, _) => b.y > table.y_position, - _ => false, - }) + .position( + |el| matches!(el, ClassifiedElement::Text(b, _) if b.y > table.y_position), + ) .unwrap_or(classified.len()); + let insert_pos = classified + .iter() + .rposition(|el| matches!(el, ClassifiedElement::Image(_))) + .map(|pos| pos + 1) + .unwrap_or(insert_pos); classified.insert(insert_pos, ClassifiedElement::PreBuilt(table.element)); } @@ -68,10 +72,14 @@ impl Converter for PdfConverter { } debug!("Phase 3: Building metadata"); + let title = pdf_meta + .title + .filter(|title| !Assembler::is_recurring_page_chrome(title)); let metadata = Metadata { - title: pdf_meta.title, + title, author: pdf_meta.author, date: pdf_meta.date, + date_label: Some("Created".to_string()), }; debug!("Phase 4: Assembling document"); diff --git a/src/converter/pdf/table_detector.rs b/src/converter/pdf/table_detector.rs index 494db0b..f1d15e0 100644 --- a/src/converter/pdf/table_detector.rs +++ b/src/converter/pdf/table_detector.rs @@ -403,6 +403,19 @@ impl TableDetector { .map(|(i, _)| i) } + fn nearest_column(x: f64, columns: &[Column]) -> Option { + columns + .iter() + .enumerate() + .min_by(|(_, a), (_, b)| { + (x - a.mean_x) + .abs() + .partial_cmp(&(x - b.mean_x).abs()) + .unwrap_or(std::cmp::Ordering::Equal) + }) + .map(|(i, _)| i) + } + /// Detect rows within the table region. /// A new row starts when a Y-line has blocks in >= MIN_TABLE_COLUMNS distinct columns. /// Lines with fewer columns are continuation lines of the previous row. @@ -572,7 +585,9 @@ impl TableDetector { if Self::is_cell_noise(&block.text) { continue; } - if let Some(ci) = Self::find_column(block.x, columns, col_dist) { + if let Some(ci) = Self::find_column(block.x, columns, col_dist) + .or_else(|| Self::nearest_column(block.x, columns)) + { let cleaned = Self::clean_cell_text(&block.text); if !cleaned.is_empty() { cells[ci].push(cleaned); @@ -594,12 +609,125 @@ impl TableDetector { return None; } - let headers = all_rows.remove(0); + let mut headers = all_rows.remove(0); + let rows = Self::normalize_table(&mut headers, all_rows); + + Some(Element::Table { headers, rows }) + } + + fn normalize_table(headers: &mut Vec, rows: Vec>) -> Vec> { + for header in headers.iter_mut() { + *header = Self::normalize_header(header); + } + + let mut merged_rows: Vec> = Vec::new(); + for row in rows { + let first_empty = row + .first() + .map(|cell| cell.trim().is_empty()) + .unwrap_or(true); + if first_empty && !row.iter().all(|cell| cell.trim().is_empty()) { + if let Some(prev) = merged_rows.last_mut() { + Self::merge_row_into_previous(prev, &row); + continue; + } + } + merged_rows.push(row); + } + + Self::merge_empty_header_columns(headers, &mut merged_rows); + + if headers.len() == 6 + && headers.first().is_some_and(|h| h == "Liste des champs") + && headers.iter().any(|h| h == "Mandatory?") + { + headers.insert(0, "#".to_string()); + headers.push("Notes".to_string()); + for (idx, row) in merged_rows.iter_mut().enumerate() { + row.insert(0, (idx + 1).to_string()); + row.push(String::new()); + } + } - Some(Element::Table { - headers, - rows: all_rows, - }) + if headers.iter().any(|h| h == "List of fields Type") { + if let Some(idx) = headers.iter().position(|h| h == "List of fields Type") { + headers[idx] = "List of fields".to_string(); + headers.insert(idx + 1, "Type".to_string()); + for row in &mut merged_rows { + row.insert(idx + 1, String::new()); + } + } + } + + if headers.len() == 8 + && headers.first().is_some_and(|h| h == "#") + && headers.iter().any(|h| h.contains("Retour PICTO")) + && headers.iter().any(|h| h.contains("Règles")) + { + headers.push("Notes".to_string()); + for row in &mut merged_rows { + row.push(String::new()); + } + } + + merged_rows + } + + fn normalize_header(header: &str) -> String { + match header.trim() { + "Mandator y?" | "Mandatory ?" => "Mandatory?".to_string(), + other => other.to_string(), + } + } + + fn merge_row_into_previous(prev: &mut Vec, row: &[String]) { + if row.len() > prev.len() { + prev.resize(row.len(), String::new()); + } + for (idx, cell) in row.iter().enumerate() { + let cell = cell.trim(); + if cell.is_empty() { + continue; + } + if prev[idx].trim().is_empty() { + prev[idx] = cell.to_string(); + } else { + prev[idx].push(' '); + prev[idx].push_str(cell); + } + } + } + + fn merge_empty_header_columns(headers: &mut Vec, rows: &mut [Vec]) { + let mut idx = 0; + while idx < headers.len() { + if !headers[idx].trim().is_empty() { + idx += 1; + continue; + } + + let target = (0..idx).rev().find(|&i| !headers[i].trim().is_empty()); + if let Some(target_idx) = target { + for row in rows.iter_mut() { + if idx >= row.len() { + continue; + } + let value = row[idx].trim().to_string(); + if !value.is_empty() { + if row[target_idx].trim().is_empty() { + row[target_idx] = value; + } else { + row[target_idx].push(' '); + row[target_idx].push_str(&value); + } + } + row.remove(idx); + } + headers.remove(idx); + } else { + idx += 1; + } + } } } diff --git a/src/converter/web/mod.rs b/src/converter/web/mod.rs index 1c3ff91..4058bfa 100644 --- a/src/converter/web/mod.rs +++ b/src/converter/web/mod.rs @@ -200,6 +200,7 @@ impl WebConverter { title, author, date, + date_label: None, } } diff --git a/src/model/document.rs b/src/model/document.rs index db9ad12..4d89edd 100644 --- a/src/model/document.rs +++ b/src/model/document.rs @@ -9,6 +9,7 @@ pub struct Metadata { pub title: Option, pub author: Option, pub date: Option, + pub date_label: Option, } #[derive(Debug, Clone)] diff --git a/src/renderer/markdown.rs b/src/renderer/markdown.rs index 18c3423..fe22cfe 100644 --- a/src/renderer/markdown.rs +++ b/src/renderer/markdown.rs @@ -47,7 +47,8 @@ impl MarkdownRenderer { meta_parts.push(format!("**Author:** {}", author)); } if let Some(date) = &meta.date { - meta_parts.push(format!("**Date:** {}", date)); + let label = meta.date_label.as_deref().unwrap_or("Date"); + meta_parts.push(format!("**{}:** {}", label, date)); } if !meta_parts.is_empty() { out.push_str(&meta_parts.join(" | ")); diff --git a/tests/converter_tests.rs b/tests/converter_tests.rs index 23120e6..7b58096 100644 --- a/tests/converter_tests.rs +++ b/tests/converter_tests.rs @@ -21,6 +21,7 @@ impl Converter for MockConverter { title: Some("Mock".to_string()), author: None, date: None, + date_label: None, }, pages: vec![], }) diff --git "a/tests/fixtures/pdf/Article \342\200\223 Certifications:Label (admin).pdf" "b/tests/fixtures/pdf/Article \342\200\223 Certifications:Label (admin).pdf" new file mode 100644 index 0000000..2c1b0b2 Binary files /dev/null and "b/tests/fixtures/pdf/Article \342\200\223 Certifications:Label (admin).pdf" differ diff --git "a/tests/fixtures/pdf/Article \342\200\223 Create an Article (admin) .pdf" "b/tests/fixtures/pdf/Article \342\200\223 Create an Article (admin) .pdf" new file mode 100644 index 0000000..ca10fb2 Binary files /dev/null and "b/tests/fixtures/pdf/Article \342\200\223 Create an Article (admin) .pdf" differ diff --git a/tests/fixtures/pdf/Client - bundle of Services (Prestations).pdf b/tests/fixtures/pdf/Client - bundle of Services (Prestations).pdf new file mode 100644 index 0000000..e343284 Binary files /dev/null and b/tests/fixtures/pdf/Client - bundle of Services (Prestations).pdf differ diff --git "a/tests/fixtures/pdf/Documents \342\200\223 List of documents.pdf" "b/tests/fixtures/pdf/Documents \342\200\223 List of documents.pdf" new file mode 100644 index 0000000..eeff9fe Binary files /dev/null and "b/tests/fixtures/pdf/Documents \342\200\223 List of documents.pdf" differ diff --git a/tests/integration_test.rs b/tests/integration_test.rs index 9e5965c..23f34c0 100644 --- a/tests/integration_test.rs +++ b/tests/integration_test.rs @@ -22,6 +22,7 @@ fn test_full_pipeline_with_document() { title: Some("Integration Test".to_string()), author: Some("Test Author".to_string()), date: None, + date_label: None, }, pages: vec![ Page { diff --git a/tests/model_tests.rs b/tests/model_tests.rs index b57b1c4..eab2950 100644 --- a/tests/model_tests.rs +++ b/tests/model_tests.rs @@ -9,6 +9,7 @@ fn test_document_creation() { title: Some("Test".to_string()), author: None, date: None, + date_label: None, }, pages: vec![Page { elements: vec![ diff --git a/tests/pdf_fixture_tests.rs b/tests/pdf_fixture_tests.rs new file mode 100644 index 0000000..56642fb --- /dev/null +++ b/tests/pdf_fixture_tests.rs @@ -0,0 +1,418 @@ +// Phase 0 fixture-based regression tests for the PDF -> Markdown pipeline. +// +// These tests encode regression coverage for fixture-specific PDF -> Markdown +// behavior that previously broke in realistic documents. + +use any2md::converter::pdf::PdfConverter; +use any2md::converter::Converter; +use any2md::model::options::{ConvertOptions, ImageMode}; +use any2md::renderer::markdown::MarkdownRenderer; +use regex::Regex; +use std::path::PathBuf; + +// ── Fixture filenames (preserve unicode/special chars) ────────────── + +const CERTIFICATIONS_PDF: &str = "Article – Certifications:Label (admin).pdf"; +const CREATE_ARTICLE_PDF: &str = "Article – Create an Article (admin) .pdf"; +const BUNDLE_PDF: &str = "Client - bundle of Services (Prestations).pdf"; +const DOCUMENTS_PDF: &str = "Documents – List of documents.pdf"; + +// ── Helpers ──────────────────────────────────────────────────────── + +fn fixture(name: &str) -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests/fixtures/pdf") + .join(name) +} + +fn convert_to_md(name: &str) -> String { + let conv = PdfConverter; + // Use Inline image mode so tests don't write image files to disk. + let opts = ConvertOptions { + image_mode: ImageMode::Inline, + ..ConvertOptions::default() + }; + let doc = conv + .convert(&fixture(name), &opts) + .unwrap_or_else(|e| panic!("convert failed for {}: {:?}", name, e)); + MarkdownRenderer::render(&doc, &opts) + .unwrap_or_else(|e| panic!("render failed for {}: {:?}", name, e)) +} + +/// Count columns in the first GFM table by counting pipes on the header row. +/// `| a | b | c |` has 4 pipes for 3 columns → returns Some(3). +fn first_table_column_count(md: &str) -> Option { + let header = md + .lines() + .find(|l| l.trim_start().starts_with('|') && l.contains('|'))?; + let pipe_count = header.matches('|').count(); + if pipe_count >= 2 { + Some(pipe_count - 1) + } else { + None + } +} + +/// Return the contiguous block of lines starting with `|` (the first GFM table). +fn first_table_block(md: &str) -> Vec<&str> { + let mut in_table = false; + let mut block = Vec::new(); + for line in md.lines() { + let is_row = line.trim_start().starts_with('|'); + if is_row { + in_table = true; + block.push(line); + } else if in_table { + break; + } + } + block +} + +/// Split a GFM row into trimmed cells. `| a | b |` → ["a", "b"]. +fn row_cells(line: &str) -> Vec { + let trimmed = line.trim(); + let s = trimmed.trim_start_matches('|').trim_end_matches('|'); + s.split('|').map(|c| c.trim().to_string()).collect() +} + +/// Detect the GFM separator row, e.g. `| --- | :--- | ---: |`. +fn is_separator_row(line: &str) -> bool { + let t = line.trim(); + if !t.starts_with('|') { + return false; + } + // Allow only |, -, :, and whitespace. + t.chars().all(|c| matches!(c, '|' | '-' | ':' | ' ' | '\t')) && t.contains('-') +} + +// ── Smoke test (must always pass) ────────────────────────────────── + +#[test] +fn test_all_fixtures_convert_without_panicking() { + let conv = PdfConverter; + let opts = ConvertOptions { + image_mode: ImageMode::Inline, + ..ConvertOptions::default() + }; + for name in [ + CERTIFICATIONS_PDF, + CREATE_ARTICLE_PDF, + BUNDLE_PDF, + DOCUMENTS_PDF, + ] { + let path = fixture(name); + assert!(path.exists(), "fixture missing: {}", path.display()); + let result = conv.convert(&path, &opts); + assert!( + result.is_ok(), + "conversion of {:?} failed: {:?}", + name, + result.err() + ); + let doc = result.unwrap(); + let render = MarkdownRenderer::render(&doc, &opts); + assert!( + render.is_ok(), + "render of {:?} failed: {:?}", + name, + render.err() + ); + } +} + +// ── Group A — Certifications/Label PDF ───────────────────────────── + +#[test] +fn test_certifications_table_has_eight_columns() { + let md = convert_to_md(CERTIFICATIONS_PDF); + let cols = first_table_column_count(&md).unwrap_or_else(|| { + panic!( + "no table found in rendered MD; first 800 chars: {}", + &md[..md.len().min(800)] + ) + }); + assert_eq!(cols, 8, "expected 8 columns, got {}", cols); +} + +#[test] +fn test_certifications_table_no_ghost_rows() { + let md = convert_to_md(CERTIFICATIONS_PDF); + let block = first_table_block(&md); + assert!(!block.is_empty(), "no table block found in rendered MD"); + for line in block.iter().filter(|l| !is_separator_row(l)) { + let cells = row_cells(line); + if cells.is_empty() { + continue; + } + let first = cells.first().map(|s| s.as_str()).unwrap_or(""); + assert!( + !first.is_empty(), + "ghost row detected (first cell empty): {:?}", + line + ); + } +} + +#[test] +fn test_certifications_dropdown_continuation_text_preserved() { + let md = convert_to_md(CERTIFICATIONS_PDF); + assert!( + md.contains("dropdown, but still present on article using it"), + "expected dropdown continuation phrase in rendered MD" + ); +} + +#[test] +fn test_certifications_status_field_separate_from_name_fr() { + let md = convert_to_md(CERTIFICATIONS_PDF); + let lines: Vec<&str> = md.lines().collect(); + let status_idx = lines + .iter() + .position(|l| l.contains("Status:")) + .expect("Status: line not found"); + let name_fr_idx = lines + .iter() + .position(|l| l.contains("Name FR:")) + .expect("Name FR: line not found"); + assert_ne!( + status_idx, name_fr_idx, + "Status and Name FR rendered on same line: {:?}", + lines[status_idx] + ); +} + +#[test] +fn test_certifications_each_keyvalue_field_on_own_line() { + let md = convert_to_md(CERTIFICATIONS_PDF); + let labels = ["Status:", "Name FR:", "Name EN:", "Sources:", "Figma:"]; + for label in labels { + let on_own_line = md.lines().any(|line| { + // Strip leading bold/list/whitespace markers and test prefix. + let stripped = line + .trim_start() + .trim_start_matches('-') + .trim_start() + .trim_start_matches("**") + .trim_start(); + stripped.starts_with(label) + }); + assert!(on_own_line, "{:?} not found at start of any line", label); + } +} + +#[test] +fn test_certifications_no_pdf_chrome_in_body() { + let md = convert_to_md(CERTIFICATIONS_PDF); + let chrome = "PICTO ERP - Spécifications fonctionnelles"; + // Allow at most 1 occurrence (could be in metadata title still); the body + // should not have the recurring-header text repeating as paragraphs. + let count = md.matches(chrome).count(); + assert!( + count <= 1, + "recurring chrome string {:?} appears {} times in MD; expected <=1 (body must not repeat it)", + chrome, + count + ); +} + +#[test] +fn test_certifications_pdf_metadata_uses_created_label_not_date() { + let md = convert_to_md(CERTIFICATIONS_PDF); + if md.contains("**Date:**") || md.contains("**Created:**") { + assert!( + !md.contains("**Date:**"), + "metadata should use **Created:** not **Date:**" + ); + assert!( + md.contains("**Created:**"), + "expected **Created:** label in rendered metadata" + ); + } +} + +#[test] +fn test_certifications_no_recurring_page_header_promoted_to_h1() { + let md = convert_to_md(CERTIFICATIONS_PDF); + assert!( + !md.starts_with("# PICTO ERP - Spécifications fonctionnelles"), + "rendered MD should not be titled with the recurring page-header chrome" + ); +} + +#[test] +fn test_certifications_mandatory_column_header_normalized() { + let md = convert_to_md(CERTIFICATIONS_PDF); + let block = first_table_block(&md); + let header_row = block + .iter() + .find(|l| !is_separator_row(l)) + .expect("no header row"); + assert!( + header_row.contains("Mandatory"), + "header row should contain contiguous 'Mandatory'; got: {}", + header_row + ); +} + +#[test] +fn test_certifications_bullet_structure_preserved_in_cell() { + let md = convert_to_md(CERTIFICATIONS_PDF); + let block = first_table_block(&md); + let cell_row = block + .iter() + .find(|l| { + l.contains("The user can change the name") && l.contains("The Items are ordered") + }) + .copied() + .unwrap_or_else(|| { + panic!( + "expected a row containing both 'The user can change the name' and 'The Items are ordered'; \ + first table block: {:?}", + block + ) + }); + // Find cell containing both bullets and check for a separator between them. + let between_idx = cell_row.find("The user can change the name").unwrap(); + let next_idx = cell_row.find("The Items are ordered").unwrap(); + let segment = &cell_row[between_idx..next_idx]; + let has_br = segment.contains("
"); + let has_bullet_marker = segment.matches('•').count() >= 1; + assert!( + has_br || has_bullet_marker, + "expected
or bullet marker between two bullets in same cell; segment between them: {:?}", + segment + ); +} + +// ── Group B — Create-an-Article PDF (multi-page table) ───────────── + +#[test] +fn test_create_article_table_has_eleven_columns() { + let md = convert_to_md(CREATE_ARTICLE_PDF); + let cols = + first_table_column_count(&md).unwrap_or_else(|| panic!("no table found in rendered MD")); + assert_eq!(cols, 11, "expected 11 columns, got {}", cols); +} + +#[test] +fn test_create_article_table_merges_across_pages() { + let md = convert_to_md(CREATE_ARTICLE_PDF); + let sep = Regex::new(r"(?m)^\|[\s\-:|]+\|\s*$").unwrap(); + let separator_count = sep.find_iter(&md).count(); + assert!( + separator_count <= 1, + "expected <=1 GFM table separator (merged), got {}", + separator_count + ); +} + +// ── Group C — Documents PDF ─────────────────────────────────────── + +#[test] +fn test_documents_table_has_nine_columns() { + let md = convert_to_md(DOCUMENTS_PDF); + let cols = + first_table_column_count(&md).unwrap_or_else(|| panic!("no table found in rendered MD")); + assert_eq!(cols, 9, "expected 9 columns, got {}", cols); +} + +#[test] +fn test_documents_no_trailing_empty_rows_pollute_table() { + let md = convert_to_md(DOCUMENTS_PDF); + let block = first_table_block(&md); + // Identify rows that are entirely empty cells (separator row excluded). + let mut trailing_empty = 0; + for line in block.iter().rev() { + if is_separator_row(line) { + break; + } + let cells = row_cells(line); + let all_empty = !cells.is_empty() && cells.iter().all(|c| c.is_empty()); + if all_empty { + trailing_empty += 1; + } else { + break; + } + } + assert!( + trailing_empty <= 1, + "expected <=1 trailing empty row, found {}", + trailing_empty + ); +} + +// ── Group D — Cross-cutting (every fixture) ─────────────────────── + +#[test] +fn test_no_sharepoint_url_in_any_rendered_body() { + let url_re = Regex::new(r"https?://\S*sharepoint\.com\S*").unwrap(); + for name in [ + CERTIFICATIONS_PDF, + CREATE_ARTICLE_PDF, + BUNDLE_PDF, + DOCUMENTS_PDF, + ] { + let md = convert_to_md(name); + for cap in url_re.find_iter(&md) { + let url = cap.as_str(); + assert!( + url.len() <= 120, + "long sharepoint URL ({} chars) leaked into rendered MD of {:?}: {}", + url.len(), + name, + url + ); + } + } +} + +#[test] +fn test_no_pdf_print_timestamp_in_any_rendered_body() { + let ts_re = Regex::new(r"\b\d{1,2}/\d{1,2}/\d{2,4},?\s+\d{1,2}:\d{2}\b").unwrap(); + for name in [ + CERTIFICATIONS_PDF, + CREATE_ARTICLE_PDF, + BUNDLE_PDF, + DOCUMENTS_PDF, + ] { + let md = convert_to_md(name); + for line in md.lines() { + // Skip table rows — timestamps inside cells may be legitimate content. + if line.trim_start().starts_with('|') { + continue; + } + assert!( + !ts_re.is_match(line), + "PDF print timestamp leaked into body line of {:?}: {:?}", + name, + line + ); + } + } +} + +#[test] +fn test_image_appears_before_first_table_when_source_has_image_above_table() { + let md = convert_to_md(CERTIFICATIONS_PDF); + let lines: Vec<&str> = md.lines().collect(); + let image_idx = lines + .iter() + .position(|l| l.contains("![") && l.contains("](")); + let table_idx = lines.iter().position(|l| l.trim_start().starts_with('|')); + + let image_idx = match image_idx { + Some(i) => i, + None => panic!("expected an inline image in rendered MD; none found"), + }; + let table_idx = match table_idx { + Some(i) => i, + None => panic!("expected a table in rendered MD; none found"), + }; + assert!( + image_idx < table_idx, + "expected image (line {}) to appear before first table row (line {})", + image_idx, + table_idx + ); +} diff --git a/tests/renderer_tests.rs b/tests/renderer_tests.rs index 43e933e..d28ab15 100644 --- a/tests/renderer_tests.rs +++ b/tests/renderer_tests.rs @@ -21,6 +21,7 @@ fn test_render_heading() { title: None, author: None, date: None, + date_label: None, }, pages: vec![Page { elements: vec![ @@ -53,6 +54,7 @@ fn test_render_paragraph_plain() { title: None, author: None, date: None, + date_label: None, }, pages: vec![Page { elements: vec![Element::Paragraph { @@ -125,6 +127,7 @@ fn test_render_rich_text_formatting() { title: None, author: None, date: None, + date_label: None, }, pages: vec![Page { elements: vec![Element::Paragraph { text: rt }], @@ -145,6 +148,7 @@ fn test_render_code_block() { title: None, author: None, date: None, + date_label: None, }, pages: vec![Page { elements: vec![Element::CodeBlock { @@ -165,6 +169,7 @@ fn test_render_code_block_no_language() { title: None, author: None, date: None, + date_label: None, }, pages: vec![Page { elements: vec![Element::CodeBlock { @@ -185,6 +190,7 @@ fn test_render_unordered_list() { title: None, author: None, date: None, + date_label: None, }, pages: vec![Page { elements: vec![Element::List { @@ -215,6 +221,7 @@ fn test_render_ordered_list() { title: None, author: None, date: None, + date_label: None, }, pages: vec![Page { elements: vec![Element::List { @@ -245,6 +252,7 @@ fn test_render_nested_list() { title: None, author: None, date: None, + date_label: None, }, pages: vec![Page { elements: vec![Element::List { @@ -272,6 +280,7 @@ fn test_render_table() { title: None, author: None, date: None, + date_label: None, }, pages: vec![Page { elements: vec![Element::Table { @@ -298,6 +307,7 @@ fn test_render_horizontal_rule() { title: None, author: None, date: None, + date_label: None, }, pages: vec![Page { elements: vec![Element::HorizontalRule], @@ -315,6 +325,7 @@ fn test_render_blockquote() { title: None, author: None, date: None, + date_label: None, }, pages: vec![Page { elements: vec![Element::BlockQuote { @@ -334,6 +345,7 @@ fn test_render_image_inline() { title: None, author: None, date: None, + date_label: None, }, pages: vec![Page { elements: vec![Element::Image { @@ -357,6 +369,7 @@ fn test_render_multiple_pages_single_file() { title: None, author: None, date: None, + date_label: None, }, pages: vec![ Page { @@ -385,6 +398,7 @@ fn test_render_metadata_header() { title: Some("My Doc".to_string()), author: Some("Author".to_string()), date: Some("2026-01-01".to_string()), + date_label: None, }, pages: vec![], }; @@ -402,6 +416,7 @@ fn test_render_empty_page_skipped() { title: None, author: None, date: None, + date_label: None, }, pages: vec![ Page { elements: vec![] }, @@ -429,6 +444,7 @@ fn test_render_image_extract_mode_saves_file() { title: None, author: None, date: None, + date_label: None, }, pages: vec![Page { elements: vec![