diff --git a/Dockerfile b/Dockerfile index d5a6f52..73dcefc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -31,19 +31,62 @@ WORKDIR /app # Pin Datadog Java agent version ARG DD_JAVA_AGENT_VERSION=1.52.1 -# Install runtime dependencies including Tesseract OCR -# Note: Ubuntu 22.04 (Jammy) ships with Tesseract 4.1.1 -# Tess4J 5.16.0 is compatible with Tesseract 4.x and 5.x +# Install runtime dependencies including Tesseract OCR and Leptonica +# Use PPA for newer Tesseract/Leptonica versions that match lept4j ABI expectations RUN apt-get update && apt-get install -y --no-install-recommends \ + software-properties-common \ + && add-apt-repository -y ppa:alex-p/tesseract-ocr-devel \ + && apt-get update && apt-get install -y --no-install-recommends \ curl \ gosu \ tesseract-ocr \ - tesseract-ocr-eng && \ - rm -rf /var/lib/apt/lists/* && \ - useradd -u 10001 -m -s /bin/sh appuser - -# Verify Tesseract is installed and working (4.x or 5.x) -RUN tesseract --version 2>&1 | head -1 + tesseract-ocr-eng \ + tesseract-ocr-osd \ + libtesseract-dev \ + libleptonica-dev \ + && ln -sf /usr/lib/x86_64-linux-gnu/liblept.so /usr/lib/x86_64-linux-gnu/libleptonica.so \ + && ldconfig \ + && rm -rf /var/lib/apt/lists/* \ + && useradd -u 10001 -m -s /bin/sh appuser + +# Verify Tesseract installation and detect tessdata directory +# Using tess4j 5.9.0 which is compatible with Ubuntu 22.04's Leptonica 1.82.0 +# Tesseract 5.x from PPA uses /usr/share/tesseract-ocr/5/tessdata +# NOTE: Detection runs at BUILD TIME and writes result for RUNTIME use +RUN set -eux; \ + echo "=== Tesseract version ==="; \ + tesseract --version 2>&1 | head -3; \ + echo ""; \ + echo "=== Installed Leptonica version ==="; \ + dpkg -l | grep leptonica || true; \ + echo ""; \ + echo "=== Detecting tessdata directory ==="; \ + if [ -f "/usr/share/tesseract-ocr/5/tessdata/eng.traineddata" ]; then \ + detectedTessdataDirectory="/usr/share/tesseract-ocr/5/tessdata"; \ + elif [ -f "/usr/share/tesseract-ocr/4.00/tessdata/eng.traineddata" ]; then \ + detectedTessdataDirectory="/usr/share/tesseract-ocr/4.00/tessdata"; \ + elif [ -f "/usr/share/tessdata/eng.traineddata" ]; then \ + detectedTessdataDirectory="/usr/share/tessdata"; \ + else \ + echo "ERROR: eng.traineddata not found in any standard location"; \ + echo "Searched paths:"; \ + ls -la /usr/share/tesseract-ocr/ || true; \ + ls -la /usr/share/tessdata/ 2>/dev/null || true; \ + exit 1; \ + fi; \ + echo "✓ Found tessdata directory: $detectedTessdataDirectory"; \ + ls -l "$detectedTessdataDirectory/eng.traineddata"; \ + echo ""; \ + echo "=== Persisting detected path for runtime ==="; \ + mkdir -p /etc/profile.d; \ + echo "export TESSDATA_PREFIX=$detectedTessdataDirectory" > /etc/profile.d/tessdata.sh; \ + chmod 644 /etc/profile.d/tessdata.sh; \ + echo ""; \ + echo "=== Configuration Summary ==="; \ + echo "Tesseract: $(tesseract --version 2>&1 | head -1)"; \ + echo "Leptonica: $(dpkg -l | grep libleptonica | awk '{print $3}')"; \ + echo "Tessdata: $detectedTessdataDirectory"; \ + echo "Using tess4j 5.9.0 (compatible with Leptonica 1.82.0)" # Copy the boot JAR from build stage COPY --from=build /src/build/libs/app.jar /app/app.jar @@ -63,11 +106,15 @@ RUN set -eux; \ fi || echo "Datadog agent download failed, continuing without it" # Environment variables for Tesseract +# Tesseract 5.x from alex-p PPA uses /usr/share/tesseract-ocr/5/tessdata +# Tess4J expects the directory that contains *.traineddata files +# NOTE: docker-compose.yml can override this if needed ENV OMP_THREAD_LIMIT=1 -ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata +ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata # Free-tier friendly memory settings (prevents OOM on t3.micro/small) -ENV JAVA_TOOL_OPTIONS="-javaagent:/app/dd-java-agent.jar -XX:MaxRAMPercentage=75.0 -XX:+UseSerialGC" +# -Djna.nosys=false ensures JNA uses tmpdir even in read-only filesystem (critical for standalone runs) +ENV JAVA_TOOL_OPTIONS="-javaagent:/app/dd-java-agent.jar -XX:MaxRAMPercentage=75.0 -XX:+UseSerialGC -Djava.io.tmpdir=/tmp -Djna.tmpdir=/tmp -Djna.nosys=false" ENV HOME=/tmp EXPOSE 8080 diff --git a/build.gradle.kts b/build.gradle.kts index 8135b93..1953a61 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -48,7 +48,9 @@ dependencies { implementation("org.springdoc:springdoc-openapi-starter-webmvc-ui:2.8.13") // OCR and PDF processing - implementation("net.sourceforge.tess4j:tess4j:5.16.0") + // Using tess4j 5.9.0 which is compatible with Leptonica 1.82.0 (Ubuntu 22.04 default) + // Newer versions (5.10.0+) require Leptonica 1.84.0+ which isn't available in Ubuntu repos + implementation("net.sourceforge.tess4j:tess4j:5.9.0") implementation("org.apache.pdfbox:pdfbox:3.0.6") implementation("net.java.dev.jna:jna:5.18.1") diff --git a/docker-compose.yml b/docker-compose.yml index 00a45da..56ff758 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -44,14 +44,16 @@ services: condition: service_healthy ports: [ "8080:8080" ] read_only: true - tmpfs: [ "/tmp","/var/tmp" ] + tmpfs: + - /tmp:rw,exec,nosuid,nodev,mode=1777 + - /var/tmp:rw,exec,nosuid,nodev,mode=1777 cap_drop: [ ALL ] cap_add: [ SETUID, SETGID ] security_opt: [ "no-new-privileges:true" ] init: true environment: # Datadog - JAVA_TOOL_OPTIONS: "-javaagent:/app/dd-java-agent.jar -XX:MaxRAMPercentage=75.0 -XX:+UseSerialGC" + JAVA_TOOL_OPTIONS: "-javaagent:/app/dd-java-agent.jar -XX:MaxRAMPercentage=75.0 -XX:+UseSerialGC -Djava.io.tmpdir=/tmp -Djna.tmpdir=/tmp -Djna.nosys=false" DD_AGENT_HOST: datadog DD_SERVICE: file-system-app DD_ENV: dev @@ -66,6 +68,10 @@ services: MANAGEMENT_ENDPOINT_HEALTH_PROBES_ENABLED: "true" MANAGEMENT_ENDPOINTS_WEB_EXPOSURE_INCLUDE: "health,info,prometheus" + # Tesseract/OCR Configuration + TESSDATA_PREFIX: /usr/share/tesseract-ocr/5/tessdata + JNA_TMPDIR: /tmp + # DB SPRING_DATASOURCE_URL: jdbc:postgresql://postgres-db:5432/file_system_db SPRING_DATASOURCE_USERNAME: user diff --git a/load-secrets.sh b/load-secrets.sh index 2b9f3b4..cd6aead 100644 --- a/load-secrets.sh +++ b/load-secrets.sh @@ -36,20 +36,23 @@ fi # Note: AWS credentials now come from EC2 instance role via IMDSv2 # AWS_REGION and AWS_S3_BUCKET are set as environment variables in docker-compose.yml -# Create a temp directory for JNA (needed for Tesseract/JNA native library loading) -# This avoids issues with /tmp being mounted noexec -# Use /var/tmp which is typically not noexec and world-writable -mkdir -p /var/tmp/jna 2>/dev/null || true -chmod 1777 /var/tmp/jna 2>/dev/null || true +# Source the detected tessdata path (set during Docker build) +# This ensures TESSDATA_PREFIX matches the actual Tesseract version installed +if [ -f /etc/profile.d/tessdata.sh ]; then + . /etc/profile.d/tessdata.sh + echo "Loaded TESSDATA_PREFIX from detection: ${TESSDATA_PREFIX}" +else + echo "Warning: /etc/profile.d/tessdata.sh not found, using default TESSDATA_PREFIX" +fi # Drop privileges and start Java app # Ubuntu Jammy uses gosu (installed in Dockerfile) -# Set jna.tmpdir to avoid /tmp noexec issues with native libraries +# JNA temp directory is now set via JAVA_TOOL_OPTIONS in docker-compose.yml if command -v gosu >/dev/null 2>&1; then - exec gosu appuser java -Djna.tmpdir=/var/tmp/jna -jar /app/app.jar "$@" + exec gosu appuser java -jar /app/app.jar "$@" elif command -v runuser >/dev/null 2>&1; then - exec runuser -u appuser -- java -Djna.tmpdir=/var/tmp/jna -jar /app/app.jar "$@" + exec runuser -u appuser -- java -jar /app/app.jar "$@" else - exec su -s /bin/sh -c 'exec java -Djna.tmpdir=/var/tmp/jna -jar /app/app.jar "$@"' appuser -- "$@" + exec su -s /bin/sh -c 'exec java -jar /app/app.jar "$@"' appuser -- "$@" fi diff --git a/src/main/java/org/ddamme/service/ai/OcrJobHandler.java b/src/main/java/org/ddamme/service/ai/OcrJobHandler.java index 466bdf4..aea71dc 100644 --- a/src/main/java/org/ddamme/service/ai/OcrJobHandler.java +++ b/src/main/java/org/ddamme/service/ai/OcrJobHandler.java @@ -54,6 +54,7 @@ public class OcrJobHandler implements JobHandler { "page_corrupt", // Per-page OCR failure "oom_guard", // OutOfMemoryError caught "s3_not_found", // NoSuchKeyException + "native_library_load_failed", // UnsatisfiedLinkError (JNA/Tesseract load failure) "unknown" // Catch-all for unexpected errors ); @@ -181,22 +182,32 @@ private Path downloadToTemp(FileMetadata metadata) throws IOException { * For PDFs, first attempts native text extraction before falling back to OCR. */ private OcrService.OcrResult extractText(Path file, String contentType) throws Exception { - if (isPdf(contentType)) { - // Try native text extraction first (huge CPU savings for digital PDFs) - OcrService.OcrResult nativeText = tryPdfTextExtraction(file); - if (nativeText != null) { - log.info("Using native PDF text extraction (skipping OCR): {} chars extracted", - nativeText.text().length()); - return nativeText; - } + try { + if (isPdf(contentType)) { + // Try native text extraction first (huge CPU savings for digital PDFs) + OcrService.OcrResult nativeText = tryPdfTextExtraction(file); + if (nativeText != null) { + log.info("Using native PDF text extraction (skipping OCR): {} chars extracted", + nativeText.text().length()); + return nativeText; + } - // Fall back to OCR for scanned PDFs - log.debug("Native text extraction insufficient, using OCR"); - return ocrService.extractTextFromPdf(file); - } else if (isImage(contentType)) { - return ocrService.extractTextFromImage(file); - } else { - throw new IllegalArgumentException("Unsupported content type for OCR: " + contentType); + // Fall back to OCR for scanned PDFs + log.debug("Native text extraction insufficient, using OCR"); + return ocrService.extractTextFromPdf(file); + } else if (isImage(contentType)) { + return ocrService.extractTextFromImage(file); + } else { + throw new IllegalArgumentException("Unsupported content type for OCR: " + contentType); + } + } catch (UnsatisfiedLinkError unsatisfiedLinkError) { + // Native library load failure (likely tmpfs noexec or missing JNA dependencies) + recordError("native_library_load_failed"); + log.error("OCR native library (JNA/Tesseract/Leptonica) failed to load. " + + "Check tmpfs has exec permissions, presence of libleptonica/libtesseract shared libraries, " + + "and JNA temp directory is accessible.", unsatisfiedLinkError); + throw new RuntimeException("OCR native library failed to load: " + unsatisfiedLinkError.getMessage(), + unsatisfiedLinkError); } } diff --git a/src/main/java/org/ddamme/service/ai/OcrService.java b/src/main/java/org/ddamme/service/ai/OcrService.java index 799c5a7..6d77b1b 100644 --- a/src/main/java/org/ddamme/service/ai/OcrService.java +++ b/src/main/java/org/ddamme/service/ai/OcrService.java @@ -175,23 +175,43 @@ public OcrResult extractTextFromPdf(Path pdfPath) throws IOException, TesseractE /** * Create and configure Tesseract instance. + * + * Tess4J expects the directory that contains *.traineddata files directly. + * For Ubuntu 22.04, this is typically /usr/share/tesseract-ocr/4.00/tessdata */ private Tesseract createTesseract() { - Tesseract tesseract = new Tesseract(); + Tesseract tesseractInstance = new Tesseract(); + + String tessdataDirectoryPath = properties.getOcr().getDataPath(); + + // Validate language data file exists (helps with debugging path issues) + String languageCode = properties.getOcr().getLanguage(); + Path languageDataFilePath = Path.of(tessdataDirectoryPath, languageCode + ".traineddata"); + + if (!java.nio.file.Files.exists(languageDataFilePath)) { + String errorMessage = String.format( + "Tesseract language file not found at %s. " + + "Verify TESSDATA_PREFIX points to directory containing *.traineddata files " + + "(e.g., /usr/share/tesseract-ocr/4.00/tessdata)", + languageDataFilePath); + log.error(errorMessage); + throw new IllegalStateException(errorMessage); + } - // Set data path for tessdata - tesseract.setDatapath(properties.getOcr().getDataPath()); + // Pass the directory that actually contains *.traineddata files + tesseractInstance.setDatapath(tessdataDirectoryPath); + log.debug("Tesseract datapath set to: {} (language: {})", tessdataDirectoryPath, languageCode); // Set language from configuration - tesseract.setLanguage(properties.getOcr().getLanguage()); + tesseractInstance.setLanguage(languageCode); // PSM 3: Fully automatic page segmentation (default) - tesseract.setPageSegMode(3); + tesseractInstance.setPageSegMode(3); // OEM 1: Neural nets LSTM engine only (best accuracy) - tesseract.setOcrEngineMode(1); + tesseractInstance.setOcrEngineMode(1); - return tesseract; + return tesseractInstance; } /** diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index 1374e7a..8f17f07 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -140,7 +140,7 @@ ai: ocr: max-pages: 50 # Cost control for large PDFs language: eng # Tesseract language (eng, fra, deu, spa, etc.) - data-path: ${TESSDATA_PREFIX:/usr/share/tesseract-ocr/4.00/tessdata} + data-path: ${TESSDATA_PREFIX:/usr/share/tesseract-ocr/5/tessdata} auto-create: ${AI_OCR_AUTO_CREATE:true} file-types: - application/pdf