ddamme05 · ddamme05 · Nov 8, 2025 · Nov 8, 2025 · Nov 8, 2025
diff --git a/Dockerfile b/Dockerfile
@@ -31,19 +31,62 @@ WORKDIR /app
 # Pin Datadog Java agent version
 ARG DD_JAVA_AGENT_VERSION=1.52.1
 
-# Install runtime dependencies including Tesseract OCR
-# Note: Ubuntu 22.04 (Jammy) ships with Tesseract 4.1.1
-# Tess4J 5.16.0 is compatible with Tesseract 4.x and 5.x
+# Install runtime dependencies including Tesseract OCR and Leptonica
+# Use PPA for newer Tesseract/Leptonica versions that match lept4j ABI expectations
 RUN apt-get update && apt-get install -y --no-install-recommends \
+      software-properties-common \
+    && add-apt-repository -y ppa:alex-p/tesseract-ocr-devel \
+    && apt-get update && apt-get install -y --no-install-recommends \
       curl \
       gosu \
       tesseract-ocr \
-      tesseract-ocr-eng && \
-    rm -rf /var/lib/apt/lists/* && \
-    useradd -u 10001 -m -s /bin/sh appuser
-
-# Verify Tesseract is installed and working (4.x or 5.x)
-RUN tesseract --version 2>&1 | head -1
+      tesseract-ocr-eng \
+      tesseract-ocr-osd \
+      libtesseract-dev \
+      libleptonica-dev \
+    && ln -sf /usr/lib/x86_64-linux-gnu/liblept.so /usr/lib/x86_64-linux-gnu/libleptonica.so \
+    && ldconfig \
+    && rm -rf /var/lib/apt/lists/* \
+    && useradd -u 10001 -m -s /bin/sh appuser
+
+# Verify Tesseract installation and detect tessdata directory
+# Using tess4j 5.9.0 which is compatible with Ubuntu 22.04's Leptonica 1.82.0
+# Tesseract 5.x from PPA uses /usr/share/tesseract-ocr/5/tessdata
+# NOTE: Detection runs at BUILD TIME and writes result for RUNTIME use
+RUN set -eux; \
+    echo "=== Tesseract version ==="; \
+    tesseract --version 2>&1 | head -3; \
+    echo ""; \
+    echo "=== Installed Leptonica version ==="; \
+    dpkg -l | grep leptonica || true; \
+    echo ""; \
+    echo "=== Detecting tessdata directory ==="; \
+    if [ -f "/usr/share/tesseract-ocr/5/tessdata/eng.traineddata" ]; then \
+      detectedTessdataDirectory="/usr/share/tesseract-ocr/5/tessdata"; \
+    elif [ -f "/usr/share/tesseract-ocr/4.00/tessdata/eng.traineddata" ]; then \
+      detectedTessdataDirectory="/usr/share/tesseract-ocr/4.00/tessdata"; \
+    elif [ -f "/usr/share/tessdata/eng.traineddata" ]; then \
+      detectedTessdataDirectory="/usr/share/tessdata"; \
+    else \
+      echo "ERROR: eng.traineddata not found in any standard location"; \
+      echo "Searched paths:"; \
+      ls -la /usr/share/tesseract-ocr/ || true; \
+      ls -la /usr/share/tessdata/ 2>/dev/null || true; \
+      exit 1; \
+    fi; \
+    echo "✓ Found tessdata directory: $detectedTessdataDirectory"; \
+    ls -l "$detectedTessdataDirectory/eng.traineddata"; \
+    echo ""; \
+    echo "=== Persisting detected path for runtime ==="; \
+    mkdir -p /etc/profile.d; \
+    echo "export TESSDATA_PREFIX=$detectedTessdataDirectory" > /etc/profile.d/tessdata.sh; \
+    chmod 644 /etc/profile.d/tessdata.sh; \
+    echo ""; \
+    echo "=== Configuration Summary ==="; \
+    echo "Tesseract: $(tesseract --version 2>&1 | head -1)"; \
+    echo "Leptonica: $(dpkg -l | grep libleptonica | awk '{print $3}')"; \
+    echo "Tessdata: $detectedTessdataDirectory"; \
+    echo "Using tess4j 5.9.0 (compatible with Leptonica 1.82.0)"
 
 # Copy the boot JAR from build stage
 COPY --from=build /src/build/libs/app.jar /app/app.jar
@@ -63,11 +106,15 @@ RUN set -eux; \
     fi || echo "Datadog agent download failed, continuing without it"
 
 # Environment variables for Tesseract
+# Tesseract 5.x from alex-p PPA uses /usr/share/tesseract-ocr/5/tessdata
+# Tess4J expects the directory that contains *.traineddata files
+# NOTE: docker-compose.yml can override this if needed
 ENV OMP_THREAD_LIMIT=1
-ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata
+ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata
 
 # Free-tier friendly memory settings (prevents OOM on t3.micro/small)
-ENV JAVA_TOOL_OPTIONS="-javaagent:/app/dd-java-agent.jar -XX:MaxRAMPercentage=75.0 -XX:+UseSerialGC"
+# -Djna.nosys=false ensures JNA uses tmpdir even in read-only filesystem (critical for standalone runs)
+ENV JAVA_TOOL_OPTIONS="-javaagent:/app/dd-java-agent.jar -XX:MaxRAMPercentage=75.0 -XX:+UseSerialGC -Djava.io.tmpdir=/tmp -Djna.tmpdir=/tmp -Djna.nosys=false"
 ENV HOME=/tmp
 
 EXPOSE 8080

diff --git a/build.gradle.kts b/build.gradle.kts
@@ -48,7 +48,9 @@ dependencies {
     implementation("org.springdoc:springdoc-openapi-starter-webmvc-ui:2.8.13")
 
     // OCR and PDF processing
-    implementation("net.sourceforge.tess4j:tess4j:5.16.0")
+    // Using tess4j 5.9.0 which is compatible with Leptonica 1.82.0 (Ubuntu 22.04 default)
+    // Newer versions (5.10.0+) require Leptonica 1.84.0+ which isn't available in Ubuntu repos
+    implementation("net.sourceforge.tess4j:tess4j:5.9.0")
     implementation("org.apache.pdfbox:pdfbox:3.0.6")
     implementation("net.java.dev.jna:jna:5.18.1")
 

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -44,14 +44,16 @@ services:
         condition: service_healthy
     ports: [ "8080:8080" ]
     read_only: true
-    tmpfs: [ "/tmp","/var/tmp" ]
+    tmpfs:
+      - /tmp:rw,exec,nosuid,nodev,mode=1777
+      - /var/tmp:rw,exec,nosuid,nodev,mode=1777
     cap_drop: [ ALL ]
     cap_add: [ SETUID, SETGID ]
     security_opt: [ "no-new-privileges:true" ]
     init: true
     environment:
       # Datadog
-      JAVA_TOOL_OPTIONS: "-javaagent:/app/dd-java-agent.jar -XX:MaxRAMPercentage=75.0 -XX:+UseSerialGC"
+      JAVA_TOOL_OPTIONS: "-javaagent:/app/dd-java-agent.jar -XX:MaxRAMPercentage=75.0 -XX:+UseSerialGC -Djava.io.tmpdir=/tmp -Djna.tmpdir=/tmp -Djna.nosys=false"
       DD_AGENT_HOST: datadog
       DD_SERVICE: file-system-app
       DD_ENV: dev
@@ -66,6 +68,10 @@ services:
       MANAGEMENT_ENDPOINT_HEALTH_PROBES_ENABLED: "true"
       MANAGEMENT_ENDPOINTS_WEB_EXPOSURE_INCLUDE: "health,info,prometheus"
 
+      # Tesseract/OCR Configuration
+      TESSDATA_PREFIX: /usr/share/tesseract-ocr/5/tessdata
+      JNA_TMPDIR: /tmp
+
       # DB
       SPRING_DATASOURCE_URL: jdbc:postgresql://postgres-db:5432/file_system_db
       SPRING_DATASOURCE_USERNAME: user

diff --git a/load-secrets.sh b/load-secrets.sh
@@ -36,20 +36,23 @@ fi
 # Note: AWS credentials now come from EC2 instance role via IMDSv2
 # AWS_REGION and AWS_S3_BUCKET are set as environment variables in docker-compose.yml
 
-# Create a temp directory for JNA (needed for Tesseract/JNA native library loading)
-# This avoids issues with /tmp being mounted noexec
-# Use /var/tmp which is typically not noexec and world-writable
-mkdir -p /var/tmp/jna 2>/dev/null || true
-chmod 1777 /var/tmp/jna 2>/dev/null || true
+# Source the detected tessdata path (set during Docker build)
+# This ensures TESSDATA_PREFIX matches the actual Tesseract version installed
+if [ -f /etc/profile.d/tessdata.sh ]; then
+  . /etc/profile.d/tessdata.sh
+  echo "Loaded TESSDATA_PREFIX from detection: ${TESSDATA_PREFIX}"
+else
+  echo "Warning: /etc/profile.d/tessdata.sh not found, using default TESSDATA_PREFIX"
+fi
 
 # Drop privileges and start Java app
 # Ubuntu Jammy uses gosu (installed in Dockerfile)
-# Set jna.tmpdir to avoid /tmp noexec issues with native libraries
+# JNA temp directory is now set via JAVA_TOOL_OPTIONS in docker-compose.yml
 if command -v gosu >/dev/null 2>&1; then
-  exec gosu appuser java -Djna.tmpdir=/var/tmp/jna -jar /app/app.jar "$@"
+  exec gosu appuser java -jar /app/app.jar "$@"
 elif command -v runuser >/dev/null 2>&1; then
-  exec runuser -u appuser -- java -Djna.tmpdir=/var/tmp/jna -jar /app/app.jar "$@"
+  exec runuser -u appuser -- java -jar /app/app.jar "$@"
 else
-  exec su -s /bin/sh -c 'exec java -Djna.tmpdir=/var/tmp/jna -jar /app/app.jar "$@"' appuser -- "$@"
+  exec su -s /bin/sh -c 'exec java -jar /app/app.jar "$@"' appuser -- "$@"
 fi
 
diff --git a/src/main/java/org/ddamme/service/ai/OcrJobHandler.java b/src/main/java/org/ddamme/service/ai/OcrJobHandler.java
@@ -54,6 +54,7 @@ public class OcrJobHandler implements JobHandler {
         "page_corrupt",       // Per-page OCR failure
         "oom_guard",          // OutOfMemoryError caught
         "s3_not_found",       // NoSuchKeyException
+        "native_library_load_failed", // UnsatisfiedLinkError (JNA/Tesseract load failure)
         "unknown"             // Catch-all for unexpected errors
     );
 
@@ -181,22 +182,32 @@ private Path downloadToTemp(FileMetadata metadata) throws IOException {
      * For PDFs, first attempts native text extraction before falling back to OCR.
      */
     private OcrService.OcrResult extractText(Path file, String contentType) throws Exception {
-        if (isPdf(contentType)) {
-            // Try native text extraction first (huge CPU savings for digital PDFs)
-            OcrService.OcrResult nativeText = tryPdfTextExtraction(file);
-            if (nativeText != null) {
-                log.info("Using native PDF text extraction (skipping OCR): {} chars extracted",
-                        nativeText.text().length());
-                return nativeText;
-            }
+        try {
+            if (isPdf(contentType)) {
+                // Try native text extraction first (huge CPU savings for digital PDFs)
+                OcrService.OcrResult nativeText = tryPdfTextExtraction(file);
+                if (nativeText != null) {
+                    log.info("Using native PDF text extraction (skipping OCR): {} chars extracted",
+                            nativeText.text().length());
+                    return nativeText;
+                }
 
-            // Fall back to OCR for scanned PDFs
-            log.debug("Native text extraction insufficient, using OCR");
-            return ocrService.extractTextFromPdf(file);
-        } else if (isImage(contentType)) {
-            return ocrService.extractTextFromImage(file);
-        } else {
-            throw new IllegalArgumentException("Unsupported content type for OCR: " + contentType);
+                // Fall back to OCR for scanned PDFs
+                log.debug("Native text extraction insufficient, using OCR");
+                return ocrService.extractTextFromPdf(file);
+            } else if (isImage(contentType)) {
+                return ocrService.extractTextFromImage(file);
+            } else {
+                throw new IllegalArgumentException("Unsupported content type for OCR: " + contentType);
+            }
+        } catch (UnsatisfiedLinkError unsatisfiedLinkError) {
+            // Native library load failure (likely tmpfs noexec or missing JNA dependencies)
+            recordError("native_library_load_failed");
+            log.error("OCR native library (JNA/Tesseract/Leptonica) failed to load. " +
+                     "Check tmpfs has exec permissions, presence of libleptonica/libtesseract shared libraries, " +
+                     "and JNA temp directory is accessible.", unsatisfiedLinkError);
+            throw new RuntimeException("OCR native library failed to load: " + unsatisfiedLinkError.getMessage(),
+                                     unsatisfiedLinkError);
         }
     }
 

diff --git a/src/main/java/org/ddamme/service/ai/OcrService.java b/src/main/java/org/ddamme/service/ai/OcrService.java
@@ -175,23 +175,43 @@ public OcrResult extractTextFromPdf(Path pdfPath) throws IOException, TesseractE
 
     /**
      * Create and configure Tesseract instance.
+     *
+     * Tess4J expects the directory that contains *.traineddata files directly.
+     * For Ubuntu 22.04, this is typically /usr/share/tesseract-ocr/4.00/tessdata
      */
     private Tesseract createTesseract() {
-        Tesseract tesseract = new Tesseract();
+        Tesseract tesseractInstance = new Tesseract();
+
+        String tessdataDirectoryPath = properties.getOcr().getDataPath();
+
+        // Validate language data file exists (helps with debugging path issues)
+        String languageCode = properties.getOcr().getLanguage();
+        Path languageDataFilePath = Path.of(tessdataDirectoryPath, languageCode + ".traineddata");
+
+        if (!java.nio.file.Files.exists(languageDataFilePath)) {
+            String errorMessage = String.format(
+                "Tesseract language file not found at %s. " +
+                "Verify TESSDATA_PREFIX points to directory containing *.traineddata files " +
+                "(e.g., /usr/share/tesseract-ocr/4.00/tessdata)",
+                languageDataFilePath);
+            log.error(errorMessage);
+            throw new IllegalStateException(errorMessage);
+        }
 
-        // Set data path for tessdata
-        tesseract.setDatapath(properties.getOcr().getDataPath());
+        // Pass the directory that actually contains *.traineddata files
+        tesseractInstance.setDatapath(tessdataDirectoryPath);
+        log.debug("Tesseract datapath set to: {} (language: {})", tessdataDirectoryPath, languageCode);
 
         // Set language from configuration
-        tesseract.setLanguage(properties.getOcr().getLanguage());
+        tesseractInstance.setLanguage(languageCode);
 
         // PSM 3: Fully automatic page segmentation (default)
-        tesseract.setPageSegMode(3);
+        tesseractInstance.setPageSegMode(3);
 
         // OEM 1: Neural nets LSTM engine only (best accuracy)
-        tesseract.setOcrEngineMode(1);
+        tesseractInstance.setOcrEngineMode(1);
 
-        return tesseract;
+        return tesseractInstance;
     }
 
     /**

diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml
@@ -140,7 +140,7 @@ ai:
     ocr:
       max-pages: 50  # Cost control for large PDFs
       language: eng  # Tesseract language (eng, fra, deu, spa, etc.)
-      data-path: ${TESSDATA_PREFIX:/usr/share/tesseract-ocr/4.00/tessdata}
+      data-path: ${TESSDATA_PREFIX:/usr/share/tesseract-ocr/5/tessdata}
       auto-create: ${AI_OCR_AUTO_CREATE:true}
       file-types:
         - application/pdf