Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 58 additions & 11 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -31,19 +31,62 @@ WORKDIR /app
# Pin Datadog Java agent version
ARG DD_JAVA_AGENT_VERSION=1.52.1

# Install runtime dependencies including Tesseract OCR
# Note: Ubuntu 22.04 (Jammy) ships with Tesseract 4.1.1
# Tess4J 5.16.0 is compatible with Tesseract 4.x and 5.x
# Install runtime dependencies including Tesseract OCR and Leptonica
# Use PPA for newer Tesseract/Leptonica versions that match lept4j ABI expectations
RUN apt-get update && apt-get install -y --no-install-recommends \
software-properties-common \
&& add-apt-repository -y ppa:alex-p/tesseract-ocr-devel \
&& apt-get update && apt-get install -y --no-install-recommends \
curl \
gosu \
tesseract-ocr \
tesseract-ocr-eng && \
rm -rf /var/lib/apt/lists/* && \
useradd -u 10001 -m -s /bin/sh appuser

# Verify Tesseract is installed and working (4.x or 5.x)
RUN tesseract --version 2>&1 | head -1
tesseract-ocr-eng \
tesseract-ocr-osd \
libtesseract-dev \
libleptonica-dev \
&& ln -sf /usr/lib/x86_64-linux-gnu/liblept.so /usr/lib/x86_64-linux-gnu/libleptonica.so \
&& ldconfig \
&& rm -rf /var/lib/apt/lists/* \
&& useradd -u 10001 -m -s /bin/sh appuser

# Verify Tesseract installation and detect tessdata directory
# Using tess4j 5.9.0 which is compatible with Ubuntu 22.04's Leptonica 1.82.0
# Tesseract 5.x from PPA uses /usr/share/tesseract-ocr/5/tessdata
# NOTE: Detection runs at BUILD TIME and writes result for RUNTIME use
RUN set -eux; \
echo "=== Tesseract version ==="; \
tesseract --version 2>&1 | head -3; \
echo ""; \
echo "=== Installed Leptonica version ==="; \
dpkg -l | grep leptonica || true; \
echo ""; \
echo "=== Detecting tessdata directory ==="; \
if [ -f "/usr/share/tesseract-ocr/5/tessdata/eng.traineddata" ]; then \
detectedTessdataDirectory="/usr/share/tesseract-ocr/5/tessdata"; \
elif [ -f "/usr/share/tesseract-ocr/4.00/tessdata/eng.traineddata" ]; then \
detectedTessdataDirectory="/usr/share/tesseract-ocr/4.00/tessdata"; \
elif [ -f "/usr/share/tessdata/eng.traineddata" ]; then \
detectedTessdataDirectory="/usr/share/tessdata"; \
else \
echo "ERROR: eng.traineddata not found in any standard location"; \
echo "Searched paths:"; \
ls -la /usr/share/tesseract-ocr/ || true; \
ls -la /usr/share/tessdata/ 2>/dev/null || true; \
exit 1; \
fi; \
echo "✓ Found tessdata directory: $detectedTessdataDirectory"; \
ls -l "$detectedTessdataDirectory/eng.traineddata"; \
echo ""; \
echo "=== Persisting detected path for runtime ==="; \
mkdir -p /etc/profile.d; \
echo "export TESSDATA_PREFIX=$detectedTessdataDirectory" > /etc/profile.d/tessdata.sh; \
chmod 644 /etc/profile.d/tessdata.sh; \
echo ""; \
echo "=== Configuration Summary ==="; \
echo "Tesseract: $(tesseract --version 2>&1 | head -1)"; \
echo "Leptonica: $(dpkg -l | grep libleptonica | awk '{print $3}')"; \
echo "Tessdata: $detectedTessdataDirectory"; \
echo "Using tess4j 5.9.0 (compatible with Leptonica 1.82.0)"

# Copy the boot JAR from build stage
COPY --from=build /src/build/libs/app.jar /app/app.jar
Expand All @@ -63,11 +106,15 @@ RUN set -eux; \
fi || echo "Datadog agent download failed, continuing without it"

# Environment variables for Tesseract
# Tesseract 5.x from alex-p PPA uses /usr/share/tesseract-ocr/5/tessdata
# Tess4J expects the directory that contains *.traineddata files
# NOTE: docker-compose.yml can override this if needed
ENV OMP_THREAD_LIMIT=1
ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata
ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata
Comment thread
cursor[bot] marked this conversation as resolved.

# Free-tier friendly memory settings (prevents OOM on t3.micro/small)
ENV JAVA_TOOL_OPTIONS="-javaagent:/app/dd-java-agent.jar -XX:MaxRAMPercentage=75.0 -XX:+UseSerialGC"
# -Djna.nosys=false ensures JNA uses tmpdir even in read-only filesystem (critical for standalone runs)
ENV JAVA_TOOL_OPTIONS="-javaagent:/app/dd-java-agent.jar -XX:MaxRAMPercentage=75.0 -XX:+UseSerialGC -Djava.io.tmpdir=/tmp -Djna.tmpdir=/tmp -Djna.nosys=false"
ENV HOME=/tmp

EXPOSE 8080
Expand Down
4 changes: 3 additions & 1 deletion build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ dependencies {
implementation("org.springdoc:springdoc-openapi-starter-webmvc-ui:2.8.13")

// OCR and PDF processing
implementation("net.sourceforge.tess4j:tess4j:5.16.0")
// Using tess4j 5.9.0 which is compatible with Leptonica 1.82.0 (Ubuntu 22.04 default)
// Newer versions (5.10.0+) require Leptonica 1.84.0+ which isn't available in Ubuntu repos
implementation("net.sourceforge.tess4j:tess4j:5.9.0")
implementation("org.apache.pdfbox:pdfbox:3.0.6")
implementation("net.java.dev.jna:jna:5.18.1")

Expand Down
10 changes: 8 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,16 @@ services:
condition: service_healthy
ports: [ "8080:8080" ]
read_only: true
tmpfs: [ "/tmp","/var/tmp" ]
tmpfs:
- /tmp:rw,exec,nosuid,nodev,mode=1777
- /var/tmp:rw,exec,nosuid,nodev,mode=1777
cap_drop: [ ALL ]
cap_add: [ SETUID, SETGID ]
security_opt: [ "no-new-privileges:true" ]
init: true
environment:
# Datadog
JAVA_TOOL_OPTIONS: "-javaagent:/app/dd-java-agent.jar -XX:MaxRAMPercentage=75.0 -XX:+UseSerialGC"
JAVA_TOOL_OPTIONS: "-javaagent:/app/dd-java-agent.jar -XX:MaxRAMPercentage=75.0 -XX:+UseSerialGC -Djava.io.tmpdir=/tmp -Djna.tmpdir=/tmp -Djna.nosys=false"
DD_AGENT_HOST: datadog
DD_SERVICE: file-system-app
DD_ENV: dev
Expand All @@ -66,6 +68,10 @@ services:
MANAGEMENT_ENDPOINT_HEALTH_PROBES_ENABLED: "true"
MANAGEMENT_ENDPOINTS_WEB_EXPOSURE_INCLUDE: "health,info,prometheus"

# Tesseract/OCR Configuration
TESSDATA_PREFIX: /usr/share/tesseract-ocr/5/tessdata
JNA_TMPDIR: /tmp

# DB
SPRING_DATASOURCE_URL: jdbc:postgresql://postgres-db:5432/file_system_db
SPRING_DATASOURCE_USERNAME: user
Expand Down
21 changes: 12 additions & 9 deletions load-secrets.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,20 +36,23 @@ fi
# Note: AWS credentials now come from EC2 instance role via IMDSv2
# AWS_REGION and AWS_S3_BUCKET are set as environment variables in docker-compose.yml

# Create a temp directory for JNA (needed for Tesseract/JNA native library loading)
# This avoids issues with /tmp being mounted noexec
# Use /var/tmp which is typically not noexec and world-writable
mkdir -p /var/tmp/jna 2>/dev/null || true
chmod 1777 /var/tmp/jna 2>/dev/null || true
# Source the detected tessdata path (set during Docker build)
# This ensures TESSDATA_PREFIX matches the actual Tesseract version installed
if [ -f /etc/profile.d/tessdata.sh ]; then
. /etc/profile.d/tessdata.sh
echo "Loaded TESSDATA_PREFIX from detection: ${TESSDATA_PREFIX}"
else
echo "Warning: /etc/profile.d/tessdata.sh not found, using default TESSDATA_PREFIX"
fi

# Drop privileges and start Java app
# Ubuntu Jammy uses gosu (installed in Dockerfile)
# Set jna.tmpdir to avoid /tmp noexec issues with native libraries
# JNA temp directory is now set via JAVA_TOOL_OPTIONS in docker-compose.yml
if command -v gosu >/dev/null 2>&1; then
exec gosu appuser java -Djna.tmpdir=/var/tmp/jna -jar /app/app.jar "$@"
exec gosu appuser java -jar /app/app.jar "$@"
elif command -v runuser >/dev/null 2>&1; then
exec runuser -u appuser -- java -Djna.tmpdir=/var/tmp/jna -jar /app/app.jar "$@"
exec runuser -u appuser -- java -jar /app/app.jar "$@"
else
exec su -s /bin/sh -c 'exec java -Djna.tmpdir=/var/tmp/jna -jar /app/app.jar "$@"' appuser -- "$@"
exec su -s /bin/sh -c 'exec java -jar /app/app.jar "$@"' appuser -- "$@"
fi

41 changes: 26 additions & 15 deletions src/main/java/org/ddamme/service/ai/OcrJobHandler.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ public class OcrJobHandler implements JobHandler {
"page_corrupt", // Per-page OCR failure
"oom_guard", // OutOfMemoryError caught
"s3_not_found", // NoSuchKeyException
"native_library_load_failed", // UnsatisfiedLinkError (JNA/Tesseract load failure)
"unknown" // Catch-all for unexpected errors
);

Expand Down Expand Up @@ -181,22 +182,32 @@ private Path downloadToTemp(FileMetadata metadata) throws IOException {
* For PDFs, first attempts native text extraction before falling back to OCR.
*/
private OcrService.OcrResult extractText(Path file, String contentType) throws Exception {
if (isPdf(contentType)) {
// Try native text extraction first (huge CPU savings for digital PDFs)
OcrService.OcrResult nativeText = tryPdfTextExtraction(file);
if (nativeText != null) {
log.info("Using native PDF text extraction (skipping OCR): {} chars extracted",
nativeText.text().length());
return nativeText;
}
try {
if (isPdf(contentType)) {
// Try native text extraction first (huge CPU savings for digital PDFs)
OcrService.OcrResult nativeText = tryPdfTextExtraction(file);
if (nativeText != null) {
log.info("Using native PDF text extraction (skipping OCR): {} chars extracted",
nativeText.text().length());
return nativeText;
}

// Fall back to OCR for scanned PDFs
log.debug("Native text extraction insufficient, using OCR");
return ocrService.extractTextFromPdf(file);
} else if (isImage(contentType)) {
return ocrService.extractTextFromImage(file);
} else {
throw new IllegalArgumentException("Unsupported content type for OCR: " + contentType);
// Fall back to OCR for scanned PDFs
log.debug("Native text extraction insufficient, using OCR");
return ocrService.extractTextFromPdf(file);
} else if (isImage(contentType)) {
return ocrService.extractTextFromImage(file);
} else {
throw new IllegalArgumentException("Unsupported content type for OCR: " + contentType);
}
} catch (UnsatisfiedLinkError unsatisfiedLinkError) {
// Native library load failure (likely tmpfs noexec or missing JNA dependencies)
recordError("native_library_load_failed");
log.error("OCR native library (JNA/Tesseract/Leptonica) failed to load. " +
"Check tmpfs has exec permissions, presence of libleptonica/libtesseract shared libraries, " +
"and JNA temp directory is accessible.", unsatisfiedLinkError);
throw new RuntimeException("OCR native library failed to load: " + unsatisfiedLinkError.getMessage(),
unsatisfiedLinkError);
}
}

Expand Down
34 changes: 27 additions & 7 deletions src/main/java/org/ddamme/service/ai/OcrService.java
Original file line number Diff line number Diff line change
Expand Up @@ -175,23 +175,43 @@ public OcrResult extractTextFromPdf(Path pdfPath) throws IOException, TesseractE

/**
* Create and configure Tesseract instance.
*
* Tess4J expects the directory that contains *.traineddata files directly.
* For Ubuntu 22.04, this is typically /usr/share/tesseract-ocr/4.00/tessdata
*/
private Tesseract createTesseract() {
Tesseract tesseract = new Tesseract();
Tesseract tesseractInstance = new Tesseract();

String tessdataDirectoryPath = properties.getOcr().getDataPath();

// Validate language data file exists (helps with debugging path issues)
String languageCode = properties.getOcr().getLanguage();
Path languageDataFilePath = Path.of(tessdataDirectoryPath, languageCode + ".traineddata");

if (!java.nio.file.Files.exists(languageDataFilePath)) {
String errorMessage = String.format(
"Tesseract language file not found at %s. " +
"Verify TESSDATA_PREFIX points to directory containing *.traineddata files " +
"(e.g., /usr/share/tesseract-ocr/4.00/tessdata)",
languageDataFilePath);
log.error(errorMessage);
throw new IllegalStateException(errorMessage);
}

// Set data path for tessdata
tesseract.setDatapath(properties.getOcr().getDataPath());
// Pass the directory that actually contains *.traineddata files
tesseractInstance.setDatapath(tessdataDirectoryPath);
log.debug("Tesseract datapath set to: {} (language: {})", tessdataDirectoryPath, languageCode);

// Set language from configuration
tesseract.setLanguage(properties.getOcr().getLanguage());
tesseractInstance.setLanguage(languageCode);

// PSM 3: Fully automatic page segmentation (default)
tesseract.setPageSegMode(3);
tesseractInstance.setPageSegMode(3);

// OEM 1: Neural nets LSTM engine only (best accuracy)
tesseract.setOcrEngineMode(1);
tesseractInstance.setOcrEngineMode(1);

return tesseract;
return tesseractInstance;
}

/**
Expand Down
2 changes: 1 addition & 1 deletion src/main/resources/application.yml
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ ai:
ocr:
max-pages: 50 # Cost control for large PDFs
language: eng # Tesseract language (eng, fra, deu, spa, etc.)
data-path: ${TESSDATA_PREFIX:/usr/share/tesseract-ocr/4.00/tessdata}
data-path: ${TESSDATA_PREFIX:/usr/share/tesseract-ocr/5/tessdata}
auto-create: ${AI_OCR_AUTO_CREATE:true}
file-types:
- application/pdf
Expand Down