Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
c2315ca
clear
ddamme05 Nov 7, 2025
fb347f4
chore(docker): migrate from Alpine to Ubuntu Jammy for Tesseract support
ddamme05 Nov 7, 2025
22485fa
fix(docker): configure JNA temp directory to avoid noexec /tmp issues
ddamme05 Nov 7, 2025
6e45293
build: add Tesseract OCR and PDF processing dependencies
ddamme05 Nov 7, 2025
35b7742
feat(database): add AI-related fields to FileMetadata entity
ddamme05 Nov 7, 2025
8b72233
fix(database): use underscore notation for nested property access in …
ddamme05 Nov 7, 2025
bcdb556
feat(storage): add downloadToFile method for local file access
ddamme05 Nov 7, 2025
552312a
feat(files): automatically create OCR jobs for PDFs and images on upload
ddamme05 Nov 7, 2025
26716ea
feat(errors): add global exception handler for InvalidRequestException
ddamme05 Nov 7, 2025
ac7f9c8
feat(config): add AI worker configuration and optimize connection poo…
ddamme05 Nov 7, 2025
b0aa84e
chore(config): reduce log noise in production
ddamme05 Nov 7, 2025
66337da
feat(frontend): add search page route and navigation
ddamme05 Nov 7, 2025
5ac134e
build(frontend): add DOMPurify for XSS protection
ddamme05 Nov 7, 2025
d72fbd5
feat(api): add HEAD request method to API client
ddamme05 Nov 7, 2025
f85d900
chore(git): ignore generated snapshot file
ddamme05 Nov 7, 2025
85ed25a
style: code quality fixes for test configuration
ddamme05 Nov 7, 2025
a0e6df6
feat(ai): add AI worker infrastructure
ddamme05 Nov 7, 2025
aba6c76
feat(database): add AI job tables and file metadata extensions
ddamme05 Nov 7, 2025
2ec7799
feat(search): add full-text search API
ddamme05 Nov 7, 2025
4ce7fb1
feat(frontend): add search page and OCR status badge
ddamme05 Nov 7, 2025
1859b81
test: add integration tests for AI and search functionality
ddamme05 Nov 7, 2025
fbb5bf3
correct Timestamp to Instant conversion
ddamme05 Nov 8, 2025
082b16e
[API-01] Fixed an OCR naming mismatch.
ddamme05 Nov 8, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ aws/
*.md
!**/README.md

# Generated snapshots (regenerate as needed)
p1_integration_snapshot.md

# Frontend build outputs and dependencies
client/node_modules/
client/dist/
Expand Down
31 changes: 25 additions & 6 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# ============================================
# Stage 1: Build the Spring Boot application
# ============================================
# Pin Gradle and JDK versions to prevent unexpected upgrades
FROM gradle:8.8-jdk21-alpine AS build

WORKDIR /src
Expand All @@ -19,17 +20,30 @@ COPY src ./src
RUN ./gradlew --no-daemon clean bootJar

# ============================================
# Stage 2: Runtime image
# Stage 2: Runtime image (Ubuntu Jammy for Tesseract)
# ============================================
FROM amazoncorretto:21-alpine
# Pin specific JRE version for reproducible builds
# Format: eclipse-temurin:{java_version}_{build_version}-jre-jammy
FROM eclipse-temurin:21.0.5_11-jre-jammy

WORKDIR /app

# Pin Datadog Java agent version
ARG DD_JAVA_AGENT_VERSION=1.52.1

# Install runtime dependencies (curl for healthchecks, su-exec for privilege dropping)
RUN apk add --no-cache curl su-exec && \
adduser -D -u 10001 -s /bin/sh appuser
# Install runtime dependencies including Tesseract OCR
# Note: Ubuntu 22.04 (Jammy) ships with Tesseract 4.1.1
# Tess4J 5.16.0 is compatible with Tesseract 4.x and 5.x
RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
gosu \
tesseract-ocr \
tesseract-ocr-eng && \
rm -rf /var/lib/apt/lists/* && \
useradd -u 10001 -m -s /bin/sh appuser

# Verify Tesseract is installed and working (4.x or 5.x)
RUN tesseract --version 2>&1 | head -1

# Copy the boot JAR from build stage
COPY --from=build /src/build/libs/app.jar /app/app.jar
Expand All @@ -48,11 +62,16 @@ RUN set -eux; \
awk '{print $1" /app/dd-java-agent.jar"}' /tmp/dd.jar.sha512 | sha512sum -c -; \
fi || echo "Datadog agent download failed, continuing without it"

# Environment variables for Tesseract
ENV OMP_THREAD_LIMIT=1
ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata

# Free-tier friendly memory settings (prevents OOM on t3.micro/small)
ENV JAVA_TOOL_OPTIONS="-XX:MaxRAMPercentage=65.0 -XX:+UseSerialGC"
ENV JAVA_TOOL_OPTIONS="-javaagent:/app/dd-java-agent.jar -XX:MaxRAMPercentage=75.0 -XX:+UseSerialGC"
ENV HOME=/tmp

EXPOSE 8080

# Start as root to read secrets, then drop privileges in entrypoint
ENTRYPOINT ["/app/load-secrets.sh"]

125 changes: 0 additions & 125 deletions architecture.md

This file was deleted.

8 changes: 7 additions & 1 deletion build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ dependencies {

runtimeOnly("io.jsonwebtoken:jjwt-impl:0.12.6")
runtimeOnly("io.jsonwebtoken:jjwt-jackson:0.12.6")
runtimeOnly("org.postgresql:postgresql")
implementation("org.postgresql:postgresql")

compileOnly("org.projectlombok:lombok")

Expand All @@ -46,6 +46,12 @@ dependencies {
implementation("com.bucket4j:bucket4j-core:8.10.1")
implementation("com.github.ben-manes.caffeine:caffeine:3.2.2")
implementation("org.springdoc:springdoc-openapi-starter-webmvc-ui:2.8.13")

// OCR and PDF processing
implementation("net.sourceforge.tess4j:tess4j:5.16.0")
implementation("org.apache.pdfbox:pdfbox:3.0.6")
implementation("net.java.dev.jna:jna:5.18.1")

annotationProcessor("org.projectlombok:lombok")
annotationProcessor("org.springframework.boot:spring-boot-configuration-processor")
testImplementation("org.springframework.boot:spring-boot-starter-test")
Expand Down
17 changes: 17 additions & 0 deletions client/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions client/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
},
"dependencies": {
"@tanstack/react-query": "^5.60.0",
"dompurify": "^3.3.0",
"lucide-react": "^0.453.0",
"react": "^19.2.0",
"react-dom": "^19.2.0",
Expand Down
11 changes: 11 additions & 0 deletions client/src/App.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import {UploadPanel} from '@/components/files/UploadPanel';
import {LoginPage} from '@/pages/LoginPage';
import {RegisterPage} from '@/pages/RegisterPage';
import {FilesPage} from '@/pages/FilesPage';
import {SearchPage} from '@/pages/SearchPage';

const queryClient = new QueryClient({
defaultOptions: {
Expand Down Expand Up @@ -42,6 +43,16 @@ export function App() {
</AuthGuard>
}
/>
<Route
path="/search"
element={
<AuthGuard>
<AppShell>
<SearchPage/>
</AppShell>
</AuthGuard>
}
/>
<Route path="*" element={<Navigate to="/" replace/>}/>
</Routes>
</BrowserRouter>
Expand Down
8 changes: 8 additions & 0 deletions client/src/api/client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,14 @@ export async function apiRequest<T>(endpoint: string, options?: RequestInit): Pr

export const api = {
get: <T>(url: string) => apiRequest<T>(url, {method: 'GET'}),
head: (url: string) => fetch(url, {
method: 'HEAD',
headers: {
...(localStorage.getItem('auth_token') && {
Authorization: `Bearer ${localStorage.getItem('auth_token')}`
}),
},
}),
post: <T>(url: string, data?: unknown) =>
apiRequest<T>(url, {
method: 'POST',
Expand Down
63 changes: 63 additions & 0 deletions client/src/components/files/OcrStatusBadge.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import { useHasText } from '../../hooks/useSearch';
import { FileText, Loader2, CheckCircle } from 'lucide-react';

interface OcrStatusBadgeProps {
fileId: number;
}

/**
* Badge showing OCR text availability status for a file
* Uses HEAD request for efficient checking (no body transfer)
*
* Example usage:
* ```tsx
* <OcrStatusBadge fileId={file.id} />
* ```
*/
export function OcrStatusBadge({ fileId }: OcrStatusBadgeProps) {
const { data, isLoading, error } = useHasText(fileId);

if (isLoading) {
return (
<div className="flex items-center gap-1 text-gray-500 text-sm">
<Loader2 size={16} className="animate-spin" />
<span>Checking...</span>
</div>
);
}

if (error) {
return (
<div className="flex items-center gap-1 text-gray-400 text-sm">
<FileText size={16} />
<span>Unknown</span>
</div>
);
}

if (!data?.hasText) {
return (
<div className="flex items-center gap-1 text-gray-400 text-sm">
<FileText size={16} />
<span>No text</span>
</div>
);
}

return (
<div className="flex items-center gap-1 text-green-600 text-sm">
<CheckCircle size={16} />
<span>Text available ({formatTextLength(data.textLength)})</span>
</div>
);
}

function formatTextLength(chars: number): string {
if (chars === 0) return '0 chars';
if (chars < 1000) return `${chars} char${chars !== 1 ? 's' : ''}`;
if (chars < 10000) return `${(chars / 1000).toFixed(1)}K`;
return `${Math.round(chars / 1000)}K`;
}



Loading