Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions apache-tika/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.baeldung</groupId>
<artifactId>apache-tika</artifactId>
<version>0.0.1-SNAPSHOT</version>

<parent>
<groupId>com.baeldung</groupId>
<artifactId>parent-modules</artifactId>
<version>1.0.0-SNAPSHOT</version>
</parent>

<properties>
<tika.version>1.17</tika.version>
</properties>

<dependencies>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers</artifactId>
<version>${tika.version}</version>
</dependency>
</dependencies>
</project>
67 changes: 67 additions & 0 deletions apache-tika/src/main/java/com/baeldung/tika/TikaAnalysis.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
package com.baeldung.tika;

import java.io.IOException;
import java.io.InputStream;

import org.apache.tika.Tika;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

public class TikaAnalysis {
public static String detectDocTypeUsingDetector(InputStream stream) throws IOException {
Detector detector = new DefaultDetector();
Metadata metadata = new Metadata();

MediaType mediaType = detector.detect(stream, metadata);
return mediaType.toString();
}

public static String detectDocTypeUsingFacade(InputStream stream) throws IOException {
Tika tika = new Tika();
String mediaType = tika.detect(stream);
return mediaType;
}

public static String extractContentUsingParser(InputStream stream) throws IOException, TikaException, SAXException {
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();

parser.parse(stream, handler, metadata, context);
return handler.toString();
}

public static String extractContentUsingFacade(InputStream stream) throws IOException, TikaException {
Tika tika = new Tika();
String content = tika.parseToString(stream);
return content;
}

public static Metadata extractMetadatatUsingParser(InputStream stream) throws IOException, SAXException, TikaException {
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();

parser.parse(stream, handler, metadata, context);
return metadata;
}

public static Metadata extractMetadatatUsingFacade(InputStream stream) throws IOException, TikaException {
Tika tika = new Tika();
Metadata metadata = new Metadata();

tika.parse(stream, metadata);
return metadata;
}
}
79 changes: 79 additions & 0 deletions apache-tika/src/test/java/com/baeldung/tika/TikaUnitTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
package com.baeldung.tika;

import static org.hamcrest.CoreMatchers.containsString;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThat;

import java.io.IOException;
import java.io.InputStream;

import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.junit.Test;
import org.xml.sax.SAXException;

public class TikaUnitTest {
@Test
public void whenUsingDetector_thenDocumentTypeIsReturned() throws IOException {
InputStream stream = this.getClass().getClassLoader().getResourceAsStream("tika.txt");
String mediaType = TikaAnalysis.detectDocTypeUsingDetector(stream);

assertEquals("application/pdf", mediaType);

stream.close();
}

@Test
public void whenUsingFacade_thenDocumentTypeIsReturned() throws IOException {
InputStream stream = this.getClass().getClassLoader().getResourceAsStream("tika.txt");
String mediaType = TikaAnalysis.detectDocTypeUsingFacade(stream);

assertEquals("application/pdf", mediaType);

stream.close();
}

@Test
public void whenUsingParser_thenContentIsReturned() throws IOException, TikaException, SAXException {
InputStream stream = this.getClass().getClassLoader().getResourceAsStream("tika.docx");
String content = TikaAnalysis.extractContentUsingParser(stream);

assertThat(content, containsString("Apache Tika - a content analysis toolkit"));
assertThat(content, containsString("detects and extracts metadata and text"));

stream.close();
}

@Test
public void whenUsingFacade_thenContentIsReturned() throws IOException, TikaException {
InputStream stream = this.getClass().getClassLoader().getResourceAsStream("tika.docx");
String content = TikaAnalysis.extractContentUsingFacade(stream);

assertThat(content, containsString("Apache Tika - a content analysis toolkit"));
assertThat(content, containsString("detects and extracts metadata and text"));

stream.close();
}

@Test
public void whenUsingParser_thenMetadataIsReturned() throws IOException, TikaException, SAXException {
InputStream stream = this.getClass().getClassLoader().getResourceAsStream("tika.xlsx");
Metadata metadata = TikaAnalysis.extractMetadatatUsingParser(stream);

assertEquals("org.apache.tika.parser.DefaultParser", metadata.get("X-Parsed-By"));
assertEquals("Microsoft Office User", metadata.get("Author"));

stream.close();
}

@Test
public void whenUsingFacade_thenMetadataIsReturned() throws IOException, TikaException {
InputStream stream = this.getClass().getClassLoader().getResourceAsStream("tika.xlsx");
Metadata metadata = TikaAnalysis.extractMetadatatUsingFacade(stream);

assertEquals("org.apache.tika.parser.DefaultParser", metadata.get("X-Parsed-By"));
assertEquals("Microsoft Office User", metadata.get("Author"));

stream.close();
}
}
Binary file added apache-tika/src/test/resources/tika.docx
Binary file not shown.
Binary file added apache-tika/src/test/resources/tika.txt
Binary file not shown.
Binary file added apache-tika/src/test/resources/tika.xlsx
Binary file not shown.
1 change: 1 addition & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
<module>apache-cxf</module>
<module>apache-fop</module>
<module>apache-poi</module>
<module>apache-tika</module>
<module>apache-thrift</module>
<module>autovalue</module>
<module>axon</module>
Expand Down