From d8c16a2990b795bdf721cf83cbb817ea243094cc Mon Sep 17 00:00:00 2001 From: madhawa-gunasekara Date: Mon, 4 Apr 2016 00:14:19 +0530 Subject: [PATCH 1/3] Adding NGram range support for feature extraction --- .../tools/doccat/NGramFeatureGenerator.java | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java index 1c9441113..fd1901893 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java @@ -27,12 +27,31 @@ */ public class NGramFeatureGenerator implements FeatureGenerator { + //default values for bigrams + private int minGram = 2; + private int maxGram = 2; + + public NGramFeatureGenerator() { + } + + public void NGramFeatureGenerator(int minGram, int maxGram) { + this.minGram = minGram; + this.maxGram = maxGram; + } + public Collection extractFeatures(String[] text, Map extraInfo) { List features = new ArrayList(); - for (int i = 0; i < text.length - 1; i++) { - features.add("ng=" + text[i] + ":" + text[i + 1]); + for (int i = 0; i <= text.length - minGram; i++) { + String feature = "ng="; + for (int y = 0; y < maxGram && i + y < text.length; y++) { + feature = feature + ":" + text[i + y]; + int gramCount = y + 1; + if (maxGram >= gramCount && gramCount >= minGram) { + features.add(feature); + } + } } return features; From 6376eaeea968c900efb8894482b2680439e0fe56 Mon Sep 17 00:00:00 2001 From: madhawa-gunasekara Date: Tue, 5 Apr 2016 08:21:14 +0530 Subject: [PATCH 2/3] Adding validation for N-Gram range --- .../tools/doccat/NGramFeatureGenerator.java | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java index fd1901893..079b9a370 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java @@ -17,6 +17,8 @@ package opennlp.tools.doccat; +import opennlp.tools.util.InvalidFormatException; + import java.util.ArrayList; import java.util.Collection; import java.util.List; @@ -34,11 +36,24 @@ public class NGramFeatureGenerator implements FeatureGenerator { public NGramFeatureGenerator() { } - public void NGramFeatureGenerator(int minGram, int maxGram) { - this.minGram = minGram; - this.maxGram = maxGram; + public void NGramFeatureGenerator(int minGram, int maxGram) throws InvalidFormatException { + if (minGram <= maxGram) { + this.minGram = minGram; + this.maxGram = maxGram; + } else { + throw new InvalidFormatException("minimum range value (minGram) should be less than or equal to maximum range value (maxGram)!"); + } } + /** + * Extract N-Grams from the given text fragments according to the N-Gram range, + * i.e. a,b,c,d with minGram = 2 & maxGram = 3 then features would be a:b, a:b:c, b:c, b:c:d, c:d + * i.e. a,b,c,d with minGram = 2 & maxGram = 2 then features would be a:b, b:c, c:d + * + * @param text the text fragments to extract features from + * @param extraInfo optional extra information to be used by the feature generator + * @return a collection of features + */ public Collection extractFeatures(String[] text, Map extraInfo) { List features = new ArrayList(); From 81597ad1f60d52607f59a7910709340cb62beea7 Mon Sep 17 00:00:00 2001 From: madhawa-gunasekara Date: Tue, 5 Apr 2016 11:00:24 +0530 Subject: [PATCH 3/3] Fixing non-negative validation --- .../tools/doccat/NGramFeatureGenerator.java | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java index 079b9a370..76bbaf8f3 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java @@ -36,12 +36,16 @@ public class NGramFeatureGenerator implements FeatureGenerator { public NGramFeatureGenerator() { } - public void NGramFeatureGenerator(int minGram, int maxGram) throws InvalidFormatException { - if (minGram <= maxGram) { - this.minGram = minGram; - this.maxGram = maxGram; + public NGramFeatureGenerator(int minGram, int maxGram) throws InvalidFormatException { + if (minGram > 0 && maxGram > 0) { + if (minGram <= maxGram) { + this.minGram = minGram; + this.maxGram = maxGram; + } else { + throw new InvalidFormatException("Minimum range value (minGram) should be less than or equal to maximum range value (maxGram)!"); + } } else { - throw new InvalidFormatException("minimum range value (minGram) should be less than or equal to maximum range value (maxGram)!"); + throw new InvalidFormatException("Both minimum range value (minGram) & maximum range value (maxGram) should be greater than or equal to 1!"); } }