diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java index 1c9441113..76bbaf8f3 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java @@ -17,6 +17,8 @@ package opennlp.tools.doccat; +import opennlp.tools.util.InvalidFormatException; + import java.util.ArrayList; import java.util.Collection; import java.util.List; @@ -27,12 +29,48 @@ */ public class NGramFeatureGenerator implements FeatureGenerator { + //default values for bigrams + private int minGram = 2; + private int maxGram = 2; + + public NGramFeatureGenerator() { + } + + public NGramFeatureGenerator(int minGram, int maxGram) throws InvalidFormatException { + if (minGram > 0 && maxGram > 0) { + if (minGram <= maxGram) { + this.minGram = minGram; + this.maxGram = maxGram; + } else { + throw new InvalidFormatException("Minimum range value (minGram) should be less than or equal to maximum range value (maxGram)!"); + } + } else { + throw new InvalidFormatException("Both minimum range value (minGram) & maximum range value (maxGram) should be greater than or equal to 1!"); + } + } + + /** + * Extract N-Grams from the given text fragments according to the N-Gram range, + * i.e. a,b,c,d with minGram = 2 & maxGram = 3 then features would be a:b, a:b:c, b:c, b:c:d, c:d + * i.e. a,b,c,d with minGram = 2 & maxGram = 2 then features would be a:b, b:c, c:d + * + * @param text the text fragments to extract features from + * @param extraInfo optional extra information to be used by the feature generator + * @return a collection of features + */ public Collection extractFeatures(String[] text, Map extraInfo) { List features = new ArrayList(); - for (int i = 0; i < text.length - 1; i++) { - features.add("ng=" + text[i] + ":" + text[i + 1]); + for (int i = 0; i <= text.length - minGram; i++) { + String feature = "ng="; + for (int y = 0; y < maxGram && i + y < text.length; y++) { + feature = feature + ":" + text[i + y]; + int gramCount = y + 1; + if (maxGram >= gramCount && gramCount >= minGram) { + features.add(feature); + } + } } return features;