Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@

package opennlp.tools.doccat;

import opennlp.tools.util.InvalidFormatException;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
Expand All @@ -27,12 +29,48 @@
*/
public class NGramFeatureGenerator implements FeatureGenerator {

//default values for bigrams
private int minGram = 2;
private int maxGram = 2;

public NGramFeatureGenerator() {
}

public NGramFeatureGenerator(int minGram, int maxGram) throws InvalidFormatException {
if (minGram > 0 && maxGram > 0) {
if (minGram <= maxGram) {
this.minGram = minGram;
this.maxGram = maxGram;
} else {
throw new InvalidFormatException("Minimum range value (minGram) should be less than or equal to maximum range value (maxGram)!");
}
} else {
throw new InvalidFormatException("Both minimum range value (minGram) & maximum range value (maxGram) should be greater than or equal to 1!");
}

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is good, but what if minGram/maxGram are negative?

}

/**
* Extract N-Grams from the given text fragments according to the N-Gram range,
* i.e. a,b,c,d with minGram = 2 & maxGram = 3 then features would be a:b, a:b:c, b:c, b:c:d, c:d
* i.e. a,b,c,d with minGram = 2 & maxGram = 2 then features would be a:b, b:c, c:d
*
* @param text the text fragments to extract features from
* @param extraInfo optional extra information to be used by the feature generator
* @return a collection of features
*/
public Collection<String> extractFeatures(String[] text, Map<String, Object> extraInfo) {

List<String> features = new ArrayList<String>();

for (int i = 0; i < text.length - 1; i++) {
features.add("ng=" + text[i] + ":" + text[i + 1]);
for (int i = 0; i <= text.length - minGram; i++) {
String feature = "ng=";
for (int y = 0; y < maxGram && i + y < text.length; y++) {
feature = feature + ":" + text[i + y];
int gramCount = y + 1;
if (maxGram >= gramCount && gramCount >= minGram) {
features.add(feature);
}
}
}

return features;
Expand Down