Skip to content

Commit

Permalink
OPENNLP-1261: The Language Detector should not ignore ngram counts.
Browse files Browse the repository at this point in the history
git push
  • Loading branch information
kottmann committed Jun 6, 2019
1 parent cfa7bb6 commit 07a39d4
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@
import java.util.ArrayList;
import java.util.Collection;

import opennlp.tools.ngram.NGramModel;
import opennlp.tools.util.StringList;
import opennlp.tools.util.StringUtil;
import opennlp.tools.util.normalizer.AggregateCharSequenceNormalizer;
import opennlp.tools.util.normalizer.CharSequenceNormalizer;

Expand Down Expand Up @@ -58,14 +57,19 @@ public DefaultLanguageDetectorContextGenerator(int minLength, int maxLength,
public String[] getContext(CharSequence document) {
Collection<String> context = new ArrayList<>();

NGramModel model = new NGramModel();
model.add(normalizer.normalize(document), minLength, maxLength);
CharSequence chars = normalizer.normalize(document);

for (StringList tokenList : model) {
if (tokenList.size() > 0) {
context.add(tokenList.getToken(0));
for (int lengthIndex = minLength; lengthIndex < maxLength + 1; lengthIndex++) {
for (int textIndex = 0;
textIndex + lengthIndex - 1 < chars.length(); textIndex++) {

String gram = StringUtil.toLowerCase(
chars.subSequence(textIndex, textIndex + lengthIndex));

context.add(gram);
}
}

return context.toArray(new String[context.size()]);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ public void missclassified(LanguageSample reference,
cv.evaluate(sampleStream, 2);

Assert.assertEquals(99, cv.getDocumentCount());
Assert.assertEquals(0.98989898989899, cv.getDocumentAccuracy(), 0.01);
Assert.assertEquals(1, cv.getDocumentAccuracy(), 0.01);
}

}

0 comments on commit 07a39d4

Please sign in to comment.