From 07a39d40167cc1ee4666855b7b2417474f0d2a83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rn=20Kottmann?= Date: Wed, 22 May 2019 16:07:09 +0200 Subject: [PATCH] OPENNLP-1261: The Language Detector should not ignore ngram counts. git push --- ...efaultLanguageDetectorContextGenerator.java | 18 +++++++++++------- .../LanguageDetectorCrossValidatorTest.java | 3 +-- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/DefaultLanguageDetectorContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/DefaultLanguageDetectorContextGenerator.java index 41f94900d..8bb9083b1 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/DefaultLanguageDetectorContextGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/DefaultLanguageDetectorContextGenerator.java @@ -20,8 +20,7 @@ import java.util.ArrayList; import java.util.Collection; -import opennlp.tools.ngram.NGramModel; -import opennlp.tools.util.StringList; +import opennlp.tools.util.StringUtil; import opennlp.tools.util.normalizer.AggregateCharSequenceNormalizer; import opennlp.tools.util.normalizer.CharSequenceNormalizer; @@ -58,14 +57,19 @@ public DefaultLanguageDetectorContextGenerator(int minLength, int maxLength, public String[] getContext(CharSequence document) { Collection context = new ArrayList<>(); - NGramModel model = new NGramModel(); - model.add(normalizer.normalize(document), minLength, maxLength); + CharSequence chars = normalizer.normalize(document); - for (StringList tokenList : model) { - if (tokenList.size() > 0) { - context.add(tokenList.getToken(0)); + for (int lengthIndex = minLength; lengthIndex < maxLength + 1; lengthIndex++) { + for (int textIndex = 0; + textIndex + lengthIndex - 1 < chars.length(); textIndex++) { + + String gram = StringUtil.toLowerCase( + chars.subSequence(textIndex, textIndex + lengthIndex)); + + context.add(gram); } } + return context.toArray(new String[context.size()]); } } diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java index 520fc717e..46faaf2e3 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java @@ -58,7 +58,6 @@ public void missclassified(LanguageSample reference, cv.evaluate(sampleStream, 2); Assert.assertEquals(99, cv.getDocumentCount()); - Assert.assertEquals(0.98989898989899, cv.getDocumentAccuracy(), 0.01); + Assert.assertEquals(1, cv.getDocumentAccuracy(), 0.01); } - }