Merge pull request #245 from WorksApplications/feature/201-lookup

Add lookup and oovMorpheme method
WorksApplications · Nov 26, 2024 · 148d072 · 148d072
2 parents cacdf10 + 08c09cc
commit 148d072
Show file tree

Hide file tree

Showing 17 changed files with 885 additions and 290 deletions.
diff --git a/src/main/java/com/worksap/nlp/sudachi/Dictionary.java b/src/main/java/com/worksap/nlp/sudachi/Dictionary.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Works Applications Co., Ltd.
+ * Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,6 +56,56 @@ public interface Dictionary extends AutoCloseable {
     @Override
     public void close() throws IOException;
 
+    /**
+     * Lookup entries in the dictionary without performing an analysis.
+     * 
+     * Specified surface will be normalized. This will work like performing analysis
+     * on the given headword and find paths with a single morpheme, but returns all
+     * paths instead of the lowest cost one.
+     * 
+     * @param surface
+     *            to lookup. Will be normalized beforehand.
+     * @return a list of morphemes that match the surface. Their begin/end will be
+     *         0/length of their headword.
+     */
+    public List<Morpheme> lookup(CharSequence surface);
+
+    /**
+     * Create an out-of-vocabulary morpheme from the pos id and string forms.
+     * 
+     * Begin/end will be set based on the surface.
+     * 
+     * @param posId
+     *            part-of-speech id of the morpheme
+     * @param surface
+     *            surface of the morpheme
+     * @param reading
+     *            reading form of the morpheme
+     * @param normalizedForm
+     *            normalized form of the morpheme
+     * @param dictionaryForm
+     *            dictionary form of the morpheme
+     * @return an oov morpheme with given information
+     */
+    public Morpheme oovMorpheme(short posId, String surface, String reading, String normalizedForm,
+            String dictionaryForm);
+
+    /**
+     * Create an out-of-vocabulary morpheme from the pos id and the surface.
+     * 
+     * Use the surface to for other string forms. Begin/end will be set based on the
+     * surface.
+     * 
+     * @param posId
+     *            part-of-speech id of the morpheme
+     * @param surface
+     *            surface of the morpheme
+     * @return an oov morpheme with given information
+     */
+    public default Morpheme oovMorpheme(short posId, String surface) {
+        return oovMorpheme(posId, surface, surface, surface, surface);
+    }
+
     /**
      * Returns the number of types of part-of-speech.
      *

diff --git a/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java b/src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Works Applications Co., Ltd.
+ * Copyright (c) 2017-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -127,6 +127,40 @@ public void close() throws IOException {
         }
     }
 
+    @Override
+    public List<Morpheme> lookup(CharSequence surface) {
+        UTF8InputTextBuilder builder = new UTF8InputTextBuilder(surface, grammar);
+        for (InputTextPlugin plugin : inputTextPlugins) {
+            plugin.rewrite(builder);
+        }
+        UTF8InputText input = builder.build();
+        byte[] bytes = input.getByteText();
+
+        List<Morpheme> morphemes = new ArrayList<>();
+        WordLookup wordLookup = lexicon.makeLookup();
+        wordLookup.reset(bytes, 0, bytes.length);
+        while (wordLookup.next()) {
+            int end = wordLookup.getEndOffset();
+            if (end != bytes.length) {
+                continue;
+            }
+            int numWords = wordLookup.getNumWords();
+            int[] wordIds = wordLookup.getWordsIds();
+            for (int word = 0; word < numWords; ++word) {
+                int wordId = wordIds[word];
+                Morpheme morpheme = new SingleMorphemeImpl(getGrammar(), getLexicon(), wordId);
+                morphemes.add(morpheme);
+            }
+        }
+        return morphemes;
+    }
+
+    @Override
+    public Morpheme oovMorpheme(short posId, String surface, String reading, String normalizedForm,
+            String dictionaryForm) {
+        return new SingleMorphemeImpl(getGrammar(), posId, surface, reading, normalizedForm, dictionaryForm);
+    }
+
     @Override
     public Tokenizer tokenizer() {
         if (grammar == null || lexicon == null) {

diff --git a/src/main/java/com/worksap/nlp/sudachi/LatticeNode.java b/src/main/java/com/worksap/nlp/sudachi/LatticeNode.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Works Applications Co., Ltd.
+ * Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -98,25 +98,41 @@ public interface LatticeNode {
      */
     public WordInfo getWordInfo();
 
+    /**
+     * Returns the string information of the node.
+     * 
+     * @return the string information of the node.
+     * @see StringsCache
+     */
+    public StringsCache getStrings();
+
     /**
      * @return the text of node.
      */
-    public String getSurface();
+    public default String getSurface() {
+        return getStrings().getSurface();
+    }
 
     /**
      * @return the reading form of node.
      */
-    public String getReading();
+    public default String getReading() {
+        return getStrings().getReading();
+    }
 
     /**
      * @return the normalized form of node.
      */
-    public String getNormalizedForm();
+    public default String getNormalizedForm() {
+        return getStrings().getNormalizedForm();
+    }
 
     /**
      * @return the dictionary form of node.
      */
-    public String getDictionaryForm();
+    public default String getDictionaryForm() {
+        return getStrings().getDictionaryForm();
+    }
 
     /**
      * Sets the morpheme information to the node.

diff --git a/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java b/src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Works Applications Co., Ltd.
+ * Copyright (c) 2021-2024 Works Applications Co., Ltd.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,13 +42,13 @@ public class LatticeNodeImpl implements LatticeNode {
     LatticeNodeImpl bestPreviousNode;
 
     // either Lexicon or StringsCache object
-    Object lexicon;
+    Object lexiconOrStrings;
 
     // Empty wordInfo for special words.
     static final WordInfo UNDEFINED_WORDINFO = new WordInfo((short) 0, (short) -1);
 
     LatticeNodeImpl(Lexicon lexicon, long params, int wordId) {
-        this.lexicon = lexicon;
+        this.lexiconOrStrings = lexicon;
         this.leftId = WordParameters.leftId(params);
         this.rightId = WordParameters.rightId(params);
         this.cost = WordParameters.cost(params);
@@ -82,7 +82,7 @@ static LatticeNodeImpl makeOov(int begin, int end, short posId, String surface,
         LatticeNodeImpl node = new LatticeNodeImpl();
         node.wordId = WordId.makeOov(posId);
         node.wordInfo = new WordInfo((short) (end - begin), posId);
-        node.lexicon = new StringsCache(surface, readingForm, normalizedForm, dictionaryForm);
+        node.lexiconOrStrings = new StringsCache(surface, readingForm, normalizedForm, dictionaryForm);
         node.begin = begin;
         node.end = end;
         return node;
@@ -110,10 +110,10 @@ public void setParameter(long params) {
     }
 
     private Lexicon lexicon() {
-        if (lexicon instanceof Lexicon) {
-            return (Lexicon) lexicon;
-        } else if (lexicon instanceof StringsCache) {
-            return ((StringsCache) lexicon).lexicon;
+        if (lexiconOrStrings instanceof Lexicon) {
+            return (Lexicon) lexiconOrStrings;
+        } else if (lexiconOrStrings instanceof StringsCache) {
+            return ((StringsCache) lexiconOrStrings).getLexicon();
         } else {
             throw new IllegalStateException("lexicon was null probably");
         }
@@ -191,26 +191,6 @@ public boolean isConnectedToBOS() {
         return bestPreviousNode != null;
     }
 
-    @Override
-    public String getSurface() {
-        return strings().getSurface(this);
-    }
-
-    @Override
-    public String getReading() {
-        return strings().getReading(this);
-    }
-
-    @Override
-    public String getNormalizedForm() {
-        return strings().getNormalizedForm(this);
-    }
-
-    @Override
-    public String getDictionaryForm() {
-        return strings().getDictionaryForm(this);
-    }
-
     @Override
     public String toString() {
         String surface = getSurface();
@@ -220,11 +200,12 @@ public String toString() {
                 cost);
     }
 
-    private StringsCache strings() {
-        Object l = lexicon;
+    @Override
+    public StringsCache getStrings() {
+        Object l = lexiconOrStrings;
         if (l instanceof Lexicon) {
-            StringsCache c = new StringsCache((Lexicon) l);
-            lexicon = c;
+            StringsCache c = new StringsCache((Lexicon) l, wordId);
+            lexiconOrStrings = c;
             return c;
         } else if (l instanceof StringsCache) {
             return (StringsCache) l;
@@ -274,82 +255,6 @@ private void appendSplitsTo(List<LatticeNodeImpl> result, int[] splitsId) {
         }
     }
 
-    /**
-     * Cache to reduce the access to the lexicon. Also used to mock the lexicon for
-     * OOV nodes.
-     */
-    private static final class StringsCache {
-        private final Lexicon lexicon;
-        private String surface;
-        private String reading;
-        private String normalizedForm;
-        private String dictionaryForm;
-
-        public StringsCache(Lexicon lexicon) {
-            this.lexicon = lexicon;
-        }
-
-        public StringsCache(String surface, String readingForm, String normalizedForm, String dictionaryForm) {
-            this.lexicon = null;
-            this.surface = surface;
-            this.reading = readingForm;
-            this.normalizedForm = normalizedForm;
-            this.dictionaryForm = dictionaryForm;
-        }
-
-        public String getSurface(LatticeNodeImpl node) {
-            // benign data race pattern
-            // https://shipilev.net/blog/2016/close-encounters-of-jmm-kind/#wishful-benign-is-resilient
-            String s = surface;
-            if (s == null) {
-                WordInfo wi = node.getWordInfo();
-                int headwordPtr = wi.getHeadword();
-                int dic = WordId.dic(node.getWordId());
-                s = lexicon.string(dic, headwordPtr);
-                surface = s;
-            }
-            return s;
-        }
-
-        public String getReading(LatticeNodeImpl node) {
-            String s = reading;
-            if (s == null) {
-                WordInfo wi = node.getWordInfo();
-                int readingPtr = wi.getReadingForm();
-                int dic = WordId.dic(node.getWordId());
-                s = lexicon.string(dic, readingPtr);
-                reading = s;
-            }
-            return s;
-        }
-
-        public String getNormalizedForm(LatticeNodeImpl node) {
-            String s = normalizedForm;
-            if (s == null) {
-                WordInfo wi = node.getWordInfo();
-                int wordref = wi.getNormalizedForm();
-                int dic = WordId.refDic(wordref, WordId.dic(node.wordId));
-                int headwordPtr = lexicon.wordInfos(dic).headwordPtr(WordId.word(wordref));
-                s = lexicon.string(dic, headwordPtr);
-                normalizedForm = s;
-            }
-            return s;
-        }
-
-        public String getDictionaryForm(LatticeNodeImpl node) {
-            String s = dictionaryForm;
-            if (s == null) {
-                WordInfo wi = node.getWordInfo();
-                int wordref = wi.getDictionaryForm();
-                int dic = WordId.refDic(wordref, WordId.dic(node.wordId));
-                int headwordPtr = lexicon.wordInfos(dic).headwordPtr(WordId.word(wordref));
-                s = lexicon.string(dic, headwordPtr);
-                dictionaryForm = s;
-            }
-            return s;
-        }
-    }
-
     /** Alias for {@link OOVFactory} constructor. */
     public static OOVFactory oovFactory(short leftId, short rightId, short cost, short posId) {
         return new OOVFactory(leftId, rightId, cost, posId);