Skip to content

Commit

Permalink
Merge pull request #245 from WorksApplications/feature/201-lookup
Browse files Browse the repository at this point in the history
Add lookup and oovMorpheme method
  • Loading branch information
mh-northlander authored Nov 26, 2024
2 parents cacdf10 + 08c09cc commit 148d072
Show file tree
Hide file tree
Showing 17 changed files with 885 additions and 290 deletions.
52 changes: 51 additions & 1 deletion src/main/java/com/worksap/nlp/sudachi/Dictionary.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -56,6 +56,56 @@ public interface Dictionary extends AutoCloseable {
@Override
public void close() throws IOException;

/**
* Lookup entries in the dictionary without performing an analysis.
*
* Specified surface will be normalized. This will work like performing analysis
* on the given headword and find paths with a single morpheme, but returns all
* paths instead of the lowest cost one.
*
* @param surface
* to lookup. Will be normalized beforehand.
* @return a list of morphemes that match the surface. Their begin/end will be
* 0/length of their headword.
*/
public List<Morpheme> lookup(CharSequence surface);

/**
* Create an out-of-vocabulary morpheme from the pos id and string forms.
*
* Begin/end will be set based on the surface.
*
* @param posId
* part-of-speech id of the morpheme
* @param surface
* surface of the morpheme
* @param reading
* reading form of the morpheme
* @param normalizedForm
* normalized form of the morpheme
* @param dictionaryForm
* dictionary form of the morpheme
* @return an oov morpheme with given information
*/
public Morpheme oovMorpheme(short posId, String surface, String reading, String normalizedForm,
String dictionaryForm);

/**
* Create an out-of-vocabulary morpheme from the pos id and the surface.
*
* Use the surface to for other string forms. Begin/end will be set based on the
* surface.
*
* @param posId
* part-of-speech id of the morpheme
* @param surface
* surface of the morpheme
* @return an oov morpheme with given information
*/
public default Morpheme oovMorpheme(short posId, String surface) {
return oovMorpheme(posId, surface, surface, surface, surface);
}

/**
* Returns the number of types of part-of-speech.
*
Expand Down
36 changes: 35 additions & 1 deletion src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017-2022 Works Applications Co., Ltd.
* Copyright (c) 2017-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -127,6 +127,40 @@ public void close() throws IOException {
}
}

@Override
public List<Morpheme> lookup(CharSequence surface) {
UTF8InputTextBuilder builder = new UTF8InputTextBuilder(surface, grammar);
for (InputTextPlugin plugin : inputTextPlugins) {
plugin.rewrite(builder);
}
UTF8InputText input = builder.build();
byte[] bytes = input.getByteText();

List<Morpheme> morphemes = new ArrayList<>();
WordLookup wordLookup = lexicon.makeLookup();
wordLookup.reset(bytes, 0, bytes.length);
while (wordLookup.next()) {
int end = wordLookup.getEndOffset();
if (end != bytes.length) {
continue;
}
int numWords = wordLookup.getNumWords();
int[] wordIds = wordLookup.getWordsIds();
for (int word = 0; word < numWords; ++word) {
int wordId = wordIds[word];
Morpheme morpheme = new SingleMorphemeImpl(getGrammar(), getLexicon(), wordId);
morphemes.add(morpheme);
}
}
return morphemes;
}

@Override
public Morpheme oovMorpheme(short posId, String surface, String reading, String normalizedForm,
String dictionaryForm) {
return new SingleMorphemeImpl(getGrammar(), posId, surface, reading, normalizedForm, dictionaryForm);
}

@Override
public Tokenizer tokenizer() {
if (grammar == null || lexicon == null) {
Expand Down
26 changes: 21 additions & 5 deletions src/main/java/com/worksap/nlp/sudachi/LatticeNode.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -98,25 +98,41 @@ public interface LatticeNode {
*/
public WordInfo getWordInfo();

/**
* Returns the string information of the node.
*
* @return the string information of the node.
* @see StringsCache
*/
public StringsCache getStrings();

/**
* @return the text of node.
*/
public String getSurface();
public default String getSurface() {
return getStrings().getSurface();
}

/**
* @return the reading form of node.
*/
public String getReading();
public default String getReading() {
return getStrings().getReading();
}

/**
* @return the normalized form of node.
*/
public String getNormalizedForm();
public default String getNormalizedForm() {
return getStrings().getNormalizedForm();
}

/**
* @return the dictionary form of node.
*/
public String getDictionaryForm();
public default String getDictionaryForm() {
return getStrings().getDictionaryForm();
}

/**
* Sets the morpheme information to the node.
Expand Down
121 changes: 13 additions & 108 deletions src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -42,13 +42,13 @@ public class LatticeNodeImpl implements LatticeNode {
LatticeNodeImpl bestPreviousNode;

// either Lexicon or StringsCache object
Object lexicon;
Object lexiconOrStrings;

// Empty wordInfo for special words.
static final WordInfo UNDEFINED_WORDINFO = new WordInfo((short) 0, (short) -1);

LatticeNodeImpl(Lexicon lexicon, long params, int wordId) {
this.lexicon = lexicon;
this.lexiconOrStrings = lexicon;
this.leftId = WordParameters.leftId(params);
this.rightId = WordParameters.rightId(params);
this.cost = WordParameters.cost(params);
Expand Down Expand Up @@ -82,7 +82,7 @@ static LatticeNodeImpl makeOov(int begin, int end, short posId, String surface,
LatticeNodeImpl node = new LatticeNodeImpl();
node.wordId = WordId.makeOov(posId);
node.wordInfo = new WordInfo((short) (end - begin), posId);
node.lexicon = new StringsCache(surface, readingForm, normalizedForm, dictionaryForm);
node.lexiconOrStrings = new StringsCache(surface, readingForm, normalizedForm, dictionaryForm);
node.begin = begin;
node.end = end;
return node;
Expand Down Expand Up @@ -110,10 +110,10 @@ public void setParameter(long params) {
}

private Lexicon lexicon() {
if (lexicon instanceof Lexicon) {
return (Lexicon) lexicon;
} else if (lexicon instanceof StringsCache) {
return ((StringsCache) lexicon).lexicon;
if (lexiconOrStrings instanceof Lexicon) {
return (Lexicon) lexiconOrStrings;
} else if (lexiconOrStrings instanceof StringsCache) {
return ((StringsCache) lexiconOrStrings).getLexicon();
} else {
throw new IllegalStateException("lexicon was null probably");
}
Expand Down Expand Up @@ -191,26 +191,6 @@ public boolean isConnectedToBOS() {
return bestPreviousNode != null;
}

@Override
public String getSurface() {
return strings().getSurface(this);
}

@Override
public String getReading() {
return strings().getReading(this);
}

@Override
public String getNormalizedForm() {
return strings().getNormalizedForm(this);
}

@Override
public String getDictionaryForm() {
return strings().getDictionaryForm(this);
}

@Override
public String toString() {
String surface = getSurface();
Expand All @@ -220,11 +200,12 @@ public String toString() {
cost);
}

private StringsCache strings() {
Object l = lexicon;
@Override
public StringsCache getStrings() {
Object l = lexiconOrStrings;
if (l instanceof Lexicon) {
StringsCache c = new StringsCache((Lexicon) l);
lexicon = c;
StringsCache c = new StringsCache((Lexicon) l, wordId);
lexiconOrStrings = c;
return c;
} else if (l instanceof StringsCache) {
return (StringsCache) l;
Expand Down Expand Up @@ -274,82 +255,6 @@ private void appendSplitsTo(List<LatticeNodeImpl> result, int[] splitsId) {
}
}

/**
* Cache to reduce the access to the lexicon. Also used to mock the lexicon for
* OOV nodes.
*/
private static final class StringsCache {
private final Lexicon lexicon;
private String surface;
private String reading;
private String normalizedForm;
private String dictionaryForm;

public StringsCache(Lexicon lexicon) {
this.lexicon = lexicon;
}

public StringsCache(String surface, String readingForm, String normalizedForm, String dictionaryForm) {
this.lexicon = null;
this.surface = surface;
this.reading = readingForm;
this.normalizedForm = normalizedForm;
this.dictionaryForm = dictionaryForm;
}

public String getSurface(LatticeNodeImpl node) {
// benign data race pattern
// https://shipilev.net/blog/2016/close-encounters-of-jmm-kind/#wishful-benign-is-resilient
String s = surface;
if (s == null) {
WordInfo wi = node.getWordInfo();
int headwordPtr = wi.getHeadword();
int dic = WordId.dic(node.getWordId());
s = lexicon.string(dic, headwordPtr);
surface = s;
}
return s;
}

public String getReading(LatticeNodeImpl node) {
String s = reading;
if (s == null) {
WordInfo wi = node.getWordInfo();
int readingPtr = wi.getReadingForm();
int dic = WordId.dic(node.getWordId());
s = lexicon.string(dic, readingPtr);
reading = s;
}
return s;
}

public String getNormalizedForm(LatticeNodeImpl node) {
String s = normalizedForm;
if (s == null) {
WordInfo wi = node.getWordInfo();
int wordref = wi.getNormalizedForm();
int dic = WordId.refDic(wordref, WordId.dic(node.wordId));
int headwordPtr = lexicon.wordInfos(dic).headwordPtr(WordId.word(wordref));
s = lexicon.string(dic, headwordPtr);
normalizedForm = s;
}
return s;
}

public String getDictionaryForm(LatticeNodeImpl node) {
String s = dictionaryForm;
if (s == null) {
WordInfo wi = node.getWordInfo();
int wordref = wi.getDictionaryForm();
int dic = WordId.refDic(wordref, WordId.dic(node.wordId));
int headwordPtr = lexicon.wordInfos(dic).headwordPtr(WordId.word(wordref));
s = lexicon.string(dic, headwordPtr);
dictionaryForm = s;
}
return s;
}
}

/** Alias for {@link OOVFactory} constructor. */
public static OOVFactory oovFactory(short leftId, short rightId, short cost, short posId) {
return new OOVFactory(leftId, rightId, cost, posId);
Expand Down
Loading

0 comments on commit 148d072

Please sign in to comment.