Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add lookup and oovMorpheme method #245

Merged
merged 11 commits into from
Nov 26, 2024
52 changes: 51 additions & 1 deletion src/main/java/com/worksap/nlp/sudachi/Dictionary.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -56,6 +56,56 @@ public interface Dictionary extends AutoCloseable {
@Override
public void close() throws IOException;

/**
* Lookup entries in the dictionary without performing an analysis.
*
* Specified surface will be normalized. This will work like performing analysis
* on the given headword and find paths with a single morpheme, but returns all
* paths instead of the lowest cost one.
*
* @param surface
* to lookup. Will be normalized beforehand.
* @return a list of morphemes that match the surface. Their begin/end will be
* 0/length of their headword.
*/
public List<Morpheme> lookup(CharSequence surface);

/**
* Create an out-of-vocabulary morpheme from the pos id and string forms.
*
* Begin/end will be set based on the surface.
*
* @param posId
* part-of-speech id of the morpheme
* @param surface
* surface of the morpheme
* @param reading
* reading form of the morpheme
* @param normalizedForm
* normalized form of the morpheme
* @param dictionaryForm
* dictionary form of the morpheme
* @return an oov morpheme with given information
*/
public Morpheme oovMorpheme(short posId, String surface, String reading, String normalizedForm,
String dictionaryForm);

/**
* Create an out-of-vocabulary morpheme from the pos id and the surface.
*
* Use the surface to for other string forms. Begin/end will be set based on the
* surface.
*
* @param posId
* part-of-speech id of the morpheme
* @param surface
* surface of the morpheme
* @return an oov morpheme with given information
*/
public default Morpheme oovMorpheme(short posId, String surface) {
return oovMorpheme(posId, surface, surface, surface, surface);
}

/**
* Returns the number of types of part-of-speech.
*
Expand Down
36 changes: 35 additions & 1 deletion src/main/java/com/worksap/nlp/sudachi/JapaneseDictionary.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017-2022 Works Applications Co., Ltd.
* Copyright (c) 2017-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -127,6 +127,40 @@ public void close() throws IOException {
}
}

@Override
public List<Morpheme> lookup(CharSequence surface) {
UTF8InputTextBuilder builder = new UTF8InputTextBuilder(surface, grammar);
for (InputTextPlugin plugin : inputTextPlugins) {
plugin.rewrite(builder);
}
UTF8InputText input = builder.build();
byte[] bytes = input.getByteText();

List<Morpheme> morphemes = new ArrayList<>();
WordLookup wordLookup = lexicon.makeLookup();
wordLookup.reset(bytes, 0, bytes.length);
while (wordLookup.next()) {
int end = wordLookup.getEndOffset();
if (end != bytes.length) {
continue;
}
int numWords = wordLookup.getNumWords();
int[] wordIds = wordLookup.getWordsIds();
for (int word = 0; word < numWords; ++word) {
int wordId = wordIds[word];
Morpheme morpheme = new SingleMorphemeImpl(getGrammar(), getLexicon(), wordId);
morphemes.add(morpheme);
}
}
return morphemes;
}

@Override
public Morpheme oovMorpheme(short posId, String surface, String reading, String normalizedForm,
String dictionaryForm) {
return new SingleMorphemeImpl(getGrammar(), posId, surface, reading, normalizedForm, dictionaryForm);
}

@Override
public Tokenizer tokenizer() {
if (grammar == null || lexicon == null) {
Expand Down
26 changes: 21 additions & 5 deletions src/main/java/com/worksap/nlp/sudachi/LatticeNode.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -98,25 +98,41 @@ public interface LatticeNode {
*/
public WordInfo getWordInfo();

/**
* Returns the string information of the node.
*
* @return the string information of the node.
* @see StringsCache
*/
public StringsCache getStrings();

/**
* @return the text of node.
*/
public String getSurface();
public default String getSurface() {
return getStrings().getSurface();
}

/**
* @return the reading form of node.
*/
public String getReading();
public default String getReading() {
return getStrings().getReading();
}

/**
* @return the normalized form of node.
*/
public String getNormalizedForm();
public default String getNormalizedForm() {
return getStrings().getNormalizedForm();
}

/**
* @return the dictionary form of node.
*/
public String getDictionaryForm();
public default String getDictionaryForm() {
return getStrings().getDictionaryForm();
}

/**
* Sets the morpheme information to the node.
Expand Down
121 changes: 13 additions & 108 deletions src/main/java/com/worksap/nlp/sudachi/LatticeNodeImpl.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021 Works Applications Co., Ltd.
* Copyright (c) 2021-2024 Works Applications Co., Ltd.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -42,13 +42,13 @@ public class LatticeNodeImpl implements LatticeNode {
LatticeNodeImpl bestPreviousNode;

// either Lexicon or StringsCache object
Object lexicon;
Object lexiconOrStrings;

// Empty wordInfo for special words.
static final WordInfo UNDEFINED_WORDINFO = new WordInfo((short) 0, (short) -1);

LatticeNodeImpl(Lexicon lexicon, long params, int wordId) {
this.lexicon = lexicon;
this.lexiconOrStrings = lexicon;
this.leftId = WordParameters.leftId(params);
this.rightId = WordParameters.rightId(params);
this.cost = WordParameters.cost(params);
Expand Down Expand Up @@ -82,7 +82,7 @@ static LatticeNodeImpl makeOov(int begin, int end, short posId, String surface,
LatticeNodeImpl node = new LatticeNodeImpl();
node.wordId = WordId.makeOov(posId);
node.wordInfo = new WordInfo((short) (end - begin), posId);
node.lexicon = new StringsCache(surface, readingForm, normalizedForm, dictionaryForm);
node.lexiconOrStrings = new StringsCache(surface, readingForm, normalizedForm, dictionaryForm);
node.begin = begin;
node.end = end;
return node;
Expand Down Expand Up @@ -110,10 +110,10 @@ public void setParameter(long params) {
}

private Lexicon lexicon() {
if (lexicon instanceof Lexicon) {
return (Lexicon) lexicon;
} else if (lexicon instanceof StringsCache) {
return ((StringsCache) lexicon).lexicon;
if (lexiconOrStrings instanceof Lexicon) {
return (Lexicon) lexiconOrStrings;
} else if (lexiconOrStrings instanceof StringsCache) {
return ((StringsCache) lexiconOrStrings).getLexicon();
} else {
throw new IllegalStateException("lexicon was null probably");
}
Expand Down Expand Up @@ -191,26 +191,6 @@ public boolean isConnectedToBOS() {
return bestPreviousNode != null;
}

@Override
public String getSurface() {
return strings().getSurface(this);
}

@Override
public String getReading() {
return strings().getReading(this);
}

@Override
public String getNormalizedForm() {
return strings().getNormalizedForm(this);
}

@Override
public String getDictionaryForm() {
return strings().getDictionaryForm(this);
}

@Override
public String toString() {
String surface = getSurface();
Expand All @@ -220,11 +200,12 @@ public String toString() {
cost);
}

private StringsCache strings() {
Object l = lexicon;
@Override
public StringsCache getStrings() {
Object l = lexiconOrStrings;
if (l instanceof Lexicon) {
StringsCache c = new StringsCache((Lexicon) l);
lexicon = c;
StringsCache c = new StringsCache((Lexicon) l, wordId);
lexiconOrStrings = c;
return c;
} else if (l instanceof StringsCache) {
return (StringsCache) l;
Expand Down Expand Up @@ -274,82 +255,6 @@ private void appendSplitsTo(List<LatticeNodeImpl> result, int[] splitsId) {
}
}

/**
* Cache to reduce the access to the lexicon. Also used to mock the lexicon for
* OOV nodes.
*/
private static final class StringsCache {
private final Lexicon lexicon;
private String surface;
private String reading;
private String normalizedForm;
private String dictionaryForm;

public StringsCache(Lexicon lexicon) {
this.lexicon = lexicon;
}

public StringsCache(String surface, String readingForm, String normalizedForm, String dictionaryForm) {
this.lexicon = null;
this.surface = surface;
this.reading = readingForm;
this.normalizedForm = normalizedForm;
this.dictionaryForm = dictionaryForm;
}

public String getSurface(LatticeNodeImpl node) {
// benign data race pattern
// https://shipilev.net/blog/2016/close-encounters-of-jmm-kind/#wishful-benign-is-resilient
String s = surface;
if (s == null) {
WordInfo wi = node.getWordInfo();
int headwordPtr = wi.getHeadword();
int dic = WordId.dic(node.getWordId());
s = lexicon.string(dic, headwordPtr);
surface = s;
}
return s;
}

public String getReading(LatticeNodeImpl node) {
String s = reading;
if (s == null) {
WordInfo wi = node.getWordInfo();
int readingPtr = wi.getReadingForm();
int dic = WordId.dic(node.getWordId());
s = lexicon.string(dic, readingPtr);
reading = s;
}
return s;
}

public String getNormalizedForm(LatticeNodeImpl node) {
String s = normalizedForm;
if (s == null) {
WordInfo wi = node.getWordInfo();
int wordref = wi.getNormalizedForm();
int dic = WordId.refDic(wordref, WordId.dic(node.wordId));
int headwordPtr = lexicon.wordInfos(dic).headwordPtr(WordId.word(wordref));
s = lexicon.string(dic, headwordPtr);
normalizedForm = s;
}
return s;
}

public String getDictionaryForm(LatticeNodeImpl node) {
String s = dictionaryForm;
if (s == null) {
WordInfo wi = node.getWordInfo();
int wordref = wi.getDictionaryForm();
int dic = WordId.refDic(wordref, WordId.dic(node.wordId));
int headwordPtr = lexicon.wordInfos(dic).headwordPtr(WordId.word(wordref));
s = lexicon.string(dic, headwordPtr);
dictionaryForm = s;
}
return s;
}
}

/** Alias for {@link OOVFactory} constructor. */
public static OOVFactory oovFactory(short leftId, short rightId, short cost, short posId) {
return new OOVFactory(leftId, rightId, cost, posId);
Expand Down
Loading