Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: introduce Lucene-Hunspell spellchecker module #1052

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions settings.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,6 @@ include("machinetranslators:apertium",
"aligner",
"theme",
"spellchecker:hunspell",
"spellchecker:lucene",
"spellchecker:morfologik",
"tipoftheday")
40 changes: 40 additions & 0 deletions spellchecker/lucene/build.gradle
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
plugins {
id 'java-library'
}

dependencies {
compileOnly(project.rootProject) {
exclude group: 'org.apache.lucene'
exclude group: 'org.languagetool'
}
if (providedLibsDir.directory) {
compileOnly fileTree(dir: providedLibsDir, includes: ['**/commons-io-*.jar'])
implementation fileTree(dir: providedLibsDir, includes: ['**/lucene-analyzers-common-8*.jar'])
} else {
compileOnly(libs.commons.io)
implementation 'org.apache.lucene:lucene-analyzers-common:8.11.3'
}
testImplementation(libs.junit4)
testImplementation(libs.assertj)
testImplementation(testFixtures(project.rootProject))
testImplementation(libs.commons.io)
}

jar {
archiveAppendix.set("spellchecker")
destinationDirectory.set(rootProject.layout.buildDirectory.dir("modules").get())
from configurations.runtimeClasspath.collect { it.isDirectory() ? it : zipTree(it) }
def pluginProp = loadProperties(file('plugin.properties'))
manifest {
attributes('License': 'GNU Public License version 3 or later',
'Implementation-Version': pluginProp.Version,
'OmegaT-Plugins': pluginProp.Class,
'Plugin-Author': pluginProp.Author,
'Plugin-Version': pluginProp.Version,
'Plugin-Name': pluginProp.Name,
'Plugin-Category': pluginProp.Category,
'Plugin-Description': pluginProp.Description
)
}
duplicatesStrategy = DuplicatesStrategy.INCLUDE
}
31 changes: 31 additions & 0 deletions spellchecker/lucene/plugin.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#
# OmegaT - Computer Assisted Translation (CAT) tool
# with fuzzy matching, translation memory, keyword search,
# glossaries, and translation leveraging into updated projects.
#
# Copyright (C) 2024 Hiroshi Miura
# Home page: https://www.omegat.org/
# Support center: https://omegat.org/support
#
# This file is part of OmegaT.
#
# OmegaT is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OmegaT is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
License=GNU Public License version 3 or later
Author=OmegaT team
Version=6.0.0
Category=spellchecker
Name=Default spell checker with Lucene Hunspell
Class=org.omegat.spellchecker.lucene.HunSpellChecker
Description=Lucene spellchecker module
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
/*******************************************************************************
* OmegaT - Computer Assisted Translation (CAT) tool
* with fuzzy matching, translation memory, keyword search,
* glossaries, and translation leveraging into updated projects.
*
* Copyright (C) 2024 Hiroshi Miura
* Home page: https://www.omegat.org/
* Support center: https://omegat.org/support
*
* This file is part of OmegaT.
*
* OmegaT is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OmegaT is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
******************************************************************************/

package org.omegat.spellchecker.lucene;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.text.ParseException;
import java.util.List;
import java.util.Optional;

import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.hunspell.Dictionary;
import org.apache.lucene.analysis.hunspell.Hunspell;
import org.apache.lucene.store.NIOFSDirectory;

import org.omegat.core.Core;
import org.omegat.core.spellchecker.AbstractSpellChecker;
import org.omegat.core.spellchecker.ISpellChecker;
import org.omegat.core.spellchecker.ISpellCheckerProvider;
import org.omegat.core.spellchecker.SpellCheckerManager;
import org.omegat.util.Preferences;

public class LuceneHunSpellChecker extends AbstractSpellChecker implements ISpellChecker {

/**
* affix file extension
*/
public static final String SC_AFFIX_FILENAME = "index.aff";

/**
* dictionary file extension
*/
public static final String SC_DICTIONARY_FILENAME = "index.dic";

/**
* Register plugins into OmegaT.
*/
public static void loadPlugins() {
Core.registerSpellCheckClass(LuceneHunSpellChecker.class);
}

public static void unloadPlugins() {
}

public LuceneHunSpellChecker() {
super();
}

@Override
protected Optional<ISpellCheckerProvider> initializeWithLanguage(final String language) {
// check that the dict exists
String dictionaryDir = Preferences.getPreferenceDefault(Preferences.SPELLCHECKER_DICTIONARY_DIRECTORY,
SpellCheckerManager.DEFAULT_DICTIONARY_DIR.getPath());

File affixName = Path.of(dictionaryDir).resolve(Path.of(language)).resolve(SC_AFFIX_FILENAME).toFile();
File dictionaryName = Path.of(dictionaryDir).resolve(language).resolve(SC_DICTIONARY_FILENAME).toFile();

if (isInvalidFile(affixName) || isInvalidFile(dictionaryName)) {
return Optional.empty();
}

try {
ISpellCheckerProvider result = new LuceneProvider(dictionaryName, affixName);
return Optional.of(result);
} catch (Exception ex) {
ex.printStackTrace();
}
return Optional.empty();
}

private static final class LuceneProvider implements ISpellCheckerProvider {
private final InputStream dictInputStream;
private final InputStream affixInputStream;
private final Hunspell hunspell;

private LuceneProvider(File dictName, File affixName) throws IOException, ParseException {
Path tempDir = Path.of(FileUtils.getTempDirectoryPath());
dictInputStream = new FileInputStream(dictName);
affixInputStream = new FileInputStream(affixName);
Dictionary dict = new Dictionary(new NIOFSDirectory(tempDir), "omegat",
affixInputStream, dictInputStream);
hunspell = new Hunspell(dict);
}

@Override
public boolean isCorrect(final String word) {
return hunspell.spell(word);
}

@Override
public List<String> suggest(final String word) {
return hunspell.suggest(word);
}

@Override
public void learnWord(final String word) {
}

@Override
public void destroy() {
try {
dictInputStream.close();
affixInputStream.close();
} catch (Exception ignored) {
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*******************************************************************************
* OmegaT - Computer Assisted Translation (CAT) tool
* with fuzzy matching, translation memory, keyword search,
* glossaries, and translation leveraging into updated projects.
*
* Copyright (C) 2008 Alex Buloichik
* Home page: https://www.omegat.org/
* Support center: https://omegat.org/support
*
* This file is part of OmegaT.
*
* OmegaT is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OmegaT is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
******************************************************************************/

package org.omegat.spellchecker;

import static org.junit.Assert.assertTrue;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.io.FileUtils;
import org.junit.Test;

/**
* Test for copyright notes exists in source files.
*
* @author Alex Buloichik
*/
public class CopyrightTest {
protected static final String[] MUST_EXIST = new String[]{
"OmegaT - Computer Assisted Translation (CAT) tool", "Copyright (C)",
"Home page: https://www.omegat.org/", "This file is part of OmegaT",
"OmegaT is free software: you can redistribute it and/or modify",
"it under the terms of the GNU General Public License as published by",
"the Free Software Foundation, either version 3 of the License, or",
"(at your option) any later version.",
"OmegaT is distributed in the hope that it will be useful,",
"but WITHOUT ANY WARRANTY; without even the implied warranty of",
"MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the",
"GNU General Public License for more details.",
"You should have received a copy of the GNU General Public License",
"along with this program. If not, see <https://www.gnu.org/licenses/>."};

@Test
public void testCopyright() throws Exception {
List<File> sourceFiles = new ArrayList<File>();
list(new File("src"), sourceFiles);
ByteArrayOutputStream fdata = new ByteArrayOutputStream();
for (File f : sourceFiles) {
if (f.getPath().replace('\\', '/').startsWith("src/gen/")) {
// skip jaxb generated files
continue;
}
FileUtils.copyFile(f, fdata);
String data = fdata.toString("ISO-8859-1");
checkNote(f, data);
fdata.reset();
}
}

protected void checkNote(File f, String data) {
int pos = data.indexOf("\npackage ");
if (pos > 0) {
data = data.substring(0, pos);
}
for (String con : MUST_EXIST) {
assertTrue("There is no copyright note in '" + f.getAbsolutePath() + "' : " + con,
data.contains(con));
}
}

protected void list(File dir, List<File> files) {
for (File f : dir.listFiles()) {
String fn = f.getName();
if (f.getName().endsWith(".java")) {
files.add(f);
} else if (fn.equals("build.xml")) {
files.add(f);
} else if (fn.endsWith(".properties")) {
if (fn.startsWith("Version") || fn.startsWith("Bundle") || fn.startsWith("project")) {
files.add(f);
}
} else if (fn.endsWith(".html")) {
files.add(f);
} else if (f.isDirectory()) {
list(f, files);
}
}
}
}
Loading
Loading