Skip to content

Commit

Permalink
OPENNLP-912: Rule based sentence detector
Browse files Browse the repository at this point in the history
  • Loading branch information
Alanscut committed Jan 28, 2021
1 parent af6a6e0 commit 238554d
Show file tree
Hide file tree
Showing 11 changed files with 1,674 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package opennlp.tools.sentdetect.segment;

public class Clean {

String regex;
String replacement;

/**
* @param regex the regular expression to which this string is to be matched
* @param replacement the string to be substituted for each match
*/
public Clean(String regex, String replacement) {
this.regex = regex;
this.replacement = replacement;
}

public String getRegex() {
return regex;
}

public String getReplacement() {
return replacement;
}

@Override
public String toString() {
return "Clean{" +
"regex='" + regex + '\'' +
", replacement='" + replacement + '\'' +
'}';
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package opennlp.tools.sentdetect.segment;

import java.util.ArrayList;
import java.util.List;

/**
* removes errant newlines, xhtml, inline formatting, etc.
*/
public class Cleaner {

public List<Clean> cleanList = new ArrayList<Clean>();

public String clean(String text) {
for (Clean clean : cleanList) {
text = text.replaceAll(clean.getRegex(), clean.getReplacement());
}
return text;
}

public void clear() {
if (cleanList != null) {
cleanList.clear();
}
}

/**
* TODO: Move rules into profiles
*/
public void rules() {

cleanList.add(new Clean("\\n(?=[a-zA-Z]{1,2}\\n)", ""));

cleanList.add(new Clean("\\n \\n", "\n"));

cleanList.add(new Clean("\\n\\n", "\n"));

cleanList.add(new Clean("\\n(?=\\.(\\s|\\n))", ""));
cleanList.add(new Clean("(?<=\\s)\\n", ""));
cleanList.add(new Clean("(?<=\\S)\\n(?=\\S)", " \n "));
cleanList.add(new Clean("\\n", "\n"));
cleanList.add(new Clean("\\\\n", "\n"));
cleanList.add(new Clean("\\\\\\ n", "\n"));

cleanList.add(new Clean("\\{b\\^&gt;\\d*&lt;b\\^\\}|\\{b\\^>\\d*<b\\^\\}",""));

cleanList.add(new Clean("\\.{4,}\\s*\\d+-*\\d*","\r"));

// cleanList.add(new Clean("\\.{5,}", " "));
cleanList.add(new Clean("\\/{3}", ""));

// cleanList.add(new Clean("(?<=[a-z])\\.(?=[A-Z])", ". "));
// cleanList.add(new Clean("(?<=\\d)\\.(?=[A-Z])", ". "));

cleanList.add(new Clean("\\n(?=•')", "\r"));
cleanList.add(new Clean("''", "\""));
cleanList.add(new Clean("``", "\""));

}

public void html() {
cleanList.add(new Clean("<\\/?\\w+((\\s+\\w+(\\s*=\\s*(?:\\\".*?\\\"|'.*?'|" +
"[\\^'\\\">\\s]+))?)+\\s*|\\s*)\\/?>", ""));
cleanList.add(new Clean("&lt;\\/?[^gt;]*gt;", ""));
}

public void pdf() {
cleanList.add(new Clean("(?<=[^\\n]\\s)\\n(?=\\S)", ""));
cleanList.add(new Clean("\\n(?=[a-z])", " "));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package opennlp.tools.sentdetect.segment;

import java.util.ArrayList;

/**
* TODO: Move rules into profiles
*/
public class EnglishRule {
private static LanguageRule languageRule = new LanguageRule("eng", new ArrayList<Rule>());

public EnglishRule() {
common();
number();
name();
betweenPunctuation();
list();
}

public LanguageRule getLanguageRule() {
return languageRule;
}


private void common() {

languageRule.addRule(new Rule(true, "\\n", ""));
languageRule.addRule(new Rule(true, " ", "\\n"));

languageRule.addRule(new Rule(true, "[\\.\\?!]+\\s+", "[^\\.]"));

languageRule.addRule(new Rule(true, "[\\.\\?!]+", "\\s*(A |Being|Did|For|He|" +
"How|However|I|In|It|Millions|More|She|That|The|There|They|We|What|When|Where|Who|Why)"));

languageRule.addRule(new Rule(true, "[!?\\.-][\\\"\\'“”]\\s+", "[A-Z]"));

languageRule.addRule(new Rule(true, "(?<=\\S)(!|\\?){3,}", "(?=(\\s|\\Z|$))"));

languageRule.addRule(new Rule(false, "[\\.\\?!]+\\s*", "(?=[\\.\\?!])"));

languageRule.addRule(new Rule(false, "([a-zA-z]°)\\.\\s*", "(?=\\d+)"));

languageRule.addRule(new Rule(false, "\\s", "(?=[a-z])"));
}

private void number() {
languageRule.addRule(new Rule(false, "\\d\\.", "(?=\\d)"));

}

private void name() {

languageRule.addRule(new Rule(false, "(Mr|Mrs|Ms|Dr|p.m|a.m|tel)\\.", "\\s*"));

languageRule.addRule(new Rule(true, "(P\\.M\\.|A\\.M\\.)", "\\s+"));

languageRule.addRule(new Rule(false, "(?<=(?<=^)[A-Z]\\.\\s+|(?<=\\A)[A-Z]\\.\\s+|" +
"[A-Z]\\.\\s+|(?<=^)[A-Z][a-z]\\.\\s+|(?<=\\A)[A-Z][a-z]\\.\\s+|(?<=\\s)[A-Z]" +
"[a-z]\\.\\s)", "(?!(A |Being|Did|For|He|How|However|I|In|It|Millions|" +
"More|She|That|The|There|They|We|What|When|Where|Who|Why))"));
}

private void betweenPunctuation() {

languageRule.addRule(new Rule(false, "(?<=\\s)'(?:[^']|'[a-zA-Z])*'", ""));

languageRule.addRule(new Rule(false, "(?<=\\s)‘(?:[^’]|’[a-zA-Z])*’", ""));

languageRule.addRule(new Rule(false, "\"(?>[^\"\\\\]+|\\\\{2}|\\\\.)*\"", ""));

languageRule.addRule(new Rule(false, "«(?>[^»\\\\]+|\\\\{2}|\\\\.)*»", ""));

languageRule.addRule(new Rule(false, "“(?>[^”\\\\]+|\\\\{2}|\\\\.)*”", ""));

languageRule.addRule(new Rule(false, "\\[(?>[^\\]\\\\]+|\\\\{2}|\\\\.)*\\]", ""));

languageRule.addRule(new Rule(false, "\\((?>[^\\(\\)\\\\]+|\\\\{2}|\\\\.)*\\)", ""));

languageRule.addRule(new Rule(false, "(?<=\\s)\\-\\-(?>[^\\-\\-])*\\-\\-", ""));
}

private void list() {

languageRule.addRule(new Rule(false, "((?<=^)[a-z]\\.|(?<=\\A)[a-z]\\.|(?<=\\s)[a-z]\\.)",
"\\s*(?!(A |Being|Did|For|He|How|However|I|In|It|Millions|More|She|That|The|There|" +
"They|We|What|When|Where|Who|Why))"));

//number_list
languageRule.addRule(new Rule(false, "(?<=\\s)\\d{1,2}\\.(\\s)|^\\d{1,2}\\.(\\s)|" +
"(?<=\\s)\\d{1,2}\\.(\\))|^\\d{1,2}\\.(\\))|(?<=\\s\\-)\\d{1,2}\\.(\\s)|" +
"(?<=^\\-)\\d{1,2}\\.(\\s)|(?<=\\s\\⁃)\\d{1,2}\\.(\\s)|(?<=^\\⁃)\\d{1,2}\\.(\\s)|" +
"(?<=\\s\\-)\\d{1,2}\\.(\\))|(?<=^\\-)\\d{1,2}\\.(\\))|(?<=\\s\\⁃)\\d{1,2}\\.(\\))|" +
"(?<=^\\⁃)\\d{1,2}\\.(\\))|(\\•)\\s*\\d{1,2}\\.(\\s)|(?<=\\s)\\d{1,2}(\\))", "\\s*"));

//number_list
languageRule.addRule(new Rule(true, "", "\\s+((?<=\\s)\\d{1,2}\\.(?=\\s)|" +
"^\\d{1,2}\\.(?=\\s)|(?<=\\s)\\d{1,2}\\.(?=\\))|^\\d{1,2}\\.(?=\\))|((?<=\\s)\\-)" +
"\\d{1,2}\\.(?=\\s)|(^\\-)\\d{1,2}\\.(?=\\s)|((?<=\\s)\\⁃)\\d{1,2}\\.(?=\\s)|" +
"(^\\⁃)\\d{1,2}\\.(?=\\s)|((?<=\\s)\\-)\\d{1,2}\\.(?=\\))|(^\\-)\\d{1,2}\\.(?=\\))|" +
"((?<=\\s)\\⁃)\\d{1,2}\\.(?=\\))|(^\\⁃)\\d{1,2}\\.(?=\\))|(\\•)\\s*\\d{1,2}\\.(\\s)|" +
"(?<=\\s)\\d{1,2}(?=\\)))"));
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package opennlp.tools.sentdetect.segment;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
* Represents rule for segmenting text in some language. Contains {@link Rule}
* list.
*
*/
public class LanguageRule {

private List<Rule> ruleList;

private String name;

/**
* Creates language rule.
*
* @param name language rule name
* @param ruleList rule list (it will be shallow copied)
*/
public LanguageRule(String name, List<Rule> ruleList) {
this.ruleList = new ArrayList<Rule>(ruleList);
this.name = name;
}

/**
* Creates empty language rule.
*
* @param name language rule name
*/
public LanguageRule(String name) {
this(name, new ArrayList<Rule>());
}

/**
* @return unmodifiable rules list
*/
public List<Rule> getRuleList() {
return Collections.unmodifiableList(ruleList);
}

/**
* Adds rule to the end of rule list.
* @param rule
*/
public void addRule(Rule rule) {
ruleList.add(rule);
}

/**
* @return language rule name
*/
public String getName() {
return name;
}

}
Loading

0 comments on commit 238554d

Please sign in to comment.