OPENNLP-912: Rule based sentence detector

apache · Jan 28, 2021 · 238554d · 238554d
1 parent af6a6e0
commit 238554d
Show file tree

Hide file tree

Showing 11 changed files with 1,674 additions and 0 deletions.
diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Clean.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Clean.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+public class Clean {
+
+  String regex;
+  String replacement;
+
+  /**
+   * @param regex the regular expression to which this string is to be matched
+   * @param replacement the string to be substituted for each match
+   */
+  public Clean(String regex, String replacement) {
+    this.regex = regex;
+    this.replacement = replacement;
+  }
+
+  public String getRegex() {
+    return regex;
+  }
+
+  public String getReplacement() {
+    return replacement;
+  }
+
+  @Override
+  public String toString() {
+    return "Clean{" +
+        "regex='" + regex + '\'' +
+        ", replacement='" + replacement + '\'' +
+        '}';
+  }
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Cleaner.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Cleaner.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * removes errant newlines, xhtml, inline formatting, etc.
+ */
+public class Cleaner {
+
+  public List<Clean> cleanList = new ArrayList<Clean>();
+
+  public String clean(String text) {
+    for (Clean clean : cleanList) {
+      text = text.replaceAll(clean.getRegex(), clean.getReplacement());
+    }
+    return text;
+  }
+
+  public void clear() {
+    if (cleanList != null) {
+      cleanList.clear();
+    }
+  }
+
+  /**
+   * TODO: Move rules into profiles
+   */
+  public void rules() {
+
+    cleanList.add(new Clean("\\n(?=[a-zA-Z]{1,2}\\n)", ""));
+
+    cleanList.add(new Clean("\\n \\n", "\n"));
+
+    cleanList.add(new Clean("\\n\\n", "\n"));
+
+    cleanList.add(new Clean("\\n(?=\\.(\\s|\\n))", ""));
+    cleanList.add(new Clean("(?<=\\s)\\n", ""));
+    cleanList.add(new Clean("(?<=\\S)\\n(?=\\S)", " \n "));
+    cleanList.add(new Clean("\\n", "\n"));
+    cleanList.add(new Clean("\\\\n", "\n"));
+    cleanList.add(new Clean("\\\\\\ n", "\n"));
+
+    cleanList.add(new Clean("\\{b\\^&gt;\\d*&lt;b\\^\\}|\\{b\\^>\\d*<b\\^\\}",""));
+
+    cleanList.add(new Clean("\\.{4,}\\s*\\d+-*\\d*","\r"));
+
+//    cleanList.add(new Clean("\\.{5,}", " "));
+    cleanList.add(new Clean("\\/{3}", ""));
+
+//    cleanList.add(new Clean("(?<=[a-z])\\.(?=[A-Z])", ". "));
+//    cleanList.add(new Clean("(?<=\\d)\\.(?=[A-Z])", ". "));
+
+    cleanList.add(new Clean("\\n(?=•')", "\r"));
+    cleanList.add(new Clean("''", "\""));
+    cleanList.add(new Clean("``", "\""));
+
+  }
+
+  public void html() {
+    cleanList.add(new Clean("<\\/?\\w+((\\s+\\w+(\\s*=\\s*(?:\\\".*?\\\"|'.*?'|" +
+        "[\\^'\\\">\\s]+))?)+\\s*|\\s*)\\/?>", ""));
+    cleanList.add(new Clean("&lt;\\/?[^gt;]*gt;", ""));
+  }
+
+  public void pdf() {
+    cleanList.add(new Clean("(?<=[^\\n]\\s)\\n(?=\\S)", ""));
+    cleanList.add(new Clean("\\n(?=[a-z])", " "));
+  }
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/EnglishRule.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/EnglishRule.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.ArrayList;
+
+/**
+ * TODO: Move rules into profiles
+ */
+public class EnglishRule {
+  private static LanguageRule languageRule = new LanguageRule("eng", new ArrayList<Rule>());
+
+  public EnglishRule() {
+    common();
+    number();
+    name();
+    betweenPunctuation();
+    list();
+  }
+
+  public LanguageRule getLanguageRule() {
+    return languageRule;
+  }
+
+
+  private void common() {
+
+    languageRule.addRule(new Rule(true, "\\n", ""));
+    languageRule.addRule(new Rule(true, " ", "\\n"));
+
+    languageRule.addRule(new Rule(true, "[\\.\\?!]+\\s+", "[^\\.]"));
+
+    languageRule.addRule(new Rule(true, "[\\.\\?!]+", "\\s*(A |Being|Did|For|He|" +
+        "How|However|I|In|It|Millions|More|She|That|The|There|They|We|What|When|Where|Who|Why)"));
+
+    languageRule.addRule(new Rule(true, "[!?\\.-][\\\"\\'“”]\\s+", "[A-Z]"));
+
+    languageRule.addRule(new Rule(true, "(?<=\\S)(!|\\?){3,}", "(?=(\\s|\\Z|$))"));
+
+    languageRule.addRule(new Rule(false, "[\\.\\?!]+\\s*", "(?=[\\.\\?!])"));
+
+    languageRule.addRule(new Rule(false, "([a-zA-z]°)\\.\\s*", "(?=\\d+)"));
+
+    languageRule.addRule(new Rule(false, "\\s", "(?=[a-z])"));
+  }
+
+  private void number() {
+    languageRule.addRule(new Rule(false, "\\d\\.", "(?=\\d)"));
+
+  }
+
+  private void name() {
+
+    languageRule.addRule(new Rule(false, "(Mr|Mrs|Ms|Dr|p.m|a.m|tel)\\.", "\\s*"));
+
+    languageRule.addRule(new Rule(true, "(P\\.M\\.|A\\.M\\.)", "\\s+"));
+
+    languageRule.addRule(new Rule(false, "(?<=(?<=^)[A-Z]\\.\\s+|(?<=\\A)[A-Z]\\.\\s+|" +
+        "[A-Z]\\.\\s+|(?<=^)[A-Z][a-z]\\.\\s+|(?<=\\A)[A-Z][a-z]\\.\\s+|(?<=\\s)[A-Z]" +
+        "[a-z]\\.\\s)", "(?!(A |Being|Did|For|He|How|However|I|In|It|Millions|" +
+        "More|She|That|The|There|They|We|What|When|Where|Who|Why))"));
+  }
+
+  private void betweenPunctuation() {
+
+    languageRule.addRule(new Rule(false, "(?<=\\s)'(?:[^']|'[a-zA-Z])*'", ""));
+
+    languageRule.addRule(new Rule(false, "(?<=\\s)‘(?:[^’]|’[a-zA-Z])*’", ""));
+
+    languageRule.addRule(new Rule(false, "\"(?>[^\"\\\\]+|\\\\{2}|\\\\.)*\"", ""));
+
+    languageRule.addRule(new Rule(false, "«(?>[^»\\\\]+|\\\\{2}|\\\\.)*»", ""));
+
+    languageRule.addRule(new Rule(false, "“(?>[^”\\\\]+|\\\\{2}|\\\\.)*”", ""));
+
+    languageRule.addRule(new Rule(false, "\\[(?>[^\\]\\\\]+|\\\\{2}|\\\\.)*\\]", ""));
+
+    languageRule.addRule(new Rule(false, "\\((?>[^\\(\\)\\\\]+|\\\\{2}|\\\\.)*\\)", ""));
+
+    languageRule.addRule(new Rule(false, "(?<=\\s)\\-\\-(?>[^\\-\\-])*\\-\\-", ""));
+  }
+
+  private void list() {
+
+    languageRule.addRule(new Rule(false, "((?<=^)[a-z]\\.|(?<=\\A)[a-z]\\.|(?<=\\s)[a-z]\\.)",
+        "\\s*(?!(A |Being|Did|For|He|How|However|I|In|It|Millions|More|She|That|The|There|" +
+            "They|We|What|When|Where|Who|Why))"));
+
+    //number_list
+    languageRule.addRule(new Rule(false, "(?<=\\s)\\d{1,2}\\.(\\s)|^\\d{1,2}\\.(\\s)|" +
+        "(?<=\\s)\\d{1,2}\\.(\\))|^\\d{1,2}\\.(\\))|(?<=\\s\\-)\\d{1,2}\\.(\\s)|" +
+        "(?<=^\\-)\\d{1,2}\\.(\\s)|(?<=\\s\\⁃)\\d{1,2}\\.(\\s)|(?<=^\\⁃)\\d{1,2}\\.(\\s)|" +
+        "(?<=\\s\\-)\\d{1,2}\\.(\\))|(?<=^\\-)\\d{1,2}\\.(\\))|(?<=\\s\\⁃)\\d{1,2}\\.(\\))|" +
+        "(?<=^\\⁃)\\d{1,2}\\.(\\))|(\\•)\\s*\\d{1,2}\\.(\\s)|(?<=\\s)\\d{1,2}(\\))", "\\s*"));
+
+    //number_list
+    languageRule.addRule(new Rule(true, "", "\\s+((?<=\\s)\\d{1,2}\\.(?=\\s)|" +
+        "^\\d{1,2}\\.(?=\\s)|(?<=\\s)\\d{1,2}\\.(?=\\))|^\\d{1,2}\\.(?=\\))|((?<=\\s)\\-)" +
+        "\\d{1,2}\\.(?=\\s)|(^\\-)\\d{1,2}\\.(?=\\s)|((?<=\\s)\\⁃)\\d{1,2}\\.(?=\\s)|" +
+        "(^\\⁃)\\d{1,2}\\.(?=\\s)|((?<=\\s)\\-)\\d{1,2}\\.(?=\\))|(^\\-)\\d{1,2}\\.(?=\\))|" +
+        "((?<=\\s)\\⁃)\\d{1,2}\\.(?=\\))|(^\\⁃)\\d{1,2}\\.(?=\\))|(\\•)\\s*\\d{1,2}\\.(\\s)|" +
+        "(?<=\\s)\\d{1,2}(?=\\)))"));
+  }
+
+}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/LanguageRule.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/LanguageRule.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect.segment;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Represents rule for segmenting text in some language. Contains {@link Rule}
+ * list.
+ *
+ */
+public class LanguageRule {
+
+  private List<Rule> ruleList;
+
+  private String name;
+
+  /**
+   * Creates language rule.
+   *
+   * @param name language rule name
+   * @param ruleList rule list (it will be shallow copied)
+   */
+  public LanguageRule(String name, List<Rule> ruleList) {
+    this.ruleList = new ArrayList<Rule>(ruleList);
+    this.name = name;
+  }
+
+  /**
+   * Creates empty language rule.
+   *
+   * @param name language rule name
+   */
+  public LanguageRule(String name) {
+    this(name, new ArrayList<Rule>());
+  }
+
+  /**
+   * @return unmodifiable rules list
+   */
+  public List<Rule> getRuleList() {
+    return Collections.unmodifiableList(ruleList);
+  }
+
+  /**
+   * Adds rule to the end of rule list.
+   * @param rule
+   */
+  public void addRule(Rule rule) {
+    ruleList.add(rule);
+  }
+
+  /**
+   * @return language rule name
+   */
+  public String getName() {
+    return name;
+  }
+
+}