diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Clean.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Clean.java new file mode 100644 index 000000000..9021745fd --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Clean.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentdetect.segment; + +public class Clean { + + String regex; + String replacement; + + /** + * @param regex the regular expression to which this string is to be matched + * @param replacement the string to be substituted for each match + */ + public Clean(String regex, String replacement) { + this.regex = regex; + this.replacement = replacement; + } + + public String getRegex() { + return regex; + } + + public String getReplacement() { + return replacement; + } + + @Override + public String toString() { + return "Clean{" + + "regex='" + regex + '\'' + + ", replacement='" + replacement + '\'' + + '}'; + } +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Cleaner.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Cleaner.java new file mode 100644 index 000000000..b767158b8 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Cleaner.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentdetect.segment; + +import java.util.ArrayList; +import java.util.List; + +/** + * removes errant newlines, xhtml, inline formatting, etc. + */ +public class Cleaner { + + public List cleanList = new ArrayList(); + + public String clean(String text) { + for (Clean clean : cleanList) { + text = text.replaceAll(clean.getRegex(), clean.getReplacement()); + } + return text; + } + + public void clear() { + if (cleanList != null) { + cleanList.clear(); + } + } + + /** + * TODO: Move rules into profiles + */ + public void rules() { + + cleanList.add(new Clean("\\n(?=[a-zA-Z]{1,2}\\n)", "")); + + cleanList.add(new Clean("\\n \\n", "\n")); + + cleanList.add(new Clean("\\n\\n", "\n")); + + cleanList.add(new Clean("\\n(?=\\.(\\s|\\n))", "")); + cleanList.add(new Clean("(?<=\\s)\\n", "")); + cleanList.add(new Clean("(?<=\\S)\\n(?=\\S)", " \n ")); + cleanList.add(new Clean("\\n", "\n")); + cleanList.add(new Clean("\\\\n", "\n")); + cleanList.add(new Clean("\\\\\\ n", "\n")); + + cleanList.add(new Clean("\\{b\\^>\\d*<b\\^\\}|\\{b\\^>\\d*\\s]+))?)+\\s*|\\s*)\\/?>", "")); + cleanList.add(new Clean("<\\/?[^gt;]*gt;", "")); + } + + public void pdf() { + cleanList.add(new Clean("(?<=[^\\n]\\s)\\n(?=\\S)", "")); + cleanList.add(new Clean("\\n(?=[a-z])", " ")); + } +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/EnglishRule.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/EnglishRule.java new file mode 100644 index 000000000..8cf4a3346 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/EnglishRule.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentdetect.segment; + +import java.util.ArrayList; + +/** + * TODO: Move rules into profiles + */ +public class EnglishRule { + private static LanguageRule languageRule = new LanguageRule("eng", new ArrayList()); + + public EnglishRule() { + common(); + number(); + name(); + betweenPunctuation(); + list(); + } + + public LanguageRule getLanguageRule() { + return languageRule; + } + + + private void common() { + + languageRule.addRule(new Rule(true, "\\n", "")); + languageRule.addRule(new Rule(true, " ", "\\n")); + + languageRule.addRule(new Rule(true, "[\\.\\?!]+\\s+", "[^\\.]")); + + languageRule.addRule(new Rule(true, "[\\.\\?!]+", "\\s*(A |Being|Did|For|He|" + + "How|However|I|In|It|Millions|More|She|That|The|There|They|We|What|When|Where|Who|Why)")); + + languageRule.addRule(new Rule(true, "[!?\\.-][\\\"\\'“”]\\s+", "[A-Z]")); + + languageRule.addRule(new Rule(true, "(?<=\\S)(!|\\?){3,}", "(?=(\\s|\\Z|$))")); + + languageRule.addRule(new Rule(false, "[\\.\\?!]+\\s*", "(?=[\\.\\?!])")); + + languageRule.addRule(new Rule(false, "([a-zA-z]°)\\.\\s*", "(?=\\d+)")); + + languageRule.addRule(new Rule(false, "\\s", "(?=[a-z])")); + } + + private void number() { + languageRule.addRule(new Rule(false, "\\d\\.", "(?=\\d)")); + + } + + private void name() { + + languageRule.addRule(new Rule(false, "(Mr|Mrs|Ms|Dr|p.m|a.m|tel)\\.", "\\s*")); + + languageRule.addRule(new Rule(true, "(P\\.M\\.|A\\.M\\.)", "\\s+")); + + languageRule.addRule(new Rule(false, "(?<=(?<=^)[A-Z]\\.\\s+|(?<=\\A)[A-Z]\\.\\s+|" + + "[A-Z]\\.\\s+|(?<=^)[A-Z][a-z]\\.\\s+|(?<=\\A)[A-Z][a-z]\\.\\s+|(?<=\\s)[A-Z]" + + "[a-z]\\.\\s)", "(?!(A |Being|Did|For|He|How|However|I|In|It|Millions|" + + "More|She|That|The|There|They|We|What|When|Where|Who|Why))")); + } + + private void betweenPunctuation() { + + languageRule.addRule(new Rule(false, "(?<=\\s)'(?:[^']|'[a-zA-Z])*'", "")); + + languageRule.addRule(new Rule(false, "(?<=\\s)‘(?:[^’]|’[a-zA-Z])*’", "")); + + languageRule.addRule(new Rule(false, "\"(?>[^\"\\\\]+|\\\\{2}|\\\\.)*\"", "")); + + languageRule.addRule(new Rule(false, "«(?>[^»\\\\]+|\\\\{2}|\\\\.)*»", "")); + + languageRule.addRule(new Rule(false, "“(?>[^”\\\\]+|\\\\{2}|\\\\.)*”", "")); + + languageRule.addRule(new Rule(false, "\\[(?>[^\\]\\\\]+|\\\\{2}|\\\\.)*\\]", "")); + + languageRule.addRule(new Rule(false, "\\((?>[^\\(\\)\\\\]+|\\\\{2}|\\\\.)*\\)", "")); + + languageRule.addRule(new Rule(false, "(?<=\\s)\\-\\-(?>[^\\-\\-])*\\-\\-", "")); + } + + private void list() { + + languageRule.addRule(new Rule(false, "((?<=^)[a-z]\\.|(?<=\\A)[a-z]\\.|(?<=\\s)[a-z]\\.)", + "\\s*(?!(A |Being|Did|For|He|How|However|I|In|It|Millions|More|She|That|The|There|" + + "They|We|What|When|Where|Who|Why))")); + + //number_list + languageRule.addRule(new Rule(false, "(?<=\\s)\\d{1,2}\\.(\\s)|^\\d{1,2}\\.(\\s)|" + + "(?<=\\s)\\d{1,2}\\.(\\))|^\\d{1,2}\\.(\\))|(?<=\\s\\-)\\d{1,2}\\.(\\s)|" + + "(?<=^\\-)\\d{1,2}\\.(\\s)|(?<=\\s\\⁃)\\d{1,2}\\.(\\s)|(?<=^\\⁃)\\d{1,2}\\.(\\s)|" + + "(?<=\\s\\-)\\d{1,2}\\.(\\))|(?<=^\\-)\\d{1,2}\\.(\\))|(?<=\\s\\⁃)\\d{1,2}\\.(\\))|" + + "(?<=^\\⁃)\\d{1,2}\\.(\\))|(\\•)\\s*\\d{1,2}\\.(\\s)|(?<=\\s)\\d{1,2}(\\))", "\\s*")); + + //number_list + languageRule.addRule(new Rule(true, "", "\\s+((?<=\\s)\\d{1,2}\\.(?=\\s)|" + + "^\\d{1,2}\\.(?=\\s)|(?<=\\s)\\d{1,2}\\.(?=\\))|^\\d{1,2}\\.(?=\\))|((?<=\\s)\\-)" + + "\\d{1,2}\\.(?=\\s)|(^\\-)\\d{1,2}\\.(?=\\s)|((?<=\\s)\\⁃)\\d{1,2}\\.(?=\\s)|" + + "(^\\⁃)\\d{1,2}\\.(?=\\s)|((?<=\\s)\\-)\\d{1,2}\\.(?=\\))|(^\\-)\\d{1,2}\\.(?=\\))|" + + "((?<=\\s)\\⁃)\\d{1,2}\\.(?=\\))|(^\\⁃)\\d{1,2}\\.(?=\\))|(\\•)\\s*\\d{1,2}\\.(\\s)|" + + "(?<=\\s)\\d{1,2}(?=\\)))")); + } + +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/LanguageRule.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/LanguageRule.java new file mode 100644 index 000000000..bb85d2877 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/LanguageRule.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentdetect.segment; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * Represents rule for segmenting text in some language. Contains {@link Rule} + * list. + * + */ +public class LanguageRule { + + private List ruleList; + + private String name; + + /** + * Creates language rule. + * + * @param name language rule name + * @param ruleList rule list (it will be shallow copied) + */ + public LanguageRule(String name, List ruleList) { + this.ruleList = new ArrayList(ruleList); + this.name = name; + } + + /** + * Creates empty language rule. + * + * @param name language rule name + */ + public LanguageRule(String name) { + this(name, new ArrayList()); + } + + /** + * @return unmodifiable rules list + */ + public List getRuleList() { + return Collections.unmodifiableList(ruleList); + } + + /** + * Adds rule to the end of rule list. + * @param rule + */ + public void addRule(Rule rule) { + ruleList.add(rule); + } + + /** + * @return language rule name + */ + public String getName() { + return name; + } + +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/LanguageTool.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/LanguageTool.java new file mode 100644 index 000000000..c1ae94843 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/LanguageTool.java @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentdetect.segment; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; +import java.util.regex.Pattern; + +public class LanguageTool { + + private LanguageRule languageRule; + + private String languageName; + + private Map cache; + + public static final String MAX_LOOKBEHIND_LENGTH_PARAM = "maxLookbehindLength"; + + public static final int DEFAULT_MAX_LOOKBEHIND_LENGTH = 100; + + private int maxLookbehindLength; + + private Map parameterMap; + + private List breakRuleList; + + private Pattern noBreakPattern; + + public LanguageTool(String languageName, LanguageRule languageRule) { + this(languageName, languageRule, Collections.emptyMap()); + } + + public LanguageTool(String languageName, LanguageRule languageRule, Map paramMap) { + this.languageRule = languageRule; + this.languageName = languageName; + parameterMap = new HashMap(paramMap); + if (parameterMap.get(MAX_LOOKBEHIND_LENGTH_PARAM) != null) { + this.maxLookbehindLength = (int) parameterMap.get(MAX_LOOKBEHIND_LENGTH_PARAM); + } else { + this.maxLookbehindLength = DEFAULT_MAX_LOOKBEHIND_LENGTH; + } + init(); + } + + private void init() { + + this.cache = new ConcurrentHashMap(); + this.breakRuleList = new ArrayList(); + StringBuilder noBreakPatternBuilder = new StringBuilder(); + + for (Rule rule : languageRule.getRuleList()) { + + if (rule.isBreak()) { + breakRuleList.add(rule); + } else { + if (noBreakPatternBuilder.length() > 0) { + noBreakPatternBuilder.append('|'); + } + String patternString = createNoBreakPatternString(rule); + noBreakPatternBuilder.append(patternString); + } + } + + if (noBreakPatternBuilder.length() > 0) { + String noBreakPatternString = noBreakPatternBuilder.toString(); + noBreakPattern = compile(noBreakPatternString); + } else { + noBreakPattern = null; + } + + } + + public Map getParameterMap() { + return parameterMap; + } + + public LanguageRule getLanguageRule() { + return languageRule; + } + + public String getLanguageName() { + return languageName; + } + + public Map getCache() { + return cache; + } + + public Pattern compile(String regex) { + String key = "PATTERN_" + regex; + Pattern pattern = (Pattern) getCache().get(key); + if (pattern == null) { + pattern = Pattern.compile(regex); + getCache().put(key, pattern); + } + return pattern; + } + + public List getBreakRuleList() { + return breakRuleList; + } + + public Pattern getNoBreakPattern() { + return noBreakPattern; + } + + private String createNoBreakPatternString(Rule rule) { + + StringBuilder patternBuilder = new StringBuilder(); + + // As Java does not allow infinite length patterns + // in lookbehind, before pattern need to be shortened. + String beforePattern = RuleUtil.finitize(rule.getBeforePattern(), maxLookbehindLength); + String afterPattern = rule.getAfterPattern(); + + patternBuilder.append("(?:"); + if (beforePattern.length() > 0) { + patternBuilder.append(beforePattern); + } + if (afterPattern.length() > 0) { + patternBuilder.append(afterPattern); + } + patternBuilder.append(")"); + return patternBuilder.toString(); + } +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Rule.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Rule.java new file mode 100644 index 000000000..a466d55f4 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/Rule.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentdetect.segment; + +/** + * Represents break or exception rule. Contains after break and before + * break patterns, + * + */ +public class Rule { + + private boolean breaking; + + private String beforePattern; + + private String afterPattern; + + /** + * Creates rule. + * + * @param breaking type of rule; true - break rule, false - exception rule + * @param beforePattern pattern matching text before break + * @param afterPattern pattern matching text after break + */ + public Rule(boolean breaking, String beforePattern, String afterPattern) { + this.breaking = breaking; + this.beforePattern = beforePattern; + this.afterPattern = afterPattern; + } + + /** + * @return type of rule; true - break rule, false - exception rule + */ + public boolean isBreak() { + return breaking; + } + + /** + * @return pattern matching text before break + */ + public String getBeforePattern() { + return beforePattern; + } + + /** + * @return pattern matching text after break + */ + public String getAfterPattern() { + return afterPattern; + } +} diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/RuleUtil.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/RuleUtil.java new file mode 100644 index 000000000..aa06f88a1 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/segment/RuleUtil.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentdetect.segment; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Thank Jarek Lipski and + * segment} + * for the inspiration for many of the design + * components of this detector. + */ +public class RuleUtil { + + private static final Pattern STAR_PATTERN = Pattern + .compile("(?<=(? breakSections; + + private List
noBreakSections; + + public SentenceTokenizer(LanguageTool languageTool, CharSequence text) { + this.text = text; + this.reader = null; + this.bufferLength = text.length(); + this.languageTool = languageTool; + this.sentence = null; + this.start = 0; + this.end = 0; + } + + public SentenceTokenizer(LanguageTool languageTool, Reader reader, int bufferLength) { + if (bufferLength <= 0) { + throw new IllegalArgumentException("Buffer size: " + bufferLength + + " must be positive."); + } + this.text = null; + this.reader = reader; + this.bufferLength = bufferLength; + this.languageTool = languageTool; + this.sentence = null; + this.start = 0; + this.end = 0; + } + + public List sentenceTokenizer() { + + List sentenceList = new ArrayList<>(); + CharSequence text = getText(); + if (breakSections == null) { + getBreakSections(); + } + for (Integer breakSection : breakSections) { + if (breakSection == 0) { + continue; + } + if (breakSection >= text.length()) { + break; + } + end = breakSection; + if (!isBreak()) { + continue; + } + sentence = text.subSequence(start, end).toString(); + start = end; + + sentence = removeSpace(sentence); + if (sentence != null) { + sentenceList.add(sentence); + } + } + if (end < text.length()) { + end = text.length(); + sentence = text.subSequence(start, end).toString(); + sentence = removeSpace(sentence); + if (sentence != null) { + sentenceList.add(sentence); + } + } + return sentenceList; + } + + public String removeSpace(String segment) { + if (segment != null) { + int first = 0; + int last = segment.length(); + while (first < segment.length() && StringUtil.isWhitespace(segment.charAt(first))) { + first++; + } + while (last > 0 && StringUtil.isWhitespace(segment.charAt(last - 1))) { + last--; + } + if (last - first > 0) { + return segment.substring(first, last); + } + } + return null; + } + + public Set getBreakSections() { + if (breakSections == null) { + breakSections = new TreeSet(); + for (Rule rule : languageTool.getBreakRuleList()) { + + Pattern beforePattern = languageTool.compile(rule.getBeforePattern()); + Pattern afterPattern = languageTool.compile(rule.getAfterPattern()); + this.beforeMatcher = beforePattern.matcher(text); + this.afterMatcher = afterPattern.matcher(text); + this.found = true; + while (find()) { + breakSections.add(getBreakPosition()); + } + } + } + return breakSections; + } + + private boolean find() { + found = false; + while ((!found) && beforeMatcher.find()) { + afterMatcher.region(beforeMatcher.end(), text.length()); + found = afterMatcher.lookingAt(); + } + return found; + } + + private int getBreakPosition() { + return afterMatcher.start(); + } + + public List
getNoBreakSections() { + if (noBreakSections == null) { + noBreakSections = new ArrayList
(); + Pattern pattern = languageTool.getNoBreakPattern(); + Matcher matcher = pattern.matcher(getText()); + while (matcher.find()) { + noBreakSections.add(new Section(matcher.start(), matcher.end())); + } + } + return noBreakSections; + } + + public CharSequence getText() { + if (text == null) { + text = read(bufferLength + 1); + } + return text; + } + + private String read(int amount) { + char[] charBuffer = new char[amount]; + int count = read(reader, charBuffer); + + String result; + if (count == amount) { + result = new String(charBuffer, 0, count - 1); + } else if (count > 0 && count < amount) { + result = new String(charBuffer, 0, count); + } else { + result = ""; + } + + return result; + } + + private int read(Reader reader, char[] buffer) { + + int start = 0; + int count; + + try { + while (true) { + if (!(((count = reader.read(buffer, start, buffer.length - start)) != -1) + && start < buffer.length)) { + break; + } + start += count; + } + } catch (IOException e) { + e.printStackTrace(); + } + return start; + } + + private boolean isBreak() { + if (noBreakSections == null) { + getNoBreakSections(); + } + if (noBreakSections != null && noBreakSections.size() > 0) { + for (Section section : noBreakSections) { + if (end >= section.getLeft() && end <= section.getRight()) { + return false; + } + } + } + return true; + } + +} diff --git a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/segment/GoldenRulesTest.java b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/segment/GoldenRulesTest.java new file mode 100644 index 000000000..4dee24759 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/segment/GoldenRulesTest.java @@ -0,0 +1,511 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.sentdetect.segment; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.junit.Assert; +import org.junit.Ignore; +import org.junit.Test; + +/** + * Thanks for the GoldenRules of + * pragmatic_segmenter + */ +public class GoldenRulesTest { + + public Cleaner cleaner = new Cleaner(); + + public List segment(String text) { + if (cleaner != null) { + text = cleaner.clean(text); + } + LanguageRule languageRule = new EnglishRule().getLanguageRule(); + Map paramMap = new HashMap<>(); + paramMap.put(LanguageTool.MAX_LOOKBEHIND_LENGTH_PARAM, 50); + LanguageTool languageTool = new LanguageTool("eng", languageRule, paramMap); + SentenceTokenizer sentenceTokenizer = new SentenceTokenizer(languageTool, text); + + return sentenceTokenizer.sentenceTokenizer(); + } + + @Test + public void test1() { + List segment = segment("Hello World. My name is Jonas."); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[Hello World., My name is Jonas.]", segment.toString()); + } + + @Test + public void test2() { + List segment = segment("What is your name? My name is Jonas."); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[What is your name?, My name is Jonas.]", segment.toString()); + } + + @Test + public void test3() { + List segment = segment("There it is! I found it."); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[There it is!, I found it.]", segment.toString()); + } + + @Test + public void test4() { + List segment = segment("My name is Jonas E. Smith."); + System.out.println(segment); + Assert.assertEquals(1, segment.size()); + Assert.assertEquals("[My name is Jonas E. Smith.]", segment.toString()); + } + + @Test + public void test5() { + List segment = segment("Please turn to p. 55."); + System.out.println(segment); + Assert.assertEquals(1, segment.size()); + Assert.assertEquals("[Please turn to p. 55.]", segment.toString()); + } + + @Test + public void test6() { + List segment = segment("Were Jane and co. at the party?"); + System.out.println(segment); + Assert.assertEquals(1, segment.size()); + Assert.assertEquals("[Were Jane and co. at the party?]", segment.toString()); + } + + @Test + public void test7() { + List segment = segment("They closed the deal with Pitt, Briggs & Co. at noon."); + System.out.println(segment); + Assert.assertEquals(1, segment.size()); + Assert.assertEquals("[They closed the deal with Pitt, Briggs & Co. at noon.]", + segment.toString()); + } + + @Test + public void test8() { + List segment = segment("Let's ask Jane and co. They should know."); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[Let's ask Jane and co., They should know.]", segment.toString()); + } + + @Test + public void test9() { + List segment = segment("They closed the deal with Pitt, Briggs & Co. It closed yesterday."); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[They closed the deal with Pitt, Briggs & Co., It closed yesterday.]", + segment.toString()); + } + + @Test + public void test10() { + List segment = segment("I can see Mt. Fuji from here."); + System.out.println(segment); + Assert.assertEquals(1, segment.size()); + Assert.assertEquals("[I can see Mt. Fuji from here.]", segment.toString()); + } + + @Test + public void test11() { + List segment = segment("St. Michael's Church is on 5th st. near the light."); + System.out.println(segment); + Assert.assertEquals(1, segment.size()); + Assert.assertEquals("[St. Michael's Church is on 5th st. near the light.]", + segment.toString()); + } + + @Test + public void test12() { + List segment = segment("That is JFK Jr.'s book."); + System.out.println(segment); + Assert.assertEquals(1, segment.size()); + Assert.assertEquals("[That is JFK Jr.'s book.]", segment.toString()); + } + + @Test + public void test13() { + List segment = segment("I visited the U.S.A. last year."); + System.out.println(segment); + Assert.assertEquals(1, segment.size()); + Assert.assertEquals("[I visited the U.S.A. last year.]", segment.toString()); + } + + @Test + public void test14() { + List segment = segment("I live in the E.U. How about you?"); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[I live in the E.U., How about you?]", segment.toString()); + } + + @Test + public void test15() { + List segment = segment("I live in the U.S. How about you?"); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[I live in the U.S., How about you?]", segment.toString()); + } + + @Test + public void test16() { + List segment = segment("I work for the U.S. Government in Virginia."); + System.out.println(segment); + Assert.assertEquals(1, segment.size()); + Assert.assertEquals("[I work for the U.S. Government in Virginia.]", segment.toString()); + } + + @Test + public void test17() { + List segment = segment("I have lived in the U.S. for 20 years."); + System.out.println(segment); + Assert.assertEquals(1, segment.size()); + Assert.assertEquals("[I have lived in the U.S. for 20 years.]", segment.toString()); + } + + @Test + public void test18() { + List segment = segment("At 5 a.m. Mr. Smith went to the bank. He left the" + + " bank at 6 P.M. Mr. Smith then went to the store."); + System.out.println(segment); + Assert.assertEquals(3, segment.size()); + Assert.assertEquals("[At 5 a.m. Mr. Smith went to the bank., He left the bank at" + + " 6 P.M., Mr. Smith then went to the store.]", segment.toString()); + } + + @Test + public void test19() { + List segment = segment("She has $100.00 in her bag."); + System.out.println(segment); + Assert.assertEquals(1, segment.size()); + Assert.assertEquals("[She has $100.00 in her bag.]", segment.toString()); + } + + @Test + public void test20() { + List segment = segment("She has $100.00. It is in her bag."); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[She has $100.00., It is in her bag.]", segment.toString()); + } + + @Test + public void test21() { + List segment = segment("He teaches science (He previously worked for" + + " 5 years as an engineer.) at the local University."); + System.out.println(segment); + Assert.assertEquals(1, segment.size()); + Assert.assertEquals("[He teaches science (He previously worked for 5 years" + + " as an engineer.) at the local University.]", segment.toString()); + } + + @Test + public void test22() { + List segment = segment("Her email is Jane.Doe@example.com. I sent her an email."); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[Her email is Jane.Doe@example.com., I sent her an email.]", + segment.toString()); + } + + @Test + public void test23() { + List segment = segment("The site is: https://www.example.50.com/new-site/" + + "awesome_content.html. Please check it out."); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[The site is: https://www.example.50.com/new-site/" + + "awesome_content.html., Please check it out.]", segment.toString()); + } + + @Test + public void test24() { + List segment = segment("She turned to him, 'This is great.' she said."); + System.out.println(segment); + Assert.assertEquals(1, segment.size()); + Assert.assertEquals("[She turned to him, 'This is great.' she said.]", + segment.toString()); + } + + @Test + public void test25() { + List segment = segment("She turned to him, \"This is great.\" she said."); + System.out.println(segment); + Assert.assertEquals(1, segment.size()); + Assert.assertEquals("[She turned to him, \"This is great.\" she said.]", + segment.toString()); + } + + @Test + public void test26() { + List segment = segment("She turned to him, \"This is great.\"" + + " She held the book out to show him."); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[She turned to him, \"This is great.\", " + + "She held the book out to show him.]", segment.toString()); + } + + @Test + public void test27() { + List segment = segment("Hello!! Long time no see."); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[Hello!!, Long time no see.]", segment.toString()); + } + + @Test + public void test28() { + List segment = segment("Hello?? Who is there?"); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[Hello??, Who is there?]", segment.toString()); + } + + @Test + public void test29() { + List segment = segment("Hello!? Is that you?"); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[Hello!?, Is that you?]", segment.toString()); + } + + @Test + public void test30() { + List segment = segment("Hello?! Is that you?"); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[Hello?!, Is that you?]", segment.toString()); + } + + @Test + public void test31() { + List segment = segment("1.) The first item 2.) The second item"); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[1.) The first item, 2.) The second item]", segment.toString()); + } + + @Test + public void test32() { + List segment = segment("1.) The first item. 2.) The second item."); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[1.) The first item., 2.) The second item.]", segment.toString()); + } + + @Test + public void test33() { + List segment = segment("1) The first item 2) The second item"); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[1) The first item, 2) The second item]", segment.toString()); + } + + @Test + public void test34() { + List segment = segment("1) The first item. 2) The second item."); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[1) The first item., 2) The second item.]", segment.toString()); + } + + @Test + public void test35() { + List segment = segment("1. The first item 2. The second item"); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[1. The first item, 2. The second item]", segment.toString()); + } + + @Test + public void test36() { + List segment = segment("1. The first item. 2. The second item."); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[1. The first item., 2. The second item.]", segment.toString()); + } + + @Test + public void test37() { + List segment = segment("• 9. The first item • 10. The second item"); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[• 9. The first item, • 10. The second item]", segment.toString()); + } + + @Test + public void test38() { + List segment = segment("⁃9. The first item ⁃10. The second item"); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[⁃9. The first item, ⁃10. The second item]", segment.toString()); + } + + @Ignore + @Test + public void test39() { + List segment = segment("a. The first item b. The second item c. The third list item"); + System.out.println(segment); + Assert.assertEquals(3, segment.size()); + Assert.assertEquals("[a. The first item, b. The second item," + + " c. The third list item]", segment.toString()); + } + + @Test + public void test40() { + cleaner.pdf(); + List segment = segment("This is a sentence\ncut off in the middle because pdf."); + System.out.println(segment); + Assert.assertEquals(1, segment.size()); + Assert.assertEquals("[This is a sentence cut off in the middle because pdf.]", + segment.toString()); + } + + @Test + public void test41() { + cleaner.pdf(); + List segment = segment("It was a cold \nnight in the city."); + System.out.println(segment); + Assert.assertEquals(1, segment.size()); + Assert.assertEquals("[It was a cold night in the city.]", segment.toString()); + } + + @Test + public void test42() { + cleaner.rules(); + List segment = segment("features\ncontact manager\nevents, activities\n"); + System.out.println(segment); + Assert.assertEquals(3, segment.size()); + Assert.assertEquals("[features, contact manager, events, activities]", + segment.toString()); + } + + @Test + public void test43() { + List segment = segment("You can find it at N°. 1026.253.553." + + " That is where the treasure is."); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[You can find it at N°. 1026.253.553.," + + " That is where the treasure is.]", segment.toString()); + } + + @Test + public void test44() { + List segment = segment("She works at Yahoo! in the accounting department."); + System.out.println(segment); + Assert.assertEquals(1, segment.size()); + Assert.assertEquals("[She works at Yahoo! in the accounting department.]", + segment.toString()); + } + + @Test + public void test45() { + List segment = segment("We make a good team, you and I." + + " Did you see Albert I. Jones yesterday?"); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[We make a good team, you and I., Did you see Albert" + + " I. Jones yesterday?]", segment.toString()); + } + + @Test + public void test46() { + List segment = segment("Thoreau argues that by simplifying one’s life," + + " “the laws of the universe will appear less complex. . . .”"); + System.out.println(segment); + Assert.assertEquals(1, segment.size()); + Assert.assertEquals("[Thoreau argues that by simplifying one’s life, " + + "“the laws of the universe will appear less complex. . . .”]", segment.toString()); + } + + @Test + public void test47() { + List segment = segment("\"Bohr [...] used the analogy of parallel" + + " stairways [...]\" (Smith 55)."); + System.out.println(segment); + Assert.assertEquals(1, segment.size()); + Assert.assertEquals("[\"Bohr [...] used the analogy of parallel stairways" + + " [...]\" (Smith 55).]", segment.toString()); + } + + @Test + public void test48() { + List segment = segment("If words are left off at the end of a sentence," + + " and that is all that is omitted, indicate the omission with ellipsis marks " + + "(preceded and followed by a space) and then indicate the end of the sentence " + + "with a period . . . . Next sentence."); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[If words are left off at the end of a sentence, " + + "and that is all that is omitted, indicate the omission with ellipsis marks " + + "(preceded and followed by a space) and then indicate the end of the sentence " + + "with a period . . . ., Next sentence.]", segment.toString()); + } + + @Test + public void test49() { + List segment = segment("I never meant that.... She left the store."); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[I never meant that...., She left the store.]", segment.toString()); + } + + @Ignore + @Test + public void test50() { + List segment = segment("I wasn’t really ... well, what I mean...see . . . " + + "what I'm saying, the thing is . . . I didn’t mean it."); + System.out.println(segment); + Assert.assertEquals(1, segment.size()); + Assert.assertEquals("[I wasn’t really ... well, what I mean...see . . . what " + + "I'm saying, the thing is . . . I didn’t mean it.", segment.toString()); + } + + @Ignore + @Test + public void test51() { + List segment = segment("One further habit which was somewhat weakened " + + ". . . was that of combining words into self-interpreting compounds. . . . " + + "The practice was not abandoned. . . ."); + System.out.println(segment); + Assert.assertEquals(2, segment.size()); + Assert.assertEquals("[One further habit which was somewhat weakened . . . " + + "was that of combining words into self-interpreting compounds., . . . " + + "The practice was not abandoned. . . .]", segment.toString()); + } + + @Ignore + @Test + public void test52() { + List segment = segment("Hello world.Today is Tuesday.Mr. Smith went to " + + "the store and bought 1,000.That is a lot."); + System.out.println(segment); + Assert.assertEquals(4, segment.size()); + Assert.assertEquals("[Hello world., Today is Tuesday., Mr. Smith went to the" + + " store and bought 1,000., That is a lot.]", segment.toString()); + } +} diff --git a/opennlp-tools/src/test/resources/opennlp/tools/sentdetect/golden-rules.txt b/opennlp-tools/src/test/resources/opennlp/tools/sentdetect/golden-rules.txt new file mode 100644 index 000000000..d3d6bcbdc --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/sentdetect/golden-rules.txt @@ -0,0 +1,207 @@ +1.) Simple period to end sentence +Hello World. My name is Jonas. +=> ["Hello World.", "My name is Jonas."] + +2.) Question mark to end sentence +What is your name? My name is Jonas. +=> ["What is your name?", "My name is Jonas."] + +3.) Exclamation point to end sentence +There it is! I found it. +=> ["There it is!", "I found it."] + +4.) One letter upper case abbreviations +My name is Jonas E. Smith. +=> ["My name is Jonas E. Smith."] + +5.) One letter lower case abbreviations +Please turn to p. 55. +=> ["Please turn to p. 55."] + +6.) Two letter lower case abbreviations in the middle of a sentence +Were Jane and co. at the party? +=> ["Were Jane and co. at the party?"] + +7.) Two letter upper case abbreviations in the middle of a sentence +They closed the deal with Pitt, Briggs & Co. at noon. +=> ["They closed the deal with Pitt, Briggs & Co. at noon."] + +8.) Two letter lower case abbreviations at the end of a sentence +Let's ask Jane and co. They should know. +=> ["Let's ask Jane and co.", "They should know."] + +9.) Two letter upper case abbreviations at the end of a sentence +They closed the deal with Pitt, Briggs & Co. It closed yesterday. +=> ["They closed the deal with Pitt, Briggs & Co.", "It closed yesterday."] + +10.) Two letter (prepositive) abbreviations +I can see Mt. Fuji from here. +=> ["I can see Mt. Fuji from here."] + +11.) Two letter (prepositive & postpositive) abbreviations +St. Michael's Church is on 5th st. near the light. +=> ["St. Michael's Church is on 5th st. near the light."] + +12.) Possesive two letter abbreviations +That is JFK Jr.'s book. +=> ["That is JFK Jr.'s book."] + +13.) Multi-period abbreviations in the middle of a sentence +I visited the U.S.A. last year. +=> ["I visited the U.S.A. last year."] + +14.) Multi-period abbreviations at the end of a sentence +I live in the E.U. How about you? +=> ["I live in the E.U.", "How about you?"] + +15.) U.S. as sentence boundary +I live in the U.S. How about you? +=> ["I live in the U.S.", "How about you?"] + +16.) U.S. as non sentence boundary with next word capitalized +I work for the U.S. Government in Virginia. +=> ["I work for the U.S. Government in Virginia."] + +17.) U.S. as non sentence boundary +I have lived in the U.S. for 20 years. +=> ["I have lived in the U.S. for 20 years."] + +18.) A.M. / P.M. as non sentence boundary and sentence boundary +At 5 a.m. Mr. Smith went to the bank. He left the bank at 6 P.M. Mr. Smith then went to the store. +=> ["At 5 a.m. Mr. Smith went to the bank.", "He left the bank at 6 P.M.", "Mr. Smith then went to the store."] + +19.) Number as non sentence boundary +She has $100.00 in her bag. +=> ["She has $100.00 in her bag."] + +20.) Number as sentence boundary +She has $100.00. It is in her bag. +=> ["She has $100.00.", "It is in her bag."] + +21.) Parenthetical inside sentence +He teaches science (He previously worked for 5 years as an engineer.) at the local University. +=> ["He teaches science (He previously worked for 5 years as an engineer.) at the local University."] + +22.) Email addresses +Her email is Jane.Doe@example.com. I sent her an email. +=> ["Her email is Jane.Doe@example.com.", "I sent her an email."] + +23.) Web addresses +The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out. +=> ["The site is: https://www.example.50.com/new-site/awesome_content.html.", "Please check it out."] + +24.) Single quotations inside sentence +She turned to him, 'This is great.' she said. +=> ["She turned to him, 'This is great.' she said."] + +25.) Double quotations inside sentence +She turned to him, "This is great." she said. +=> ["She turned to him, \"This is great.\" she said."] + +26.) Double quotations at the end of a sentence +She turned to him, \"This is great.\" She held the book out to show him. +=> ["She turned to him, \"This is great.\"", "She held the book out to show him."] + +27.) Double punctuation (exclamation point) +Hello!! Long time no see. +=> ["Hello!!", "Long time no see."] + +28.) Double punctuation (question mark) +Hello?? Who is there? +=> ["Hello??", "Who is there?"] + +29.) Double punctuation (exclamation point / question mark) +Hello!? Is that you? +=> ["Hello!?", "Is that you?"] + +30.) Double punctuation (question mark / exclamation point) +Hello?! Is that you? +=> ["Hello?!", "Is that you?"] + +31.) List (period followed by parens and no period to end item) +1.) The first item 2.) The second item +=> ["1.) The first item", "2.) The second item"] + +32.) List (period followed by parens and period to end item) +1.) The first item. 2.) The second item. +=> ["1.) The first item.", "2.) The second item."] + +33.) List (parens and no period to end item) +1) The first item 2) The second item +=> ["1) The first item", "2) The second item"] + +34.) List (parens and period to end item) +1) The first item. 2) The second item. +=> ["1) The first item.", "2) The second item."] + +35.) List (period to mark list and no period to end item) +1. The first item 2. The second item +=> ["1. The first item", "2. The second item"] + +36.) List (period to mark list and period to end item) +1. The first item. 2. The second item. +=> ["1. The first item.", "2. The second item."] + +37.) List with bullet +• 9. The first item • 10. The second item +=> ["• 9. The first item", "• 10. The second item"] + +38.) List with hypthen +⁃9. The first item ⁃10. The second item +=> ["⁃9. The first item", "⁃10. The second item"] + +39.) Alphabetical list +a. The first item b. The second item c. The third list item +=> ["a. The first item", "b. The second item", "c. The third list item"] + +40.) Errant newline in the middle of a sentence (PDF) +This is a sentence\ncut off in the middle because pdf. +=> ["This is a sentence\ncut off in the middle because pdf."] + +41.) Errant newline in the middle of a sentence +It was a cold \nnight in the city. +=> ["It was a cold night in the city."] + +42.) Lower case list separated by newline +features\ncontact manager\nevents, activities\n +=> ["features", "contact manager", "events, activities"] + +43.) Geo Coordinates +You can find it at N°. 1026.253.553. That is where the treasure is. +=> ["You can find it at N°. 1026.253.553.", "That is where the treasure is."] + +44.) Named entities with an exclamation point +She works at Yahoo! in the accounting department. +=> ["She works at Yahoo! in the accounting department."] + +45.) I as a sentence boundary and I as an abbreviation +We make a good team, you and I. Did you see Albert I. Jones yesterday? +=> ["We make a good team, you and I.", "Did you see Albert I. Jones yesterday?"] + +46.) Ellipsis at end of quotation +Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .” +=> ["Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”"] + +47.) Ellipsis with square brackets +"Bohr [...] used the analogy of parallel stairways [...]" (Smith 55). +=> ["\"Bohr [...] used the analogy of parallel stairways [...]\" (Smith 55)."] + +48.) Ellipsis as sentence boundary (standard ellipsis rules) +If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence. +=> ["If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . .", "Next sentence."] + +49.) Ellipsis as sentence boundary (non-standard ellipsis rules) +I never meant that.... She left the store. +=> ["I never meant that....", "She left the store."] + +50.) Ellipsis as non sentence boundary +I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it. +=> ["I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it." + +51.) 4-dot ellipsis +One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . . +=> ["One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds.", ". . . The practice was not abandoned. . . ."] + +52.) No whitespace in between sentences Credit: Don_Patrick +Hello world.Today is Tuesday.Mr. Smith went to the store and bought 1,000.That is a lot. +=> ["Hello world.", "Today is Tuesday.", "Mr. Smith went to the store and bought 1,000.", "That is a lot."] \ No newline at end of file