From 2bf73ce780b378287c7e11ba354bffdd6f02ee36 Mon Sep 17 00:00:00 2001 From: Andrew Yoon Date: Sun, 10 Mar 2024 12:42:36 -0400 Subject: [PATCH] Support shifting punctuation to the right of quotes --- CHANGELOG.md | 4 + src/postprocessing.ts | 164 ++++++++++++----------- test/testPostprocessing.ts | 268 +++++++++++++++++++------------------ 3 files changed, 225 insertions(+), 211 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d96b1a8..10cf015 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ ending before quotation marks. For instance, `"Test." test` will now correct to `"Test." Test` where previously it would not. * Fix visual line breaks when backslash is followed by whitespace +* Update punctuation cleanup postprocessing to shift punctuation to the + right of quotes (single, double, or underscores). For example `"test" .` + will now correct to `"test".`. Note that this does not shift punctuation + to the inside of quotes, as this is context and style dependent. ### 0.1.9 * Update experimental in-eval ref lookup. Now exposed by two diff --git a/src/postprocessing.ts b/src/postprocessing.ts index 9059063..046434a 100644 --- a/src/postprocessing.ts +++ b/src/postprocessing.ts @@ -12,77 +12,81 @@ const TRAILING_WHITESPACE_RE = /\s+$/; * 5. Ensuring the text ends with a single line break */ export function whitespaceCleanup(text: string): string { - let out = ''; - let atDocStart = true; - let lastLineWasBlank = false; - for (let line of text.split('\n')) { - let isBlank = BLANK_LINE_RE.test(line); - - if (atDocStart) { - if (isBlank) { - // Skip blank lines at start of document - continue; - } else { - atDocStart = false; - } - } + let out = ''; + let atDocStart = true; + let lastLineWasBlank = false; + for (let line of text.split('\n')) { + let isBlank = BLANK_LINE_RE.test(line); + + if (atDocStart) { + if (isBlank) { + // Skip blank lines at start of document + continue; + } else { + atDocStart = false; + } + } - if (isBlank) { - if (lastLineWasBlank) { - // Skip runs of blank lines - continue; - } - // Lines consisting of only whitespace should - // become simply blank lines - line = ''; - } else { - // intra-line cleanups - line = line.replace(TRAILING_WHITESPACE_RE, ''); - let rewrittenLine = ''; - let atLineStart = true; - let lastCharWasSpace = false; - for (let char of line) { - let charIsSpace = char === ' '; - if (!atLineStart && lastCharWasSpace && charIsSpace) { - continue; + if (isBlank) { + if (lastLineWasBlank) { + // Skip runs of blank lines + continue; + } + // Lines consisting of only whitespace should + // become simply blank lines + line = ''; } else { - if (!charIsSpace) { - atLineStart = false; - } - rewrittenLine += char; - lastCharWasSpace = charIsSpace; + // intra-line cleanups + line = line.replace(TRAILING_WHITESPACE_RE, ''); + let rewrittenLine = ''; + let atLineStart = true; + let lastCharWasSpace = false; + for (let char of line) { + let charIsSpace = char === ' '; + if (!atLineStart && lastCharWasSpace && charIsSpace) { + continue; + } else { + if (!charIsSpace) { + atLineStart = false; + } + rewrittenLine += char; + lastCharWasSpace = charIsSpace; + } + } + line = rewrittenLine; } - } - line = rewrittenLine; - } - lastLineWasBlank = isBlank; + lastLineWasBlank = isBlank; - out += line + '\n'; - } + out += line + '\n'; + } - // Edge case: if input ended with a line break already, above code - // will result in \n\n ending the output. Correct this so output - // always terminates with a single \n - if (out.endsWith('\n\n')) { - out = out.substring(0, out.length - 1); - } + // Edge case: if input ended with a line break already, above code + // will result in \n\n ending the output. Correct this so output + // always terminates with a single \n + if (out.endsWith('\n\n')) { + out = out.substring(0, out.length - 1); + } - return out; + return out; } // Note the 3 dashes here are the different kinds, not the same character -const MISPLACED_WORD_ENDING_PUNC_RE = /([a-zA-Z0-9\xA0-\uFFFF])(\s+)([.,:;!?\-\–\—]+)/g; +const MISPLACED_WORD_ENDING_PUNC_RE = /([a-zA-Z0-9\xA0-\uFFFF"'_])(\s+)([.,:;!?\-\–\—]+)/g; /** * Performs simple English-like correction of whitespace around * punctuation marks. * - * - snap [, . : ; ! ?] to the end of preceding words when separated - * by whitespace (including line breaks.) + * Snap [, . : ; ! ?] to the end of preceding words, quotes, or underscores, + * when separated by whitespace (including line breaks.) + * + * Note that this will shift punctuation to the *right* of quotes ([" ' _]), + * but will not shift punctuation to the inside of quotes, as this is often + * dependent on style and context. */ export function punctuationCleanup(text: string): string { - return text.replace(MISPLACED_WORD_ENDING_PUNC_RE, '$1$3$2'); + return text.replace(MISPLACED_WORD_ENDING_PUNC_RE, '$1$3$2'); } @@ -97,18 +101,18 @@ const INCORRECT_CAPS_RE = /([.!?]["'_]?\s+|^\s*)(\p{Ll})/gu; * following a sentence-ending punctuation mark. */ export function capitalizationCleanup(text: string): string { - // Conforms to `text.replace` replacer function interface - function correctCaps(_match: string, p1: string, p2: string) { - return p1 + p2.toUpperCase(); - } + // Conforms to `text.replace` replacer function interface + function correctCaps(_match: string, p1: string, p2: string) { + return p1 + p2.toUpperCase(); + } - return text.replace(INCORRECT_CAPS_RE, correctCaps); + return text.replace(INCORRECT_CAPS_RE, correctCaps); } const VISUAL_LINE_BREAK_RE = /\\ *(\r?\n|\r)[ \t]*/g export function replaceVisualLineBreaks(text: string): string { - return text.replace(VISUAL_LINE_BREAK_RE, ' '); + return text.replace(VISUAL_LINE_BREAK_RE, ' '); } const INDEFINITE_ARTICLE_RE = /\b(a|an) ([\p{L}0-9]+)\b/igu @@ -117,27 +121,27 @@ const INDEFINITE_ARTICLE_RE = /\b(a|an) ([\p{L}0-9]+)\b/igu * Attempt to correct English indefinite articles (a / an) */ export function correctIndefiniteArticles(text: string) { - function upcaseFirstLetter(s: string): string { - if (s.length === 0) { - return s; - } else if (s.length === 1) { - return s.toUpperCase(); - } else { - return s[0].toUpperCase() + s.slice(1); + function upcaseFirstLetter(s: string): string { + if (s.length === 0) { + return s; + } else if (s.length === 1) { + return s.toUpperCase(); + } else { + return s[0].toUpperCase() + s.slice(1); + } } - } - // Conforms to `text.replace` replacer function interface - function correctArticle(_match: string, originalArticle: string, word: string) { - let article = indefinite(word, { articleOnly: true }); - if (originalArticle === 'a' || originalArticle === 'an') { - return article + ' ' + word; - } else if (originalArticle === 'A' || originalArticle === 'An') { - return upcaseFirstLetter(article) + ' ' + word; - } else { - // All caps - return article.toUpperCase() + ' ' + word; + // Conforms to `text.replace` replacer function interface + function correctArticle(_match: string, originalArticle: string, word: string) { + let article = indefinite(word, { articleOnly: true }); + if (originalArticle === 'a' || originalArticle === 'an') { + return article + ' ' + word; + } else if (originalArticle === 'A' || originalArticle === 'An') { + return upcaseFirstLetter(article) + ' ' + word; + } else { + // All caps + return article.toUpperCase() + ' ' + word; + } } - } - return text.replace(INDEFINITE_ARTICLE_RE, correctArticle); + return text.replace(INDEFINITE_ARTICLE_RE, correctArticle); } diff --git a/test/testPostprocessing.ts b/test/testPostprocessing.ts index 076a032..7116256 100644 --- a/test/testPostprocessing.ts +++ b/test/testPostprocessing.ts @@ -3,155 +3,161 @@ import * as postprocessing from '../src/postprocessing'; describe('replaceVisualLineBreaks', function() { - it('works in a basic case', function() { - expect(postprocessing.replaceVisualLineBreaks('foo\\\nbar')).toBe('foo bar'); - }); + it('works in a basic case', function() { + expect(postprocessing.replaceVisualLineBreaks('foo\\\nbar')).toBe('foo bar'); + }); - it('works backslash is followed by whitespace before line break', function() { - expect(postprocessing.replaceVisualLineBreaks('foo\\ \nbar')).toBe('foo bar'); - }); + it('works backslash is followed by whitespace before line break', function() { + expect(postprocessing.replaceVisualLineBreaks('foo\\ \nbar')).toBe('foo bar'); + }); }); describe('whitespaceCleanup', function() { - it('removes blank lines at start and end of string', function() { - expect(postprocessing.whitespaceCleanup('\n\n\n\nfoo\n \n')).toBe('foo\n'); - }); + it('removes blank lines at start and end of string', function() { + expect(postprocessing.whitespaceCleanup('\n\n\n\nfoo\n \n')).toBe('foo\n'); + }); - it('collapses runs of more than 1 blank line into 1', function() { - let input = 'foo\n\nbar\n\n\n\n\nbiz'; - let expectedOutput = 'foo\n\nbar\n\nbiz\n'; - expect(postprocessing.whitespaceCleanup(input)).toBe(expectedOutput); + it('collapses runs of more than 1 blank line into 1', function() { + let input = 'foo\n\nbar\n\n\n\n\nbiz'; + let expectedOutput = 'foo\n\nbar\n\nbiz\n'; + expect(postprocessing.whitespaceCleanup(input)).toBe(expectedOutput); - }); + }); - it('removes trailing whitespace on every line', function() { - expect(postprocessing.whitespaceCleanup('foo\n bar \n \n')).toBe('foo\n bar\n'); - }); + it('removes trailing whitespace on every line', function() { + expect(postprocessing.whitespaceCleanup('foo\n bar \n \n')).toBe('foo\n bar\n'); + }); - it('automatically inserts an EOF line break', function() { - expect(postprocessing.whitespaceCleanup('foo')).toBe('foo\n'); - }); + it('automatically inserts an EOF line break', function() { + expect(postprocessing.whitespaceCleanup('foo')).toBe('foo\n'); + }); - it('doesnt insert a redundant EOF line break when one already exists', function() { - expect(postprocessing.whitespaceCleanup('foo\n')).toBe('foo\n'); - }); + it('doesnt insert a redundant EOF line break when one already exists', function() { + expect(postprocessing.whitespaceCleanup('foo\n')).toBe('foo\n'); + }); - it('preserves leading whitespace on every line', function() { - expect(postprocessing.whitespaceCleanup(' foo')).toBe(' foo\n'); - }); + it('preserves leading whitespace on every line', function() { + expect(postprocessing.whitespaceCleanup(' foo')).toBe(' foo\n'); + }); - it('collapses runs of more than 1 whitespace in the middle of a line', function() { - expect(postprocessing.whitespaceCleanup(' foo bar')).toBe(' foo bar\n'); - }); + it('collapses runs of more than 1 whitespace in the middle of a line', function() { + expect(postprocessing.whitespaceCleanup(' foo bar')).toBe(' foo bar\n'); + }); }); describe('punctuationCleanup', function() { - it('snaps punctuation left', function() { - expect(postprocessing.punctuationCleanup('test . ')).toBe('test. '); - expect(postprocessing.punctuationCleanup('test , ')).toBe('test, '); - expect(postprocessing.punctuationCleanup('test : ')).toBe('test: '); - expect(postprocessing.punctuationCleanup('test ; ')).toBe('test; '); - expect(postprocessing.punctuationCleanup('test ! ')).toBe('test! '); - expect(postprocessing.punctuationCleanup('test ? ')).toBe('test? '); - // Hyphen and multiple hyphens - expect(postprocessing.punctuationCleanup('test - ')).toBe('test- '); - expect(postprocessing.punctuationCleanup('test --- ')).toBe('test--- '); - // En dash - expect(postprocessing.punctuationCleanup('test – ')).toBe('test– '); - // Em dash - expect(postprocessing.punctuationCleanup('test — ')).toBe('test— '); - }); - - it('snaps punctuation left with Chinese characters', function() { - expect(postprocessing.punctuationCleanup('道 . ')).toBe('道. '); - }); - - it('snaps groups of punctuation left together', function() { - expect(postprocessing.punctuationCleanup('test ?! ')).toBe('test?! '); - }); - - it('preserves whatever whitespace comes before', function() { - expect(postprocessing.punctuationCleanup('test \t?! ')).toBe('test?! \t '); - }); - - it('corrects across newlines too', function() { - expect(postprocessing.punctuationCleanup('test \n\n. ')).toBe('test. \n\n '); - }); - - it('does nothing on correctly written text', function() { - let src = 'test, test: test; test! test? '; - expect(postprocessing.punctuationCleanup(src)).toBe(src); - }); + it('snaps punctuation left', function() { + expect(postprocessing.punctuationCleanup('test . ')).toBe('test. '); + expect(postprocessing.punctuationCleanup('test , ')).toBe('test, '); + expect(postprocessing.punctuationCleanup('test : ')).toBe('test: '); + expect(postprocessing.punctuationCleanup('test ; ')).toBe('test; '); + expect(postprocessing.punctuationCleanup('test ! ')).toBe('test! '); + expect(postprocessing.punctuationCleanup('test ? ')).toBe('test? '); + // Hyphen and multiple hyphens + expect(postprocessing.punctuationCleanup('test - ')).toBe('test- '); + expect(postprocessing.punctuationCleanup('test --- ')).toBe('test--- '); + // En dash + expect(postprocessing.punctuationCleanup('test – ')).toBe('test– '); + // Em dash + expect(postprocessing.punctuationCleanup('test — ')).toBe('test— '); + }); + + it('snaps punctuation left with Chinese characters', function() { + expect(postprocessing.punctuationCleanup('道 . ')).toBe('道. '); + }); + + it('snaps groups of punctuation left together', function() { + expect(postprocessing.punctuationCleanup('test ?! ')).toBe('test?! '); + }); + + it('preserves whatever whitespace comes before', function() { + expect(postprocessing.punctuationCleanup('test \t?! ')).toBe('test?! \t '); + }); + + it('corrects across newlines too', function() { + expect(postprocessing.punctuationCleanup('test \n\n. ')).toBe('test. \n\n '); + }); + + it('corrects after quotes', function() { + expect(postprocessing.punctuationCleanup('"test" .')).toBe('"test". '); + expect(postprocessing.punctuationCleanup('\'test\' .')).toBe('\'test\'. '); + expect(postprocessing.punctuationCleanup('_test_ .')).toBe('_test_. '); + }); + + it('does nothing on correctly written text', function() { + let src = 'test, test: test; test! test? '; + expect(postprocessing.punctuationCleanup(src)).toBe(src); + }); }); describe('capitalizationCleanup', function() { - it('Does nothing on well-capitalized text', function() { - let src = 'Test. Test 2! 123 test? Test'; - expect(postprocessing.capitalizationCleanup(src)).toBe(src); - }); - - it('Capitalizes plain ASCII characters', function() { - let src = 'test. test.'; - expect(postprocessing.capitalizationCleanup(src)).toBe('Test. Test.'); - }); - - it('Capitalizes extended latin characters', function() { - let src = 'test! ä'; - expect(postprocessing.capitalizationCleanup(src)).toBe('Test! Ä'); - }); - - it('Works across line breaks', function() { - let src = 'test. \ntest.'; - expect(postprocessing.capitalizationCleanup(src)).toBe('Test. \nTest.'); - }); - - it('Works across quotation marks', function() { - let src = '"Test." test.'; - expect(postprocessing.capitalizationCleanup(src)).toBe('"Test." Test.'); - }); + it('Does nothing on well-capitalized text', function() { + let src = 'Test. Test 2! 123 test? Test'; + expect(postprocessing.capitalizationCleanup(src)).toBe(src); + }); + + it('Capitalizes plain ASCII characters', function() { + let src = 'test. test.'; + expect(postprocessing.capitalizationCleanup(src)).toBe('Test. Test.'); + }); + + it('Capitalizes extended latin characters', function() { + let src = 'test! ä'; + expect(postprocessing.capitalizationCleanup(src)).toBe('Test! Ä'); + }); + + it('Works across line breaks', function() { + let src = 'test. \ntest.'; + expect(postprocessing.capitalizationCleanup(src)).toBe('Test. \nTest.'); + }); + + it('Works across quotation marks', function() { + let src = '"Test." test.'; + expect(postprocessing.capitalizationCleanup(src)).toBe('"Test." Test.'); + }); }); describe('correctIndefiniteArticles', function() { - function testCase(input: string, output: string) { - expect(postprocessing.correctIndefiniteArticles(input)).toBe(output); - } - - it('Leaves correct cases intact', function() { - testCase('a dog', 'a dog'); - testCase('an apple', 'an apple'); - testCase('a union', 'a union'); - testCase('a 10', 'a 10'); - testCase('an 8', 'an 8'); - testCase('a UFO', 'a UFO'); - }); - - it('Corrects incorrect cases', function() { - testCase('an dog', 'a dog'); - testCase('a apple', 'an apple'); - testCase('an union', 'a union'); - testCase('an 10', 'a 10'); - testCase('a 8', 'an 8'); - testCase('an UFO', 'a UFO'); - }); - - it('Corrects multiple cases in a string', function() { - testCase('an dog\nand a apple', 'a dog\nand an apple'); - }); - - it('Preserves capitalization schemes', function() { - testCase('An dog', 'A dog'); - testCase('AN dog', 'A dog'); - testCase('A apple', 'An apple'); - testCase('a apple', 'an apple'); - testCase('AN apple', 'AN apple'); - }); - - it('Works on words with diacritics', function() { - testCase('an jalapeño', 'a jalapeño'); - }) - - it('Doesnt act on words spelled with article-like endings', function() { - // Regression test - testCase('can dog', 'can dog'); - }); + function testCase(input: string, output: string) { + expect(postprocessing.correctIndefiniteArticles(input)).toBe(output); + } + + it('Leaves correct cases intact', function() { + testCase('a dog', 'a dog'); + testCase('an apple', 'an apple'); + testCase('a union', 'a union'); + testCase('a 10', 'a 10'); + testCase('an 8', 'an 8'); + testCase('a UFO', 'a UFO'); + }); + + it('Corrects incorrect cases', function() { + testCase('an dog', 'a dog'); + testCase('a apple', 'an apple'); + testCase('an union', 'a union'); + testCase('an 10', 'a 10'); + testCase('a 8', 'an 8'); + testCase('an UFO', 'a UFO'); + }); + + it('Corrects multiple cases in a string', function() { + testCase('an dog\nand a apple', 'a dog\nand an apple'); + }); + + it('Preserves capitalization schemes', function() { + testCase('An dog', 'A dog'); + testCase('AN dog', 'A dog'); + testCase('A apple', 'An apple'); + testCase('a apple', 'an apple'); + testCase('AN apple', 'AN apple'); + }); + + it('Works on words with diacritics', function() { + testCase('an jalapeño', 'a jalapeño'); + }) + + it('Doesnt act on words spelled with article-like endings', function() { + // Regression test + testCase('can dog', 'can dog'); + }); });