From 2bf73ce780b378287c7e11ba354bffdd6f02ee36 Mon Sep 17 00:00:00 2001
From: Andrew Yoon <andrew@nothing-to-say.org>
Date: Sun, 10 Mar 2024 12:42:36 -0400
Subject: [PATCH] Support shifting punctuation to the right of quotes

---
 CHANGELOG.md               |   4 +
 src/postprocessing.ts      | 164 ++++++++++++-----------
 test/testPostprocessing.ts | 268 +++++++++++++++++++------------------
 3 files changed, 225 insertions(+), 211 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d96b1a8..10cf015 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,10 @@
   ending before quotation marks. For instance, `"Test." test` will now
   correct to `"Test." Test` where previously it would not.
 * Fix visual line breaks when backslash is followed by whitespace
+* Update punctuation cleanup postprocessing to shift punctuation to the
+  right of quotes (single, double, or underscores). For example `"test" .`
+  will now correct to `"test".`. Note that this does not shift punctuation
+  to the inside of quotes, as this is context and style dependent.
 
 ### 0.1.9
 * Update experimental in-eval ref lookup. Now exposed by two
diff --git a/src/postprocessing.ts b/src/postprocessing.ts
index 9059063..046434a 100644
--- a/src/postprocessing.ts
+++ b/src/postprocessing.ts
@@ -12,77 +12,81 @@ const TRAILING_WHITESPACE_RE = /\s+$/;
  * 5. Ensuring the text ends with a single line break
  */
 export function whitespaceCleanup(text: string): string {
-  let out = '';
-  let atDocStart = true;
-  let lastLineWasBlank = false;
-  for (let line of text.split('\n')) {
-    let isBlank = BLANK_LINE_RE.test(line);
-
-    if (atDocStart) {
-      if (isBlank) {
-        // Skip blank lines at start of document
-        continue;
-      } else {
-        atDocStart = false;
-      }
-    }
+    let out = '';
+    let atDocStart = true;
+    let lastLineWasBlank = false;
+    for (let line of text.split('\n')) {
+        let isBlank = BLANK_LINE_RE.test(line);
+
+        if (atDocStart) {
+            if (isBlank) {
+                // Skip blank lines at start of document
+                continue;
+            } else {
+                atDocStart = false;
+            }
+        }
 
-    if (isBlank) {
-      if (lastLineWasBlank) {
-        // Skip runs of blank lines
-        continue;
-      }
-      // Lines consisting of only whitespace should
-      // become simply blank lines
-      line = '';
-    } else {
-      // intra-line cleanups
-      line = line.replace(TRAILING_WHITESPACE_RE, '');
-      let rewrittenLine = '';
-      let atLineStart = true;
-      let lastCharWasSpace = false;
-      for (let char of line) {
-        let charIsSpace = char === ' ';
-        if (!atLineStart && lastCharWasSpace && charIsSpace) {
-          continue;
+        if (isBlank) {
+            if (lastLineWasBlank) {
+                // Skip runs of blank lines
+                continue;
+            }
+            // Lines consisting of only whitespace should
+            // become simply blank lines
+            line = '';
         } else {
-          if (!charIsSpace) {
-            atLineStart = false;
-          }
-          rewrittenLine += char;
-          lastCharWasSpace = charIsSpace;
+            // intra-line cleanups
+            line = line.replace(TRAILING_WHITESPACE_RE, '');
+            let rewrittenLine = '';
+            let atLineStart = true;
+            let lastCharWasSpace = false;
+            for (let char of line) {
+                let charIsSpace = char === ' ';
+                if (!atLineStart && lastCharWasSpace && charIsSpace) {
+                    continue;
+                } else {
+                    if (!charIsSpace) {
+                        atLineStart = false;
+                    }
+                    rewrittenLine += char;
+                    lastCharWasSpace = charIsSpace;
+                }
+            }
+            line = rewrittenLine;
         }
-      }
-      line = rewrittenLine;
-    }
 
-    lastLineWasBlank = isBlank;
+        lastLineWasBlank = isBlank;
 
-    out += line + '\n';
-  }
+        out += line + '\n';
+    }
 
-  // Edge case: if input ended with a line break already, above code
-  // will result in \n\n ending the output. Correct this so output
-  // always terminates with a single \n
-  if (out.endsWith('\n\n')) {
-    out = out.substring(0, out.length - 1);
-  }
+    // Edge case: if input ended with a line break already, above code
+    // will result in \n\n ending the output. Correct this so output
+    // always terminates with a single \n
+    if (out.endsWith('\n\n')) {
+        out = out.substring(0, out.length - 1);
+    }
 
-  return out;
+    return out;
 }
 
 // Note the 3 dashes here are the different kinds, not the same character
-const MISPLACED_WORD_ENDING_PUNC_RE = /([a-zA-Z0-9\xA0-\uFFFF])(\s+)([.,:;!?\-\–\—]+)/g;
+const MISPLACED_WORD_ENDING_PUNC_RE = /([a-zA-Z0-9\xA0-\uFFFF"'_])(\s+)([.,:;!?\-\–\—]+)/g;
 
 /**
  * Performs simple English-like correction of whitespace around
  * punctuation marks.
  *
- * - snap [, . : ; ! ?] to the end of preceding words when separated
- *   by whitespace (including line breaks.)
+ * Snap [, . : ; ! ?] to the end of preceding words, quotes, or underscores,
+ * when separated by whitespace (including line breaks.)
+ *
+ * Note that this will shift punctuation to the *right* of quotes ([" ' _]),
+ * but will not shift punctuation to the inside of quotes, as this is often
+ * dependent on style and context.
  */
 export function punctuationCleanup(text: string): string {
-  return text.replace(MISPLACED_WORD_ENDING_PUNC_RE, '$1$3$2');
+    return text.replace(MISPLACED_WORD_ENDING_PUNC_RE, '$1$3$2');
 }
 
 
@@ -97,18 +101,18 @@ const INCORRECT_CAPS_RE = /([.!?]["'_]?\s+|^\s*)(\p{Ll})/gu;
  * following a sentence-ending punctuation mark.
  */
 export function capitalizationCleanup(text: string): string {
-  // Conforms to `text.replace` replacer function interface
-  function correctCaps(_match: string, p1: string, p2: string) {
-    return p1 + p2.toUpperCase();
-  }
+    // Conforms to `text.replace` replacer function interface
+    function correctCaps(_match: string, p1: string, p2: string) {
+        return p1 + p2.toUpperCase();
+    }
 
-  return text.replace(INCORRECT_CAPS_RE, correctCaps);
+    return text.replace(INCORRECT_CAPS_RE, correctCaps);
 }
 
 const VISUAL_LINE_BREAK_RE = /\\ *(\r?\n|\r)[ \t]*/g
 
 export function replaceVisualLineBreaks(text: string): string {
-  return text.replace(VISUAL_LINE_BREAK_RE, ' ');
+    return text.replace(VISUAL_LINE_BREAK_RE, ' ');
 }
 
 const INDEFINITE_ARTICLE_RE = /\b(a|an) ([\p{L}0-9]+)\b/igu
@@ -117,27 +121,27 @@ const INDEFINITE_ARTICLE_RE = /\b(a|an) ([\p{L}0-9]+)\b/igu
  * Attempt to correct English indefinite articles (a / an)
  */
 export function correctIndefiniteArticles(text: string) {
-  function upcaseFirstLetter(s: string): string {
-    if (s.length === 0) {
-      return s;
-    } else if (s.length === 1) {
-      return s.toUpperCase();
-    } else {
-      return s[0].toUpperCase() + s.slice(1);
+    function upcaseFirstLetter(s: string): string {
+        if (s.length === 0) {
+            return s;
+        } else if (s.length === 1) {
+            return s.toUpperCase();
+        } else {
+            return s[0].toUpperCase() + s.slice(1);
+        }
     }
-  }
-  // Conforms to `text.replace` replacer function interface
-  function correctArticle(_match: string, originalArticle: string, word: string) {
-    let article = indefinite(word, { articleOnly: true });
-    if (originalArticle === 'a' || originalArticle === 'an') {
-      return article + ' ' + word;
-    } else if (originalArticle === 'A' || originalArticle === 'An') {
-      return upcaseFirstLetter(article) + ' ' + word;
-    } else {
-      // All caps
-      return article.toUpperCase() + ' ' + word;
+    // Conforms to `text.replace` replacer function interface
+    function correctArticle(_match: string, originalArticle: string, word: string) {
+        let article = indefinite(word, { articleOnly: true });
+        if (originalArticle === 'a' || originalArticle === 'an') {
+            return article + ' ' + word;
+        } else if (originalArticle === 'A' || originalArticle === 'An') {
+            return upcaseFirstLetter(article) + ' ' + word;
+        } else {
+            // All caps
+            return article.toUpperCase() + ' ' + word;
+        }
     }
-  }
 
-  return text.replace(INDEFINITE_ARTICLE_RE, correctArticle);
+    return text.replace(INDEFINITE_ARTICLE_RE, correctArticle);
 }
diff --git a/test/testPostprocessing.ts b/test/testPostprocessing.ts
index 076a032..7116256 100644
--- a/test/testPostprocessing.ts
+++ b/test/testPostprocessing.ts
@@ -3,155 +3,161 @@ import * as postprocessing from '../src/postprocessing';
 
 
 describe('replaceVisualLineBreaks', function() {
-  it('works in a basic case', function() {
-    expect(postprocessing.replaceVisualLineBreaks('foo\\\nbar')).toBe('foo bar');
-  });
+    it('works in a basic case', function() {
+        expect(postprocessing.replaceVisualLineBreaks('foo\\\nbar')).toBe('foo bar');
+    });
 
-  it('works backslash is followed by whitespace before line break', function() {
-    expect(postprocessing.replaceVisualLineBreaks('foo\\ \nbar')).toBe('foo bar');
-  });
+    it('works backslash is followed by whitespace before line break', function() {
+        expect(postprocessing.replaceVisualLineBreaks('foo\\ \nbar')).toBe('foo bar');
+    });
 });
 
 describe('whitespaceCleanup', function() {
-  it('removes blank lines at start and end of string', function() {
-    expect(postprocessing.whitespaceCleanup('\n\n\n\nfoo\n   \n')).toBe('foo\n');
-  });
+    it('removes blank lines at start and end of string', function() {
+        expect(postprocessing.whitespaceCleanup('\n\n\n\nfoo\n   \n')).toBe('foo\n');
+    });
 
-  it('collapses runs of more than 1 blank line into 1', function() {
-    let input = 'foo\n\nbar\n\n\n\n\nbiz';
-    let expectedOutput = 'foo\n\nbar\n\nbiz\n';
-    expect(postprocessing.whitespaceCleanup(input)).toBe(expectedOutput);
+    it('collapses runs of more than 1 blank line into 1', function() {
+        let input = 'foo\n\nbar\n\n\n\n\nbiz';
+        let expectedOutput = 'foo\n\nbar\n\nbiz\n';
+        expect(postprocessing.whitespaceCleanup(input)).toBe(expectedOutput);
 
-  });
+    });
 
-  it('removes trailing whitespace on every line', function() {
-    expect(postprocessing.whitespaceCleanup('foo\n bar  \n    \n')).toBe('foo\n bar\n');
-  });
+    it('removes trailing whitespace on every line', function() {
+        expect(postprocessing.whitespaceCleanup('foo\n bar  \n    \n')).toBe('foo\n bar\n');
+    });
 
-  it('automatically inserts an EOF line break', function() {
-    expect(postprocessing.whitespaceCleanup('foo')).toBe('foo\n');
-  });
+    it('automatically inserts an EOF line break', function() {
+        expect(postprocessing.whitespaceCleanup('foo')).toBe('foo\n');
+    });
 
-  it('doesnt insert a redundant EOF line break when one already exists', function() {
-    expect(postprocessing.whitespaceCleanup('foo\n')).toBe('foo\n');
-  });
+    it('doesnt insert a redundant EOF line break when one already exists', function() {
+        expect(postprocessing.whitespaceCleanup('foo\n')).toBe('foo\n');
+    });
 
-  it('preserves leading whitespace on every line', function() {
-    expect(postprocessing.whitespaceCleanup(' foo')).toBe(' foo\n');
-  });
+    it('preserves leading whitespace on every line', function() {
+        expect(postprocessing.whitespaceCleanup(' foo')).toBe(' foo\n');
+    });
 
-  it('collapses runs of more than 1 whitespace in the middle of a line', function() {
-    expect(postprocessing.whitespaceCleanup('  foo     bar')).toBe('  foo bar\n');
-  });
+    it('collapses runs of more than 1 whitespace in the middle of a line', function() {
+        expect(postprocessing.whitespaceCleanup('  foo     bar')).toBe('  foo bar\n');
+    });
 });
 
 describe('punctuationCleanup', function() {
-  it('snaps punctuation left', function() {
-    expect(postprocessing.punctuationCleanup('test . ')).toBe('test.  ');
-    expect(postprocessing.punctuationCleanup('test , ')).toBe('test,  ');
-    expect(postprocessing.punctuationCleanup('test : ')).toBe('test:  ');
-    expect(postprocessing.punctuationCleanup('test ; ')).toBe('test;  ');
-    expect(postprocessing.punctuationCleanup('test ! ')).toBe('test!  ');
-    expect(postprocessing.punctuationCleanup('test ? ')).toBe('test?  ');
-    // Hyphen and multiple hyphens
-    expect(postprocessing.punctuationCleanup('test - ')).toBe('test-  ');
-    expect(postprocessing.punctuationCleanup('test --- ')).toBe('test---  ');
-    // En dash
-    expect(postprocessing.punctuationCleanup('test – ')).toBe('test–  ');
-    // Em dash
-    expect(postprocessing.punctuationCleanup('test — ')).toBe('test—  ');
-  });
-
-  it('snaps punctuation left with Chinese characters', function() {
-    expect(postprocessing.punctuationCleanup('道 . ')).toBe('道.  ');
-  });
-
-  it('snaps groups of punctuation left together', function() {
-    expect(postprocessing.punctuationCleanup('test ?! ')).toBe('test?!  ');
-  });
-
-  it('preserves whatever whitespace comes before', function() {
-    expect(postprocessing.punctuationCleanup('test  \t?! ')).toBe('test?!  \t ');
-  });
-
-  it('corrects across newlines too', function() {
-    expect(postprocessing.punctuationCleanup('test  \n\n. ')).toBe('test.  \n\n ');
-  });
-
-  it('does nothing on correctly written text', function() {
-    let src = 'test, test:  test; test! test? ';
-    expect(postprocessing.punctuationCleanup(src)).toBe(src);
-  });
+    it('snaps punctuation left', function() {
+        expect(postprocessing.punctuationCleanup('test . ')).toBe('test.  ');
+        expect(postprocessing.punctuationCleanup('test , ')).toBe('test,  ');
+        expect(postprocessing.punctuationCleanup('test : ')).toBe('test:  ');
+        expect(postprocessing.punctuationCleanup('test ; ')).toBe('test;  ');
+        expect(postprocessing.punctuationCleanup('test ! ')).toBe('test!  ');
+        expect(postprocessing.punctuationCleanup('test ? ')).toBe('test?  ');
+        // Hyphen and multiple hyphens
+        expect(postprocessing.punctuationCleanup('test - ')).toBe('test-  ');
+        expect(postprocessing.punctuationCleanup('test --- ')).toBe('test---  ');
+        // En dash
+        expect(postprocessing.punctuationCleanup('test – ')).toBe('test–  ');
+        // Em dash
+        expect(postprocessing.punctuationCleanup('test — ')).toBe('test—  ');
+    });
+
+    it('snaps punctuation left with Chinese characters', function() {
+        expect(postprocessing.punctuationCleanup('道 . ')).toBe('道.  ');
+    });
+
+    it('snaps groups of punctuation left together', function() {
+        expect(postprocessing.punctuationCleanup('test ?! ')).toBe('test?!  ');
+    });
+
+    it('preserves whatever whitespace comes before', function() {
+        expect(postprocessing.punctuationCleanup('test  \t?! ')).toBe('test?!  \t ');
+    });
+
+    it('corrects across newlines too', function() {
+        expect(postprocessing.punctuationCleanup('test  \n\n. ')).toBe('test.  \n\n ');
+    });
+
+    it('corrects after quotes', function() {
+        expect(postprocessing.punctuationCleanup('"test"  .')).toBe('"test".  ');
+        expect(postprocessing.punctuationCleanup('\'test\'  .')).toBe('\'test\'.  ');
+        expect(postprocessing.punctuationCleanup('_test_  .')).toBe('_test_.  ');
+    });
+
+    it('does nothing on correctly written text', function() {
+        let src = 'test, test:  test; test! test? ';
+        expect(postprocessing.punctuationCleanup(src)).toBe(src);
+    });
 });
 
 describe('capitalizationCleanup', function() {
-  it('Does nothing on well-capitalized text', function() {
-    let src = 'Test. Test 2! 123 test? Test';
-    expect(postprocessing.capitalizationCleanup(src)).toBe(src);
-  });
-
-  it('Capitalizes plain ASCII characters', function() {
-    let src = 'test. test.';
-    expect(postprocessing.capitalizationCleanup(src)).toBe('Test. Test.');
-  });
-
-  it('Capitalizes extended latin characters', function() {
-    let src = 'test! ä';
-    expect(postprocessing.capitalizationCleanup(src)).toBe('Test! Ä');
-  });
-
-  it('Works across line breaks', function() {
-    let src = 'test. \ntest.';
-    expect(postprocessing.capitalizationCleanup(src)).toBe('Test. \nTest.');
-  });
-
-  it('Works across quotation marks', function() {
-    let src = '"Test." test.';
-    expect(postprocessing.capitalizationCleanup(src)).toBe('"Test." Test.');
-  });
+    it('Does nothing on well-capitalized text', function() {
+        let src = 'Test. Test 2! 123 test? Test';
+        expect(postprocessing.capitalizationCleanup(src)).toBe(src);
+    });
+
+    it('Capitalizes plain ASCII characters', function() {
+        let src = 'test. test.';
+        expect(postprocessing.capitalizationCleanup(src)).toBe('Test. Test.');
+    });
+
+    it('Capitalizes extended latin characters', function() {
+        let src = 'test! ä';
+        expect(postprocessing.capitalizationCleanup(src)).toBe('Test! Ä');
+    });
+
+    it('Works across line breaks', function() {
+        let src = 'test. \ntest.';
+        expect(postprocessing.capitalizationCleanup(src)).toBe('Test. \nTest.');
+    });
+
+    it('Works across quotation marks', function() {
+        let src = '"Test." test.';
+        expect(postprocessing.capitalizationCleanup(src)).toBe('"Test." Test.');
+    });
 });
 
 describe('correctIndefiniteArticles', function() {
-  function testCase(input: string, output: string) {
-    expect(postprocessing.correctIndefiniteArticles(input)).toBe(output);
-  }
-
-  it('Leaves correct cases intact', function() {
-    testCase('a dog', 'a dog');
-    testCase('an apple', 'an apple');
-    testCase('a union', 'a union');
-    testCase('a 10', 'a 10');
-    testCase('an 8', 'an 8');
-    testCase('a UFO', 'a UFO');
-  });
-
-  it('Corrects incorrect cases', function() {
-    testCase('an dog', 'a dog');
-    testCase('a apple', 'an apple');
-    testCase('an union', 'a union');
-    testCase('an 10', 'a 10');
-    testCase('a 8', 'an 8');
-    testCase('an UFO', 'a UFO');
-  });
-
-  it('Corrects multiple cases in a string', function() {
-    testCase('an dog\nand a apple', 'a dog\nand an apple');
-  });
-
-  it('Preserves capitalization schemes', function() {
-    testCase('An dog', 'A dog');
-    testCase('AN dog', 'A dog');
-    testCase('A apple', 'An apple');
-    testCase('a apple', 'an apple');
-    testCase('AN apple', 'AN apple');
-  });
-
-  it('Works on words with diacritics', function() {
-    testCase('an jalapeño', 'a jalapeño');
-  })
-
-  it('Doesnt act on words spelled with article-like endings', function() {
-    // Regression test
-    testCase('can dog', 'can dog');
-  });
+    function testCase(input: string, output: string) {
+        expect(postprocessing.correctIndefiniteArticles(input)).toBe(output);
+    }
+
+    it('Leaves correct cases intact', function() {
+        testCase('a dog', 'a dog');
+        testCase('an apple', 'an apple');
+        testCase('a union', 'a union');
+        testCase('a 10', 'a 10');
+        testCase('an 8', 'an 8');
+        testCase('a UFO', 'a UFO');
+    });
+
+    it('Corrects incorrect cases', function() {
+        testCase('an dog', 'a dog');
+        testCase('a apple', 'an apple');
+        testCase('an union', 'a union');
+        testCase('an 10', 'a 10');
+        testCase('a 8', 'an 8');
+        testCase('an UFO', 'a UFO');
+    });
+
+    it('Corrects multiple cases in a string', function() {
+        testCase('an dog\nand a apple', 'a dog\nand an apple');
+    });
+
+    it('Preserves capitalization schemes', function() {
+        testCase('An dog', 'A dog');
+        testCase('AN dog', 'A dog');
+        testCase('A apple', 'An apple');
+        testCase('a apple', 'an apple');
+        testCase('AN apple', 'AN apple');
+    });
+
+    it('Works on words with diacritics', function() {
+        testCase('an jalapeño', 'a jalapeño');
+    })
+
+    it('Doesnt act on words spelled with article-like endings', function() {
+        // Regression test
+        testCase('can dog', 'can dog');
+    });
 });