Skip to content

Commit

Permalink
Support shifting punctuation to the right of quotes
Browse files Browse the repository at this point in the history
  • Loading branch information
ajyoon committed Mar 10, 2024
1 parent 7750e26 commit 2bf73ce
Show file tree
Hide file tree
Showing 3 changed files with 225 additions and 211 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
ending before quotation marks. For instance, `"Test." test` will now
correct to `"Test." Test` where previously it would not.
* Fix visual line breaks when backslash is followed by whitespace
* Update punctuation cleanup postprocessing to shift punctuation to the
right of quotes (single, double, or underscores). For example `"test" .`
will now correct to `"test".`. Note that this does not shift punctuation
to the inside of quotes, as this is context and style dependent.

### 0.1.9
* Update experimental in-eval ref lookup. Now exposed by two
Expand Down
164 changes: 84 additions & 80 deletions src/postprocessing.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,77 +12,81 @@ const TRAILING_WHITESPACE_RE = /\s+$/;
* 5. Ensuring the text ends with a single line break
*/
export function whitespaceCleanup(text: string): string {
let out = '';
let atDocStart = true;
let lastLineWasBlank = false;
for (let line of text.split('\n')) {
let isBlank = BLANK_LINE_RE.test(line);

if (atDocStart) {
if (isBlank) {
// Skip blank lines at start of document
continue;
} else {
atDocStart = false;
}
}
let out = '';
let atDocStart = true;
let lastLineWasBlank = false;
for (let line of text.split('\n')) {
let isBlank = BLANK_LINE_RE.test(line);

if (atDocStart) {
if (isBlank) {
// Skip blank lines at start of document
continue;
} else {
atDocStart = false;
}
}

if (isBlank) {
if (lastLineWasBlank) {
// Skip runs of blank lines
continue;
}
// Lines consisting of only whitespace should
// become simply blank lines
line = '';
} else {
// intra-line cleanups
line = line.replace(TRAILING_WHITESPACE_RE, '');
let rewrittenLine = '';
let atLineStart = true;
let lastCharWasSpace = false;
for (let char of line) {
let charIsSpace = char === ' ';
if (!atLineStart && lastCharWasSpace && charIsSpace) {
continue;
if (isBlank) {
if (lastLineWasBlank) {
// Skip runs of blank lines
continue;
}
// Lines consisting of only whitespace should
// become simply blank lines
line = '';
} else {
if (!charIsSpace) {
atLineStart = false;
}
rewrittenLine += char;
lastCharWasSpace = charIsSpace;
// intra-line cleanups
line = line.replace(TRAILING_WHITESPACE_RE, '');
let rewrittenLine = '';
let atLineStart = true;
let lastCharWasSpace = false;
for (let char of line) {
let charIsSpace = char === ' ';
if (!atLineStart && lastCharWasSpace && charIsSpace) {
continue;
} else {
if (!charIsSpace) {
atLineStart = false;
}
rewrittenLine += char;
lastCharWasSpace = charIsSpace;
}
}
line = rewrittenLine;
}
}
line = rewrittenLine;
}

lastLineWasBlank = isBlank;
lastLineWasBlank = isBlank;

out += line + '\n';
}
out += line + '\n';
}

// Edge case: if input ended with a line break already, above code
// will result in \n\n ending the output. Correct this so output
// always terminates with a single \n
if (out.endsWith('\n\n')) {
out = out.substring(0, out.length - 1);
}
// Edge case: if input ended with a line break already, above code
// will result in \n\n ending the output. Correct this so output
// always terminates with a single \n
if (out.endsWith('\n\n')) {
out = out.substring(0, out.length - 1);
}

return out;
return out;
}

// Note the 3 dashes here are the different kinds, not the same character
const MISPLACED_WORD_ENDING_PUNC_RE = /([a-zA-Z0-9\xA0-\uFFFF])(\s+)([.,:;!?\-\–\—]+)/g;
const MISPLACED_WORD_ENDING_PUNC_RE = /([a-zA-Z0-9\xA0-\uFFFF"'_])(\s+)([.,:;!?\-\–\—]+)/g;

/**
* Performs simple English-like correction of whitespace around
* punctuation marks.
*
* - snap [, . : ; ! ?] to the end of preceding words when separated
* by whitespace (including line breaks.)
* Snap [, . : ; ! ?] to the end of preceding words, quotes, or underscores,
* when separated by whitespace (including line breaks.)
*
* Note that this will shift punctuation to the *right* of quotes ([" ' _]),
* but will not shift punctuation to the inside of quotes, as this is often
* dependent on style and context.
*/
export function punctuationCleanup(text: string): string {
return text.replace(MISPLACED_WORD_ENDING_PUNC_RE, '$1$3$2');
return text.replace(MISPLACED_WORD_ENDING_PUNC_RE, '$1$3$2');
}


Expand All @@ -97,18 +101,18 @@ const INCORRECT_CAPS_RE = /([.!?]["'_]?\s+|^\s*)(\p{Ll})/gu;
* following a sentence-ending punctuation mark.
*/
export function capitalizationCleanup(text: string): string {
// Conforms to `text.replace` replacer function interface
function correctCaps(_match: string, p1: string, p2: string) {
return p1 + p2.toUpperCase();
}
// Conforms to `text.replace` replacer function interface
function correctCaps(_match: string, p1: string, p2: string) {
return p1 + p2.toUpperCase();
}

return text.replace(INCORRECT_CAPS_RE, correctCaps);
return text.replace(INCORRECT_CAPS_RE, correctCaps);
}

const VISUAL_LINE_BREAK_RE = /\\ *(\r?\n|\r)[ \t]*/g

export function replaceVisualLineBreaks(text: string): string {
return text.replace(VISUAL_LINE_BREAK_RE, ' ');
return text.replace(VISUAL_LINE_BREAK_RE, ' ');
}

const INDEFINITE_ARTICLE_RE = /\b(a|an) ([\p{L}0-9]+)\b/igu
Expand All @@ -117,27 +121,27 @@ const INDEFINITE_ARTICLE_RE = /\b(a|an) ([\p{L}0-9]+)\b/igu
* Attempt to correct English indefinite articles (a / an)
*/
export function correctIndefiniteArticles(text: string) {
function upcaseFirstLetter(s: string): string {
if (s.length === 0) {
return s;
} else if (s.length === 1) {
return s.toUpperCase();
} else {
return s[0].toUpperCase() + s.slice(1);
function upcaseFirstLetter(s: string): string {
if (s.length === 0) {
return s;
} else if (s.length === 1) {
return s.toUpperCase();
} else {
return s[0].toUpperCase() + s.slice(1);
}
}
}
// Conforms to `text.replace` replacer function interface
function correctArticle(_match: string, originalArticle: string, word: string) {
let article = indefinite(word, { articleOnly: true });
if (originalArticle === 'a' || originalArticle === 'an') {
return article + ' ' + word;
} else if (originalArticle === 'A' || originalArticle === 'An') {
return upcaseFirstLetter(article) + ' ' + word;
} else {
// All caps
return article.toUpperCase() + ' ' + word;
// Conforms to `text.replace` replacer function interface
function correctArticle(_match: string, originalArticle: string, word: string) {
let article = indefinite(word, { articleOnly: true });
if (originalArticle === 'a' || originalArticle === 'an') {
return article + ' ' + word;
} else if (originalArticle === 'A' || originalArticle === 'An') {
return upcaseFirstLetter(article) + ' ' + word;
} else {
// All caps
return article.toUpperCase() + ' ' + word;
}
}
}

return text.replace(INDEFINITE_ARTICLE_RE, correctArticle);
return text.replace(INDEFINITE_ARTICLE_RE, correctArticle);
}
Loading

0 comments on commit 2bf73ce

Please sign in to comment.