Skip to content

Commit

Permalink
Fix autocaps around brackets and other delimiters
Browse files Browse the repository at this point in the history
  • Loading branch information
ajyoon committed May 12, 2024
1 parent dec25b8 commit 6c32031
Show file tree
Hide file tree
Showing 3 changed files with 239 additions and 217 deletions.
5 changes: 3 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
### 0.1.10 (unreleased)
* Improve interactive view error/warning capture
* Update capitalization cleanup postprocessing to support sentences
ending before quotation marks. For instance, `"Test." test` will now
correct to `"Test." Test` where previously it would not.
ending before quotation marks and other matching delimiters like brackets.
For instance, `"Test." test` will now correct to `"Test." Test` where
previously it would not.
* Fix visual line breaks when backslash is followed by whitespace
* Update punctuation cleanup postprocessing to shift punctuation to the
right of quotes (single, double, or underscores). For example `"test" .`
Expand Down
156 changes: 78 additions & 78 deletions src/postprocessing.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,63 +12,63 @@ const TRAILING_WHITESPACE_RE = /\s+$/;
* 5. Ensuring the text ends with a single line break
*/
export function whitespaceCleanup(text: string): string {
let out = '';
let atDocStart = true;
let lastLineWasBlank = false;
for (let line of text.split('\n')) {
let isBlank = BLANK_LINE_RE.test(line);

if (atDocStart) {
if (isBlank) {
// Skip blank lines at start of document
continue;
} else {
atDocStart = false;
}
}
let out = '';
let atDocStart = true;
let lastLineWasBlank = false;
for (let line of text.split('\n')) {
let isBlank = BLANK_LINE_RE.test(line);

if (atDocStart) {
if (isBlank) {
// Skip blank lines at start of document
continue;
} else {
atDocStart = false;
}
}

if (isBlank) {
if (lastLineWasBlank) {
// Skip runs of blank lines
continue;
}
// Lines consisting of only whitespace should
// become simply blank lines
line = '';
if (isBlank) {
if (lastLineWasBlank) {
// Skip runs of blank lines
continue;
}
// Lines consisting of only whitespace should
// become simply blank lines
line = '';
} else {
// intra-line cleanups
line = line.replace(TRAILING_WHITESPACE_RE, '');
let rewrittenLine = '';
let atLineStart = true;
let lastCharWasSpace = false;
for (let char of line) {
let charIsSpace = char === ' ';
if (!atLineStart && lastCharWasSpace && charIsSpace) {
continue;
} else {
// intra-line cleanups
line = line.replace(TRAILING_WHITESPACE_RE, '');
let rewrittenLine = '';
let atLineStart = true;
let lastCharWasSpace = false;
for (let char of line) {
let charIsSpace = char === ' ';
if (!atLineStart && lastCharWasSpace && charIsSpace) {
continue;
} else {
if (!charIsSpace) {
atLineStart = false;
}
rewrittenLine += char;
lastCharWasSpace = charIsSpace;
}
}
line = rewrittenLine;
if (!charIsSpace) {
atLineStart = false;
}
rewrittenLine += char;
lastCharWasSpace = charIsSpace;
}
}
line = rewrittenLine;
}

lastLineWasBlank = isBlank;
lastLineWasBlank = isBlank;

out += line + '\n';
}
out += line + '\n';
}

// Edge case: if input ended with a line break already, above code
// will result in \n\n ending the output. Correct this so output
// always terminates with a single \n
if (out.endsWith('\n\n')) {
out = out.substring(0, out.length - 1);
}
// Edge case: if input ended with a line break already, above code
// will result in \n\n ending the output. Correct this so output
// always terminates with a single \n
if (out.endsWith('\n\n')) {
out = out.substring(0, out.length - 1);
}

return out;
return out;
}

// Note the 3 dashes here are the different kinds, not the same character
Expand All @@ -86,12 +86,12 @@ const MISPLACED_WORD_ENDING_PUNC_RE = /([a-zA-Z0-9\xA0-\uFFFF"'_])(\s+)([.,:;!?\
* dependent on style and context.
*/
export function punctuationCleanup(text: string): string {
return text.replace(MISPLACED_WORD_ENDING_PUNC_RE, '$1$3$2');
return text.replace(MISPLACED_WORD_ENDING_PUNC_RE, '$1$3$2');
}


// \p{Ll} matches unicode lowercase letters which have uppercase variants.
const INCORRECT_CAPS_RE = /([.!?]["'_]?\s+|^\s*)(\p{Ll})/gu;
const INCORRECT_CAPS_RE = /([.!?]["'_\]\)\}\s]*?\s+|^\s*)(\p{Ll})/gu;


/**
Expand All @@ -101,18 +101,18 @@ const INCORRECT_CAPS_RE = /([.!?]["'_]?\s+|^\s*)(\p{Ll})/gu;
* following a sentence-ending punctuation mark.
*/
export function capitalizationCleanup(text: string): string {
// Conforms to `text.replace` replacer function interface
function correctCaps(_match: string, p1: string, p2: string) {
return p1 + p2.toUpperCase();
}
// Conforms to `text.replace` replacer function interface
function correctCaps(_match: string, p1: string, p2: string) {
return p1 + p2.toUpperCase();
}

return text.replace(INCORRECT_CAPS_RE, correctCaps);
return text.replace(INCORRECT_CAPS_RE, correctCaps);
}

const VISUAL_LINE_BREAK_RE = /\\ *(\r?\n|\r)[ \t]*/g

export function replaceVisualLineBreaks(text: string): string {
return text.replace(VISUAL_LINE_BREAK_RE, ' ');
return text.replace(VISUAL_LINE_BREAK_RE, ' ');
}

const INDEFINITE_ARTICLE_RE = /\b(a|an) ([\p{L}0-9]+)\b/igu
Expand All @@ -121,27 +121,27 @@ const INDEFINITE_ARTICLE_RE = /\b(a|an) ([\p{L}0-9]+)\b/igu
* Attempt to correct English indefinite articles (a / an)
*/
export function correctIndefiniteArticles(text: string) {
function upcaseFirstLetter(s: string): string {
if (s.length === 0) {
return s;
} else if (s.length === 1) {
return s.toUpperCase();
} else {
return s[0].toUpperCase() + s.slice(1);
}
function upcaseFirstLetter(s: string): string {
if (s.length === 0) {
return s;
} else if (s.length === 1) {
return s.toUpperCase();
} else {
return s[0].toUpperCase() + s.slice(1);
}
// Conforms to `text.replace` replacer function interface
function correctArticle(_match: string, originalArticle: string, word: string) {
let article = indefinite(word, { articleOnly: true });
if (originalArticle === 'a' || originalArticle === 'an') {
return article + ' ' + word;
} else if (originalArticle === 'A' || originalArticle === 'An') {
return upcaseFirstLetter(article) + ' ' + word;
} else {
// All caps
return article.toUpperCase() + ' ' + word;
}
}
// Conforms to `text.replace` replacer function interface
function correctArticle(_match: string, originalArticle: string, word: string) {
let article = indefinite(word, { articleOnly: true });
if (originalArticle === 'a' || originalArticle === 'an') {
return article + ' ' + word;
} else if (originalArticle === 'A' || originalArticle === 'An') {
return upcaseFirstLetter(article) + ' ' + word;
} else {
// All caps
return article.toUpperCase() + ' ' + word;
}
}

return text.replace(INDEFINITE_ARTICLE_RE, correctArticle);
return text.replace(INDEFINITE_ARTICLE_RE, correctArticle);
}
Loading

0 comments on commit 6c32031

Please sign in to comment.