From feaf5cffd19435e554535705dbdcea5bf8fd7884 Mon Sep 17 00:00:00 2001
From: digitalMoksha <brett@digitalmoksha.com>
Date: Tue, 13 Aug 2024 17:30:42 -0500
Subject: [PATCH] Use a line_offset vector to track offsets for inline
 sourcepos

---
 src/cm.rs              |  14 +++---
 src/nodes.rs           |   2 +
 src/parser/autolink.rs |  13 ++----
 src/parser/inlines.rs  |  20 +++++---
 src/parser/mod.rs      |   8 +++-
 src/parser/table.rs    |  24 ++++++----
 src/tests/core.rs      | 104 +++++++++++++++++++++++++++++++++++++++--
 src/tests/table.rs     |   8 +---
 8 files changed, 149 insertions(+), 44 deletions(-)
diff --git a/src/cm.rs b/src/cm.rs
index 7e1da213..fc15a1ad 100644
--- a/src/cm.rs
+++ b/src/cm.rs
@@ -544,13 +544,13 @@ impl<'a, 'o, 'c> CommonMarkFormatter<'a, 'o, 'c> {
             let info = ncb.info.as_bytes();
             let literal = ncb.literal.as_bytes();
 
-            if info.is_empty()
-                && (literal.len() > 2
-                    && !isspace(literal[0])
-                    && !(isspace(literal[literal.len() - 1])
-                        && isspace(literal[literal.len() - 2])))
-                && !first_in_list_item
-                && !self.options.render.prefer_fenced
+            #[allow(clippy::len_zero)]
+            if !(info.len() > 0
+                || literal.len() <= 2
+                || isspace(literal[0])
+                || first_in_list_item
+                || self.options.render.prefer_fenced
+                || isspace(literal[literal.len() - 1]) && isspace(literal[literal.len() - 2]))
             {
                 write!(self, "    ").unwrap();
                 write!(self.prefix, "    ").unwrap();
diff --git a/src/nodes.rs b/src/nodes.rs
index c04f244c..95b58a59 100644
--- a/src/nodes.rs
+++ b/src/nodes.rs
@@ -534,6 +534,7 @@ pub struct Ast {
     pub(crate) open: bool,
     pub(crate) last_line_blank: bool,
     pub(crate) table_visited: bool,
+    pub(crate) line_offsets: Vec<usize>,
 }
 
 /// Represents the position in the source Markdown this node was rendered from.
@@ -609,6 +610,7 @@ impl Ast {
             open: true,
             last_line_blank: false,
             table_visited: false,
+            line_offsets: Vec::with_capacity(0),
         }
     }
 }
diff --git a/src/parser/autolink.rs b/src/parser/autolink.rs
index a4361da6..15e8e315 100644
--- a/src/parser/autolink.rs
+++ b/src/parser/autolink.rs
@@ -41,14 +41,11 @@ pub(crate) fn process_autolinks<'a>(
                 }
             }
 
-            match contents[i] {
-                b'@' => {
-                    post_org = email_match(arena, contents, i, relaxed_autolinks);
-                    if post_org.is_some() {
-                        break;
-                    }
+            if contents[i] == b'@' {
+                post_org = email_match(arena, contents, i, relaxed_autolinks);
+                if post_org.is_some() {
+                    break;
                 }
-                _ => (),
             }
             i += 1;
         }
@@ -161,7 +158,7 @@ fn check_domain(data: &[u8], allow_short: bool) -> Option<usize> {
 }
 
 fn is_valid_hostchar(ch: char) -> bool {
-    !ch.is_whitespace() && !(ch.is_punctuation() || ch.is_symbol())
+    !(ch.is_whitespace() || ch.is_punctuation() || ch.is_symbol())
 }
 
 fn autolink_delim(data: &[u8], mut link_end: usize, relaxed_autolinks: bool) -> usize {
diff --git a/src/parser/inlines.rs b/src/parser/inlines.rs
index 33407e4e..8b255b73 100644
--- a/src/parser/inlines.rs
+++ b/src/parser/inlines.rs
@@ -31,8 +31,8 @@ pub struct Subject<'a: 'd, 'r, 'o, 'c, 'd, 'i> {
     pub input: &'i [u8],
     line: usize,
     pub pos: usize,
-    block_offset: usize,
     column_offset: isize,
+    line_offset: usize,
     flags: Flags,
     pub refmap: &'r mut RefMap,
     delimiter_arena: &'d Arena<Delimiter<'a, 'd>>,
@@ -116,7 +116,6 @@ impl<'a, 'r, 'o, 'c, 'd, 'i> Subject<'a, 'r, 'o, 'c, 'd, 'i> {
         options: &'o Options<'c>,
         input: &'i [u8],
         line: usize,
-        block_offset: usize,
         refmap: &'r mut RefMap,
         delimiter_arena: &'d Arena<Delimiter<'a, 'd>>,
     ) -> Self {
@@ -126,8 +125,8 @@ impl<'a, 'r, 'o, 'c, 'd, 'i> Subject<'a, 'r, 'o, 'c, 'd, 'i> {
             input,
             line,
             pos: 0,
-            block_offset,
             column_offset: 0,
+            line_offset: 0,
             flags: Flags::default(),
             refmap,
             delimiter_arena,
@@ -182,6 +181,11 @@ impl<'a, 'r, 'o, 'c, 'd, 'i> Subject<'a, 'r, 'o, 'c, 'd, 'i> {
             None => return false,
             Some(ch) => *ch as char,
         };
+
+        let node_ast = node.data.borrow();
+        let adjusted_line = self.line - node_ast.sourcepos.start.line;
+        self.line_offset = node_ast.line_offsets[adjusted_line];
+
         let new_inl: Option<&'a AstNode<'a>> = match c {
             '\0' => return false,
             '\r' | '\n' => Some(self.handle_newline()),
@@ -1604,7 +1608,7 @@ impl<'a, 'r, 'o, 'c, 'd, 'i> Subject<'a, 'r, 'o, 'c, 'd, 'i> {
                 inl.data.borrow_mut().sourcepos.start.column =
                     bracket_inl_text.data.borrow().sourcepos.start.column;
                 inl.data.borrow_mut().sourcepos.end.column = usize::try_from(
-                    self.pos as isize + self.column_offset + self.block_offset as isize,
+                    self.pos as isize + self.column_offset + self.line_offset as isize,
                 )
                 .unwrap();
                 bracket_inl_text.insert_before(inl);
@@ -1655,7 +1659,7 @@ impl<'a, 'r, 'o, 'c, 'd, 'i> Subject<'a, 'r, 'o, 'c, 'd, 'i> {
             .sourcepos
             .start;
         inl.data.borrow_mut().sourcepos.end.column =
-            usize::try_from(self.pos as isize + self.column_offset + self.block_offset as isize)
+            usize::try_from(self.pos as isize + self.column_offset + self.line_offset as isize)
                 .unwrap();
 
         self.brackets[brackets_len - 1].inl_text.insert_before(inl);
@@ -1847,8 +1851,8 @@ impl<'a, 'r, 'o, 'c, 'd, 'i> Subject<'a, 'r, 'o, 'c, 'd, 'i> {
         end_column: usize,
     ) -> &'a AstNode<'a> {
         let start_column =
-            start_column as isize + 1 + self.column_offset + self.block_offset as isize;
-        let end_column = end_column as isize + 1 + self.column_offset + self.block_offset as isize;
+            start_column as isize + 1 + self.column_offset + self.line_offset as isize;
+        let end_column = end_column as isize + 1 + self.column_offset + self.line_offset as isize;
 
         let ast = Ast {
             value,
@@ -1864,6 +1868,7 @@ impl<'a, 'r, 'o, 'c, 'd, 'i> Subject<'a, 'r, 'o, 'c, 'd, 'i> {
             open: false,
             last_line_blank: false,
             table_visited: false,
+            line_offsets: Vec::with_capacity(0),
         };
         self.arena.alloc(Node::new(RefCell::new(ast)))
     }
@@ -1972,6 +1977,7 @@ pub fn make_inline<'a>(
         open: false,
         last_line_blank: false,
         table_visited: false,
+        line_offsets: Vec::with_capacity(0),
     };
     arena.alloc(Node::new(RefCell::new(ast)))
 }
diff --git a/src/parser/mod.rs b/src/parser/mod.rs
index fed397bc..b506b502 100644
--- a/src/parser/mod.rs
+++ b/src/parser/mod.rs
@@ -67,6 +67,7 @@ pub fn parse_document<'a>(
         open: true,
         last_line_blank: false,
         table_visited: false,
+        line_offsets: Vec::with_capacity(0),
     })));
     let mut parser = Parser::new(arena, root, options);
     let mut linebuf = Vec::with_capacity(buffer.len());
@@ -1998,6 +1999,11 @@ impl<'a, 'o, 'c: 'o> Parser<'a, 'o, 'c> {
             }
         }
         if self.offset < line.len() {
+            // since whitespace is stripped off the beginning of lines, we need to keep
+            // track of how much was stripped off. This allows us to properly calculate
+            // inline sourcepos during inline processing.
+            ast.line_offsets.push(self.offset);
+
             ast.content
                 .push_str(str::from_utf8(&line[self.offset..]).unwrap());
         }
@@ -2185,7 +2191,6 @@ impl<'a, 'o, 'c: 'o> Parser<'a, 'o, 'c> {
             self.options,
             content,
             node_data.sourcepos.start.line,
-            node_data.sourcepos.start.column - 1 + node_data.internal_offset,
             &mut self.refmap,
             &delimiter_arena,
         );
@@ -2439,7 +2444,6 @@ impl<'a, 'o, 'c: 'o> Parser<'a, 'o, 'c> {
             self.options,
             content,
             0, // XXX -1 in upstream; never used?
-            0,
             &mut self.refmap,
             &delimiter_arena,
         );
diff --git a/src/parser/table.rs b/src/parser/table.rs
index 651810bd..4d6f8610 100644
--- a/src/parser/table.rs
+++ b/src/parser/table.rs
@@ -116,6 +116,10 @@ fn try_opening_header<'a>(
             start.column_add((cell.end_offset - header_row.paragraph_offset) as isize);
         ast.internal_offset = cell.internal_offset;
         ast.content.clone_from(&cell.content);
+        ast.line_offsets.push(
+            start.column + cell.start_offset - 1 + cell.internal_offset
+                - header_row.paragraph_offset,
+        );
 
         i += 1;
     }
@@ -172,6 +176,9 @@ fn try_opening_row<'a>(
         cell_ast.internal_offset = cell.internal_offset;
         cell_ast.sourcepos.end.column = sourcepos.start.column + cell.end_offset;
         cell_ast.content.clone_from(&cell.content);
+        cell_ast
+            .line_offsets
+            .push(sourcepos.start.column + cell.start_offset - 1 + cell.internal_offset);
 
         last_column = cell_ast.sourcepos.end.column;
 
@@ -295,16 +302,13 @@ fn try_inserting_table_header_paragraph<'a>(
     let mut paragraph = Ast::new(NodeValue::Paragraph, start);
     paragraph.sourcepos.end.line = start.line + newlines - 1;
 
-    // XXX We don't have the last_line_length to go on by this point,
-    // so we have no idea what the end column should be.
-    // We can't track it in row() like we do paragraph_offset, because
-    // we've already discarded the leading whitespace for that line.
-    // This is hard to avoid with this backtracking approach to
-    // creating the pre-table paragraph — we're doing the work of
-    // finalize() here, but without the parser state at that time.
-    // Approximate by just counting the line length as it is and adding
-    // to the start column.
-    paragraph.sourcepos.end.column = start.column - 1
+    // copy over the line offsets related to the paragraph
+    for n in 0..newlines {
+        paragraph.line_offsets.push(container_ast.line_offsets[n]);
+    }
+
+    let last_line_offset = *paragraph.line_offsets.last().unwrap_or(&0);
+    paragraph.sourcepos.end.column = last_line_offset
         + preface
             .iter()
             .rev()
diff --git a/src/tests/core.rs b/src/tests/core.rs
index 0ed9419b..f6a3eb51 100644
--- a/src/tests/core.rs
+++ b/src/tests/core.rs
@@ -529,8 +529,6 @@ fn link_sourcepos_newline() {
     );
 }
 
-// Ignored per https://github.com/kivikakk/comrak/pull/439#issuecomment-2225129960.
-#[ignore]
 #[test]
 fn link_sourcepos_truffle() {
     assert_ast_match!(
@@ -577,8 +575,6 @@ fn link_sourcepos_truffle_twist() {
     );
 }
 
-// Ignored per https://github.com/kivikakk/comrak/pull/439#issuecomment-2225129960.
-#[ignore]
 #[test]
 fn link_sourcepos_truffle_bergamot() {
     assert_ast_match!(
@@ -601,3 +597,103 @@ fn link_sourcepos_truffle_bergamot() {
         ])
     );
 }
+
+#[test]
+fn link_sourcepos_inline_paragraph_multiline() {
+    assert_ast_match!(
+        [],
+        "  A\n"
+        "   B\n",
+        (document (1:1-2:4) [
+            (paragraph (1:3-2:4) [
+                (text (1:3-1:3) "A")
+                (softbreak (1:4-1:4))
+                (text (2:4-2:4) "B")
+            ])
+        ])
+    );
+}
+
+#[test]
+fn link_sourcepos_inline_listitem_multiline() {
+    assert_ast_match!(
+        [],
+        "- A\n"
+        "B\n",
+        (document (1:1-2:1) [
+            (list (1:1-2:1) [
+                (item (1:1-2:1) [
+                    (paragraph (1:3-2:1) [
+                        (text (1:3-1:3) "A")
+                        (softbreak (1:4-1:4))
+                        (text (2:1-2:1) "B")
+                    ])
+                ])
+            ])
+        ])
+    );
+}
+
+#[test]
+fn link_sourcepos_inline_listitem_multiline_2() {
+    assert_ast_match!(
+        [],
+        "- A\n"
+        "   B\n"
+        "-  C\n"
+        " D",
+        (document (1:1-4:2) [
+            (list (1:1-4:2) [
+                (item (1:1-2:4) [
+                    (paragraph (1:3-2:4) [
+                        (text (1:3-1:3) "A")
+                        (softbreak (1:4-1:4))
+                        (text (2:4-2:4) "B")
+                    ])
+                ])
+                (item (3:1-4:2) [
+                    (paragraph (3:4-4:2) [
+                        (text (3:4-3:4) "C")
+                        (softbreak (3:5-3:5))
+                        (text (4:2-4:2) "D")
+                    ])
+                ])
+            ])
+        ])
+    );
+}
+
+#[test]
+fn link_sourcepos_inline_double_emphasis_1() {
+    assert_ast_match!(
+        [],
+        "_**this**_\n",
+        (document (1:1-1:10) [
+            (paragraph (1:1-1:10) [
+                (emph (1:1-1:10) [
+                    (strong (1:2-1:9) [
+                        (text (1:4-1:7) "this")
+                    ])
+                ])
+            ])
+        ])
+    );
+}
+
+#[ignore]
+#[test]
+fn link_sourcepos_inline_double_emphasis_2() {
+    assert_ast_match!(
+        [],
+        "___this___\n",
+        (document (1:1-1:10) [
+            (paragraph (1:1-1:10) [
+                (emph (1:1-1:10) [
+                    (strong (1:2-1:9) [
+                        (text (1:4-1:7) "this")
+                    ])
+                ])
+            ])
+        ])
+    );
+}
diff --git a/src/tests/table.rs b/src/tests/table.rs
index 3f95cdd2..32ec8ce8 100644
--- a/src/tests/table.rs
+++ b/src/tests/table.rs
@@ -192,14 +192,10 @@ fn sourcepos_with_preceding_para_offset() {
         " | c | d |\n"
         ,
         (document (1:1-5:10) [
-
-            // XXX This should be 1:2-2:5; see
-            // crate::parser::table::try_inserting_table_header_paragraph.
-            (paragraph (1:2-2:4) [
-
+            (paragraph (1:2-2:5) [
                 (text (1:2-1:4) "123")
                 (softbreak (1:5-1:5))
-                (text (2:2-2:4) "456")
+                (text (2:3-2:5) "456")
             ])
             (table (3:2-5:10) [
                 (table_row (3:2-3:10) [