From feaf5cffd19435e554535705dbdcea5bf8fd7884 Mon Sep 17 00:00:00 2001 From: digitalMoksha Date: Tue, 13 Aug 2024 17:30:42 -0500 Subject: [PATCH] Use a line_offset vector to track offsets for inline sourcepos --- src/cm.rs | 14 +++--- src/nodes.rs | 2 + src/parser/autolink.rs | 13 ++---- src/parser/inlines.rs | 20 +++++--- src/parser/mod.rs | 8 +++- src/parser/table.rs | 24 ++++++---- src/tests/core.rs | 104 +++++++++++++++++++++++++++++++++++++++-- src/tests/table.rs | 8 +--- 8 files changed, 149 insertions(+), 44 deletions(-) diff --git a/src/cm.rs b/src/cm.rs index 7e1da213..fc15a1ad 100644 --- a/src/cm.rs +++ b/src/cm.rs @@ -544,13 +544,13 @@ impl<'a, 'o, 'c> CommonMarkFormatter<'a, 'o, 'c> { let info = ncb.info.as_bytes(); let literal = ncb.literal.as_bytes(); - if info.is_empty() - && (literal.len() > 2 - && !isspace(literal[0]) - && !(isspace(literal[literal.len() - 1]) - && isspace(literal[literal.len() - 2]))) - && !first_in_list_item - && !self.options.render.prefer_fenced + #[allow(clippy::len_zero)] + if !(info.len() > 0 + || literal.len() <= 2 + || isspace(literal[0]) + || first_in_list_item + || self.options.render.prefer_fenced + || isspace(literal[literal.len() - 1]) && isspace(literal[literal.len() - 2])) { write!(self, " ").unwrap(); write!(self.prefix, " ").unwrap(); diff --git a/src/nodes.rs b/src/nodes.rs index c04f244c..95b58a59 100644 --- a/src/nodes.rs +++ b/src/nodes.rs @@ -534,6 +534,7 @@ pub struct Ast { pub(crate) open: bool, pub(crate) last_line_blank: bool, pub(crate) table_visited: bool, + pub(crate) line_offsets: Vec, } /// Represents the position in the source Markdown this node was rendered from. @@ -609,6 +610,7 @@ impl Ast { open: true, last_line_blank: false, table_visited: false, + line_offsets: Vec::with_capacity(0), } } } diff --git a/src/parser/autolink.rs b/src/parser/autolink.rs index a4361da6..15e8e315 100644 --- a/src/parser/autolink.rs +++ b/src/parser/autolink.rs @@ -41,14 +41,11 @@ pub(crate) fn process_autolinks<'a>( } } - match contents[i] { - b'@' => { - post_org = email_match(arena, contents, i, relaxed_autolinks); - if post_org.is_some() { - break; - } + if contents[i] == b'@' { + post_org = email_match(arena, contents, i, relaxed_autolinks); + if post_org.is_some() { + break; } - _ => (), } i += 1; } @@ -161,7 +158,7 @@ fn check_domain(data: &[u8], allow_short: bool) -> Option { } fn is_valid_hostchar(ch: char) -> bool { - !ch.is_whitespace() && !(ch.is_punctuation() || ch.is_symbol()) + !(ch.is_whitespace() || ch.is_punctuation() || ch.is_symbol()) } fn autolink_delim(data: &[u8], mut link_end: usize, relaxed_autolinks: bool) -> usize { diff --git a/src/parser/inlines.rs b/src/parser/inlines.rs index 33407e4e..8b255b73 100644 --- a/src/parser/inlines.rs +++ b/src/parser/inlines.rs @@ -31,8 +31,8 @@ pub struct Subject<'a: 'd, 'r, 'o, 'c, 'd, 'i> { pub input: &'i [u8], line: usize, pub pos: usize, - block_offset: usize, column_offset: isize, + line_offset: usize, flags: Flags, pub refmap: &'r mut RefMap, delimiter_arena: &'d Arena>, @@ -116,7 +116,6 @@ impl<'a, 'r, 'o, 'c, 'd, 'i> Subject<'a, 'r, 'o, 'c, 'd, 'i> { options: &'o Options<'c>, input: &'i [u8], line: usize, - block_offset: usize, refmap: &'r mut RefMap, delimiter_arena: &'d Arena>, ) -> Self { @@ -126,8 +125,8 @@ impl<'a, 'r, 'o, 'c, 'd, 'i> Subject<'a, 'r, 'o, 'c, 'd, 'i> { input, line, pos: 0, - block_offset, column_offset: 0, + line_offset: 0, flags: Flags::default(), refmap, delimiter_arena, @@ -182,6 +181,11 @@ impl<'a, 'r, 'o, 'c, 'd, 'i> Subject<'a, 'r, 'o, 'c, 'd, 'i> { None => return false, Some(ch) => *ch as char, }; + + let node_ast = node.data.borrow(); + let adjusted_line = self.line - node_ast.sourcepos.start.line; + self.line_offset = node_ast.line_offsets[adjusted_line]; + let new_inl: Option<&'a AstNode<'a>> = match c { '\0' => return false, '\r' | '\n' => Some(self.handle_newline()), @@ -1604,7 +1608,7 @@ impl<'a, 'r, 'o, 'c, 'd, 'i> Subject<'a, 'r, 'o, 'c, 'd, 'i> { inl.data.borrow_mut().sourcepos.start.column = bracket_inl_text.data.borrow().sourcepos.start.column; inl.data.borrow_mut().sourcepos.end.column = usize::try_from( - self.pos as isize + self.column_offset + self.block_offset as isize, + self.pos as isize + self.column_offset + self.line_offset as isize, ) .unwrap(); bracket_inl_text.insert_before(inl); @@ -1655,7 +1659,7 @@ impl<'a, 'r, 'o, 'c, 'd, 'i> Subject<'a, 'r, 'o, 'c, 'd, 'i> { .sourcepos .start; inl.data.borrow_mut().sourcepos.end.column = - usize::try_from(self.pos as isize + self.column_offset + self.block_offset as isize) + usize::try_from(self.pos as isize + self.column_offset + self.line_offset as isize) .unwrap(); self.brackets[brackets_len - 1].inl_text.insert_before(inl); @@ -1847,8 +1851,8 @@ impl<'a, 'r, 'o, 'c, 'd, 'i> Subject<'a, 'r, 'o, 'c, 'd, 'i> { end_column: usize, ) -> &'a AstNode<'a> { let start_column = - start_column as isize + 1 + self.column_offset + self.block_offset as isize; - let end_column = end_column as isize + 1 + self.column_offset + self.block_offset as isize; + start_column as isize + 1 + self.column_offset + self.line_offset as isize; + let end_column = end_column as isize + 1 + self.column_offset + self.line_offset as isize; let ast = Ast { value, @@ -1864,6 +1868,7 @@ impl<'a, 'r, 'o, 'c, 'd, 'i> Subject<'a, 'r, 'o, 'c, 'd, 'i> { open: false, last_line_blank: false, table_visited: false, + line_offsets: Vec::with_capacity(0), }; self.arena.alloc(Node::new(RefCell::new(ast))) } @@ -1972,6 +1977,7 @@ pub fn make_inline<'a>( open: false, last_line_blank: false, table_visited: false, + line_offsets: Vec::with_capacity(0), }; arena.alloc(Node::new(RefCell::new(ast))) } diff --git a/src/parser/mod.rs b/src/parser/mod.rs index fed397bc..b506b502 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -67,6 +67,7 @@ pub fn parse_document<'a>( open: true, last_line_blank: false, table_visited: false, + line_offsets: Vec::with_capacity(0), }))); let mut parser = Parser::new(arena, root, options); let mut linebuf = Vec::with_capacity(buffer.len()); @@ -1998,6 +1999,11 @@ impl<'a, 'o, 'c: 'o> Parser<'a, 'o, 'c> { } } if self.offset < line.len() { + // since whitespace is stripped off the beginning of lines, we need to keep + // track of how much was stripped off. This allows us to properly calculate + // inline sourcepos during inline processing. + ast.line_offsets.push(self.offset); + ast.content .push_str(str::from_utf8(&line[self.offset..]).unwrap()); } @@ -2185,7 +2191,6 @@ impl<'a, 'o, 'c: 'o> Parser<'a, 'o, 'c> { self.options, content, node_data.sourcepos.start.line, - node_data.sourcepos.start.column - 1 + node_data.internal_offset, &mut self.refmap, &delimiter_arena, ); @@ -2439,7 +2444,6 @@ impl<'a, 'o, 'c: 'o> Parser<'a, 'o, 'c> { self.options, content, 0, // XXX -1 in upstream; never used? - 0, &mut self.refmap, &delimiter_arena, ); diff --git a/src/parser/table.rs b/src/parser/table.rs index 651810bd..4d6f8610 100644 --- a/src/parser/table.rs +++ b/src/parser/table.rs @@ -116,6 +116,10 @@ fn try_opening_header<'a>( start.column_add((cell.end_offset - header_row.paragraph_offset) as isize); ast.internal_offset = cell.internal_offset; ast.content.clone_from(&cell.content); + ast.line_offsets.push( + start.column + cell.start_offset - 1 + cell.internal_offset + - header_row.paragraph_offset, + ); i += 1; } @@ -172,6 +176,9 @@ fn try_opening_row<'a>( cell_ast.internal_offset = cell.internal_offset; cell_ast.sourcepos.end.column = sourcepos.start.column + cell.end_offset; cell_ast.content.clone_from(&cell.content); + cell_ast + .line_offsets + .push(sourcepos.start.column + cell.start_offset - 1 + cell.internal_offset); last_column = cell_ast.sourcepos.end.column; @@ -295,16 +302,13 @@ fn try_inserting_table_header_paragraph<'a>( let mut paragraph = Ast::new(NodeValue::Paragraph, start); paragraph.sourcepos.end.line = start.line + newlines - 1; - // XXX We don't have the last_line_length to go on by this point, - // so we have no idea what the end column should be. - // We can't track it in row() like we do paragraph_offset, because - // we've already discarded the leading whitespace for that line. - // This is hard to avoid with this backtracking approach to - // creating the pre-table paragraph — we're doing the work of - // finalize() here, but without the parser state at that time. - // Approximate by just counting the line length as it is and adding - // to the start column. - paragraph.sourcepos.end.column = start.column - 1 + // copy over the line offsets related to the paragraph + for n in 0..newlines { + paragraph.line_offsets.push(container_ast.line_offsets[n]); + } + + let last_line_offset = *paragraph.line_offsets.last().unwrap_or(&0); + paragraph.sourcepos.end.column = last_line_offset + preface .iter() .rev() diff --git a/src/tests/core.rs b/src/tests/core.rs index 0ed9419b..f6a3eb51 100644 --- a/src/tests/core.rs +++ b/src/tests/core.rs @@ -529,8 +529,6 @@ fn link_sourcepos_newline() { ); } -// Ignored per https://github.com/kivikakk/comrak/pull/439#issuecomment-2225129960. -#[ignore] #[test] fn link_sourcepos_truffle() { assert_ast_match!( @@ -577,8 +575,6 @@ fn link_sourcepos_truffle_twist() { ); } -// Ignored per https://github.com/kivikakk/comrak/pull/439#issuecomment-2225129960. -#[ignore] #[test] fn link_sourcepos_truffle_bergamot() { assert_ast_match!( @@ -601,3 +597,103 @@ fn link_sourcepos_truffle_bergamot() { ]) ); } + +#[test] +fn link_sourcepos_inline_paragraph_multiline() { + assert_ast_match!( + [], + " A\n" + " B\n", + (document (1:1-2:4) [ + (paragraph (1:3-2:4) [ + (text (1:3-1:3) "A") + (softbreak (1:4-1:4)) + (text (2:4-2:4) "B") + ]) + ]) + ); +} + +#[test] +fn link_sourcepos_inline_listitem_multiline() { + assert_ast_match!( + [], + "- A\n" + "B\n", + (document (1:1-2:1) [ + (list (1:1-2:1) [ + (item (1:1-2:1) [ + (paragraph (1:3-2:1) [ + (text (1:3-1:3) "A") + (softbreak (1:4-1:4)) + (text (2:1-2:1) "B") + ]) + ]) + ]) + ]) + ); +} + +#[test] +fn link_sourcepos_inline_listitem_multiline_2() { + assert_ast_match!( + [], + "- A\n" + " B\n" + "- C\n" + " D", + (document (1:1-4:2) [ + (list (1:1-4:2) [ + (item (1:1-2:4) [ + (paragraph (1:3-2:4) [ + (text (1:3-1:3) "A") + (softbreak (1:4-1:4)) + (text (2:4-2:4) "B") + ]) + ]) + (item (3:1-4:2) [ + (paragraph (3:4-4:2) [ + (text (3:4-3:4) "C") + (softbreak (3:5-3:5)) + (text (4:2-4:2) "D") + ]) + ]) + ]) + ]) + ); +} + +#[test] +fn link_sourcepos_inline_double_emphasis_1() { + assert_ast_match!( + [], + "_**this**_\n", + (document (1:1-1:10) [ + (paragraph (1:1-1:10) [ + (emph (1:1-1:10) [ + (strong (1:2-1:9) [ + (text (1:4-1:7) "this") + ]) + ]) + ]) + ]) + ); +} + +#[ignore] +#[test] +fn link_sourcepos_inline_double_emphasis_2() { + assert_ast_match!( + [], + "___this___\n", + (document (1:1-1:10) [ + (paragraph (1:1-1:10) [ + (emph (1:1-1:10) [ + (strong (1:2-1:9) [ + (text (1:4-1:7) "this") + ]) + ]) + ]) + ]) + ); +} diff --git a/src/tests/table.rs b/src/tests/table.rs index 3f95cdd2..32ec8ce8 100644 --- a/src/tests/table.rs +++ b/src/tests/table.rs @@ -192,14 +192,10 @@ fn sourcepos_with_preceding_para_offset() { " | c | d |\n" , (document (1:1-5:10) [ - - // XXX This should be 1:2-2:5; see - // crate::parser::table::try_inserting_table_header_paragraph. - (paragraph (1:2-2:4) [ - + (paragraph (1:2-2:5) [ (text (1:2-1:4) "123") (softbreak (1:5-1:5)) - (text (2:2-2:4) "456") + (text (2:3-2:5) "456") ]) (table (3:2-5:10) [ (table_row (3:2-3:10) [