-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
use tree-sitter to parse everything #4
Comments
This uses Davis Vaughan's implementation of tree-sitter in r, based on tree-walking code I've already implemented in This code includes debugging lines switched off with NA_to_null <- function (i) ifelse (is.na (i), "", i)
walk_one_tree <- function (tree) {
cli_out <- FALSE
it <- tree_walk (tree)
reached_foot <- FALSE
first_identifier <- TRUE
get_next_open <- FALSE
grammar_types <- node_text <- next_open <- fn_name <- character (0L)
while (!reached_foot) {
field_name <- NA_to_null (it$field_name ())
grammar_type <- NA_to_null (node_grammar_type (it$node ()))
if (field_name == "function" && grammar_type != "extract_operator") {
if (cli_out) {
cli::cli_h1 ("function")
print (it$node ())
cli::cli_alert_info (cli::col_green ("field name: {it$field_name()}"))
cli::cli_alert_info (cli::col_green ("grammar type: {grammar_type}"))
cli::cli_alert_info (cli::col_green ("grammar symbol: {node_grammar_symbol(it$node())}"))
cli::cli_alert_info (cli::col_green ("node is named: {node_is_named(it$node())}"))
cli::cli_alert_info (cli::col_green ("node text: {node_text(it$node())}"))
cli::cli_alert_info (cli::col_green ("node symbol: {node_symbol(it$node())}"))
}
grammar_types <- c (grammar_types, grammar_type)
node_text <- c (node_text, node_text (it$node ()))
get_next_open <- TRUE
} else if (grammar_type == "identifier" && first_identifier) {
if (cli_out) {
cli::cli_h1 ("identifier")
print (it$node ())
cli::cli_alert_info (cli::col_green ("field name: {it$field_name()}"))
cli::cli_alert_info (cli::col_green ("grammar type: {grammar_type}"))
cli::cli_alert_info (cli::col_green ("grammar symbol: {node_grammar_symbol(it$node())}"))
cli::cli_alert_info (cli::col_green ("node is named: {node_is_named(it$node())}"))
cli::cli_alert_info (cli::col_green ("node text: {node_text(it$node())}"))
cli::cli_alert_info (cli::col_green ("node symbol: {node_symbol(it$node())}"))
}
fn_name <- node_text (it$node ())
first_identifier <- FALSE
} else if (get_next_open && field_name == "open") {
if (cli_out) {
cli::cli_h1 ("next open")
}
next_open <- c (next_open, grammar_type)
get_next_open <- FALSE
}
if (it$goto_first_child ()) next
if (it$goto_next_sibling ()) next
retracing <- TRUE
while (retracing) {
if (!it$goto_parent ()) {
retracing <- FALSE
reached_foot <- TRUE
}
if (it$goto_next_sibling ()) {
retracing <- FALSE
}
}
}
data.frame (
fn_name = fn_name,
grammar_type = grammar_types,
node_text = node_text
) [which (next_open != "["), ]
} That library (treesitter)
#>
#> Attaching package: 'treesitter'
#> The following object is masked from 'package:base':
#>
#> range
language <- treesitter.r::language()
parser <- parser(language)
f <- "/<local>/<path>/<to>/teal/R/include_css_js.R"
parse_list <- parse (f)
fn_calls <- lapply (parse_list, function (p) {
txt <- paste0 (as.character (p), collapse = "\n")
tree <- parser_parse(parser, txt)
walk_one_tree (tree)
})
do.call (rbind, fn_calls)
#> fn_name grammar_type node_text
#> 1 include_css_files identifier list.files
#> 2 include_css_files identifier system.file
#> 3 include_css_files identifier singleton
#> 4 include_css_files identifier lapply
#> 11 include_js_files namespace_operator checkmate::assert_character
#> 21 include_js_files identifier list.files
#> 31 include_js_files identifier system.file
#> 5 include_js_files identifier basename
#> 6 include_js_files identifier singleton
#> 7 include_js_files identifier lapply
#> 12 run_js_files namespace_operator checkmate::assert_character
#> 22 run_js_files identifier lapply
#> 32 run_js_files namespace_operator shinyjs::runjs
#> 41 run_js_files identifier paste0
#> 51 run_js_files identifier readLines
#> 61 run_js_files identifier system.file
#> 71 run_js_files identifier invisible
#> 13 include_teal_css_js identifier tagList
#> 23 include_teal_css_js namespace_operator shinyjs::useShinyjs
#> 33 include_teal_css_js identifier include_css_files
#> 42 include_teal_css_js identifier include_js_files
#> 52 include_teal_css_js namespace_operator shinyjs::hidden
#> 62 include_teal_css_js identifier icon Created on 2024-09-11 with reprex v2.1.1 |
Here's an improved version of the function, minus the debugging lines: walk_one_tree <- function (tree) {
it <- treesitter::tree_walk (tree)
reached_foot <- FALSE
first_identifier <- TRUE
get_next_open <- FALSE
grammar_types <- node_text <- next_open <- fn_name <- character (0L)
while (!reached_foot) {
field_name <- NA_to_null (it$field_name ())
grammar_type <- NA_to_null (treesitter::node_grammar_type (it$node ()))
if (field_name == "function" && !grammar_type %in% c ("call", "extract_operator")) {
grammar_types <- c (grammar_types, grammar_type)
node_text <- c (node_text, treesitter::node_text (it$node ()))
get_next_open <- TRUE
} else if (grammar_type == "identifier" && first_identifier) {
fn_name <- treesitter::node_text (it$node ())
first_identifier <- FALSE
} else if (get_next_open && field_name == "open") {
next_open <- c (next_open, grammar_type)
get_next_open <- FALSE
}
if (it$goto_first_child ()) next
if (it$goto_next_sibling ()) next
retracing <- TRUE
while (retracing) {
if (!it$goto_parent ()) {
retracing <- FALSE
reached_foot <- TRUE
}
if (it$goto_next_sibling ()) {
retracing <- FALSE
}
}
}
# This line ensures fn_name is also length 0 when no data are parsed:
fn_name <- rep (fn_name, length (grammar_types))
data.frame (
fn_name = fn_name,
grammar_type = grammar_types,
node_text = node_text
) [which (next_open != "["), ]
} |
And here is a comparison with current Function to use language <- treesitter.r::language()
parser <- treesitter::parser(language)
path <- "/<local>/<path>/<to>/teal"
tree_sitter_calls <- function (path) {
flist <- fs::dir_ls (path, pattern = "\\.R$")
fn_calls <- lapply (flist, function (f) {
parse_list <- pkgstats:::control_parse (f)
fn_calls <- lapply (parse_list, function (p) {
txt <- paste0 (as.character (p), collapse = "\n")
tree <- treesitter::parser_parse(parser, txt)
walk_one_tree (tree)
})
res <- do.call (rbind, fn_calls)
cbind (file = rep (f, nrow (res)), res)
})
fn_calls <- do.call (rbind, fn_calls)
} Then code to compare the three approaches: ctags_calls <- function (path) {
withr::with_dir (path, pkgstats:::get_ctags ("R", has_tabs = FALSE))
}
t0 <- system.time (
tags_ctags <- ctags_calls (path)
)
t1 <- system.time (
tags_tree_sitter <- tree_sitter_calls (file.path (path, "R"))
)
t2 <- system.time (
tags_checkglobals <- as.data.frame (checkglobals::check_pkg(path))
) Then comparison of calculation times: message (
"times (ctags; ts; checkglobals) = (",
round (t0 [3], digits = 2),
"; ",
round (t1 [3], digits = 2),
"; ",
round (t2 [3], digits = 2),
"); ratios to ctags = ",
round (t1 [3] / t0 [3], digits = 1),
"; ",
round (t2 [3] / t0 [3], digits = 1)
)
#> times (ctags; ts; checkglobals) = (0.34; 1.19; 0.22); ratios to ctags = 3.5; 0.6 And finally numbers of identified calls: message (
"numbers of tags (ctags; ts; checkglobals) = (",
format (nrow (tags_ctags), big.mark = ","),
"; ",
format (nrow (tags_tree_sitter), big.mark = ","),
"; ",
format (nrow (tags_checkglobals), big.mark = ","),
")"
)
#> numbers of tags (ctags; ts; checkglobals) = (890; 2,214; 170) Created on 2024-09-11 with reprex v2.1.1 |
This issue now has to be paused because of DavisVaughan/r-tree-sitter#21 (comment). The bundled versions of all relevant C libraries are currently in https://github.com/ropensci-review-tools/pkgsimil, but that makes for a 24MB |
The playground on the main docs site demonstrates that tree-sitter indeed generates everything that is currently extracted by combining
ctags
+gtags
.The text was updated successfully, but these errors were encountered: