Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use tree-sitter to parse everything #4

Open
mpadge opened this issue Apr 20, 2021 · 4 comments
Open

use tree-sitter to parse everything #4

mpadge opened this issue Apr 20, 2021 · 4 comments

Comments

@mpadge
Copy link
Member

mpadge commented Apr 20, 2021

The playground on the main docs site demonstrates that tree-sitter indeed generates everything that is currently extracted by combining ctags + gtags.

@mpadge
Copy link
Member Author

mpadge commented Sep 11, 2024

This uses Davis Vaughan's implementation of tree-sitter in r, based on tree-walking code I've already implemented in pkgsimil. Davis's version exposes the tree cursor directly in R, and does not expose the tree-sitter api.h file, so all of these loops must be done in R rather than C. That makes this proof-of-principle only; any real implementations would have to be done in C like in pkgsimil.

This code includes debugging lines switched off with cli_out <- FALSE at the outset.

NA_to_null <- function (i) ifelse (is.na (i), "", i)
walk_one_tree <- function (tree) {

    cli_out <- FALSE
    it <- tree_walk (tree)

    reached_foot <- FALSE
    first_identifier <- TRUE
    get_next_open <- FALSE
    grammar_types <- node_text <- next_open <- fn_name <- character (0L)
    while (!reached_foot) {
        field_name <- NA_to_null (it$field_name ())
        grammar_type <- NA_to_null (node_grammar_type (it$node ()))
        if (field_name == "function" && grammar_type != "extract_operator") {
            if (cli_out) {
                cli::cli_h1 ("function")
                print (it$node ())
                cli::cli_alert_info (cli::col_green ("field name: {it$field_name()}"))
                cli::cli_alert_info (cli::col_green ("grammar type: {grammar_type}"))
                cli::cli_alert_info (cli::col_green ("grammar symbol: {node_grammar_symbol(it$node())}"))
                cli::cli_alert_info (cli::col_green ("node is named: {node_is_named(it$node())}"))
                cli::cli_alert_info (cli::col_green ("node text: {node_text(it$node())}"))
                cli::cli_alert_info (cli::col_green ("node symbol: {node_symbol(it$node())}"))
            }
            grammar_types <- c (grammar_types, grammar_type)
            node_text <- c (node_text, node_text (it$node ()))
            get_next_open <- TRUE
        } else if (grammar_type == "identifier" && first_identifier) {
            if (cli_out) {
                cli::cli_h1 ("identifier")
                print (it$node ())
                cli::cli_alert_info (cli::col_green ("field name: {it$field_name()}"))
                cli::cli_alert_info (cli::col_green ("grammar type: {grammar_type}"))
                cli::cli_alert_info (cli::col_green ("grammar symbol: {node_grammar_symbol(it$node())}"))
                cli::cli_alert_info (cli::col_green ("node is named: {node_is_named(it$node())}"))
                cli::cli_alert_info (cli::col_green ("node text: {node_text(it$node())}"))
                cli::cli_alert_info (cli::col_green ("node symbol: {node_symbol(it$node())}"))
            }
            fn_name <- node_text (it$node ())
            first_identifier <- FALSE
        } else if (get_next_open && field_name == "open") {
            if (cli_out) {
                cli::cli_h1 ("next open")
            }
            next_open <- c (next_open, grammar_type)
            get_next_open <- FALSE
        }

        if (it$goto_first_child ()) next
        if (it$goto_next_sibling ()) next

        retracing <- TRUE
        while (retracing) {
            if (!it$goto_parent ()) {
                retracing <- FALSE
                reached_foot <- TRUE
            }
            if (it$goto_next_sibling ()) {
                retracing <- FALSE
            }
        }
    }

    data.frame (
        fn_name = fn_name,
        grammar_type = grammar_types,
        node_text = node_text
    ) [which (next_open != "["), ]
}

That walk_one_tree() function can then be used to extract all function calls like this, using one R file from the code base discussed in #61.

library (treesitter)
#> 
#> Attaching package: 'treesitter'
#> The following object is masked from 'package:base':
#> 
#>     range
language <- treesitter.r::language()
parser <- parser(language)
f <- "/<local>/<path>/<to>/teal/R/include_css_js.R"
parse_list <- parse (f)

fn_calls <- lapply (parse_list, function (p) {
    txt <- paste0 (as.character (p), collapse = "\n")
    tree <- parser_parse(parser, txt)
    walk_one_tree (tree)
})
do.call (rbind, fn_calls)
#>                fn_name       grammar_type                   node_text
#> 1    include_css_files         identifier                  list.files
#> 2    include_css_files         identifier                 system.file
#> 3    include_css_files         identifier                   singleton
#> 4    include_css_files         identifier                      lapply
#> 11    include_js_files namespace_operator checkmate::assert_character
#> 21    include_js_files         identifier                  list.files
#> 31    include_js_files         identifier                 system.file
#> 5     include_js_files         identifier                    basename
#> 6     include_js_files         identifier                   singleton
#> 7     include_js_files         identifier                      lapply
#> 12        run_js_files namespace_operator checkmate::assert_character
#> 22        run_js_files         identifier                      lapply
#> 32        run_js_files namespace_operator              shinyjs::runjs
#> 41        run_js_files         identifier                      paste0
#> 51        run_js_files         identifier                   readLines
#> 61        run_js_files         identifier                 system.file
#> 71        run_js_files         identifier                   invisible
#> 13 include_teal_css_js         identifier                     tagList
#> 23 include_teal_css_js namespace_operator         shinyjs::useShinyjs
#> 33 include_teal_css_js         identifier           include_css_files
#> 42 include_teal_css_js         identifier            include_js_files
#> 52 include_teal_css_js namespace_operator             shinyjs::hidden
#> 62 include_teal_css_js         identifier                        icon

Created on 2024-09-11 with reprex v2.1.1

@mpadge
Copy link
Member Author

mpadge commented Sep 11, 2024

Here's an improved version of the function, minus the debugging lines:

walk_one_tree <- function (tree) {

    it <- treesitter::tree_walk (tree)

    reached_foot <- FALSE
    first_identifier <- TRUE
    get_next_open <- FALSE
    grammar_types <- node_text <- next_open <- fn_name <- character (0L)
    while (!reached_foot) {
        field_name <- NA_to_null (it$field_name ())
        grammar_type <- NA_to_null (treesitter::node_grammar_type (it$node ()))
        if (field_name == "function" && !grammar_type %in% c ("call", "extract_operator")) {
            grammar_types <- c (grammar_types, grammar_type)
            node_text <- c (node_text, treesitter::node_text (it$node ()))
            get_next_open <- TRUE
        } else if (grammar_type == "identifier" && first_identifier) {
            fn_name <- treesitter::node_text (it$node ())
            first_identifier <- FALSE
        } else if (get_next_open && field_name == "open") {
            next_open <- c (next_open, grammar_type)
            get_next_open <- FALSE
        }

        if (it$goto_first_child ()) next
        if (it$goto_next_sibling ()) next

        retracing <- TRUE
        while (retracing) {
            if (!it$goto_parent ()) {
                retracing <- FALSE
                reached_foot <- TRUE
            }
            if (it$goto_next_sibling ()) {
                retracing <- FALSE
            }
        }
    }

    # This line ensures fn_name is also length 0 when no data are parsed:
    fn_name <- rep (fn_name, length (grammar_types))
    data.frame (
        fn_name = fn_name,
        grammar_type = grammar_types,
        node_text = node_text
    ) [which (next_open != "["), ]
}

@mpadge
Copy link
Member Author

mpadge commented Sep 11, 2024

And here is a comparison with current ctags approach, as well as with checkglobals as suggested by @pawelru in #61. Note that checkglobals does not trace all function calls, rather it only identifies single unique function calls, and so returns only unique names of functions.

Function to use treesitter to trace all calls in R/ directory of package, via the walk_one_tree() function defined above:

language <- treesitter.r::language()
parser <- treesitter::parser(language)
path <- "/<local>/<path>/<to>/teal"

tree_sitter_calls <- function (path) {
    flist <- fs::dir_ls (path, pattern = "\\.R$")
    fn_calls <- lapply (flist, function (f) {
        parse_list <- pkgstats:::control_parse (f)
        fn_calls <- lapply (parse_list, function (p) {
            txt <- paste0 (as.character (p), collapse = "\n")
            tree <- treesitter::parser_parse(parser, txt)
            walk_one_tree (tree)
        })
        res <- do.call (rbind, fn_calls)
        cbind (file = rep (f, nrow (res)), res)
    })
    fn_calls <- do.call (rbind, fn_calls)
}

Then code to compare the three approaches:

ctags_calls <- function (path) {
    withr::with_dir (path, pkgstats:::get_ctags ("R", has_tabs = FALSE))
}

t0 <- system.time (
    tags_ctags <- ctags_calls (path)
)
t1 <- system.time (
    tags_tree_sitter <- tree_sitter_calls (file.path (path, "R"))
)
t2 <- system.time (
    tags_checkglobals <- as.data.frame (checkglobals::check_pkg(path))
)

Then comparison of calculation times:

message (
    "times (ctags; ts; checkglobals) = (", 
    round (t0 [3], digits = 2),
    "; ", 
    round (t1 [3], digits = 2),
    "; ", 
    round (t2 [3], digits = 2),
    "); ratios to ctags = ", 
    round (t1 [3] / t0 [3], digits = 1),
    "; ", 
    round (t2 [3] / t0 [3], digits = 1)
)
#> times (ctags; ts; checkglobals) = (0.34; 1.19; 0.22); ratios to ctags = 3.5; 0.6

And finally numbers of identified calls:

message (
    "numbers of tags (ctags; ts; checkglobals) = (", 
    format (nrow (tags_ctags), big.mark = ","),
    "; ", 
    format (nrow (tags_tree_sitter), big.mark = ","),
    "; ", 
    format (nrow (tags_checkglobals), big.mark = ","),
    ")"
)
#> numbers of tags (ctags; ts; checkglobals) = (890; 2,214; 170)

Created on 2024-09-11 with reprex v2.1.1

@mpadge
Copy link
Member Author

mpadge commented Sep 11, 2024

This issue now has to be paused because of DavisVaughan/r-tree-sitter#21 (comment). The bundled versions of all relevant C libraries are currently in https://github.com/ropensci-review-tools/pkgsimil, but that makes for a 24MB src/ directory, so is not a viable approach for any package intended for CRAN.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant