Skip to content

Instantly share code, notes, and snippets.

Revisions

  1. arizvisa created this gist Jul 18, 2024.
    607 changes: 607 additions & 0 deletions weggli.dc73a25ec28ca42d65517042988fc95076d851e7.patch
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,607 @@
    From b322ecac9992c8bd8c3dfeaf2579a0b2b4b81336 Mon Sep 17 00:00:00 2001
    From: Ali Rizvi-Santiago <arizvisa@gmail.com>
    Date: Sat, 28 May 2022 11:49:00 -0500
    Subject: [PATCH 1/2] saving my game prior to modifying the tree-sitter-c
    grammar


    100.0% src/
    diff --git a/src/builder.rs b/src/builder.rs
    index 13908b0..f600032 100644
    --- a/src/builder.rs
    +++ b/src/builder.rs
    @@ -125,6 +125,129 @@ fn _build_query_tree(

    debug!("tree_sitter query {}: {}", id, sexp);

    + match crate::ts_query(&sexp, is_cpp) {
    + Ok(qr) => {
    + return QueryTree::new(qr, b.captures, variables, b.negations, id);
    + }
    + Err(e) => {
    + eprintln!(
    + "Tree sitter query generation failed: {:?}\n {}",
    + e.kind, e.message
    + );
    + eprintln!("sexpr: {}", sexp);
    + eprintln!("This is a bug! Can't recover :/");
    + std::process::exit(1);
    + }
    + }
    + /*
    + QueryTree::new(
    + crate::ts_query(&sexp, is_cpp),
    + b.captures,
    + variables,
    + b.negations,
    + id,
    + )
    + */
    +}
    +
    +pub fn check_query_tree (
    + source: &str,
    + c: &mut TreeCursor,
    + id: usize,
    + is_cpp: bool,
    + is_multi_pattern: bool,
    + strict_mode: bool,
    + regex_constraints: Option<RegexMap>,
    +) -> Result<QueryTree, tree_sitter::QueryError> {
    + let mut b = QueryBuilder {
    + query_source: source.to_string(),
    + captures: Vec::new(),
    + negations: Vec::new(),
    + id,
    + cpp: is_cpp,
    + regex_constraints: match regex_constraints {
    + Some(r) => r,
    + None => RegexMap::new(HashMap::new()),
    + },
    + };
    +
    + // Skip the root node if it's a translation_unit.
    + if c.node().kind() == "translation_unit" {
    + debug!("query cursor specifies translation_unit");
    + c.goto_first_child();
    + }
    +
    + let mut variables = HashSet::new();
    +
    + let sexp = if !is_multi_pattern {
    + // We want to wrap queries into a function_definition so we can easily
    + // extract the function that contains a match. Of course we should not do that
    + // if the user specifies a function_definition as part of the query.
    + let needs_anchor = c.node().kind() == "compound_statement" && id == 0;
    + debug!("query needs anchor: {}", needs_anchor);
    +
    + // The main work happens here. Iterate through the AST and create a tree-sitter query
    + let mut s = b.build(c, 0, strict_mode);
    +
    + // Make sure user supplied function headers are displayed by adding a Capture
    + if !needs_anchor {
    + s += "@";
    + s += &add_capture(&mut b.captures, Capture::Display);
    + }
    +
    + // Iterate through all captures, add their constraints to the query and extract used variables
    + s += &process_captures(&b.captures, 0, &mut variables);
    +
    + // Optionally anchor query with a function_definition
    + if needs_anchor {
    + let capture = Capture::Display;
    + format!(
    + "(function_definition body: {}) @{}",
    + s,
    + &add_capture(&mut b.captures, capture)
    + )
    + } else {
    + "(".to_string() + &s + ")"
    + }
    + } else {
    + // When building a QueryTree for a compound statement, we create a tree-sitter
    + // query with multiple root patterns for efficient searching.
    + // This code is only executed when creating sub queries so we can skip
    + // the whole anchoring logic needed for the single pattern case.
    +
    + assert!(c.goto_first_child());
    + assert!(c.goto_next_sibling());
    +
    + let mut s = String::new();
    + loop {
    + let child = c.node();
    + if !c.goto_next_sibling() {
    + break;
    + }
    +
    + let before = b.captures.len();
    + let mut cursor = child.walk();
    +
    + let child_sexp = b.build(&mut cursor, 0, strict_mode);
    +
    + let captures = &process_captures(&b.captures, before, &mut variables);
    +
    + if !child_sexp.is_empty() {
    + s += &format!("({} {})", child_sexp, captures);
    + }
    + }
    + s
    + };
    +
    + debug!("tree_sitter query {}: {}", id, sexp);
    +
    + match crate::ts_query(&sexp, is_cpp) {
    + Ok(qr) => {
    + Ok(QueryTree::new(qr, b.captures, variables, b.negations, id))
    + }
    + Err(e) => Err(e)
    + }
    + /*
    QueryTree::new(
    crate::ts_query(&sexp, is_cpp),
    b.captures,
    @@ -132,6 +255,7 @@ fn _build_query_tree(
    b.negations,
    id,
    )
    + */
    }

    /// Iterates through `captures` starting at `offset` and returns the necessary query predicates as a string.
    diff --git a/src/lib.rs b/src/lib.rs
    index b58d861..997516a 100644
    --- a/src/lib.rs
    +++ b/src/lib.rs
    @@ -56,13 +56,16 @@ pub fn parse(source: &str, cpp: bool) -> Tree {
    }

    // Internal helper function to create a new tree-sitter query.
    -fn ts_query(sexpr: &str, cpp: bool) -> tree_sitter::Query {
    +fn ts_query(sexpr: &str, cpp: bool) -> Result<tree_sitter::Query, tree_sitter::QueryError> {
    let language = if !cpp {
    unsafe { tree_sitter_c() }
    } else {
    unsafe { tree_sitter_cpp() }
    };

    + return Query::new(language, sexpr);
    +
    + /*
    match Query::new(language, sexpr) {
    Ok(q) => q,
    Err(e) => {
    @@ -75,6 +78,7 @@ fn ts_query(sexpr: &str, cpp: bool) -> tree_sitter::Query {
    std::process::exit(1);
    }
    }
    + */
    }

    /// Map from variable names to a positive/negative regex constraint
    diff --git a/src/python.rs b/src/python.rs
    index d841c8f..ccaf0df 100644
    --- a/src/python.rs
    +++ b/src/python.rs
    @@ -15,10 +15,16 @@
    */

    use pyo3::prelude::*;
    +use pyo3::types::*;
    use pyo3::wrap_pyfunction;
    +use pyo3::exceptions::PyRuntimeError;

    use crate::query::QueryTree;
    use crate::result::QueryResult;
    +use crate::RegexMap;
    +
    +use regex::Regex;
    +use std::collections::HashMap;

    #[pyclass]
    struct QueryTreePy {
    @@ -30,14 +36,223 @@ struct QueryResultPy {
    qr: QueryResult,
    }

    -#[pyfunction(cpp = "false")]
    -#[text_signature = "(query, cpp)"]
    -fn parse_query(q: &str, cpp: bool) -> PyResult<QueryTreePy> {
    - let tree = crate::parse(q, cpp);
    - let mut c = tree.walk();
    +const VALID_NODE_KINDS: &[&str] = &[
    + "compound_statement",
    + "function_definition",
    + "struct_specifier",
    + "enum_specifier",
    + "union_specifier",
    + "class_specifier",
    +
    + /* because we're searching for both declarations and expressions */
    + "expression_statement",
    + "declaration",
    +];

    +#[pyfunction(cpp = "false", force_query = "false")]
    +#[text_signature = "(query, cpp, force_query, regexes)"]
    +fn parse_query(pattern: &str, cpp: bool, force_query: bool, regexes: Option<HashMap::<&str, &str>>) -> PyResult<QueryTreePy> {
    +
    + let mut tree = crate::parse(pattern, cpp);
    + //let mut cur = tree.walk();
    + let mut p = pattern;
    +
    + /*
    + let temp_pattern;
    +
    + // we need to filter the cursor using validate_query
    + if tree.root_node().has_error() {
    + if !pattern.ends_with(';') {
    + temp_pattern = format!("{};", &p);
    + let fixed_tree = crate::parse(&temp_pattern, cpp);
    + if !fixed_tree.root_node().has_error() {
    + info!("normalizing query: add missing ;");
    + tree = fixed_tree;
    + p = &temp_pattern;
    + }
    + }
    + }
    +
    + // Try to do query normalization to support missing { }
    + // 'memcpy(_);' -> {memcpy(_);}
    + let temp_pattern2;
    + if !tree.root_node().has_error() {
    + let c = tree.root_node().child(0);
    + if let Some(n) = c {
    + if !VALID_NODE_KINDS.contains(&n.kind()) {
    + temp_pattern2 = format!("{{{}}}", &p);
    + let fixed_tree = crate::parse(&temp_pattern2, cpp);
    + if !fixed_tree.root_node().has_error() {
    + info!("normalizing query: add {}", "{}");
    + tree = fixed_tree;
    + p = &temp_pattern2;
    + }
    + }
    + }
    + }
    + */
    +
    + //let mut cur = tree.walk();
    + //let mut cur = validate_query(&tree, p, force_query)?;
    + let mut cur = validate_query(&tree, p, force_query);
    +
    + let constraints = match regexes {
    + Some(regexes) => {
    + match process_regexes(regexes) {
    + Ok(regexconstraints) => {
    + Some(regexconstraints)
    + }
    + Err(err) => {
    + return Err(PyRuntimeError::new_err(err))
    + }
    + }
    + }
    + None => None
    + };
    +
    + match &mut cur {
    + Err(e) => {
    + Err(PyRuntimeError::new_err(format!("Tree sitter query validation failed: {}", e)))
    + }
    + Ok(cursor) => {
    +
    + // guard build_query_tree for python so that we can avoid an exit
    + match crate::builder::check_query_tree(p, cursor, 0, cpp, false, false, constraints /*None*/) {
    + Ok(qt) => {
    + //let qt = crate::builder::build_query_tree(q, &mut c, cpp, None);
    + Ok(QueryTreePy { qt })
    + }
    + Err(e) => {
    + Err(PyRuntimeError::new_err(format!("Tree sitter query generation failed: {:?}\n {}",
    + e.kind, e.message
    + )))
    + }
    + }
    + }
    + }
    + /*
    let qt = crate::builder::build_query_tree(q, &mut c, cpp, None);
    Ok(QueryTreePy { qt })
    + */
    +}
    +
    +enum RegexError {
    + InvalidArg(String),
    + InvalidRegex(regex::Error),
    +}
    +
    +impl From<regex::Error> for RegexError {
    + fn from(err: regex::Error) -> RegexError {
    + RegexError::InvalidRegex(err)
    + }
    +}
    +
    +fn process_regexes(regexes: HashMap::<&str, &str>) -> Result<RegexMap, String> {
    + let mut result = HashMap::new();
    + let mut error = None;
    +
    + for (var, raw_regex) in regexes {
    + //let mut s = r.splitn(2, '=');
    + //let var = s.next().ok_or_else(|| RegexError::InvalidArg(r.clone()))?;
    + //let raw_regex = s.next().ok_or_else(|| RegexError::InvalidArg(r.clone()))?;
    +
    + let mut normalized_var = if var.starts_with('$') {
    + var.to_string()
    + } else {
    + "$".to_string() + var
    + };
    + let negative = normalized_var.ends_with('!');
    +
    + if negative {
    + normalized_var.pop(); // remove !
    + }
    +
    + //let regex = Regex::new(raw_regex);
    + match Regex::new(raw_regex) {
    + Ok(regex) => {
    + error = None;
    + result.insert(normalized_var, (negative, regex));
    + }
    + Err(msg) => {
    + error = Some(msg);
    + break
    + }
    + }
    + }
    +
    + match error {
    + Some(regex) => {
    + Err(format!("Regex error {}", regex))
    + }
    + None => {
    + Ok(RegexMap::new(result))
    + }
    + }
    +}
    +
    +fn validate_query<'a>(
    + tree: &'a tree_sitter::Tree,
    + query: &str,
    + force: bool,
    +) -> Result<tree_sitter::TreeCursor<'a>, String> {
    + if tree.root_node().has_error() && !force {
    + let mut errmsg = format!("{}", "Error! Query parsing failed:");
    + let mut cursor = tree.root_node().walk();
    +
    + let mut first_error = None;
    + loop {
    + let node = cursor.node();
    + if node.has_error() {
    + if node.is_error() || node.is_missing() {
    + first_error = Some(node);
    + break;
    + } else if !cursor.goto_first_child() {
    + break;
    + }
    + } else if !cursor.goto_next_sibling() {
    + break;
    + }
    + }
    +
    + if let Some(node) = first_error {
    + errmsg.push_str(&format!(" {}", &query[0..node.start_byte()]));
    + if node.is_missing() {
    + errmsg.push_str(&format!(
    + "{}{}{}",
    + " [MISSING ", node.kind(), " ] "
    + ));
    + }
    + errmsg.push_str(&format!(
    + "{}{}",
    + &query[node.start_byte()..node.end_byte()],
    + &query[node.end_byte()..]
    + ));
    + }
    +
    + return Err(errmsg);
    + }
    +
    + info!("query sexp: {}", tree.root_node().to_sexp());
    +
    + let mut c = tree.walk();
    +
    + if c.node().named_child_count() > 1 {
    + return Err(format!(
    + "{}'{}' query contains multiple root nodes",
    + "Error: ", query
    + ));
    + }
    +
    + c.goto_first_child();
    +
    + if !VALID_NODE_KINDS.contains(&c.node().kind()) {
    + return Err(format!(
    + "{}'{}' ({}) is not a supported query root node.",
    + "Error: ", query, c.node().kind()
    + ));
    + }
    +
    + Ok(c)
    }

    #[pyfunction]
    @@ -58,17 +273,59 @@ fn matches(p: &QueryTreePy, source: &str, cpp: bool) -> PyResult<Vec<QueryResult
    Ok(r)
    }

    -#[pyfunction(color = "None")]
    -#[text_signature = "(q, source, color)"]
    -fn display(p: &QueryResultPy, source: &str, color: Option<bool>) -> PyResult<String> {
    +#[pyfunction(color = "None", before = 10, after = 10)]
    +#[text_signature = "(q, source, color, before, after)"]
    +fn display(p: &QueryResultPy, source: &str, color: Option<bool>, before: usize, after: usize) -> PyResult<String> {
    if let Some(color_override) = color {
    colored::control::set_override(color_override);
    }
    - let r = p.qr.display(source, 10, 10);
    + let r = p.qr.display(source, before, after);
    colored::control::unset_override();
    Ok(r)
    }

    +#[pyfunction]
    +#[text_signature = "(q)"]
    +fn bounds(p: &QueryResultPy) -> (usize, usize) {
    + return (p.qr.function.start, p.qr.function.end);
    +}
    +
    +#[pyfunction]
    +#[text_signature = "(q)"]
    +fn spans(p: &QueryResultPy) -> PyResult<Vec<(String, usize, usize)>> {
    + let mut sorted = p.qr.captures.clone();
    + sorted.sort_by(|a, b| a.range.start.cmp(&b.range.start));
    +
    + let mut varlookup: HashMap::<usize, String> = HashMap::with_capacity(p.qr.vars.len());
    + for (index, name) in p.qr.vars.clone().into_iter().map(|(name, index)| (index, name)) {
    + varlookup.insert(index, name);
    + }
    +
    + let mut parallel_vartable: Vec<String> = Vec::with_capacity(p.qr.captures.len());
    + let mut clean_ranges: Vec<std::ops::Range<usize>> = Vec::with_capacity(p.qr.captures.len());
    + //for r in sorted.into_iter().skip(1).map(|c| c.range) {
    + for r in sorted.into_iter().skip(1) {
    + if !clean_ranges.is_empty() && clean_ranges.last().unwrap().contains(&r.range.start) {
    + continue;
    + }
    + clean_ranges.push(r.range.clone());
    + let idx: usize = (r.capture_idx as usize) + 1; /* this +1 is prolly busted */
    + match varlookup.get(&idx) {
    + Some(s) => parallel_vartable.push(s.to_string()),
    + None => parallel_vartable.push("".to_string()),
    + }
    + }
    +
    + let mut result: Vec<(String, usize, usize)> = Vec::with_capacity(clean_ranges.len());
    + for (i, r) in clean_ranges.iter().enumerate() {
    + let s = &parallel_vartable[i];
    + result.push((s.to_string(), r.start, r.end))
    + }
    + Ok(result.into_iter().collect())
    +
    + //Ok(clean_ranges.into_iter().map(|r| (r.start, r.end)).collect())
    +}
    +
    #[pymodule]
    fn weggli(_py: Python, m: &PyModule) -> PyResult<()> {
    m.add_class::<QueryTreePy>()?;
    @@ -76,6 +333,8 @@ fn weggli(_py: Python, m: &PyModule) -> PyResult<()> {
    m.add_function(wrap_pyfunction!(identifiers, m)?)?;
    m.add_function(wrap_pyfunction!(matches, m)?)?;
    m.add_function(wrap_pyfunction!(display, m)?)?;
    + m.add_function(wrap_pyfunction!(bounds, m)?)?;
    + m.add_function(wrap_pyfunction!(spans, m)?)?;

    Ok(())
    }
    diff --git a/src/result.rs b/src/result.rs
    index 1d32094..83183b4 100644
    --- a/src/result.rs
    +++ b/src/result.rs
    @@ -31,7 +31,8 @@ pub struct QueryResult {
    pub vars: FxHashMap<String, usize>,
    // Range of the outermost node. This is badly named as it does not have to be a
    // function definition, but for final query results it normally is.
    - function: std::ops::Range<usize>,
    + pub function: std::ops::Range<usize>,
    + //function: std::ops::Range<usize>,
    }

    /// Stores the result (== source range) for a single capture.
    --
    2.43.0


    From dc73a25ec28ca42d65517042988fc95076d851e7 Mon Sep 17 00:00:00 2001
    From: Ali Rizvi-Santiago <arizvisa@gmail.com>
    Date: Fri, 26 Aug 2022 10:53:12 -0500
    Subject: [PATCH 2/2] looks like i modified parse_query or something


    100.0% src/
    diff --git a/src/python.rs b/src/python.rs
    index ccaf0df..a53d1c9 100644
    --- a/src/python.rs
    +++ b/src/python.rs
    @@ -51,50 +51,10 @@ const VALID_NODE_KINDS: &[&str] = &[

    #[pyfunction(cpp = "false", force_query = "false")]
    #[text_signature = "(query, cpp, force_query, regexes)"]
    -fn parse_query(pattern: &str, cpp: bool, force_query: bool, regexes: Option<HashMap::<&str, &str>>) -> PyResult<QueryTreePy> {
    -
    - let mut tree = crate::parse(pattern, cpp);
    - //let mut cur = tree.walk();
    - let mut p = pattern;
    -
    - /*
    - let temp_pattern;
    -
    - // we need to filter the cursor using validate_query
    - if tree.root_node().has_error() {
    - if !pattern.ends_with(';') {
    - temp_pattern = format!("{};", &p);
    - let fixed_tree = crate::parse(&temp_pattern, cpp);
    - if !fixed_tree.root_node().has_error() {
    - info!("normalizing query: add missing ;");
    - tree = fixed_tree;
    - p = &temp_pattern;
    - }
    - }
    - }
    -
    - // Try to do query normalization to support missing { }
    - // 'memcpy(_);' -> {memcpy(_);}
    - let temp_pattern2;
    - if !tree.root_node().has_error() {
    - let c = tree.root_node().child(0);
    - if let Some(n) = c {
    - if !VALID_NODE_KINDS.contains(&n.kind()) {
    - temp_pattern2 = format!("{{{}}}", &p);
    - let fixed_tree = crate::parse(&temp_pattern2, cpp);
    - if !fixed_tree.root_node().has_error() {
    - info!("normalizing query: add {}", "{}");
    - tree = fixed_tree;
    - p = &temp_pattern2;
    - }
    - }
    - }
    - }
    - */
    -
    - //let mut cur = tree.walk();
    - //let mut cur = validate_query(&tree, p, force_query)?;
    - let mut cur = validate_query(&tree, p, force_query);
    +fn parse_query(pattern: &str, cpp: bool, force_query: bool, regexes: Option<HashMap::<&str, &str>>) -> PyResult<QueryTreePy>
    +{
    + let tree = crate::parse(pattern, cpp);
    + let mut cursor = validate_query(&tree, pattern, force_query);

    let constraints = match regexes {
    Some(regexes) => {
    @@ -110,14 +70,14 @@ fn parse_query(pattern: &str, cpp: bool, force_query: bool, regexes: Option<Hash
    None => None
    };

    - match &mut cur {
    + match &mut cursor {
    Err(e) => {
    Err(PyRuntimeError::new_err(format!("Tree sitter query validation failed: {}", e)))
    }
    - Ok(cursor) => {
    + Ok(c) => {

    // guard build_query_tree for python so that we can avoid an exit
    - match crate::builder::check_query_tree(p, cursor, 0, cpp, false, false, constraints /*None*/) {
    + match crate::builder::check_query_tree(pattern, c, 0, cpp, false, false, constraints /*None*/) {
    Ok(qt) => {
    //let qt = crate::builder::build_query_tree(q, &mut c, cpp, None);
    Ok(QueryTreePy { qt })
    --
    2.43.0