Created
July 18, 2024 18:16
-
-
Save arizvisa/ba180b27fa21a3a65ab12f534be19d27 to your computer and use it in GitHub Desktop.
Patch to weggli-rs/weggli@dc73a25 to add support for spans and bounds to the python module.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From b322ecac9992c8bd8c3dfeaf2579a0b2b4b81336 Mon Sep 17 00:00:00 2001 | |
From: Ali Rizvi-Santiago <[email protected]> | |
Date: Sat, 28 May 2022 11:49:00 -0500 | |
Subject: [PATCH 1/2] saving my game prior to modifying the tree-sitter-c | |
grammar | |
100.0% src/ | |
diff --git a/src/builder.rs b/src/builder.rs | |
index 13908b0..f600032 100644 | |
--- a/src/builder.rs | |
+++ b/src/builder.rs | |
@@ -125,6 +125,129 @@ fn _build_query_tree( | |
debug!("tree_sitter query {}: {}", id, sexp); | |
+ match crate::ts_query(&sexp, is_cpp) { | |
+ Ok(qr) => { | |
+ return QueryTree::new(qr, b.captures, variables, b.negations, id); | |
+ } | |
+ Err(e) => { | |
+ eprintln!( | |
+ "Tree sitter query generation failed: {:?}\n {}", | |
+ e.kind, e.message | |
+ ); | |
+ eprintln!("sexpr: {}", sexp); | |
+ eprintln!("This is a bug! Can't recover :/"); | |
+ std::process::exit(1); | |
+ } | |
+ } | |
+ /* | |
+ QueryTree::new( | |
+ crate::ts_query(&sexp, is_cpp), | |
+ b.captures, | |
+ variables, | |
+ b.negations, | |
+ id, | |
+ ) | |
+ */ | |
+} | |
+ | |
+pub fn check_query_tree ( | |
+ source: &str, | |
+ c: &mut TreeCursor, | |
+ id: usize, | |
+ is_cpp: bool, | |
+ is_multi_pattern: bool, | |
+ strict_mode: bool, | |
+ regex_constraints: Option<RegexMap>, | |
+) -> Result<QueryTree, tree_sitter::QueryError> { | |
+ let mut b = QueryBuilder { | |
+ query_source: source.to_string(), | |
+ captures: Vec::new(), | |
+ negations: Vec::new(), | |
+ id, | |
+ cpp: is_cpp, | |
+ regex_constraints: match regex_constraints { | |
+ Some(r) => r, | |
+ None => RegexMap::new(HashMap::new()), | |
+ }, | |
+ }; | |
+ | |
+ // Skip the root node if it's a translation_unit. | |
+ if c.node().kind() == "translation_unit" { | |
+ debug!("query cursor specifies translation_unit"); | |
+ c.goto_first_child(); | |
+ } | |
+ | |
+ let mut variables = HashSet::new(); | |
+ | |
+ let sexp = if !is_multi_pattern { | |
+ // We want to wrap queries into a function_definition so we can easily | |
+ // extract the function that contains a match. Of course we should not do that | |
+ // if the user specifies a function_definition as part of the query. | |
+ let needs_anchor = c.node().kind() == "compound_statement" && id == 0; | |
+ debug!("query needs anchor: {}", needs_anchor); | |
+ | |
+ // The main work happens here. Iterate through the AST and create a tree-sitter query | |
+ let mut s = b.build(c, 0, strict_mode); | |
+ | |
+ // Make sure user supplied function headers are displayed by adding a Capture | |
+ if !needs_anchor { | |
+ s += "@"; | |
+ s += &add_capture(&mut b.captures, Capture::Display); | |
+ } | |
+ | |
+ // Iterate through all captures, add their constraints to the query and extract used variables | |
+ s += &process_captures(&b.captures, 0, &mut variables); | |
+ | |
+ // Optionally anchor query with a function_definition | |
+ if needs_anchor { | |
+ let capture = Capture::Display; | |
+ format!( | |
+ "(function_definition body: {}) @{}", | |
+ s, | |
+ &add_capture(&mut b.captures, capture) | |
+ ) | |
+ } else { | |
+ "(".to_string() + &s + ")" | |
+ } | |
+ } else { | |
+ // When building a QueryTree for a compound statement, we create a tree-sitter | |
+ // query with multiple root patterns for efficient searching. | |
+ // This code is only executed when creating sub queries so we can skip | |
+ // the whole anchoring logic needed for the single pattern case. | |
+ | |
+ assert!(c.goto_first_child()); | |
+ assert!(c.goto_next_sibling()); | |
+ | |
+ let mut s = String::new(); | |
+ loop { | |
+ let child = c.node(); | |
+ if !c.goto_next_sibling() { | |
+ break; | |
+ } | |
+ | |
+ let before = b.captures.len(); | |
+ let mut cursor = child.walk(); | |
+ | |
+ let child_sexp = b.build(&mut cursor, 0, strict_mode); | |
+ | |
+ let captures = &process_captures(&b.captures, before, &mut variables); | |
+ | |
+ if !child_sexp.is_empty() { | |
+ s += &format!("({} {})", child_sexp, captures); | |
+ } | |
+ } | |
+ s | |
+ }; | |
+ | |
+ debug!("tree_sitter query {}: {}", id, sexp); | |
+ | |
+ match crate::ts_query(&sexp, is_cpp) { | |
+ Ok(qr) => { | |
+ Ok(QueryTree::new(qr, b.captures, variables, b.negations, id)) | |
+ } | |
+ Err(e) => Err(e) | |
+ } | |
+ /* | |
QueryTree::new( | |
crate::ts_query(&sexp, is_cpp), | |
b.captures, | |
@@ -132,6 +255,7 @@ fn _build_query_tree( | |
b.negations, | |
id, | |
) | |
+ */ | |
} | |
/// Iterates through `captures` starting at `offset` and returns the necessary query predicates as a string. | |
diff --git a/src/lib.rs b/src/lib.rs | |
index b58d861..997516a 100644 | |
--- a/src/lib.rs | |
+++ b/src/lib.rs | |
@@ -56,13 +56,16 @@ pub fn parse(source: &str, cpp: bool) -> Tree { | |
} | |
// Internal helper function to create a new tree-sitter query. | |
-fn ts_query(sexpr: &str, cpp: bool) -> tree_sitter::Query { | |
+fn ts_query(sexpr: &str, cpp: bool) -> Result<tree_sitter::Query, tree_sitter::QueryError> { | |
let language = if !cpp { | |
unsafe { tree_sitter_c() } | |
} else { | |
unsafe { tree_sitter_cpp() } | |
}; | |
+ return Query::new(language, sexpr); | |
+ | |
+ /* | |
match Query::new(language, sexpr) { | |
Ok(q) => q, | |
Err(e) => { | |
@@ -75,6 +78,7 @@ fn ts_query(sexpr: &str, cpp: bool) -> tree_sitter::Query { | |
std::process::exit(1); | |
} | |
} | |
+ */ | |
} | |
/// Map from variable names to a positive/negative regex constraint | |
diff --git a/src/python.rs b/src/python.rs | |
index d841c8f..ccaf0df 100644 | |
--- a/src/python.rs | |
+++ b/src/python.rs | |
@@ -15,10 +15,16 @@ | |
*/ | |
use pyo3::prelude::*; | |
+use pyo3::types::*; | |
use pyo3::wrap_pyfunction; | |
+use pyo3::exceptions::PyRuntimeError; | |
use crate::query::QueryTree; | |
use crate::result::QueryResult; | |
+use crate::RegexMap; | |
+ | |
+use regex::Regex; | |
+use std::collections::HashMap; | |
#[pyclass] | |
struct QueryTreePy { | |
@@ -30,14 +36,223 @@ struct QueryResultPy { | |
qr: QueryResult, | |
} | |
-#[pyfunction(cpp = "false")] | |
-#[text_signature = "(query, cpp)"] | |
-fn parse_query(q: &str, cpp: bool) -> PyResult<QueryTreePy> { | |
- let tree = crate::parse(q, cpp); | |
- let mut c = tree.walk(); | |
+const VALID_NODE_KINDS: &[&str] = &[ | |
+ "compound_statement", | |
+ "function_definition", | |
+ "struct_specifier", | |
+ "enum_specifier", | |
+ "union_specifier", | |
+ "class_specifier", | |
+ | |
+ /* because we're searching for both declarations and expressions */ | |
+ "expression_statement", | |
+ "declaration", | |
+]; | |
+#[pyfunction(cpp = "false", force_query = "false")] | |
+#[text_signature = "(query, cpp, force_query, regexes)"] | |
+fn parse_query(pattern: &str, cpp: bool, force_query: bool, regexes: Option<HashMap::<&str, &str>>) -> PyResult<QueryTreePy> { | |
+ | |
+ let mut tree = crate::parse(pattern, cpp); | |
+ //let mut cur = tree.walk(); | |
+ let mut p = pattern; | |
+ | |
+ /* | |
+ let temp_pattern; | |
+ | |
+ // we need to filter the cursor using validate_query | |
+ if tree.root_node().has_error() { | |
+ if !pattern.ends_with(';') { | |
+ temp_pattern = format!("{};", &p); | |
+ let fixed_tree = crate::parse(&temp_pattern, cpp); | |
+ if !fixed_tree.root_node().has_error() { | |
+ info!("normalizing query: add missing ;"); | |
+ tree = fixed_tree; | |
+ p = &temp_pattern; | |
+ } | |
+ } | |
+ } | |
+ | |
+ // Try to do query normalization to support missing { } | |
+ // 'memcpy(_);' -> {memcpy(_);} | |
+ let temp_pattern2; | |
+ if !tree.root_node().has_error() { | |
+ let c = tree.root_node().child(0); | |
+ if let Some(n) = c { | |
+ if !VALID_NODE_KINDS.contains(&n.kind()) { | |
+ temp_pattern2 = format!("{{{}}}", &p); | |
+ let fixed_tree = crate::parse(&temp_pattern2, cpp); | |
+ if !fixed_tree.root_node().has_error() { | |
+ info!("normalizing query: add {}", "{}"); | |
+ tree = fixed_tree; | |
+ p = &temp_pattern2; | |
+ } | |
+ } | |
+ } | |
+ } | |
+ */ | |
+ | |
+ //let mut cur = tree.walk(); | |
+ //let mut cur = validate_query(&tree, p, force_query)?; | |
+ let mut cur = validate_query(&tree, p, force_query); | |
+ | |
+ let constraints = match regexes { | |
+ Some(regexes) => { | |
+ match process_regexes(regexes) { | |
+ Ok(regexconstraints) => { | |
+ Some(regexconstraints) | |
+ } | |
+ Err(err) => { | |
+ return Err(PyRuntimeError::new_err(err)) | |
+ } | |
+ } | |
+ } | |
+ None => None | |
+ }; | |
+ | |
+ match &mut cur { | |
+ Err(e) => { | |
+ Err(PyRuntimeError::new_err(format!("Tree sitter query validation failed: {}", e))) | |
+ } | |
+ Ok(cursor) => { | |
+ | |
+ // guard build_query_tree for python so that we can avoid an exit | |
+ match crate::builder::check_query_tree(p, cursor, 0, cpp, false, false, constraints /*None*/) { | |
+ Ok(qt) => { | |
+ //let qt = crate::builder::build_query_tree(q, &mut c, cpp, None); | |
+ Ok(QueryTreePy { qt }) | |
+ } | |
+ Err(e) => { | |
+ Err(PyRuntimeError::new_err(format!("Tree sitter query generation failed: {:?}\n {}", | |
+ e.kind, e.message | |
+ ))) | |
+ } | |
+ } | |
+ } | |
+ } | |
+ /* | |
let qt = crate::builder::build_query_tree(q, &mut c, cpp, None); | |
Ok(QueryTreePy { qt }) | |
+ */ | |
+} | |
+ | |
+enum RegexError { | |
+ InvalidArg(String), | |
+ InvalidRegex(regex::Error), | |
+} | |
+ | |
+impl From<regex::Error> for RegexError { | |
+ fn from(err: regex::Error) -> RegexError { | |
+ RegexError::InvalidRegex(err) | |
+ } | |
+} | |
+ | |
+fn process_regexes(regexes: HashMap::<&str, &str>) -> Result<RegexMap, String> { | |
+ let mut result = HashMap::new(); | |
+ let mut error = None; | |
+ | |
+ for (var, raw_regex) in regexes { | |
+ //let mut s = r.splitn(2, '='); | |
+ //let var = s.next().ok_or_else(|| RegexError::InvalidArg(r.clone()))?; | |
+ //let raw_regex = s.next().ok_or_else(|| RegexError::InvalidArg(r.clone()))?; | |
+ | |
+ let mut normalized_var = if var.starts_with('$') { | |
+ var.to_string() | |
+ } else { | |
+ "$".to_string() + var | |
+ }; | |
+ let negative = normalized_var.ends_with('!'); | |
+ | |
+ if negative { | |
+ normalized_var.pop(); // remove ! | |
+ } | |
+ | |
+ //let regex = Regex::new(raw_regex); | |
+ match Regex::new(raw_regex) { | |
+ Ok(regex) => { | |
+ error = None; | |
+ result.insert(normalized_var, (negative, regex)); | |
+ } | |
+ Err(msg) => { | |
+ error = Some(msg); | |
+ break | |
+ } | |
+ } | |
+ } | |
+ | |
+ match error { | |
+ Some(regex) => { | |
+ Err(format!("Regex error {}", regex)) | |
+ } | |
+ None => { | |
+ Ok(RegexMap::new(result)) | |
+ } | |
+ } | |
+} | |
+ | |
+fn validate_query<'a>( | |
+ tree: &'a tree_sitter::Tree, | |
+ query: &str, | |
+ force: bool, | |
+) -> Result<tree_sitter::TreeCursor<'a>, String> { | |
+ if tree.root_node().has_error() && !force { | |
+ let mut errmsg = format!("{}", "Error! Query parsing failed:"); | |
+ let mut cursor = tree.root_node().walk(); | |
+ | |
+ let mut first_error = None; | |
+ loop { | |
+ let node = cursor.node(); | |
+ if node.has_error() { | |
+ if node.is_error() || node.is_missing() { | |
+ first_error = Some(node); | |
+ break; | |
+ } else if !cursor.goto_first_child() { | |
+ break; | |
+ } | |
+ } else if !cursor.goto_next_sibling() { | |
+ break; | |
+ } | |
+ } | |
+ | |
+ if let Some(node) = first_error { | |
+ errmsg.push_str(&format!(" {}", &query[0..node.start_byte()])); | |
+ if node.is_missing() { | |
+ errmsg.push_str(&format!( | |
+ "{}{}{}", | |
+ " [MISSING ", node.kind(), " ] " | |
+ )); | |
+ } | |
+ errmsg.push_str(&format!( | |
+ "{}{}", | |
+ &query[node.start_byte()..node.end_byte()], | |
+ &query[node.end_byte()..] | |
+ )); | |
+ } | |
+ | |
+ return Err(errmsg); | |
+ } | |
+ | |
+ info!("query sexp: {}", tree.root_node().to_sexp()); | |
+ | |
+ let mut c = tree.walk(); | |
+ | |
+ if c.node().named_child_count() > 1 { | |
+ return Err(format!( | |
+ "{}'{}' query contains multiple root nodes", | |
+ "Error: ", query | |
+ )); | |
+ } | |
+ | |
+ c.goto_first_child(); | |
+ | |
+ if !VALID_NODE_KINDS.contains(&c.node().kind()) { | |
+ return Err(format!( | |
+ "{}'{}' ({}) is not a supported query root node.", | |
+ "Error: ", query, c.node().kind() | |
+ )); | |
+ } | |
+ | |
+ Ok(c) | |
} | |
#[pyfunction] | |
@@ -58,17 +273,59 @@ fn matches(p: &QueryTreePy, source: &str, cpp: bool) -> PyResult<Vec<QueryResult | |
Ok(r) | |
} | |
-#[pyfunction(color = "None")] | |
-#[text_signature = "(q, source, color)"] | |
-fn display(p: &QueryResultPy, source: &str, color: Option<bool>) -> PyResult<String> { | |
+#[pyfunction(color = "None", before = 10, after = 10)] | |
+#[text_signature = "(q, source, color, before, after)"] | |
+fn display(p: &QueryResultPy, source: &str, color: Option<bool>, before: usize, after: usize) -> PyResult<String> { | |
if let Some(color_override) = color { | |
colored::control::set_override(color_override); | |
} | |
- let r = p.qr.display(source, 10, 10); | |
+ let r = p.qr.display(source, before, after); | |
colored::control::unset_override(); | |
Ok(r) | |
} | |
+#[pyfunction] | |
+#[text_signature = "(q)"] | |
+fn bounds(p: &QueryResultPy) -> (usize, usize) { | |
+ return (p.qr.function.start, p.qr.function.end); | |
+} | |
+ | |
+#[pyfunction] | |
+#[text_signature = "(q)"] | |
+fn spans(p: &QueryResultPy) -> PyResult<Vec<(String, usize, usize)>> { | |
+ let mut sorted = p.qr.captures.clone(); | |
+ sorted.sort_by(|a, b| a.range.start.cmp(&b.range.start)); | |
+ | |
+ let mut varlookup: HashMap::<usize, String> = HashMap::with_capacity(p.qr.vars.len()); | |
+ for (index, name) in p.qr.vars.clone().into_iter().map(|(name, index)| (index, name)) { | |
+ varlookup.insert(index, name); | |
+ } | |
+ | |
+ let mut parallel_vartable: Vec<String> = Vec::with_capacity(p.qr.captures.len()); | |
+ let mut clean_ranges: Vec<std::ops::Range<usize>> = Vec::with_capacity(p.qr.captures.len()); | |
+ //for r in sorted.into_iter().skip(1).map(|c| c.range) { | |
+ for r in sorted.into_iter().skip(1) { | |
+ if !clean_ranges.is_empty() && clean_ranges.last().unwrap().contains(&r.range.start) { | |
+ continue; | |
+ } | |
+ clean_ranges.push(r.range.clone()); | |
+ let idx: usize = (r.capture_idx as usize) + 1; /* this +1 is prolly busted */ | |
+ match varlookup.get(&idx) { | |
+ Some(s) => parallel_vartable.push(s.to_string()), | |
+ None => parallel_vartable.push("".to_string()), | |
+ } | |
+ } | |
+ | |
+ let mut result: Vec<(String, usize, usize)> = Vec::with_capacity(clean_ranges.len()); | |
+ for (i, r) in clean_ranges.iter().enumerate() { | |
+ let s = ¶llel_vartable[i]; | |
+ result.push((s.to_string(), r.start, r.end)) | |
+ } | |
+ Ok(result.into_iter().collect()) | |
+ | |
+ //Ok(clean_ranges.into_iter().map(|r| (r.start, r.end)).collect()) | |
+} | |
+ | |
#[pymodule] | |
fn weggli(_py: Python, m: &PyModule) -> PyResult<()> { | |
m.add_class::<QueryTreePy>()?; | |
@@ -76,6 +333,8 @@ fn weggli(_py: Python, m: &PyModule) -> PyResult<()> { | |
m.add_function(wrap_pyfunction!(identifiers, m)?)?; | |
m.add_function(wrap_pyfunction!(matches, m)?)?; | |
m.add_function(wrap_pyfunction!(display, m)?)?; | |
+ m.add_function(wrap_pyfunction!(bounds, m)?)?; | |
+ m.add_function(wrap_pyfunction!(spans, m)?)?; | |
Ok(()) | |
} | |
diff --git a/src/result.rs b/src/result.rs | |
index 1d32094..83183b4 100644 | |
--- a/src/result.rs | |
+++ b/src/result.rs | |
@@ -31,7 +31,8 @@ pub struct QueryResult { | |
pub vars: FxHashMap<String, usize>, | |
// Range of the outermost node. This is badly named as it does not have to be a | |
// function definition, but for final query results it normally is. | |
- function: std::ops::Range<usize>, | |
+ pub function: std::ops::Range<usize>, | |
+ //function: std::ops::Range<usize>, | |
} | |
/// Stores the result (== source range) for a single capture. | |
-- | |
2.43.0 | |
From dc73a25ec28ca42d65517042988fc95076d851e7 Mon Sep 17 00:00:00 2001 | |
From: Ali Rizvi-Santiago <[email protected]> | |
Date: Fri, 26 Aug 2022 10:53:12 -0500 | |
Subject: [PATCH 2/2] looks like i modified parse_query or something | |
100.0% src/ | |
diff --git a/src/python.rs b/src/python.rs | |
index ccaf0df..a53d1c9 100644 | |
--- a/src/python.rs | |
+++ b/src/python.rs | |
@@ -51,50 +51,10 @@ const VALID_NODE_KINDS: &[&str] = &[ | |
#[pyfunction(cpp = "false", force_query = "false")] | |
#[text_signature = "(query, cpp, force_query, regexes)"] | |
-fn parse_query(pattern: &str, cpp: bool, force_query: bool, regexes: Option<HashMap::<&str, &str>>) -> PyResult<QueryTreePy> { | |
- | |
- let mut tree = crate::parse(pattern, cpp); | |
- //let mut cur = tree.walk(); | |
- let mut p = pattern; | |
- | |
- /* | |
- let temp_pattern; | |
- | |
- // we need to filter the cursor using validate_query | |
- if tree.root_node().has_error() { | |
- if !pattern.ends_with(';') { | |
- temp_pattern = format!("{};", &p); | |
- let fixed_tree = crate::parse(&temp_pattern, cpp); | |
- if !fixed_tree.root_node().has_error() { | |
- info!("normalizing query: add missing ;"); | |
- tree = fixed_tree; | |
- p = &temp_pattern; | |
- } | |
- } | |
- } | |
- | |
- // Try to do query normalization to support missing { } | |
- // 'memcpy(_);' -> {memcpy(_);} | |
- let temp_pattern2; | |
- if !tree.root_node().has_error() { | |
- let c = tree.root_node().child(0); | |
- if let Some(n) = c { | |
- if !VALID_NODE_KINDS.contains(&n.kind()) { | |
- temp_pattern2 = format!("{{{}}}", &p); | |
- let fixed_tree = crate::parse(&temp_pattern2, cpp); | |
- if !fixed_tree.root_node().has_error() { | |
- info!("normalizing query: add {}", "{}"); | |
- tree = fixed_tree; | |
- p = &temp_pattern2; | |
- } | |
- } | |
- } | |
- } | |
- */ | |
- | |
- //let mut cur = tree.walk(); | |
- //let mut cur = validate_query(&tree, p, force_query)?; | |
- let mut cur = validate_query(&tree, p, force_query); | |
+fn parse_query(pattern: &str, cpp: bool, force_query: bool, regexes: Option<HashMap::<&str, &str>>) -> PyResult<QueryTreePy> | |
+{ | |
+ let tree = crate::parse(pattern, cpp); | |
+ let mut cursor = validate_query(&tree, pattern, force_query); | |
let constraints = match regexes { | |
Some(regexes) => { | |
@@ -110,14 +70,14 @@ fn parse_query(pattern: &str, cpp: bool, force_query: bool, regexes: Option<Hash | |
None => None | |
}; | |
- match &mut cur { | |
+ match &mut cursor { | |
Err(e) => { | |
Err(PyRuntimeError::new_err(format!("Tree sitter query validation failed: {}", e))) | |
} | |
- Ok(cursor) => { | |
+ Ok(c) => { | |
// guard build_query_tree for python so that we can avoid an exit | |
- match crate::builder::check_query_tree(p, cursor, 0, cpp, false, false, constraints /*None*/) { | |
+ match crate::builder::check_query_tree(pattern, c, 0, cpp, false, false, constraints /*None*/) { | |
Ok(qt) => { | |
//let qt = crate::builder::build_query_tree(q, &mut c, cpp, None); | |
Ok(QueryTreePy { qt }) | |
-- | |
2.43.0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment