Created
July 18, 2017 11:12
-
-
Save zikani03/afd28b6c6b80432ab69c6b1a11c637f4 to your computer and use it in GitHub Desktop.
Extract SQL queries from pentaho files with Rust
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
extern crate zip; | |
extern crate quick_xml; | |
extern crate html_entities; | |
use std::io::BufReader; | |
use std::fs::File; | |
use std::io::Read; | |
use std::ops::Deref; | |
use std::collections::BTreeMap; | |
use zip::read::ZipArchive; | |
use zip::read::ZipFile; | |
use quick_xml::reader::Reader; | |
use quick_xml::events::Event; | |
use quick_xml::events::attributes::Attribute; | |
use html_entities::decode_html_entities; | |
fn main() { | |
let arg = std::env::args().nth(1); | |
let arg_dest = std::env::args().nth(2); | |
read_pentaho_file(arg.unwrap()); | |
} | |
fn read_pentaho_file(filename: String) -> zip::result::ZipResult<()> { | |
let mut reader = BufReader::new(File::open(filename).unwrap()); | |
let mut zip = try!(ZipArchive::new(reader)); | |
// let mut files: Vec<String> = vec![]; | |
let mut data: String = String::new(); | |
for i in 0..zip.len() { | |
let mut file = zip.by_index(i).unwrap(); | |
if file.name().ends_with("sql-ds.xml") { | |
file.read_to_string (&mut data); | |
extract_sql_queries(data.clone()); | |
} | |
} | |
Ok(()) | |
} | |
/// | |
/// The SQL Queries in Pentaho report files are kept in a file named sql-ds.xml | |
/// the structure of the xml file contents is as follows: | |
/// data:sql-datasource | |
/// > data:query-definitions | |
/// > data:query name="QUERY_NAME_HERE" | |
/// > data:static-query > TEXT | |
fn extract_sql_queries(contents: String) { | |
let mut reader = Reader::from_str(contents.as_str()); | |
reader.trim_text(true); | |
let count = 0; | |
let mut buf = Vec::new(); | |
let mut queries = Vec::<String>::new(); | |
let mut query_name = String::new(); | |
let mut is_query_data = false; | |
loop { | |
match reader.read_namespaced_event(&mut buf) { | |
Ok((ref namespace_value, Event::Start(ref e))) => { | |
match e.name() { | |
// We don't care to match these other queries | |
//b"data:sql-datasource" | |
//b"data:query-definitions" | |
//b"data:query" => e.attributes("name") | |
b"data:query" => { | |
let value = e.attributes() | |
.map(|a| match a { Ok(a) => a.value, Err(_) => &[] } ) | |
.find(|val| | |
match std::str::from_utf8(val) { | |
Ok(v) => "name" == v, | |
_ => false, | |
}); | |
match value { | |
Some(bytes) => { | |
match std::str::from_utf8(bytes) { | |
Ok(v) => query_name = v.to_string(), | |
Err(e) => (), | |
} | |
}, | |
None => (), | |
} | |
is_query_data = false; | |
}, | |
// b"data:static-query > TEXT" | |
b"data:static-query" => is_query_data = true, | |
// Indicate we're processing tags that don't contain actual query content. | |
_ => is_query_data = false, | |
} | |
}, | |
Ok((ref namespace_value, Event::Text(ref e))) => { | |
if is_query_data { | |
// queries.insert(query_name.clone(), std::str::from_utf8(e.deref()).unwrap().to_string()); | |
match decode_html_entities(std::str::from_utf8(e.deref()).unwrap()) { | |
Ok(value) => { | |
queries.push(value); | |
}, | |
Err(_) => (), | |
} | |
} | |
}, | |
Ok((ref namespace_value, Event::Eof)) => break, | |
Err(e) => panic!("Error {:?}", e), | |
_ => (), | |
} | |
// from the docs: "if we don't keep a borrow elsewhere, we can clear the buffer to keep memory usage low" | |
buf.clear(); | |
} | |
for entry in &queries { | |
println!("Query sql={:?}", entry); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment