Skip to content

Instantly share code, notes, and snippets.

@cholcombe973
Created August 22, 2016 16:03
Show Gist options
  • Save cholcombe973/000d6e059b75b4a05349a1b691169c98 to your computer and use it in GitHub Desktop.
Save cholcombe973/000d6e059b75b4a05349a1b691169c98 to your computer and use it in GitHub Desktop.
//! This program has been created to handle an incompatibility between how mysqldump escapes
//! some characters and how hive interprets those escaped chars. It does the following:
//! If you see an 0x5c30 in the input sequence
//! a. and there is no or even number of 0x5c before 0x5c30, translate this 0x5c30 to 0x00
//! b. if there is odd number of 0x5c before 0x5c30, don't do anything.
//! Some sample transforms:
//! 0x5c30 => 0x00
//! 0x5c5c30 => 0x5c5c30
//! 0x5c5c5c30 => 0x5c5c00
//! 0x5c5c5c5c30 => 0x5c5c5c5c30
//! 0x5c5c5c3030 => 0x5c5c0030
//! 0x5c5c5c5c3030 => 0x5c5c5c5c3030
//! 0x5c5c5c40 => 0x5c5c5c40
//! 0x5c5c5c5c40 => 0x5c5c5c5c40
//! Here is another way to test:
//! - Create table with blob content: create table MyTest (id integer, value1 varchar(20),
//! content blob, value2 double, primary key(id));
//! - Insert into blob content: insert into MyTest (id, value1, content, value2)
//! values (1, "data1", 0x3020090d0a2227005c30, 2.2);
//! - checking content: select hex(content) from MyTest;
//! - chmod a+rw /tmp/dump
//! - mysqldump -u root --tab=/tmp/dump --single-transaction -- create-options test
//! - see content: hexdump /tmp/dump/MyTest.txt
//! hexdump of original dump file:
//! 0000000 31 09 64 61 74 61 31 09 30 20 5c 09 0d 5c 0a 22
//! 0000010 27 5c 30 5c 5c 30 09 32 2e 32 0a
//! 000001b
//! hexdump after passing through this program:
//! 0000000 31 09 64 61 74 61 31 09 30 20 5c 09 0d 5c 0a 22
//! 0000010 27 00 5c 5c 30 09 32 2e 32 0a
//! 000001a
//! Author : vamsi Nov 2015
#[macro_use]
extern crate log;
extern crate simplelog;
use simplelog::SimpleLogger;
use simplelog::LogLevelFilter;
use std::io::{BufReader, BufWriter};
use std::io;
use std::io::ErrorKind;
use std::io::prelude::*;
#[test]
fn test_transform() {
// 0x5c30 => 0x00
let test1 = &[0x5c, 0x30];
let mut input = std::io::Cursor::new(test1);
let mut output: Vec<u8> = Vec::new();
process_input(&mut input, &mut output);
println!("output1: {:?}", output);
assert_eq!(output, vec![0x00]);
// TODO: Broken
// 0x5c5c30 => 0x5c5c30 //Broken
let test2 = &[0x5c, 0x5c, 0x30];
let mut input2 = std::io::Cursor::new(test2);
let mut output2: Vec<u8> = Vec::new();
process_input(&mut input2, &mut output2);
println!("output2: {:?}", output2);
assert_eq!(output2, vec![0x5c, 0x5c, 0x30]);
// 0x5c5c5c30 => 0x5c5c00
let test3 = &[0x5c, 0x5c, 0x5c, 0x30];
let mut input3 = std::io::Cursor::new(test3);
let mut output3: Vec<u8> = Vec::new();
process_input(&mut input3, &mut output3);
println!("output3: {:?}", output3);
assert_eq!(output3, vec![0x5c, 0x5c, 0x00]);
// 0x5c5c5c5c30 => 0x5c5c5c5c30
let test4 = &[0x5c, 0x5c, 0x5c, 0x5c, 0x30];
let mut input4 = std::io::Cursor::new(test4);
let mut output4: Vec<u8> = Vec::new();
process_input(&mut input4, &mut output4);
println!("output4: {:?}", output4);
assert_eq!(output4, vec![0x5c, 0x5c, 0x5c, 0x5c, 0x30]);
// 0x5c5c5c3030 => 0x5c5c0030
let test5 = &[0x5c, 0x5c, 0x5c, 0x30, 0x30];
let mut input5 = std::io::Cursor::new(test5);
let mut output5: Vec<u8> = Vec::new();
process_input(&mut input5, &mut output5);
println!("output5: {:?}", output5);
assert_eq!(output5, vec![0x5c, 0x5c, 0x00, 0x30]);
}
fn write_byte<W>(byte: u8, writer: &mut W) -> Result<usize, std::io::Error>
where W: Write
{
let written_bytes = try!(writer.write(&[byte]));
Ok(written_bytes)
}
fn process_input<R, W>(mut reader: R, mut writer: W) -> Result<(), std::io::Error>
where R: BufRead,
W: Write
{
// As long as there's another byte this loop will continue
let mut done = false;
let mut buffer = [0; 1024 * 128];
while !done {
let read_size = try!(reader.read(&mut buffer[..]));
if read_size > 0 {
let mut count: u64 = 0;
let mut buf_position: usize = 0;
// This inner loop is so we fully read the buffer before trying to read again
'inner: loop {
if buf_position >= read_size {
break;
}
// Fast forward through anything that isn't 0x5c
for _ in buf_position..read_size {
if buffer[buf_position] != 0x5c {
try!(write_byte(buffer[buf_position], &mut writer));
buf_position += 1;
continue;
}
}
// Advance to the next position
buf_position += 1;
// Now handle the cases
for _ in buf_position..read_size {
if buffer[buf_position] == 0x30 {
if count % 2 == 0 {
// we saw 0 or even number of 0x5c before 0x5c30
try!(write_byte(0x00, &mut writer));
buf_position += 1;
break;
} else {
// we saw odd number of 0x5c before 0x5c30. put the outstanding 0c5c
// in the output,
// and then 0x30
//
try!(write_byte(0x5c, &mut writer));
try!(write_byte(0x30, &mut writer));
buf_position += 1;
break;
}
} else if buffer[buf_position] == 0x5c {
try!(write_byte(0x5c, &mut writer));
buf_position += 1;
count += 1;
} else {
// put the outstanding 0x5c and the char we just read in output
try!(write_byte(0x5c, &mut writer));
try!(write_byte(buffer[buf_position], &mut writer));
buf_position += 1;
break;
}
}
}
} else {
done = true;
}
}
// 'outer: loop {
// let next_byte = reader.by_ref().bytes().next();
//
// match next_byte {
// Some(read_byte) => {
// let read_byte = try!(read_byte);
//
// Fast forward through bytes that don't match 0x5c
// if read_byte != 0x5c {
// try!(write_byte(read_byte, &mut writer));
// continue;
// }
//
// let mut count: u64 = 0;
// for byte in reader.by_ref().bytes() {
// let read_byte = byte.unwrap();
// if read_byte == 0x30 {
// if count % 2 == 0 {
// we saw 0 or even number of 0x5c before 0x5c30
// try!(write_byte(0x00, &mut writer));
// break;
// } else {
// we saw odd number of 0x5c before 0x5c30. put the outstanding 0c5c
// in the output,
// and then 0x30
//
// try!(write_byte(0x5c, &mut writer));
// try!(write_byte(0x30, &mut writer));
// break;
// }
// } else if read_byte == 0x5c {
// try!(write_byte(0x5c, &mut writer));
// count += 1;
// } else {
// put the outstanding 0x5c and the char we just read in output
// try!(write_byte(0x5c, &mut writer));
// try!(write_byte(read_byte, &mut writer));
// break;
// }
// }
// }
// None => {
// break 'outer;
// }
// }
// }
//
// EOF
// writer will flush when dropped
Ok(())
}
fn main() {
let _ = SimpleLogger::init(LogLevelFilter::Trace);
// let stdin = io::stdin();
// let mut stdin = stdin.lock();
let mut stdin = BufReader::with_capacity(256 * 1024, io::stdin());
// BufWriter with 128K capacity. Try to make our writes large for efficient
// downstream consumption
let mut writer = BufWriter::with_capacity(256 * 1024, io::stdout());
match process_input(&mut stdin, &mut writer) {
Ok(_) => {}
Err(e) => {
error!("Failed with error: {}", e);
}
};
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment