Created
June 23, 2022 19:22
-
-
Save tallpeak/d9e2c1c67315e62b3df02afda65cd047 to your computer and use it in GitHub Desktop.
superfile to tab-separated-values
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// fixedWidthToCSV | |
// output is short by 545 lines | |
// wc c:\Users\tallp\Downloads\SFA8 c:\temp\sfa8.txt | |
// 1167801 12474922 411065952 c:\Users\tallp\Downloads\SFA8 | |
// 1167256 40205289 449393560 c:\temp\sfa8.txt | |
// 2335057 52680211 860459512 total | |
// https://www.youtube.com/watch?v=lLWchWTUFOQ | |
// Ryan Levick | |
// oxide.computer | |
extern crate flate2; | |
use std::env; | |
use std::fs::File; | |
use std::io::{self, BufRead}; | |
use std::path::Path; | |
use std::io::LineWriter; | |
use std::io::Write; | |
// use encoding::{DecoderTrap};//,Encoding | |
// use encoding::all::ISO_8859_1; | |
// use encoding_rs::*; | |
use flate2::read::GzDecoder; | |
//use bstr::io::BufReadExt; | |
// use std::fmt::Write; | |
// the following assumes utf8 | |
// The output is wrapped in a Result to allow matching on errors | |
// Returns an Iterator to the Reader of the lines of the file. | |
// fn read_lines<P>(filename: P) -> io::Result<io::Lines<io::BufReader<File>>> | |
// where P: AsRef<Path>, { | |
// let file = File::open(filename)?; | |
// Ok(io::BufReader::with_capacity(65536,file).lines()) | |
// } | |
// This program converts a fixed-width file called Superfile or SFA8.TXT | |
// to tab-delimited for import by Postgresql, using | |
// the COPY FROM statement | |
fn main() -> std::io::Result<()> { | |
let sftxt_path = Path::new("c:\\temp\\sfa8.txt"); | |
// Open a file in write-only mode, returns `io::Result<File>` | |
let sftxt_file = match File::create(&sftxt_path) { | |
Err(why) => panic!("couldn't create sfa8.txt: {}", why), | |
Ok(file) => file, | |
}; | |
let mut sftxt = LineWriter::with_capacity(65536, sftxt_file); | |
let sflens: [u8; 34] = [ | |
2, 8, 6, 5, 7, 1,30,30,30,30, | |
13, 5, 4,30,10, 9, 9, 4, 4, 8, | |
8, 8, 8, 8, 5, 3, 3, 5,12, 9, | |
11, 3,17, 5]; | |
let homedir = dirs::home_dir().unwrap().to_str().unwrap().to_string(); | |
let filename = homedir.clone() + "\\Downloads\\SFA8.gz"; | |
let file = File::open(filename)?; | |
let gz = GzDecoder::new(file);//.expect("couldn't decode gzip stream"); | |
let mut rdr = io::BufReader::with_capacity(65536,gz); | |
let mut buf:Vec<u8> = Vec::with_capacity(500); | |
//IBM437 does not exist in encoding_rs | |
// let enc = WINDOWS_1252 | |
let enc = encoding_rs::Encoding::for_label(b"l1").expect("failed to find encoding"); //ISO_8859_1 | |
println!("using encoding: {}", enc.name()); | |
//if let Ok(lines) = read_lines("C:\\Users\\tallp\\Downloads\\SFA8") { | |
while let Ok(bytesread) = rdr.read_until(0x0A as u8, &mut buf) { | |
if bytesread == 0 { | |
break; | |
} | |
let mut s = String::with_capacity(400); | |
let mut p : usize = 0; | |
for l in sflens { | |
//let t: String = ln.drain(..l as usize).collect(); | |
//s.push_str(ln.substring(p,l)) | |
let slice = &buf[p..(p + l as usize)]; | |
//let utf = enc.decode(slice, DecoderTrap::Replace).unwrap(); | |
let (cow, _encoding_used, _had_errors) = enc.decode(slice); | |
s.push_str(cow.trim()); | |
s.push_str("\t"); | |
p += l as usize; | |
} | |
s.replace_range(s.len()-1..,"\n"); | |
match sftxt.write_all(s.as_bytes()) { | |
Err(why) => panic!("couldn't write to sfa8.txt: {}", why), | |
Ok(_) => () | |
} | |
buf.clear(); | |
} | |
sftxt.flush()?; | |
Ok(()) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment