Last active
December 5, 2023 23:31
-
-
Save GuillaumePressiat/76b58f48ef9739107d8cb37f1d38ac63 to your computer and use it in GitHub Desktop.
rust pola.rs example for fwf file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use polars::prelude::*; | |
use std::{ | |
env | |
}; | |
pub fn main(){ | |
configure_the_environment(); | |
use std::time::Instant; | |
let now = Instant::now(); | |
{ | |
let data_ = parse_pmsi_fwf2(); | |
// let data_ = stat_ghm(); | |
// let data_ = build_parsing_frame(b"22", b"mco", b"rsa"); | |
println!("{:?}", data_); | |
} | |
let elapsed = now.elapsed(); | |
println!("Elapsed: {:.2?}", elapsed); | |
} | |
pub fn parse_pmsi_fwf_test() -> Result<polars::prelude::DataFrame, PolarsError> { | |
let path = "/Users/guillaumepressiat/Documents/data/mco/290000017.2022.12.rsa"; | |
let data_ = LazyCsvReader::new(path) | |
.has_header(false) | |
.finish() | |
.unwrap() | |
.with_columns( | |
vec![col("column_1").str().slice(0, Some(9)).alias("nofiness"), | |
col("column_1").str().slice(9, Some(3)).alias("format"), | |
col("column_1").str().slice(12, Some(10)).alias("cle_rsa"), | |
col("column_1").str().slice(22, Some(3)).alias("novrss"), | |
col("column_1").str().slice(25, Some(3)).alias("noseqta"), | |
col("column_1").str().slice(28, Some(2)).alias("gpvclass"), | |
col("column_1").str().slice(30, Some(2)).alias("gpcmd"), | |
col("column_1").str().slice(32, Some(1)).alias("gptype"), | |
col("column_1").str().slice(33, Some(2)).alias("gpnum"), | |
col("column_1").str().slice(30, Some(6)).alias("ghm") | |
]) | |
.collect() | |
; | |
// println!("{:?}", data_); | |
// ?.drop("column_1") | |
return data_; | |
} | |
pub fn get_formats(annee: &[u8], champ: &[u8], table: &[u8]) -> Result<polars::prelude::DataFrame, PolarsError> { | |
use std::fs::File; | |
let r = File::open("/Users/guillaumepressiat/Documents/GitHub/pypmsi/pypmsi/formats/pmeasyr_formats.parquet").unwrap(); | |
let reader = ParquetReader::new(r); | |
let data_ = reader | |
.finish() | |
.unwrap() | |
.lazy() | |
.filter(col("champ").eq(lit(champ))) | |
.filter(col("table").eq(lit(table))) | |
.filter(col("an").eq(lit(annee))) | |
.with_column(col("nom").str().to_lowercase()) | |
.with_column(col("longueur").cast(DataType::Int32)) | |
.with_column((col("position")-lit(1)).alias("position_0")) | |
// .select(col("nom")) | |
.with_row_count("Id", None) | |
.collect() | |
; | |
return data_; | |
} | |
pub fn build_parsing_frame(annee: &[u8], champ: &[u8], table: &[u8]) -> Result<(), PolarsError> { | |
// use std::str; | |
let data_ = get_formats(annee, champ, table); | |
let data_ok = match data_ { | |
Ok(pl_df) => pl_df, | |
Err(error) => panic!("Problem avec les formats: {:?}", error), | |
}; | |
let _i = 0; | |
let _vec_expr:Vec<Expr> = vec![]; | |
// println!("{:?}", data_ok.clone()); | |
let columns: &[Series] = data_ok.get_columns(); | |
let _as_vec_longueurs: Vec<i32> = columns[2].clone().i32()?.into_no_null_iter().collect(); | |
let as_vec_positions: Vec<i32> = columns[15].clone().i32()?.into_no_null_iter().collect(); | |
// let as_vec_colonnes = data_ok.clone().select([col("nom") | |
// .apply(|s| s, GetOutput::from_type(DataType::List(Box::new(DataType::Utf8))))]).unwrap(); | |
println!("{:?}", as_vec_positions); | |
println!("{:?}", columns[6].clone().utf8()?); | |
// while i < as_vec_longueurs.len() { | |
// println!("{:?}", as_vec_longueurs[i]); | |
// vec_expr.append(&mut | |
// vec![col("column_1").str(). | |
// slice(row["position"].get(0), Some(row["longueur"].get(0))) | |
// .alias(row["noms"].get(0))]); | |
// i +=1; | |
// } | |
// println!("{:?}", vec_expr); | |
return Ok(()); | |
// return Ok(); | |
} | |
pub fn configure_the_environment() { | |
env::set_var("POLARS_FMT_TABLE_ROUNDED_CORNERS", "1"); // apply rounded corners to UTF8-styled tables. | |
//env::set_var("POLARS_FMT_MAX_COLS", "20"); // maximum number of columns shown when formatting DataFrames. | |
env::set_var("POLARS_FMT_MAX_ROWS", "10"); // maximum number of rows shown when formatting DataFrames. | |
env::set_var("POLARS_FMT_STR_LEN", "100"); // maximum number of characters printed per string value. | |
} | |
pub fn parse_pmsi_fwf2() -> Result<polars::prelude::DataFrame, PolarsError> { | |
let path = "/Users/guillaumepressiat/Documents/data/mco/290000017.2022.12.rsa"; | |
let noms = vec!["NOFINESS", "NOVRSA", "CLE_RSA", "NOVRSS", "NOSEQTA", "GPVCLASS", "GPCMD", "GPTYPE", "GPNUM", "GPCOMPX", "GPCDRETR", "RSAVCLASS", "RSACMD", "RSATYPE", "RSANUM", "RSACOMPX", "RSACDRETR", "NBRUM", "AGEAN", "AGEJR", "SEXE", "ECHPMSI", "PROV", "MOISSOR", "ANSOR", "SCHPMSI", "DEST", "TYPESEJ", "DUREE", "CDGEO", "CDPOSTAL", "POIDS", "AGEGEST", "DELAIREG", "NBSEANCE", "NOGHS", "NBJRBS", "SEJINFBI", "NBJREXB", "TOPADMNAIS", "TOPRADAVASTIN", "FILLER4", "UHCD", "GHSMINORE", "CONFCDSEJ", "NBAUTPGV", "NBSUPHS", "NBSUPAHS", "NBSUPCHS", "NBSUPEHS", "NBACTE9615", "NBSUPREAPED", "NBSUPATPART", "NB_RDTH", "VALVAORT", "GENAUTORSA", "GHSHORSINNO", "FILLER1", "NBSUPCAISSON", "TYPRESTPO", "NBSUPREA", "NBSUPSI", "NBSUPSTF", "NBSUPSRC", "NBSUPNN1", "NBSUPNN2", "NBSUPNN3", "NBSUPREP", "PASLITSP", "TYPMACHRADIO", "TYPEDOSIM", "NUMINNO", "SUPPDEFCARD", "CONVERSION_HC", "PC_RAAC", "ADMISSION_MAISON_NAISSANCE", "ELL_GRADATION", "SURVEILLANCE_PARTICULIERE", "RESERERVE_HOSP", "RESCRIT_TARIFAIRE", "CAT_NB_INTERVENANTS", "ADNP75", "FILLER5", "NOSEQRUM", "DP", "DR", "NDAS", "NA", "FILLER6", "ZA"]; | |
// ["nofiness", "cle_rsa", "ghm"]; | |
let position = vec![0, 9, 12, 22, 25, 28, 30, 32, 33, 35, 36, 39, 41, 43, 44, 46, 47, 50, 52, 55, 58, 59, 60, 61, 63, 67, 68, 69, 70, 74, 79, 84, 88, 90, 93, 95, 99, 103, 104, 107, 108, 109, 110, 111, 112, 113, 114, 117, 120, 123, 126, 129, 132, 135, 136, 137, 138, 142, 150, 153, 154, 157, 160, 163, 166, 169, 172, 175, 178, 179, 180, 181, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 209, 211, 217, 223, 227, 232, 247]; | |
// [0, 12, 30]; | |
let longueur = vec![9, 3, 10, 3, 3, 2, 2, 1, 2, 1, 3, 2, 2, 1, 2, 1, 3, 2, 3, 3, 1, 1, 1, 2, 4, 1, 1, 1, 4, 5, 5, 4, 2, 3, 2, 4, 4, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 4, 8, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 15, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 6, 6, 4, 5, 15, 1000000]; | |
// [9, 10, 6]; | |
let mut _vec_expr:Vec<Expr> = vec![]; | |
let mut _i = 0; | |
while _i < noms.len() { | |
_vec_expr.append(&mut | |
vec![col("column_1").str(). | |
slice(position[_i], Some(longueur[_i])) | |
.alias(noms[_i])]); | |
_i +=1; | |
} | |
// println!("{:?}", vec_expr); | |
let data_ = LazyCsvReader::new(path) | |
.has_header(false) | |
.finish() | |
.unwrap() | |
.with_columns( | |
_vec_expr | |
) | |
.collect()? | |
.drop("column_1") | |
; | |
return data_; | |
// return get_formats(b"22", b"mco", b"rsa"); | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment