Skip to content

Instantly share code, notes, and snippets.

@GuillaumePressiat
Last active December 5, 2023 23:31
Show Gist options
  • Save GuillaumePressiat/76b58f48ef9739107d8cb37f1d38ac63 to your computer and use it in GitHub Desktop.
Save GuillaumePressiat/76b58f48ef9739107d8cb37f1d38ac63 to your computer and use it in GitHub Desktop.
rust pola.rs example for fwf file
use polars::prelude::*;
use std::{
env
};
pub fn main(){
configure_the_environment();
use std::time::Instant;
let now = Instant::now();
{
let data_ = parse_pmsi_fwf2();
// let data_ = stat_ghm();
// let data_ = build_parsing_frame(b"22", b"mco", b"rsa");
println!("{:?}", data_);
}
let elapsed = now.elapsed();
println!("Elapsed: {:.2?}", elapsed);
}
pub fn parse_pmsi_fwf_test() -> Result<polars::prelude::DataFrame, PolarsError> {
let path = "/Users/guillaumepressiat/Documents/data/mco/290000017.2022.12.rsa";
let data_ = LazyCsvReader::new(path)
.has_header(false)
.finish()
.unwrap()
.with_columns(
vec![col("column_1").str().slice(0, Some(9)).alias("nofiness"),
col("column_1").str().slice(9, Some(3)).alias("format"),
col("column_1").str().slice(12, Some(10)).alias("cle_rsa"),
col("column_1").str().slice(22, Some(3)).alias("novrss"),
col("column_1").str().slice(25, Some(3)).alias("noseqta"),
col("column_1").str().slice(28, Some(2)).alias("gpvclass"),
col("column_1").str().slice(30, Some(2)).alias("gpcmd"),
col("column_1").str().slice(32, Some(1)).alias("gptype"),
col("column_1").str().slice(33, Some(2)).alias("gpnum"),
col("column_1").str().slice(30, Some(6)).alias("ghm")
])
.collect()
;
// println!("{:?}", data_);
// ?.drop("column_1")
return data_;
}
pub fn get_formats(annee: &[u8], champ: &[u8], table: &[u8]) -> Result<polars::prelude::DataFrame, PolarsError> {
use std::fs::File;
let r = File::open("/Users/guillaumepressiat/Documents/GitHub/pypmsi/pypmsi/formats/pmeasyr_formats.parquet").unwrap();
let reader = ParquetReader::new(r);
let data_ = reader
.finish()
.unwrap()
.lazy()
.filter(col("champ").eq(lit(champ)))
.filter(col("table").eq(lit(table)))
.filter(col("an").eq(lit(annee)))
.with_column(col("nom").str().to_lowercase())
.with_column(col("longueur").cast(DataType::Int32))
.with_column((col("position")-lit(1)).alias("position_0"))
// .select(col("nom"))
.with_row_count("Id", None)
.collect()
;
return data_;
}
pub fn build_parsing_frame(annee: &[u8], champ: &[u8], table: &[u8]) -> Result<(), PolarsError> {
// use std::str;
let data_ = get_formats(annee, champ, table);
let data_ok = match data_ {
Ok(pl_df) => pl_df,
Err(error) => panic!("Problem avec les formats: {:?}", error),
};
let _i = 0;
let _vec_expr:Vec<Expr> = vec![];
// println!("{:?}", data_ok.clone());
let columns: &[Series] = data_ok.get_columns();
let _as_vec_longueurs: Vec<i32> = columns[2].clone().i32()?.into_no_null_iter().collect();
let as_vec_positions: Vec<i32> = columns[15].clone().i32()?.into_no_null_iter().collect();
// let as_vec_colonnes = data_ok.clone().select([col("nom")
// .apply(|s| s, GetOutput::from_type(DataType::List(Box::new(DataType::Utf8))))]).unwrap();
println!("{:?}", as_vec_positions);
println!("{:?}", columns[6].clone().utf8()?);
// while i < as_vec_longueurs.len() {
// println!("{:?}", as_vec_longueurs[i]);
// vec_expr.append(&mut
// vec![col("column_1").str().
// slice(row["position"].get(0), Some(row["longueur"].get(0)))
// .alias(row["noms"].get(0))]);
// i +=1;
// }
// println!("{:?}", vec_expr);
return Ok(());
// return Ok();
}
pub fn configure_the_environment() {
env::set_var("POLARS_FMT_TABLE_ROUNDED_CORNERS", "1"); // apply rounded corners to UTF8-styled tables.
//env::set_var("POLARS_FMT_MAX_COLS", "20"); // maximum number of columns shown when formatting DataFrames.
env::set_var("POLARS_FMT_MAX_ROWS", "10"); // maximum number of rows shown when formatting DataFrames.
env::set_var("POLARS_FMT_STR_LEN", "100"); // maximum number of characters printed per string value.
}
pub fn parse_pmsi_fwf2() -> Result<polars::prelude::DataFrame, PolarsError> {
let path = "/Users/guillaumepressiat/Documents/data/mco/290000017.2022.12.rsa";
let noms = vec!["NOFINESS", "NOVRSA", "CLE_RSA", "NOVRSS", "NOSEQTA", "GPVCLASS", "GPCMD", "GPTYPE", "GPNUM", "GPCOMPX", "GPCDRETR", "RSAVCLASS", "RSACMD", "RSATYPE", "RSANUM", "RSACOMPX", "RSACDRETR", "NBRUM", "AGEAN", "AGEJR", "SEXE", "ECHPMSI", "PROV", "MOISSOR", "ANSOR", "SCHPMSI", "DEST", "TYPESEJ", "DUREE", "CDGEO", "CDPOSTAL", "POIDS", "AGEGEST", "DELAIREG", "NBSEANCE", "NOGHS", "NBJRBS", "SEJINFBI", "NBJREXB", "TOPADMNAIS", "TOPRADAVASTIN", "FILLER4", "UHCD", "GHSMINORE", "CONFCDSEJ", "NBAUTPGV", "NBSUPHS", "NBSUPAHS", "NBSUPCHS", "NBSUPEHS", "NBACTE9615", "NBSUPREAPED", "NBSUPATPART", "NB_RDTH", "VALVAORT", "GENAUTORSA", "GHSHORSINNO", "FILLER1", "NBSUPCAISSON", "TYPRESTPO", "NBSUPREA", "NBSUPSI", "NBSUPSTF", "NBSUPSRC", "NBSUPNN1", "NBSUPNN2", "NBSUPNN3", "NBSUPREP", "PASLITSP", "TYPMACHRADIO", "TYPEDOSIM", "NUMINNO", "SUPPDEFCARD", "CONVERSION_HC", "PC_RAAC", "ADMISSION_MAISON_NAISSANCE", "ELL_GRADATION", "SURVEILLANCE_PARTICULIERE", "RESERERVE_HOSP", "RESCRIT_TARIFAIRE", "CAT_NB_INTERVENANTS", "ADNP75", "FILLER5", "NOSEQRUM", "DP", "DR", "NDAS", "NA", "FILLER6", "ZA"];
// ["nofiness", "cle_rsa", "ghm"];
let position = vec![0, 9, 12, 22, 25, 28, 30, 32, 33, 35, 36, 39, 41, 43, 44, 46, 47, 50, 52, 55, 58, 59, 60, 61, 63, 67, 68, 69, 70, 74, 79, 84, 88, 90, 93, 95, 99, 103, 104, 107, 108, 109, 110, 111, 112, 113, 114, 117, 120, 123, 126, 129, 132, 135, 136, 137, 138, 142, 150, 153, 154, 157, 160, 163, 166, 169, 172, 175, 178, 179, 180, 181, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 209, 211, 217, 223, 227, 232, 247];
// [0, 12, 30];
let longueur = vec![9, 3, 10, 3, 3, 2, 2, 1, 2, 1, 3, 2, 2, 1, 2, 1, 3, 2, 3, 3, 1, 1, 1, 2, 4, 1, 1, 1, 4, 5, 5, 4, 2, 3, 2, 4, 4, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 4, 8, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 15, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 6, 6, 4, 5, 15, 1000000];
// [9, 10, 6];
let mut _vec_expr:Vec<Expr> = vec![];
let mut _i = 0;
while _i < noms.len() {
_vec_expr.append(&mut
vec![col("column_1").str().
slice(position[_i], Some(longueur[_i]))
.alias(noms[_i])]);
_i +=1;
}
// println!("{:?}", vec_expr);
let data_ = LazyCsvReader::new(path)
.has_header(false)
.finish()
.unwrap()
.with_columns(
_vec_expr
)
.collect()?
.drop("column_1")
;
return data_;
// return get_formats(b"22", b"mco", b"rsa");
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment