-
-
Save markdanese/e53dcbfbb0c00f109e6bd65712d07cfa to your computer and use it in GitHub Desktop.
# this loads the 2016 NIS fixed width (asc) files into R | |
# it also saves the result as an fst file for much faster re-reading into R | |
library(data.table) | |
library(readr) | |
library(fst) | |
# load core data -------------------------------------------------------------------- | |
nis_specs <- fread("./docs/nis_specs_core.csv") | |
nis_specs[, width := end - start + 1] | |
nis_specs[, varname := tolower(varname)] | |
nis_specs[, type := ifelse(type == "int", "i", ifelse(type %in% c("byte", "double", "long"), "d", "c"))] | |
missing_values <- as.character(quote(c(-99, -88, -66, -99.9999999, -88.8888888, -66.6666666, -9, -8, -6, -5, -9999, -8888, -6666, -999999999, -888888888, -666666666,-999, -888, -666))) | |
dt <- | |
read_fwf( | |
"~/Dropbox (Outcomes Insights)/Data repository/HCUP/NIS_2016/NIS_2016_Core.ASC", | |
col_positions = fwf_widths(nis_specs$width), | |
col_types = paste0(nis_specs$type, collapse = ""), | |
trim_ws = TRUE, | |
na = missing_values | |
) | |
setDT(dt) | |
setnames(dt, names(dt), nis_specs$varname) | |
write_fst(dt, "./data/analysis/hospital.fst", compress = 75) | |
dt <- read_fst("./data/analysis/hospital.fst", as.data.table = TRUE) | |
write_fst(dt, "./data/analysis/core.fst", compress = 100) | |
# load hospital data ---------------------------------------------------------------- | |
nis_specs <- fread("./docs/nis_specs_hospital.csv") | |
nis_specs[, width := end - start + 1] | |
nis_specs[, varname := tolower(varname)] | |
nis_specs[, type := ifelse(type == "int", "i", ifelse(type %in% c("byte", "double", "long"), "d", "c"))] | |
missing_values <- as.character(quote(c(-99, -88, -66, -99.9999999, -88.8888888, -66.6666666, -9, -8, -6, -5, -9999, -8888, -6666, -999999999, -888888888, -666666666,-999, -888, -666))) | |
dt <- | |
read_fwf( | |
"~/Dropbox (Outcomes Insights)/Data repository/HCUP/NIS_2016/NIS_2016_Hospital.ASC", | |
col_positions = fwf_widths(nis_specs$width), | |
col_types = paste0(nis_specs$type, collapse = ""), | |
trim_ws = TRUE, | |
# n_max = 1000, | |
na = missing_values | |
) | |
setDT(dt) | |
setnames(dt, names(dt), nis_specs$varname) | |
write_fst(dt, "./data/analysis/hospital.fst", compress = 100) | |
# load severity data ---------------------------------------------------------------- | |
nis_specs <- fread("./docs/nis_specs_severity.csv") | |
nis_specs[, width := end - start + 1] | |
nis_specs[, varname := tolower(varname)] | |
nis_specs[, type := ifelse(type == "int", "i", ifelse(type %in% c("byte", "double", "long"), "d", "c"))] | |
missing_values <- as.character(quote(c(-99, -88, -66, -99.9999999, -88.8888888, -66.6666666, -9, -8, -6, -5, -9999, -8888, -6666, -999999999, -888888888, -666666666,-999, -888, -666))) | |
dt <- | |
read_fwf( | |
"~/Dropbox (Outcomes Insights)/Data repository/HCUP/NIS_2016/NIS_2016_Severity.ASC", | |
col_positions = fwf_widths(nis_specs$width), | |
col_types = paste0(nis_specs$type, collapse = ""), | |
trim_ws = TRUE, | |
# n_max = 1000, | |
na = missing_values | |
) | |
setDT(dt) | |
setnames(dt, names(dt), nis_specs$varname) | |
write_fst(dt, "./data/analysis/severity.fst", compress = 100) | |
# copy the information below into a text editor and save as "nis_specs_xxx.csv" where "xxx" is | |
# "core", "hospital", or "severity" (see above) | |
# the information is taken from the Stata load programs | |
# white space is stripped when read into R, but kept below for legibility | |
# core data csv | |
type, varname, start, end | |
int, AGE, 1, 3 | |
byte, AGE_NEONATE, 4, 5 | |
byte, AMONTH, 6, 7 | |
byte, AWEEKEND, 8, 9 | |
byte, DIED, 10, 11 | |
double, DISCWT, 12, 22 | |
byte, DISPUNIFORM, 23, 24 | |
byte, DQTR, 25, 26 | |
int, DRG, 27, 29 | |
byte, DRGVER, 30, 31 | |
int, DRG_NoPOA, 32, 34 | |
byte, DXVER, 35, 36 | |
byte, ELECTIVE, 37, 38 | |
byte, FEMALE, 39, 40 | |
int, HCUP_ED, 41, 43 | |
byte, HOSP_DIVISION, 44, 45 | |
long, HOSP_NIS, 46, 50 | |
str, I10_DX1, 51, 57 | |
str, I10_DX2, 58, 64 | |
str, I10_DX3, 65, 71 | |
str, I10_DX4, 72, 78 | |
str, I10_DX5, 79, 85 | |
str, I10_DX6, 86, 92 | |
str, I10_DX7, 93, 99 | |
str, I10_DX8, 100, 106 | |
str, I10_DX9, 107, 113 | |
str, I10_DX10, 114, 120 | |
str, I10_DX11, 121, 127 | |
str, I10_DX12, 128, 134 | |
str, I10_DX13, 135, 141 | |
str, I10_DX14, 142, 148 | |
str, I10_DX15, 149, 155 | |
str, I10_DX16, 156, 162 | |
str, I10_DX17, 163, 169 | |
str, I10_DX18, 170, 176 | |
str, I10_DX19, 177, 183 | |
str, I10_DX20, 184, 190 | |
str, I10_DX21, 191, 197 | |
str, I10_DX22, 198, 204 | |
str, I10_DX23, 205, 211 | |
str, I10_DX24, 212, 218 | |
str, I10_DX25, 219, 225 | |
str, I10_DX26, 226, 232 | |
str, I10_DX27, 233, 239 | |
str, I10_DX28, 240, 246 | |
str, I10_DX29, 247, 253 | |
str, I10_DX30, 254, 260 | |
str, I10_ECAUSE1, 261, 267 | |
str, I10_ECAUSE2, 268, 274 | |
str, I10_ECAUSE3, 275, 281 | |
str, I10_ECAUSE4, 282, 288 | |
byte, I10_NDX, 289, 290 | |
int, I10_NECAUSE, 291, 293 | |
byte, I10_NPR, 294, 295 | |
str, I10_PR1, 296, 302 | |
str, I10_PR2, 303, 309 | |
str, I10_PR3, 310, 316 | |
str, I10_PR4, 317, 323 | |
str, I10_PR5, 324, 330 | |
str, I10_PR6, 331, 337 | |
str, I10_PR7, 338, 344 | |
str, I10_PR8, 345, 351 | |
str, I10_PR9, 352, 358 | |
str, I10_PR10, 359, 365 | |
str, I10_PR11, 366, 372 | |
str, I10_PR12, 373, 379 | |
str, I10_PR13, 380, 386 | |
str, I10_PR14, 387, 393 | |
str, I10_PR15, 394, 400 | |
double, KEY_NIS, 401, 410 | |
long, LOS, 411, 415 | |
byte, MDC, 416, 417 | |
byte, MDC_NoPOA, 418, 419 | |
int, NIS_STRATUM, 420, 423 | |
byte, PAY1, 424, 425 | |
int, PL_NCHS, 426, 428 | |
int, PRDAY1, 429, 431 | |
int, PRDAY2, 432, 434 | |
int, PRDAY3, 435, 437 | |
int, PRDAY4, 438, 440 | |
int, PRDAY5, 441, 443 | |
int, PRDAY6, 444, 446 | |
int, PRDAY7, 447, 449 | |
int, PRDAY8, 450, 452 | |
int, PRDAY9, 453, 455 | |
int, PRDAY10, 456, 458 | |
int, PRDAY11, 459, 461 | |
int, PRDAY12, 462, 464 | |
int, PRDAY13, 465, 467 | |
int, PRDAY14, 468, 470 | |
int, PRDAY15, 471, 473 | |
byte, PRVER, 474, 475 | |
byte, RACE, 476, 477 | |
double, TOTCHG, 478, 487 | |
byte, TRAN_IN, 488, 489 | |
byte, TRAN_OUT, 490, 491 | |
int, YEAR, 492, 495 | |
byte, ZIPINC_QRTL, 496, 497 | |
# hospital csv | |
type,varname,start,end | |
double, DISCWT, 1,11 | |
byte, HOSP_BEDSIZE, 12,13 | |
byte, HOSP_DIVISION,14,15 | |
byte, HOSP_LOCTEACH,16,17 | |
long, HOSP_NIS, 18,22 | |
byte, HOSP_REGION, 23,24 | |
byte, H_CONTRL, 25,26 | |
int, NIS_STRATUM, 27,30 | |
long, N_DISC_U, 31,38 | |
int, N_HOSP_U, 39,42 | |
long, S_DISC_U, 43,50 | |
int, S_HOSP_U, 51,54 | |
long, TOTAL_DISC, 55,60 | |
int, YEAR, 61,64 | |
# severity csv | |
type,varname,start,end | |
long, HOSP_NIS, 1, 5 | |
double, KEY_NIS, 6,15 | |
int, APRDRG, 16,19 | |
byte, APRDRG_Risk_Mortality, 20,21 | |
byte, APRDRG_Severity, 22,23 |
Sorry, I don't have any recent experience with those. I typically read things in by adapting the SAS or Stata load programs, if they are available. I do it manually or with a script depending on the complexity.
How did you get your specs data in a .csv format? Did you convert it from the NIS File Specifications website? https://hcup-us.ahrq.gov/db/nation/nis/nisfilespecs.jsp#2016NIS
As i mentioned above, I simply edit the available SAS or Stata load programs using a text editor like Sublime text that allows you to edit multiple rows simultaneously.
Thanks for sharing this! I adapted your code with a function to read specs from the stata load files online. I've tested 2016-2020 and it seems to work well https://gist.github.com/stephenrho/63f114281ee124cc6eec1db2433c9be5
@stephenrho -- that sounds great! Thanks for doing that.
Do you have a load program for the state data (e.g., SID, SEDD)? Or have tips on how to create a load program for those datasets?