Last active
December 2, 2021 16:34
-
-
Save gweissman/5965ef96d4216142aa8176ef0cb6d63f to your computer and use it in GitHub Desktop.
Code to scrape the ICD 10 cardiac procedure codes from the AHRQ PQI v6.0 Appendix B 50+ page pdf file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# extract the ICD10 Cardiac procedure codes from PDF | |
# Appendix B of AHRQ PQI v6.0 | |
require(pdftools) | |
download.file(url = 'http://www.qualityindicators.ahrq.gov/Downloads/Modules/PQI/V60/TechSpecs/PQI_Appendix_B.pdf', | |
destfile = 'PQI_Appendix_B.pdf') | |
raw_text_pages <- pdf_text('PQI_Appendix_B.pdf') | |
raw_text <- paste(raw_text_list, collapse = '') | |
tokens <- unlist(strsplit(raw_text, split = "\\s")) | |
tokens_filt <- tokens[ ! tokens %in% c('APPENDIX', '(ACSCARP)')] | |
# notice any patterns we can use to our advantate here? | |
# 7 digit all caps alpha and numeric string separated by multiple whitespace to the right and newline or whitespace on the left | |
rgx <- "[0-9A-Z]{7}" | |
proc_codes <- grep(rgx, tokens_filt, value = TRUE) | |
# here they are | |
print(proc_codes) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment