Created
October 14, 2023 06:36
-
-
Save technocrat/9203735a3ad95cf5c49d096fa56c403a to your computer and use it in GitHub Desktop.
Script for discussion of R programming style focused on the what of each step
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# aristotle.R | |
# author: Richard Careaga | |
# problem to illustrate analytic programming | |
# Date: 2023-10-14 | |
# what we have: a text copy of an old fixed-length format tape | |
# intended to record numeric blocks of data in 20 character increments | |
# without delimiters; the objective is to divide the text into | |
# 20 character blocks, find the index of the last "0" in each block | |
# and record the block sequence number of the block together with the | |
# numeric characters preceding it back to the next "0" reading right to | |
# left. example for the first block | |
# "62034706309085592380" --> 1 8559238 | |
# strategy | |
# 1. divide the source data into 20-character | |
# strings | |
# 2. write functions to parse the strings | |
# according to the criteria for extracting | |
# digits that appear between "0" characters | |
# 3. classify strings into categories and | |
# obtain location indexes for each category | |
# 4. subsets strings by category and apply | |
# the appropriate function | |
# key step is to parse each string according to | |
# the index location of the "0" chracters | |
# contained--example "62034706309085592380" | |
# has "0" at | |
# find_zero("62034706309085592380") | |
# [1] 3 7 10 12 20 | |
# functions | |
double_zero <- function(x) x[length(x)] - x[length(x)-1] == 1 | |
find_zero <- function(x) as.numeric(gregexpr("0", x)[[1]]) | |
first_zero <- function(x) ifelse(x == 1 & length(x) == 1,TRUE,FALSE) | |
has_zero <- function(x) grep("0",x[,2]) | |
lacks_zero <- function(x) grep("0",x[,2], invert = TRUE) | |
multi_zero <- function(x) ifelse(length(x) >= 2,TRUE,FALSE) | |
parse_multi <- function(x) { | |
pull_multi = function(x) gsub(".*0(.*)0.*", "\\1", x) | |
ifelse(pull_multi(x) == "",NA,pull_multi(x)) |> | |
gsub("^0|0$","",x=_) | |
} | |
parse_single <- function(x) gsub("0.*$","",x) | |
single_zero <- function(x) ifelse(x != 1 & length(x) == 1,TRUE,FALSE) | |
# data | |
x <- paste(readLines("https://gist.githubusercontent.com/technocrat/50e92332adacc7f25b10c272f512e0f4/raw/67e1fe4db09cdafecfb7907c863987e1b0444dd1/legacy%2520tape"), collapse = '\n') | |
# checks: divisible into 80-character blocks, | |
# only digits 0-9 and show beginning | |
nchar(x) %% 20 | |
grep("[^0-9]",x) | |
paste(substr(x,1,160),"...") | |
# example: first 20 character block | |
substr(x, 1, 20) | |
# what we want | |
# y example: first row of target matrix y | |
yex <- matrix(c(1,8559238),1,2) | |
colnames(yex) <- c("id","extract") | |
yex | |
# y = f(x): f is to be composed | |
# preprocessing: divide into blocks of 20 | |
# characters, keeping track of block sequence | |
char_list <- strsplit(x, "")[[1]] | |
chunk_size <- 20 | |
num_chunks <- ceiling(length(char_list) / chunk_size) | |
chunks <- split(char_list, ceiling(seq_along(char_list) / chunk_size)) | |
x <- matrix(sapply(chunks, paste, collapse = ""),ncol = 1) | |
# add a record identifier | |
id <- row(x) | |
x <- cbind(id,x) | |
colnames(x) <- c("id","extract") | |
# list of positions of zero in each block of 20 | |
# applied to x, the data | |
lp <- lapply(x[,2],find_zero) | |
# index of rows of each type | |
the_all_zeros <- which(lapply(lp,length) == 20) | |
the_double_zeros <- which(lapply(lp,double_zero) == TRUE) | |
the_first_zeros <- which(lapply(lapply(lp,first_zero),isTRUE) == TRUE) | |
the_lacks_zeros <- which(lapply(lp,sum) == -1) | |
the_multi_zeros <- which(lapply(lp,length) >= 2) | |
the_single_zeros <- which(lapply(lp,length) == 1) | |
the_discards <- c(the_all_zeros,the_double_zeros,the_first_zeros,the_lacks_zeros) | |
# receiver object | |
y <- matrix(rep("blank",640)) | |
# there should be no values "blank" remaining after assigning | |
# values to x[index] where index is the_multi_zeros, | |
# the_single_zeros, the_discards | |
# main | |
y[the_multi_zeros] <- parse_multi(x[the_multi_zeros,2]) | |
y[the_single_zeros] <- parse_single(x[the_single_zeros,2]) | |
y[the_discards] <- NA | |
cbind(x[,2],y) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment