Created
February 12, 2021 07:59
-
-
Save thedivtagguy/74de5ae9d853828530f3414e6b52df95 to your computer and use it in GitHub Desktop.
Library Parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(shiny) | |
library(tidyverse) | |
library(gdata) | |
library(stringr) | |
library(readr) | |
library(gsubfn) | |
library(waiter) | |
# Define UI for data upload app ---- | |
gif <- paste0("https://media1.tenor.com/images", | |
"/cb27704982766b4f02691ea975d9a259/tenor.gif?itemid=11365139") | |
loading_screen <- tagList( | |
h3("Bear with me a second.", style = "color:gray;"), | |
img(src = gif, height = "200px") | |
) | |
ui <- fluidPage( | |
use_waiter(), | |
waiter_on_busy(html = loading_screen, color = "white"), # App title ---- | |
titlePanel("Srishti Library Database"), | |
# Sidebar layout with input and output definitions ---- | |
# Sidebar panel for inputs ---- | |
sidebarPanel( | |
# Input: Select a file ---- | |
fileInput("file1", "Choose Library File", | |
multiple = TRUE, | |
accept = c("text/csv", | |
"text/comma-separated-values,text/plain", | |
".csv")), | |
# Copy the line below to make a checkbox | |
checkboxInput("checkbox", label = "Or Use default file", value = FALSE), | |
# Horizontal line ---- | |
tags$hr(), | |
numericInput("skiplines", "Number of Lines to Skip:", 10, min = 1, max = 100), | |
# Horizontal line ---- | |
tags$hr(), | |
), | |
# Main panel for displaying outputs ---- | |
mainPanel( | |
# Output: Data file ---- | |
dataTableOutput('myTable') | |
) | |
) | |
# Define server logic to read selected file ---- | |
server <- function(input, output) { | |
output$myTable <- renderDataTable({ | |
# input$file1 will be NULL initially. After the user selects | |
# and uploads a file, head of that data file by default, | |
# or all rows if selected, will be shown. | |
if(input$checkbox == FALSE){ | |
req(input$file1) | |
data <- readr::read_fwf(input$file1$datapath, | |
fwf_empty(input$file1$datapath), | |
skip = input$skiplines | |
)} | |
else { | |
data <- readr::read_fwf("https://raw.githubusercontent.com/thedivtagguy/srishtilibrary/main/books_raw.txt", fwf_empty("https://raw.githubusercontent.com/thedivtagguy/srishtilibrary/main/books_raw.txt", col_names = c("Title")),skip = 10 | |
) | |
} | |
data <- data %>% rename(Title = X1) | |
# Time to Separate those columns. | |
# Remove Date Component to New Column | |
data <- data %>% mutate(Date = str_extract(Title, "\\d+/\\d+/\\d+")) | |
# Delete Date from main column | |
data$Title <- gsub("\\d+/\\d+/\\d+", "", data$Title) | |
# Extract Account No. | |
data <- data %>% mutate(Account_No = str_extract(Title, "\\d{4,5}\\s")) | |
data$Title <- gsub("\\d{4,5}\\s", "", data$Title) %>% trimws() | |
# Rearrange Columns in the order Account_No, Title, Date | |
data <- data[, c(3, 1, 2)] | |
# Function to Fix Rows | |
delim <- function(df, col_numb) { | |
for (i in nrow(df):2) { | |
if (is.na(df[i, (col_numb - 1)])) { | |
if (!is.na(df[i, (col_numb)])) { | |
paste(df[i - 1, col_numb], df[i, col_numb], sep = ' ') -> df[i - 1, col_numb] | |
NA -> df[i, col_numb] | |
} | |
} | |
} | |
df | |
} | |
# Fix Rows | |
data <- data %>% | |
as_tibble() %>% | |
mutate(across(everything(), na_if, "")) %>% | |
delim(2) %>% | |
drop_na() | |
data$acc <- gsub("\\s+"," ",data$Title) | |
# Separate Authors from Title Column | |
data <- data %>% | |
dplyr::mutate(Author = str_extract(Title, "[^/]+$")) | |
# Extract Book ID | |
data <- data %>% mutate(Book_ID = str_extract(Author, "\\d+\\.*\\d*")) | |
# Delete Book ID from Author Column | |
data$Author <- gsub("\\d+\\.*\\d*", "", data$Author) | |
data$Title <- gsub("\\d+\\.*\\d*", "", data$Title) | |
# Delete the author abbreviations and weird Et al lines. | |
data$Author <- gsub(" .*", "", data$Author) | |
data$Title <- gsub(" .*", "", data$Title) | |
data$Author <- gsub("Et al", "", data$Author) | |
data$Author <- gsub("ED", "", data$Author) | |
data$Author <- gsub("\\(", "", data$Author) | |
data$Author <- gsub("\\)", "", data$Author) | |
data$Author <- gsub("Et. al.,", "", data$Author) | |
data$Author <- gsub("ET AL", "", data$Author) | |
data$Author <- gsub("& Et", "", data$Author) | |
data$Author <- gsub("\\s\\.", "", data$Author) | |
data$Author <- gsub("Fic", "", data$Author) | |
data$Author <- gsub("\\s([A-Z]+[A-Za-z]{2})\\s", "", data$Author) | |
# Delete Where Author Names and Titles are Same | |
i <- 1 | |
for (i in 1:length(data)){ | |
if(data$Title[i] == data$Author[i]) data$Author[i] = NA | |
} | |
# Remove extra punctuation in author column | |
data$Author <- gsub("\\.$|\\,$", "", data$Author, ignore.case = TRUE) | |
# Add Space After Comma for Authors | |
data$Author <- textclean::add_comma_space(data$Author) %>% trimws() | |
# Classify Books | |
data <- data %>% mutate( | |
Book_Type = case_when( | |
startsWith(Book_ID, "00") ~ "Computer science, knowledge & systems", | |
startsWith(Book_ID, "01") ~ "Bibliographies", | |
startsWith(Book_ID, "02") ~ "Library & Information Sciences", | |
startsWith(Book_ID, "03") ~ "Encyclopedias & books of facts", | |
startsWith(Book_ID, "04") ~ "Unassigned", | |
startsWith(Book_ID, "05") ~ "Magazines, journals & serials", | |
startsWith(Book_ID, "06") ~ "Associations, organizations & museums", | |
startsWith(Book_ID, "07") ~ "News media, journalism & publishing", | |
startsWith(Book_ID, "08") ~ "Quotations", | |
startsWith(Book_ID, "09") ~ "Manuscripts & rare books", | |
startsWith(Book_ID, "10") ~ "Philosophy", | |
startsWith(Book_ID, "11") ~ "Metaphysics", | |
startsWith(Book_ID, "12") ~ "Epistemology", | |
startsWith(Book_ID, "13") ~ "Parapsychology & occultism", | |
startsWith(Book_ID, "14") ~ "Philosophical schools of thought", | |
startsWith(Book_ID, "15") ~ "Psychology", | |
startsWith(Book_ID, "16") ~ "Philosophical logic", | |
startsWith(Book_ID, "17") ~ "Ethics", | |
startsWith(Book_ID, "18") ~ "Ancient, medieval, eastern philosophy", | |
startsWith(Book_ID, "19") ~ "Modern Western philosophy", | |
startsWith(Book_ID, "20") ~ "Religion", | |
startsWith(Book_ID, "21") ~ "Philosophy & theory of religion", | |
startsWith(Book_ID, "22") ~ "Bible", | |
startsWith(Book_ID, "23") ~ "Christianity", | |
startsWith(Book_ID, "24") ~ "Christian practice & observance", | |
startsWith(Book_ID, "25") ~ "Christian orders & local church", | |
startsWith(Book_ID, "26") ~ "Social & ecclesiastical theology", | |
startsWith(Book_ID, "27") ~ "History of Christianity", | |
startsWith(Book_ID, "28") ~ "Christian denominations", | |
startsWith(Book_ID, "29") ~ "Other religions", | |
startsWith(Book_ID, "30") ~ "Social sciences, sociology & anthropology", | |
startsWith(Book_ID, "31") ~ "Statistics", | |
startsWith(Book_ID, "32") ~ "Political science", | |
startsWith(Book_ID, "33") ~ "Economics", | |
startsWith(Book_ID, "34") ~ "Law", | |
startsWith(Book_ID, "35") ~ "Public administration & military science", | |
startsWith(Book_ID, "36") ~ "Social problems & services", | |
startsWith(Book_ID, "37") ~ "Education", | |
startsWith(Book_ID, "38") ~ "Commerce, communications & transportation", | |
startsWith(Book_ID, "39") ~ "Customs, etiquette, folklore", | |
startsWith(Book_ID, "40") ~ "Language", | |
startsWith(Book_ID, "41") ~ "Linguistics", | |
startsWith(Book_ID, "42") ~ "English & Old English languages", | |
startsWith(Book_ID, "43") ~ "German and related languages", | |
startsWith(Book_ID, "44") ~ "French & related languages", | |
startsWith(Book_ID, "45") ~ "Italian, Romanian & related languages", | |
startsWith(Book_ID, "46") ~ "Spanish, Portuguese, Galician", | |
startsWith(Book_ID, "47") ~ "Latin & related Italic languages", | |
startsWith(Book_ID, "48") ~ "Classical & modern Greek languages", | |
startsWith(Book_ID, "49") ~ "Other languages", | |
startsWith(Book_ID, "50") ~ "Science", | |
startsWith(Book_ID, "51") ~ "Mathematics", | |
startsWith(Book_ID, "52") ~ "Astronomy", | |
startsWith(Book_ID, "53") ~ "Physics", | |
startsWith(Book_ID, "54") ~ "Chemistry", | |
startsWith(Book_ID, "55") ~ "Earth sciences", | |
startsWith(Book_ID, "56") ~ "Fossils & prehistoric life", | |
startsWith(Book_ID, "57") ~ "Biology", | |
startsWith(Book_ID, "58") ~ "Plants", | |
startsWith(Book_ID, "59") ~ "Animals", | |
startsWith(Book_ID, "60") ~ "Technology", | |
startsWith(Book_ID, "61") ~ "Medicine", | |
startsWith(Book_ID, "62") ~ "Engineering", | |
startsWith(Book_ID, "63") ~ "Agriculture", | |
startsWith(Book_ID, "64") ~ "Home & family management", | |
startsWith(Book_ID, "65") ~ "Management & Public Relations", | |
startsWith(Book_ID, "66") ~ "Chemical Engineering", | |
startsWith(Book_ID, "67") ~ "Manufacturing", | |
startsWith(Book_ID, "68") ~ "Manufacture for specific uses", | |
startsWith(Book_ID, "69") ~ "Construction of buildings", | |
startsWith(Book_ID, "70") ~ "Arts", | |
startsWith(Book_ID, "71") ~ "Area planning & landscape architecture", | |
startsWith(Book_ID, "72") ~ "Architecture", | |
startsWith(Book_ID, "73") ~ "Sculpture, ceramics & metalwork", | |
startsWith(Book_ID, "74") ~ "Graphic arts & decorative arts", | |
startsWith(Book_ID, "75") ~ "Painting", | |
startsWith(Book_ID, "76") ~ "Printmaking & prints", | |
startsWith(Book_ID, "77") ~ "Photography, computer art, film, video", | |
startsWith(Book_ID, "78") ~ "Music", | |
startsWith(Book_ID, "79") ~ "Outline of sports, games & entertainment", | |
startsWith(Book_ID, "80") ~ "Literature, rhetoric & criticism", | |
startsWith(Book_ID, "81") ~ "American literature in English", | |
startsWith(Book_ID, "82") ~ "English & Old English literatures", | |
startsWith(Book_ID, "83") ~ "German & related literatures", | |
startsWith(Book_ID, "84") ~ "French & related literatures", | |
startsWith(Book_ID, "85") ~ "Italian, Romanian & related literatures", | |
startsWith(Book_ID, "86") ~ "Spanish, Portuguese, Galician literatures", | |
startsWith(Book_ID, "87") ~ "Latin & Italic literatures", | |
startsWith(Book_ID, "88") ~ "Classical & modern Greek literatures", | |
startsWith(Book_ID, "89") ~ "Other literatures", | |
startsWith(Book_ID, "90") ~ "History", | |
startsWith(Book_ID, "91") ~ "Geography & travel", | |
startsWith(Book_ID, "92") ~ "Biography & genealogy", | |
startsWith(Book_ID, "93") ~ "History of ancient world", | |
startsWith(Book_ID, "94") ~ "History of Europe", | |
startsWith(Book_ID, "95") ~ "History of Asia", | |
startsWith(Book_ID, "96") ~ "History of Africa", | |
startsWith(Book_ID, "97") ~ "History of North America", | |
startsWith(Book_ID, "98") ~ "History of South America", | |
startsWith(Book_ID, "99") ~ "History of other areas", | |
T ~ "Uncategorized")) %>% | |
mutate(Book_Type = factor(Book_Type)) | |
# General Categories | |
data <- data %>% mutate( | |
Category = case_when( | |
startsWith(Book_ID, "0") ~ "Computer science, information & general works", | |
startsWith(Book_ID, "1") ~ "Philosophy & psychology", | |
startsWith(Book_ID, "2") ~ "Religion", | |
startsWith(Book_ID, "3") ~ "Social Sciences", | |
startsWith(Book_ID, "4") ~ "Language", | |
startsWith(Book_ID, "5") ~ "Science", | |
startsWith(Book_ID, "6") ~ "Technology", | |
startsWith(Book_ID, "7") ~ "Arts & recreation", | |
startsWith(Book_ID, "8") ~ "Literature", | |
startsWith(Book_ID, "9") ~ "History & geography", | |
T ~ "Uncategorized")) %>% | |
mutate(Book_Type = factor(Book_Type)) | |
# Clean Book Title | |
data$Title <- sub("/.*", "", data$Title) | |
data$Title <- str_to_title(data$Title) | |
data$Author <- str_to_title(data$Author) | |
data$acc <- NULL | |
return(data) | |
}, options = list( extensions = 'Buttons', buttons = | |
list("copy", list( | |
extend = "collection" | |
, buttons = c("csv", "excel", "pdf") | |
, text = "Download")),pageLength = 10, info = FALSE, searchHighlight = TRUE)) | |
} | |
# Run the app ---- | |
shinyApp(ui, server) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment