Created
March 15, 2018 04:35
-
-
Save soumyaray/fb9704d3f167e6a85a5384817f778738 to your computer and use it in GitHub Desktop.
Suggestions for Travis — privacy document analysis ETL flow
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
docs <- c("OECD", "GDPR", "FTC") | |
# Functions to create: | |
pdf_to_df <- function(doc_name) { | |
filename <- paste(doc_name, ".txt") | |
#...return a dataframe | |
data.frame() | |
} | |
create_bigrams <- function(doc_df) { data.frame() } | |
remove_stopwords <- function(doc_df) { data.frame() } | |
#...more document cleaning functions | |
# document cleaning flow | |
# returns: named list with "name" and cleaned "df" | |
# $name | |
# [1] "OECD" | |
# $df | |
clean_document <- function(doc_name) { | |
cleaned_df <- | |
doc_name %>% | |
pdf_to_df %>% | |
create_bigrams # %>% ... | |
list(name=doc_name, df=cleaned_df) # could add more metadata about document | |
} | |
# Clean ALL documents | |
docs <- lapply(docs, clean_document) | |
docs[[1]]$name # [1] "OECD | |
plot_document <- function(named_doc, x_col, y_col) { | |
df <- named_doc$df | |
plot(df[x_col], df[y_col], ..., xlab=named_doc$name) | |
} | |
# could potentially map a plotting function on named docs list | |
lapply(docs, plot_document) | |
# FIND FILES | |
list.files(pattern = "\\.(PDF|pdf)$") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment