Last active
July 7, 2020 16:12
-
-
Save turbomam/f620e9e9f042b643e47de730328e3e83 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
options(java.parameters = "-Xmx6g") | |
# see also https://jangorecki.gitlab.io/data.cube/library/data.table/html/dcast.data.table.html | |
library(config) | |
# library(dplyr) | |
# library(ggplot2) | |
library(httr) | |
# library(igraph) | |
library(jsonlite) | |
# library(randomForest) | |
library(rdflib) | |
# library(readr) | |
# library(readxl) | |
# library(reshape2) | |
# library(RJDBC) | |
library(solrium) | |
# library(stringdist) | |
# library(stringr) | |
# library(tm) | |
# library(uuid) | |
# # train | |
# library(splitstackshape) | |
# | |
# ### validation | |
# library(ROCR) | |
# library(caret) | |
# library(xgboost) | |
# # also try party and xgboot | |
# ensure that large integers aren't casted to scientific notation | |
# for example when being posted into a SQL query | |
options(scipen = 999) | |
print("Default file path set to:") | |
print(getwd()) | |
config.file <- "~/graph_db_common.yaml" | |
config <- config::get(file = config.file) | |
chunk.vec <- function(vec, chunk.count) { | |
split(vec, cut(seq_along(vec), chunk.count, labels = FALSE)) | |
} | |
make.table.frame <- function(my.vector) { | |
temp <- table(my.vector) | |
temp <- cbind.data.frame(names(temp), as.numeric(temp)) | |
colnames(temp) <- c('value', 'count') | |
temp$value <- as.character(temp$value) | |
return(temp) | |
} | |
import.from.local.file <- | |
function(some.graph.name, | |
some.local.file, | |
some.rdf.format) { | |
print(some.graph.name) | |
print(some.local.file) | |
print(some.rdf.format) | |
post.dest <- | |
paste0( | |
config$my.graphdb.base, | |
'/repositories/', | |
config$my.selected.repo, | |
'/rdf-graphs/service?graph=', | |
some.graph.name | |
) | |
print(post.dest) | |
post.resp <- | |
httr::POST( | |
url = post.dest, | |
body = upload_file(some.local.file), | |
content_type(some.rdf.format), | |
authenticate(config$my.graphdb.username, | |
config$my.graphdb.pw, | |
type = 'basic') | |
) | |
print('Errors will be listed below:') | |
print(rawToChar(post.resp$content)) | |
} | |
import.from.url <- function(some.graph.name, | |
some.ontology.url, | |
some.rdf.format) { | |
print(some.graph.name) | |
print(some.ontology.url) | |
print(some.rdf.format) | |
if (nchar(some.rdf.format) > 0) { | |
update.body <- paste0( | |
'{ | |
"context": "', | |
some.graph.name, | |
'", | |
"data": "', | |
some.ontology.url, | |
'", | |
"format": "', | |
some.rdf.format, | |
'" | |
}' | |
) | |
} else { | |
update.body <- paste0('{ | |
"context": "', | |
some.graph.name, | |
'", | |
"data": "', | |
some.ontology.url, | |
'" | |
}') | |
} | |
cat("\n") | |
cat(update.body) | |
cat("\n\n") | |
post.res <- POST( | |
url.post.endpoint, | |
body = update.body, | |
content_type("application/json"), | |
accept("application/json"), | |
saved.authentication | |
) | |
cat(rawToChar(post.res$content)) | |
} | |
get.context.report <- function() { | |
context.report <- GET( | |
url = paste0( | |
config$my.graphdb.base, | |
"/repositories/", | |
config$my.selected.repo, | |
"/contexts" | |
), | |
saved.authentication | |
) | |
context.report <- | |
jsonlite::fromJSON(rawToChar(context.report$content)) | |
context.report <- | |
context.report$results$bindings$contextID$value | |
return(context.report) | |
} | |
monitor.named.graphs <- function() { | |
while (TRUE) { | |
print(paste0( | |
Sys.time(), | |
": '", | |
last.post.status, | |
"' submitted at ", | |
last.post.time | |
)) | |
context.report <- get.context.report() | |
pending.graphs <- sort(setdiff(expectation, context.report)) | |
# will this properly handle the case when the report is empty (NULL)? | |
if (length(pending.graphs) == 0) { | |
print("Update complete") | |
break() | |
} | |
print(paste0("still waiting for: ", pending.graphs)) | |
print(paste0("Next check in ", | |
config$monitor.pause.seconds, | |
" seconds.")) | |
Sys.sleep(config$monitor.pause.seconds) | |
} | |
} | |
q2j2df <- | |
function(query, | |
endpoint = config$my.graphdb.base, | |
repo = config$my.selected.repo, | |
auth = saved.authentication) { | |
# query <- config$main.solr.query | |
minquery <- gsub(pattern = " +", | |
replacement = " ", | |
x = query) | |
rdfres <- httr::GET( | |
url = paste0(endpoint, | |
"/repositories/", | |
repo), | |
query = list(query = minquery), | |
auth | |
) | |
# convert binary JSON SPARQL results to a minimal dataframe | |
rdfres <- | |
jsonlite::fromJSON(rawToChar(rdfres$content)) | |
rdfres <- rdfres$results$bindings | |
rdfres <- | |
do.call(what = cbind.data.frame, args = rdfres) | |
keepers <- colnames(rdfres) | |
keepers <- keepers[grepl(pattern = "value$", x = keepers)] | |
rdfres <- rdfres[, keepers] | |
# beautify column labels | |
temp <- | |
gsub(pattern = '\\.value$', | |
replacement = '', | |
x = colnames(rdfres)) | |
colnames(rdfres) <- temp | |
return(rdfres) | |
} | |
url.post.endpoint <- | |
paste0( | |
config$my.graphdb.base, | |
"/rest/data/import/upload/", | |
config$my.selected.repo, | |
"/url" | |
) | |
update.endpoint <- | |
paste0(config$my.graphdb.base, | |
"/repositories/", | |
config$my.selected.repo, | |
"/statements") | |
select.endpoint <- | |
paste0(config$my.graphdb.base, | |
"/repositories/", | |
config$my.selected.repo) | |
saved.authentication <- | |
authenticate(config$my.graphdb.username, | |
config$my.graphdb.pw, | |
type = "basic") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment