|
#!/bin/bash |
|
# bash-refine v1.3.4: templates.sh, Felix Lohmeier, 2020-11-04 |
|
# https://gist.github.com/felixlohmeier/d76bd27fbc4b8ab6d683822cdf61f81d |
|
# license: MIT License https://choosealicense.com/licenses/mit/ |
|
|
|
# TODO: example for setting metadata |
|
# TODO: example for engine config (facets) |
|
|
|
# ======================= TEMPLATES FOR YOUR WORKFLOW ======================== # |
|
|
|
# The following code shows several options for import, transform and export |
|
# use the templates to write your own scripts or execute this file for a demo |
|
|
|
# =============================== ENVIRONMENT ================================ # |
|
|
|
# source the main script |
|
source "${BASH_SOURCE%/*}/bash-refine.sh" || exit 1 |
|
|
|
# make script executable from another directory |
|
cd "${BASH_SOURCE%/*}/" || exit 1 |
|
|
|
### override default config? |
|
#endpoint="http://localhost:3333" |
|
#memory="1400M" # increase to available RAM |
|
#csrf=true # set to false for OpenRefine < 3.3 |
|
#date="$(date +%Y%m%d_%H%M%S)" |
|
#workdir="${BASH_SOURCE%/*}/output/${date}" |
|
#logfile="${BASH_SOURCE%/*}/log/${date}.log" |
|
#jq="${BASH_SOURCE%/*}/lib/jq" # path to executable |
|
#refine="${BASH_SOURCE%/*}/lib/openrefine/refine" # path to executable |
|
|
|
# check requirements, set trap, create workdir and tee to logfile |
|
init |
|
|
|
# ================================= STARTUP ================================== # |
|
|
|
checkpoint "Startup"; echo |
|
|
|
# start OpenRefine server |
|
refine_start; echo |
|
|
|
# ============================= MOCKUP TEST DATA ============================= # |
|
|
|
mkdir -p input |
|
|
|
cat << "DATA" > "input/example1.csv" |
|
a,b,c |
|
1,2,3 |
|
0,0,0 |
|
$,\,' |
|
DATA |
|
|
|
cat << "DATA" > "input/example2.tsv" |
|
a b c |
|
' \ $ |
|
0 0 0 |
|
3 2 1 |
|
DATA |
|
|
|
cat << "DATA" > "input/example-operations-history.json" |
|
[ |
|
{ |
|
"op": "core/column-addition", |
|
"engineConfig": { |
|
"mode": "row-based" |
|
}, |
|
"newColumnName": "apply-from-file", |
|
"columnInsertIndex": 2, |
|
"baseColumnName": "b", |
|
"expression": "grel:value.replace('2','TEST')", |
|
"onError": "set-to-blank" |
|
} |
|
] |
|
DATA |
|
|
|
# ================================== IMPORT ================================== # |
|
|
|
checkpoint "Import"; echo |
|
|
|
# declare input |
|
projects["from heredoc"]="" |
|
projects["csv file example"]="input/example1.csv" |
|
projects["tsv file example"]="input/example2.tsv" |
|
projects["another csv example"]="input/example1.csv" |
|
projects["yet another csv example"]="input/example1.csv" |
|
|
|
# --------------------------- IMPORT FROM HEREDOC ---------------------------- # |
|
|
|
# quoted heredoc ("DATA") will not be expanded by bash (no escaping needed) |
|
# project id will be stored in as ${projects[csv file example]} |
|
p="from heredoc" |
|
f="" # optional filename, will be stored in OpenRefine project metadata |
|
echo "import heredoc..." |
|
if curl -fs --write-out "%{redirect_url}\n" \ |
|
--form project-file="@-$(if [[ -n $f ]]; then echo ";filename=${f}"; fi)" \ |
|
--form project-name="${p}" \ |
|
--form format="text/line-based/*sv" \ |
|
--form options='{ |
|
"encoding": "UTF-8", |
|
"separator": " " |
|
}' \ |
|
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ |
|
> "${workdir}/${p}.id" \ |
|
<< "DATA" |
|
a b c |
|
1 2 3 |
|
0 0 0 |
|
$ \ ' |
|
DATA |
|
then |
|
log "imported heredoc as ${p}" |
|
else |
|
error "import of ${p} failed!" |
|
fi |
|
refine_store "${p}" "${workdir}/${p}.id" || error "import of ${p} failed!" |
|
echo |
|
|
|
# ---------------------------- IMPORT FROM FILE ------------------------------ # |
|
|
|
# project id will be stored in ${projects[tsv file example]} |
|
p="tsv file example" |
|
echo "import file ${projects[$p]} ..." |
|
if curl -fs --write-out "%{redirect_url}\n" \ |
|
--form project-file="@${projects[$p]}" \ |
|
--form project-name="${p}" \ |
|
--form format="text/line-based/*sv" \ |
|
--form options='{ |
|
"encoding": "UTF-8", |
|
"separator": "\t" |
|
}' \ |
|
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ |
|
> "${workdir}/${p}.id" |
|
then |
|
log "imported ${projects[$p]} as ${p}" |
|
else |
|
error "import of ${projects[$p]} failed!" |
|
fi |
|
refine_store "${p}" "${workdir}/${p}.id" || error "import of ${p} failed!" |
|
echo |
|
|
|
# -------------------- IMPORT MULTIPLE FILES (PARALLEL) ---------------------- # |
|
|
|
# project ids will be stored in ${projects[another csv example]} etc. |
|
ps=( "csv file example" "another csv example" "yet another csv example" ) |
|
echo "import files" \ |
|
"$(for p in "${ps[@]}"; do printf "%s" "${projects[$p]} "; done)..." |
|
for p in "${ps[@]}"; do |
|
(if curl -fs --write-out "%{redirect_url}\n" \ |
|
--form project-file="@${projects[$p]}" \ |
|
--form project-name="${p}" \ |
|
--form format="line-based" \ |
|
--form options='{ |
|
"encoding": "UTF-8", |
|
"separator": "," |
|
}' \ |
|
"${endpoint}/command/core/create-project-from-upload$(refine_csrf)" \ |
|
> "${workdir}/${p}.id" |
|
then |
|
log "imported ${projects[$p]} as ${p}" |
|
else |
|
error "import of ${projects[$p]} failed!" |
|
fi) & |
|
monitor "${p}" |
|
done |
|
monitoring |
|
for p in "${ps[@]}"; do |
|
refine_store "${p}" "${workdir}/${p}.id" || error "import of ${p} failed!" |
|
done |
|
echo |
|
|
|
# ================================ TRANSFORM ================================= # |
|
|
|
checkpoint "Transform"; echo |
|
|
|
# ------------------------ APPLY OPERATIONS FROM FILE ------------------------ # |
|
|
|
p="csv file example" |
|
f="input/example-operations-history.json" |
|
echo "apply ${f} to ${p}..." |
|
if curl -fs \ |
|
--data project="${projects[$p]}" \ |
|
--data-urlencode operations@"${f}" \ |
|
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null |
|
then |
|
log "transformed ${p} (${projects[$p]})" |
|
else |
|
error "transform ${p} (${projects[$p]}) failed!" |
|
fi |
|
echo |
|
|
|
# ---------------------- APPLY OPERATIONS FROM HEREDOC ----------------------- # |
|
|
|
# quoted heredoc ("JSON") will not be expanded by bash (no escaping needed) |
|
p="csv file example" |
|
echo "add column apply-from-heredoc to ${p}..." |
|
if curl -fs \ |
|
--data project="${projects[$p]}" \ |
|
--data-urlencode "operations@-" \ |
|
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ |
|
<< "JSON" |
|
[ |
|
{ |
|
"op": "core/column-addition", |
|
"engineConfig": { |
|
"mode": "row-based" |
|
}, |
|
"newColumnName": "apply-from-heredoc", |
|
"columnInsertIndex": 2, |
|
"baseColumnName": "b", |
|
"expression": "grel:value.replace('2','TEST')", |
|
"onError": "set-to-blank" |
|
} |
|
] |
|
JSON |
|
then |
|
log "transformed ${p} (${projects[$p]})" |
|
else |
|
error "transform ${p} (${projects[$p]}) failed!" |
|
fi |
|
echo |
|
|
|
# ---------------- APPLY OPERATIONS FROM HEREDOC AND VARIABLES --------------- # |
|
|
|
# unquoted heredocs with variable and multi-line expression (requires jq) |
|
# \ must be used to quote the characters \, $, and `. |
|
p="csv file example" |
|
replace='TEST' |
|
column="apply with variables" |
|
echo "add column ${column} to ${p}..." |
|
read -r -d '' expression << EXPRESSION |
|
grel:value.replace( |
|
'2', |
|
'${replace}' |
|
) |
|
EXPRESSION |
|
if curl -fs \ |
|
--data project="${projects[$p]}" \ |
|
--data-urlencode "operations@-" \ |
|
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ |
|
<< JSON |
|
[ |
|
{ |
|
"op": "core/column-addition", |
|
"engineConfig": { |
|
"mode": "row-based" |
|
}, |
|
"newColumnName": "${column}", |
|
"columnInsertIndex": 2, |
|
"baseColumnName": "b", |
|
"expression": $(echo "${expression}" | ${jq} -s -R '.'), |
|
"onError": "set-to-blank" |
|
} |
|
] |
|
JSON |
|
then |
|
log "transformed ${p} (${projects[$p]})" |
|
else |
|
error "transform ${p} (${projects[$p]}) failed!" |
|
fi |
|
echo |
|
|
|
# ------ APPLY OPERATIONS FROM HEREDOC TO MULTIPLE PROJECTS (PARALLEL) ------ # |
|
|
|
# quoted heredoc ("JSON") will not be expanded by bash (no escaping needed) |
|
ps=( "another csv example" "yet another csv example" ) |
|
echo "add column apply-from-heredoc to" "${ps[@]}" "..." |
|
for p in "${ps[@]}"; do |
|
(if curl -fs \ |
|
--data project="${projects[$p]}" \ |
|
--data-urlencode "operations@-" \ |
|
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null \ |
|
<< "JSON" |
|
[ |
|
{ |
|
"op": "core/column-addition", |
|
"engineConfig": { |
|
"mode": "row-based" |
|
}, |
|
"newColumnName": "apply-from-heredoc", |
|
"columnInsertIndex": 2, |
|
"baseColumnName": "b", |
|
"expression": "grel:value.replace('2','TEST')", |
|
"onError": "set-to-blank" |
|
} |
|
] |
|
JSON |
|
then |
|
log "transformed ${p} (${projects[$p]})" |
|
else |
|
error "transform ${p} (${projects[$p]}) failed!" |
|
fi) & |
|
monitor "${p}" |
|
done |
|
monitoring |
|
echo |
|
|
|
# ------------- APPLY MULTIPLE OPERATIONS GENERATED FROM HEREDOC ------------- # |
|
|
|
# unquoted heredoc (JSON) with variables and multiplied (requires jq) |
|
# \ must be used to quote the characters \, $, and `. |
|
p="csv file example" |
|
columns=( "apply-from-file" "apply-from-heredoc" ) |
|
echo "delete columns" "${columns[@]}" "in ${p}..." |
|
for column in "${columns[@]}"; do |
|
cat << JSON >> "${workdir}/${p}.tmp" |
|
[ |
|
{ |
|
"op": "core/column-removal", |
|
"columnName": "${column}" |
|
} |
|
] |
|
JSON |
|
done |
|
if "${jq}" -s add "${workdir}/${p}.tmp" | curl -fs \ |
|
--data project="${projects[$p]}" \ |
|
--data-urlencode operations@- \ |
|
"${endpoint}/command/core/apply-operations$(refine_csrf)" > /dev/null |
|
then |
|
log "transformed ${p} (${projects[$p]})" |
|
rm "${workdir}/${p}.tmp" |
|
else |
|
error "transform ${p} (${projects[$p]}) failed!" |
|
fi |
|
echo |
|
|
|
# ================================== EXPORT ================================== # |
|
|
|
checkpoint "Export"; echo |
|
|
|
# ----------------------------- EXPORT TO STDOUT ----------------------------- # |
|
|
|
p="csv file example" |
|
format="tsv" |
|
echo "export ${p} in ${format} format..." |
|
if curl -fs \ |
|
--data project="${projects[$p]}" \ |
|
--data format="tsv" \ |
|
--data engine='{"facets":[],"mode":"row-based"}' \ |
|
"${endpoint}/command/core/export-rows" |
|
then |
|
log "exported ${p} (${projects[$p]})" |
|
else |
|
error "export of ${p} (${projects[$p]}) failed!" |
|
fi |
|
echo |
|
|
|
# ------------------------------ EXPORT TO FILE ------------------------------ # |
|
|
|
p="csv file example" |
|
format="csv" |
|
echo "export ${p} to ${format} file..." |
|
if curl -fs \ |
|
--data project="${projects[$p]}" \ |
|
--data format="${format}" \ |
|
--data engine='{"facets":[],"mode":"row-based"}' \ |
|
"${endpoint}/command/core/export-rows" \ |
|
> "${workdir}/${p}.${format}" |
|
then |
|
log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}" |
|
else |
|
error "export of ${p} (${projects[$p]}) failed!" |
|
fi |
|
echo |
|
|
|
# ------------------------- TEMPLATING EXPORT TO FILE ------------------------ # |
|
|
|
p="csv file example" |
|
format="json" |
|
echo "export ${p} to ${format} file using template..." |
|
IFS= read -r -d '' template << "TEMPLATE" |
|
{ |
|
"a": {{cells['a'].value.jsonize()}}, |
|
"b": {{cells['b'].value.jsonize()}}, |
|
"c": {{cells['c'].value.jsonize()}} |
|
} |
|
TEMPLATE |
|
if echo "${template}" | head -c -2 | curl -fs \ |
|
--data project="${projects[$p]}" \ |
|
--data format="template" \ |
|
--data prefix="[ |
|
" \ |
|
--data suffix=" |
|
]" \ |
|
--data separator=", |
|
" \ |
|
--data engine='{"facets":[],"mode":"row-based"}' \ |
|
--data-urlencode template@- \ |
|
"${endpoint}/command/core/export-rows" \ |
|
> "${workdir}/${p}.${format}" |
|
then |
|
log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}" |
|
else |
|
error "export of ${p} (${projects[$p]}) failed!" |
|
fi |
|
echo |
|
|
|
# ------------------- EXPORT TO MULTIPLE FILES (PARALLEL) -------------------- # |
|
|
|
ps=( "another csv example" "yet another csv example" ) |
|
format="tsv" |
|
echo "export" "${ps[@]}" "to ${format} files..." |
|
for p in "${ps[@]}"; do |
|
(if curl -fs \ |
|
--data project="${projects[$p]}" \ |
|
--data format="${format}" \ |
|
--data engine='{"facets":[],"mode":"row-based"}' \ |
|
"${endpoint}/command/core/export-rows" \ |
|
> "${workdir}/${p}.${format}" |
|
then |
|
log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}" |
|
else |
|
error "export of ${p} (${projects[$p]}) failed!" |
|
fi) & |
|
monitor "${p}" |
|
done |
|
monitoring |
|
echo |
|
|
|
# ------------------------------ EXPORT PROJECT ------------------------------ # |
|
|
|
p="csv file example" |
|
format="openrefine.tar.gz" |
|
echo "export ${p} to ${format} file..." |
|
if curl -fs \ |
|
--data project="${projects[$p]}" \ |
|
"${endpoint}/command/core/export-project" \ |
|
> "${workdir}/${p}.${format}" |
|
then |
|
log "exported ${p} (${projects[$p]}) to ${workdir}/${p}.${format}" |
|
else |
|
error "export of ${p} (${projects[$p]}) failed!" |
|
fi |
|
echo |
|
|
|
# ================================ UTILITIES ================================= # |
|
|
|
checkpoint "Utilities"; echo |
|
|
|
# ------------------------------ LIST PROJECTS ------------------------------- # |
|
|
|
# get all project metadata and reshape json to print a list (requires jq) |
|
echo "list projects..." |
|
if curl -fs --get \ |
|
"${endpoint}/command/core/get-all-project-metadata" \ |
|
| "${jq}" -r '.projects | keys[] as $k | "\($k): \(.[$k] | .name)"' |
|
then |
|
: #log "printed list of projects" |
|
else |
|
error "getting list of projects failed!" |
|
fi |
|
echo |
|
|
|
# ------------------------------- GET METADATA ------------------------------- # |
|
|
|
# get project metadata and reshape json to include project id (requires jq) |
|
p="csv file example" |
|
echo "metadata for ${p}..." |
|
if curl -fs --get \ |
|
--data project="${projects[$p]}" \ |
|
"${endpoint}/command/core/get-project-metadata" \ |
|
| "${jq}" "{ id: ${projects[$p]} } + ." |
|
then |
|
: #log "printed metadata of ${p} (${projects[$p]})" |
|
else |
|
error "getting metadata of ${p} (${projects[$p]}) failed!" |
|
fi |
|
echo |
|
|
|
# ------------------------------ GET ROW COUNT ------------------------------- # |
|
|
|
# get total number of rows |
|
p="csv file example" |
|
echo "total number of rows in ${p}..." |
|
if curl -fs --get \ |
|
--data project="${projects[$p]}" \ |
|
--data limit=0 \ |
|
"${endpoint}/command/core/get-rows" \ |
|
| tr "," "\n" | grep total | cut -d ":" -f 2 |
|
then |
|
: #log "printed row count of ${p} (${projects[$p]})" |
|
else |
|
error "getting row count of ${p} (${projects[$p]}) failed!" |
|
fi |
|
echo |
|
|
|
# ------------------------------- GET COLUMNS -------------------------------- # |
|
|
|
# get column names from project model (requires jq) |
|
p="csv file example" |
|
echo "column names of ${p}..." |
|
if curl -fs --get \ |
|
--data project="${projects[$p]}" \ |
|
"${endpoint}/command/core/get-models" \ |
|
| "${jq}" -r '.columnModel | .columns[] | .name' |
|
then |
|
: #log "printed column names of ${p} (${projects[$p]})" |
|
else |
|
error "getting column names of ${p} (${projects[$p]}) failed!" |
|
fi |
|
echo |
|
|
|
# -------------------------- GET OPERATIONS HISTORY -------------------------- # |
|
|
|
# get operations history and reshape json to make it applicable (requires jq) |
|
p="csv file example" |
|
f="${workdir}/${p}_history.json" |
|
echo "history of operations for ${p}..." |
|
if curl -fs --get \ |
|
--data project="${projects[$p]}" \ |
|
"${endpoint}/command/core/get-operations" \ |
|
| "${jq}" '[ .entries[] | .operation ]' \ |
|
> "${f}" |
|
then |
|
log "saved ops history of ${p} (${projects[$p]}) to ${f}" |
|
else |
|
error "getting ops history of ${p} (${projects[$p]}) failed!" |
|
fi |
|
echo |
|
|
|
# ---------------------------- GET IMPORT HISTORY ---------------------------- # |
|
|
|
# get project metadata and filter import options history (requires jq) |
|
p="csv file example" |
|
echo "history of import for ${p}..." |
|
if curl -fs --get \ |
|
--data project="${projects[$p]}" \ |
|
"${endpoint}/command/core/get-project-metadata" \ |
|
| "${jq}" ".importOptionMetadata[0]" |
|
then |
|
: #log "printed import history of ${p} (${projects[$p]})" |
|
else |
|
error "getting import history of ${p} (${projects[$p]}) failed!" |
|
fi |
|
echo |
|
|
|
# ------------------------------ DELETE PROJECT ------------------------------ # |
|
|
|
# delete a project (rarely needed for batch processing) |
|
p="yet another csv example" |
|
echo "delete project ${p}..." |
|
if curl -fs \ |
|
--data project="${projects[$p]}" \ |
|
"${endpoint}/command/core/delete-project$(refine_csrf)" > /dev/null |
|
then |
|
log "deleted ${p} (${projects[$p]})" |
|
else |
|
error "deletion of ${p} (${projects[$p]}) failed!" |
|
fi |
|
echo |
|
|
|
# ================================== FINISH ================================== # |
|
|
|
checkpoint "Finish"; echo |
|
|
|
# stop OpenRefine server |
|
refine_stop; echo |
|
|
|
# calculate run time based on checkpoints |
|
checkpoint_stats; echo |
|
|
|
# word count on all files in workdir |
|
count_output |