Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save bonnebulle/7bae067eb8c2fcdcc8505e03b447e049 to your computer and use it in GitHub Desktop.
Save bonnebulle/7bae067eb8c2fcdcc8505e03b447e049 to your computer and use it in GitHub Desktop.
pdfannots_auto.sh
## QUICKSTART --> search :
##### GO --- EDIT ME :
# V1.3 - add cleanup json files + creat md + add contents
## WHAT
# - Extracts highlight marks in PDFs (input)
# - Merge them by page
# - Filtre/select by colors /or/ any/all
# --> Output json datas /then/ md (readable)
#
# SOURCE -- https://softwarerecs.stackexchange.com/a/84569
#
# ALT (opt.) use papis-extract <3
######
### IN USE --- source (zsh loading)
######
## DEP / INSTALL
# pdfannots + yq + sponge
### SOURCES :
# https://github.com/0xabu/pdfannots
# https://github.com/mikefarah/yq/#install
# https://command-not-found.com/sponge
#################
### USE EXEMPLES
## NO PARAMS --> AUTO ==
#
# $ pdfannots_auto path_to_file.pdf
# --> pdfannots_auto all nocolor path_to_file.pdf
#
#######
## PARAMS / OPTIONS
# $ pdfannots_auto color_pls merge_mod path_to_file.pdf
# EX/ $ pdfannots_auto (vert|orange|rouge|jaune|any) (nocolor|merge|any) ~/dir/path/filename.pdf
####
####
##
#################
# ALT (opt.) using PAPIS <3
## papis-extract
## https://github.com/marty-oehme/papis-extract
## INSTALL
## $ python3 -m pipx install git+https://git.martyoeh.me/Marty/papis-extract.git --include-deps
## HELP
## $ papis extract --help
## USE
## $ papis extract --input pdf --doc-folder /path/papis/a5967030ebcebc5f558ae76137f8d5f7 --manual --write "author:Prenom_NOM"
#################
##### GO --- EDIT ME :
######
## CONFIG / EDIT
editor_open=codium-insiders
editor_open_end=true
debug=false
# + COLORS --> colors_pls() --> SET PDF colors
colors_pls() {
## Colors
vert="#33cc00"
orange="#ffbe6f"
rouge="#fc9494"
jaune="#ffff01"
case $color in
"vert")
color_cible=$vert
color_is=true
;;
"orange")
color_cible=$orange
color_is=true
;;
"rouge")
color_cible=$rouge
color_is=true
;;
"jaune")
color_cible=$jaune
color_is=true
;;
*) ## -- default -- on garde toutes les couleurs
color_cible="all"
color_is=false
;;
esac
}
_pdfannots_auto() {
# Vérifier si plus de 0 paramètre
if [ $# -eq 0 ]; then
echo
echo "Erreur: Pas de paramètres. Minimum : indiquer le path d'un fichier PDF"
echo
return 1
# Vérifier si plus de 3 paramètres
# == si (-) de 3
elif [ $# -gt 3 ]; then
echo
echo "Erreur: !!! Trop de paramètres. Maximum 3 paramètres autorisés."
echo "--> Params default"
# echo "Minimum : indiquer le path d'un fichier PDF"
# echo
echo "Usage: pdfannots_auto [couleur] [fusion] fichier.pdf"
echo
# echo "!!! PENSER à mettre le chemun du fichier entre '/guillemets/merci/'"
# echo ".... Tentative de traitement ..."
if [[ "$1" == "." ]]; then
fname="$(fd --type f *.pdf)"
else
fname="$@"
fi
### SET DEFAULTs
color="all"
merge="nocolor"
filenoext="pdf"
forma="json"
###
if [[ "$1" == "." ]]; then
fname="$(fd --type f *.pdf)"
elif [ ! -f "$fname" ]; then
echo "Erreur: Le fichier $fname n'existe pas. (pas d'arguments)"
return 1
else
echo "ok"
fi
else
### DEFAULT PARAMS
### SI PAS 4 params.... params par défaut
# FILE
if [ -z "$3" ]; then
fname="$1"
else
fname="$3"
fi
# echo $fname
if [[ "$1" == "." ]]; then
fname="$(fd --type f *.pdf)"
elif [ ! -f "$fname" ]; then
echo "Erreur: Le fichier $fname n'existe pas. (default)"
return 1
else
echo "ok"
fi
# COLOR
if [ -z "$3" ]; then
color="all"
else
color="$1"
fi
# MERGE
if [ -z "$3" ]; then
merge="nocolor"
else
merge="$2"
fi
fi
### COLORS
colors_pls
## FILE
filename=$(basename -- "$fname")
extension="${filename##*.}"
filenoext=$(echo $filename | sed s/.$extension//)
parent_path=$(dirname "$fname")/
forma="json"
echo
echo "PATH :"
echo parent_path -- $parent_path
# echo filenoext -- $filenoext
echo filename -- $filename
echo
echo "PARAMS :"
echo color -- $color
echo merge -- $merge
echo
cd $parent_path
## PATHS
file_name=$parent_path$filenoext.$forma
file_final_name=$parent_path$filenoext"___"$color"."$forma
file_final_makd=$parent_path$filenoext"___"$color".md"
##### COMMANDE
## Reccupération des annotations dans le PDF -> JSON
pdfannots $fname --format json -o $file_name
#########
if [[ $debug == false ]]; then
## ADD + CLEANUP
# Ajouter -- infos du fichier + date (--args)
# Filtrer -- garder page + text + color + contents
jq --arg filename "$filename" --arg parent_path "$parent_path" --arg color "$color" --arg date "$(date +%Y-%m-%d__%H:%M:%S)" '{"filename": $filename, "parent_path": $parent_path, "color": $color, "date": $date, "annotations": [.[] | {page, text, color, contents}]}' $file_name | sponge $file_name
# yes | cp $file_name $file_name.bak
### RM
if [[ $color_is == true ]]; then ## Color
jq -s ".[].annotations[] | select(.color == \"$color_cible\") | {page, text, color, contents}" $file_name > $file_final_name
else
jq -s ".[].annotations[] | select(.color != \"$color_cible\") | {page, text, color, contents}" $file_name > $file_final_name
fi
### FINAL
## MERGE
# IF NOT a color (color_is!=true) --> merge
if [[ ($merge == "colors") && ($color_is != true) ]]; then ## Merge ADD color -- IF merge==colors AND not color_is!=color (all/no filter)
jq -s --arg filename "$filename" --arg parent_path "$parent_path" --arg color "$color" --arg date "$(date +%Y-%m-%d__%H:%M:%S)" '{"filename": $filename, "parent_path": $parent_path, "color": $color, "date": $date, "annotations": [{"pages": ([.[] | {page, text, color, contents}] | group_by(.page) | map({page: .[0].page, textes: (to_entries | map({(.key|tostring): {text: .value.text, color, contents: .value.color, .value.contents}}) | add)}))}]}' $file_final_name | sponge $file_final_name
echo 000
elif [[ ($merge == "merge") || (($merge == "colors") && ($color_is == true)) ]]; then ## Merge RM/ø color -- IF merge==colors AND color_is==color (non nécessaire pour color spec..)
jq -s --arg filename "$filename" --arg parent_path "$parent_path" --arg color "$color" --arg date "$(date +%Y-%m-%d__%H:%M:%S)" '{"filename": $filename, "parent_path": $parent_path, "color": $color, "date": $date, "annotations": [{"pages": ([.[] | {page, text, color, contents}] | group_by(.page) | map({page: .[0].page, textes: (to_entries | map({(.key|tostring): .value.text}) | add)}))}]}' $file_final_name | sponge $file_final_name
echo 001
else ## NO MERGE ( Separated pages + colors ) -- not merge/colors + all (no color spec)
jq -s --arg filename "$filename" --arg parent_path "$parent_path" --arg color "$color" --arg date "$(date +%Y-%m-%d__%H:%M:%S)" '{"filename": $filename, "parent_path": $parent_path, "color": $color, "date": $date, "annotations": [.[] | {page, text, color, contents}]}' $file_final_name | sponge $file_final_name
echo 002
fi
else # DEBUG ?
return 1
fi
## BACKUP TESTS (opt.)
# yes | cp $file_name $file_name.bak
## EDITOR (opt.)
if [[ $editor_open_end == true ]]; then
# ( file_final_name == json )
# ( file_final_makd == md )
# $editor_open $file_final_name
$editor_open $file_final_makd
else
echo "editor_open_end FALSE"
fi
# Conversion en Markdown
json_to_md "$file_final_name"
### CLEANUP JSON data (opt.)
if [[ $debug == false ]]; then
rm $file_name
rm $file_final_name
fi
echo SOURCE_FILE / PWD
echo /var/www/cms/grav_dn/user/snippets/md_tools/pdfannots_auto.sh
}
json_to_md() {
local input_file="$1"
local output_file="${input_file%.*}.md"
# Extraire les métadonnées
filename=$(jq -r .filename "$input_file")
parent_path=$(jq -r .parent_path "$input_file")
date=$(jq -r .date "$input_file")
color=$(jq -r .color "$input_file")
# Créer l'en-tête du fichier Markdown
echo "# Annotations PDF \nNAME : $filename \nPATH : $parent_path" > "$output_file"
echo "DATE : $date" >> "$output_file"
echo "COLOR : $color" >> "$output_file"
# echo "" >> "$output_file"
# Vérifier le format du JSON
if jq -e '.annotations[0].pages' "$input_file" > /dev/null 2>&1; then
# Format fusionné
jq -r '.annotations[0].pages[] | "\n\n\n## Page \(.page)\n" + (.textes | to_entries | map(
if (.value | type) == "object" then
"- \(.value.color)\(.value.contents) --- \(.value.text)"
else
"- \(.value)"
end
) | join("\n\n"))' "$input_file" >> "$output_file"
else
# Format non fusionné
jq -r '.annotations | group_by(.page) | .[] | "\n\n\n## Page \(.[0].page)\n" + (map("\n- \(.color) --note : **[\(.contents)]** \n\(.text)") | join("\n"))' "$input_file" >> "$output_file"
fi
sed -i 's/\-\-note \: \*\*\[null\]\*\*//g' "$output_file"
sed -i 's/\-\-note \: /\nNOTE :\n/g' "$output_file"
### COLORS (opt.)
# colors_pls_replace
}
# colors_pls_replace() {
# # Remplacer les codes de couleur par leurs noms
# sed -i 's/#00ff00/[vert]/g' "$output_file"
# sed -i 's/#ffbe6f/[orange]/g' "$output_file"
# sed -i 's/#fc9494/[rouge]/g' "$output_file"
# sed -i 's/#ffff00/[jaune]/g' "$output_file"
# }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment