Last active
April 4, 2025 13:54
-
-
Save bonnebulle/7bae067eb8c2fcdcc8505e03b447e049 to your computer and use it in GitHub Desktop.
pdfannots_auto.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## QUICKSTART --> search : | |
##### GO --- EDIT ME : | |
# V1.3 - add cleanup json files + creat md + add contents | |
## WHAT | |
# - Extracts highlight marks in PDFs (input) | |
# - Merge them by page | |
# - Filtre/select by colors /or/ any/all | |
# --> Output json datas /then/ md (readable) | |
# | |
# SOURCE -- https://softwarerecs.stackexchange.com/a/84569 | |
# | |
# ALT (opt.) use papis-extract <3 | |
###### | |
### IN USE --- source (zsh loading) | |
###### | |
## DEP / INSTALL | |
# pdfannots + yq + sponge | |
### SOURCES : | |
# https://github.com/0xabu/pdfannots | |
# https://github.com/mikefarah/yq/#install | |
# https://command-not-found.com/sponge | |
################# | |
### USE EXEMPLES | |
## NO PARAMS --> AUTO == | |
# | |
# $ pdfannots_auto path_to_file.pdf | |
# --> pdfannots_auto all nocolor path_to_file.pdf | |
# | |
####### | |
## PARAMS / OPTIONS | |
# $ pdfannots_auto color_pls merge_mod path_to_file.pdf | |
# EX/ $ pdfannots_auto (vert|orange|rouge|jaune|any) (nocolor|merge|any) ~/dir/path/filename.pdf | |
#### | |
#### | |
## | |
################# | |
# ALT (opt.) using PAPIS <3 | |
## papis-extract | |
## https://github.com/marty-oehme/papis-extract | |
## INSTALL | |
## $ python3 -m pipx install git+https://git.martyoeh.me/Marty/papis-extract.git --include-deps | |
## HELP | |
## $ papis extract --help | |
## USE | |
## $ papis extract --input pdf --doc-folder /path/papis/a5967030ebcebc5f558ae76137f8d5f7 --manual --write "author:Prenom_NOM" | |
################# | |
##### GO --- EDIT ME : | |
###### | |
## CONFIG / EDIT | |
editor_open=codium-insiders | |
editor_open_end=true | |
debug=false | |
# + COLORS --> colors_pls() --> SET PDF colors | |
colors_pls() { | |
## Colors | |
vert="#33cc00" | |
orange="#ffbe6f" | |
rouge="#fc9494" | |
jaune="#ffff01" | |
case $color in | |
"vert") | |
color_cible=$vert | |
color_is=true | |
;; | |
"orange") | |
color_cible=$orange | |
color_is=true | |
;; | |
"rouge") | |
color_cible=$rouge | |
color_is=true | |
;; | |
"jaune") | |
color_cible=$jaune | |
color_is=true | |
;; | |
*) ## -- default -- on garde toutes les couleurs | |
color_cible="all" | |
color_is=false | |
;; | |
esac | |
} | |
_pdfannots_auto() { | |
# Vérifier si plus de 0 paramètre | |
if [ $# -eq 0 ]; then | |
echo | |
echo "Erreur: Pas de paramètres. Minimum : indiquer le path d'un fichier PDF" | |
echo | |
return 1 | |
# Vérifier si plus de 3 paramètres | |
# == si (-) de 3 | |
elif [ $# -gt 3 ]; then | |
echo | |
echo "Erreur: !!! Trop de paramètres. Maximum 3 paramètres autorisés." | |
echo "--> Params default" | |
# echo "Minimum : indiquer le path d'un fichier PDF" | |
# echo | |
echo "Usage: pdfannots_auto [couleur] [fusion] fichier.pdf" | |
echo | |
# echo "!!! PENSER à mettre le chemun du fichier entre '/guillemets/merci/'" | |
# echo ".... Tentative de traitement ..." | |
if [[ "$1" == "." ]]; then | |
fname="$(fd --type f *.pdf)" | |
else | |
fname="$@" | |
fi | |
### SET DEFAULTs | |
color="all" | |
merge="nocolor" | |
filenoext="pdf" | |
forma="json" | |
### | |
if [[ "$1" == "." ]]; then | |
fname="$(fd --type f *.pdf)" | |
elif [ ! -f "$fname" ]; then | |
echo "Erreur: Le fichier $fname n'existe pas. (pas d'arguments)" | |
return 1 | |
else | |
echo "ok" | |
fi | |
else | |
### DEFAULT PARAMS | |
### SI PAS 4 params.... params par défaut | |
# FILE | |
if [ -z "$3" ]; then | |
fname="$1" | |
else | |
fname="$3" | |
fi | |
# echo $fname | |
if [[ "$1" == "." ]]; then | |
fname="$(fd --type f *.pdf)" | |
elif [ ! -f "$fname" ]; then | |
echo "Erreur: Le fichier $fname n'existe pas. (default)" | |
return 1 | |
else | |
echo "ok" | |
fi | |
# COLOR | |
if [ -z "$3" ]; then | |
color="all" | |
else | |
color="$1" | |
fi | |
# MERGE | |
if [ -z "$3" ]; then | |
merge="nocolor" | |
else | |
merge="$2" | |
fi | |
fi | |
### COLORS | |
colors_pls | |
## FILE | |
filename=$(basename -- "$fname") | |
extension="${filename##*.}" | |
filenoext=$(echo $filename | sed s/.$extension//) | |
parent_path=$(dirname "$fname")/ | |
forma="json" | |
echo | |
echo "PATH :" | |
echo parent_path -- $parent_path | |
# echo filenoext -- $filenoext | |
echo filename -- $filename | |
echo | |
echo "PARAMS :" | |
echo color -- $color | |
echo merge -- $merge | |
echo | |
cd $parent_path | |
## PATHS | |
file_name=$parent_path$filenoext.$forma | |
file_final_name=$parent_path$filenoext"___"$color"."$forma | |
file_final_makd=$parent_path$filenoext"___"$color".md" | |
##### COMMANDE | |
## Reccupération des annotations dans le PDF -> JSON | |
pdfannots $fname --format json -o $file_name | |
######### | |
if [[ $debug == false ]]; then | |
## ADD + CLEANUP | |
# Ajouter -- infos du fichier + date (--args) | |
# Filtrer -- garder page + text + color + contents | |
jq --arg filename "$filename" --arg parent_path "$parent_path" --arg color "$color" --arg date "$(date +%Y-%m-%d__%H:%M:%S)" '{"filename": $filename, "parent_path": $parent_path, "color": $color, "date": $date, "annotations": [.[] | {page, text, color, contents}]}' $file_name | sponge $file_name | |
# yes | cp $file_name $file_name.bak | |
### RM | |
if [[ $color_is == true ]]; then ## Color | |
jq -s ".[].annotations[] | select(.color == \"$color_cible\") | {page, text, color, contents}" $file_name > $file_final_name | |
else | |
jq -s ".[].annotations[] | select(.color != \"$color_cible\") | {page, text, color, contents}" $file_name > $file_final_name | |
fi | |
### FINAL | |
## MERGE | |
# IF NOT a color (color_is!=true) --> merge | |
if [[ ($merge == "colors") && ($color_is != true) ]]; then ## Merge ADD color -- IF merge==colors AND not color_is!=color (all/no filter) | |
jq -s --arg filename "$filename" --arg parent_path "$parent_path" --arg color "$color" --arg date "$(date +%Y-%m-%d__%H:%M:%S)" '{"filename": $filename, "parent_path": $parent_path, "color": $color, "date": $date, "annotations": [{"pages": ([.[] | {page, text, color, contents}] | group_by(.page) | map({page: .[0].page, textes: (to_entries | map({(.key|tostring): {text: .value.text, color, contents: .value.color, .value.contents}}) | add)}))}]}' $file_final_name | sponge $file_final_name | |
echo 000 | |
elif [[ ($merge == "merge") || (($merge == "colors") && ($color_is == true)) ]]; then ## Merge RM/ø color -- IF merge==colors AND color_is==color (non nécessaire pour color spec..) | |
jq -s --arg filename "$filename" --arg parent_path "$parent_path" --arg color "$color" --arg date "$(date +%Y-%m-%d__%H:%M:%S)" '{"filename": $filename, "parent_path": $parent_path, "color": $color, "date": $date, "annotations": [{"pages": ([.[] | {page, text, color, contents}] | group_by(.page) | map({page: .[0].page, textes: (to_entries | map({(.key|tostring): .value.text}) | add)}))}]}' $file_final_name | sponge $file_final_name | |
echo 001 | |
else ## NO MERGE ( Separated pages + colors ) -- not merge/colors + all (no color spec) | |
jq -s --arg filename "$filename" --arg parent_path "$parent_path" --arg color "$color" --arg date "$(date +%Y-%m-%d__%H:%M:%S)" '{"filename": $filename, "parent_path": $parent_path, "color": $color, "date": $date, "annotations": [.[] | {page, text, color, contents}]}' $file_final_name | sponge $file_final_name | |
echo 002 | |
fi | |
else # DEBUG ? | |
return 1 | |
fi | |
## BACKUP TESTS (opt.) | |
# yes | cp $file_name $file_name.bak | |
## EDITOR (opt.) | |
if [[ $editor_open_end == true ]]; then | |
# ( file_final_name == json ) | |
# ( file_final_makd == md ) | |
# $editor_open $file_final_name | |
$editor_open $file_final_makd | |
else | |
echo "editor_open_end FALSE" | |
fi | |
# Conversion en Markdown | |
json_to_md "$file_final_name" | |
### CLEANUP JSON data (opt.) | |
if [[ $debug == false ]]; then | |
rm $file_name | |
rm $file_final_name | |
fi | |
echo SOURCE_FILE / PWD | |
echo /var/www/cms/grav_dn/user/snippets/md_tools/pdfannots_auto.sh | |
} | |
json_to_md() { | |
local input_file="$1" | |
local output_file="${input_file%.*}.md" | |
# Extraire les métadonnées | |
filename=$(jq -r .filename "$input_file") | |
parent_path=$(jq -r .parent_path "$input_file") | |
date=$(jq -r .date "$input_file") | |
color=$(jq -r .color "$input_file") | |
# Créer l'en-tête du fichier Markdown | |
echo "# Annotations PDF \nNAME : $filename \nPATH : $parent_path" > "$output_file" | |
echo "DATE : $date" >> "$output_file" | |
echo "COLOR : $color" >> "$output_file" | |
# echo "" >> "$output_file" | |
# Vérifier le format du JSON | |
if jq -e '.annotations[0].pages' "$input_file" > /dev/null 2>&1; then | |
# Format fusionné | |
jq -r '.annotations[0].pages[] | "\n\n\n## Page \(.page)\n" + (.textes | to_entries | map( | |
if (.value | type) == "object" then | |
"- \(.value.color)\(.value.contents) --- \(.value.text)" | |
else | |
"- \(.value)" | |
end | |
) | join("\n\n"))' "$input_file" >> "$output_file" | |
else | |
# Format non fusionné | |
jq -r '.annotations | group_by(.page) | .[] | "\n\n\n## Page \(.[0].page)\n" + (map("\n- \(.color) --note : **[\(.contents)]** \n\(.text)") | join("\n"))' "$input_file" >> "$output_file" | |
fi | |
sed -i 's/\-\-note \: \*\*\[null\]\*\*//g' "$output_file" | |
sed -i 's/\-\-note \: /\nNOTE :\n/g' "$output_file" | |
### COLORS (opt.) | |
# colors_pls_replace | |
} | |
# colors_pls_replace() { | |
# # Remplacer les codes de couleur par leurs noms | |
# sed -i 's/#00ff00/[vert]/g' "$output_file" | |
# sed -i 's/#ffbe6f/[orange]/g' "$output_file" | |
# sed -i 's/#fc9494/[rouge]/g' "$output_file" | |
# sed -i 's/#ffff00/[jaune]/g' "$output_file" | |
# } | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment