-
-
Save cdown/1163649 to your computer and use it in GitHub Desktop.
urlencode() { | |
# urlencode <string> | |
old_lc_collate=$LC_COLLATE | |
LC_COLLATE=C | |
local length="${#1}" | |
for (( i = 0; i < length; i++ )); do | |
local c="${1:$i:1}" | |
case $c in | |
[a-zA-Z0-9.~_-]) printf '%s' "$c" ;; | |
*) printf '%%%02X' "'$c" ;; | |
esac | |
done | |
LC_COLLATE=$old_lc_collate | |
} | |
urldecode() { | |
# urldecode <string> | |
local url_encoded="${1//+/ }" | |
printf '%b' "${url_encoded//%/\\x}" | |
} |
This works for me.
https://stackoverflow.com/questions/296536/how-to-urlencode-data-for-curl-command
rawurlencode() { local string="${1}" local strlen=${#string} local encoded="" local pos c o for (( pos=0 ; pos<strlen ; pos++ )); do c=${string:$pos:1} case "$c" in [-_.~a-zA-Z0-9] ) o="${c}" ;; * ) printf -v o '%%%02x' "'$c" esac encoded+="${o}" done echo "${encoded}" # You can either set a return variable (FASTER) REPLY="${encoded}" #+or echo the result (EASIER)... or both... :p }
@ThePredators this breaks on unicode
input: a:/b c?d=e&f#g-+-`-´-ä-€
input hex:
\x61\x3A\x2F\x62\x20\x63\x3F\x64\x3D\x65\x26\x66\x23\x67\x2D\x2B\x2D\x60\x2D\xC2\xB4\x2D\xC3\xA4\x2D\xE2\x82\xAC
input hexdump:
00000000 61 3a 2f 62 20 63 3f 64 3d 65 26 66 23 67 2d 2b |a:/b c?d=e&f#g-+|
00000010 2d 60 2d c2 b4 2d c3 a4 2d e2 82 ac |-`-..-..-...|
0000001c
actual: a%3A%2Fb%20c%3Fd%3De%26f%23g-%2B-%60-%B4-%E4-%20AC
expected: a%3A%2Fb%20c%3Fd%3De%26f%23g-%2B-%60-%C2%B4-%C3%A4-%E2%82%AC
@ThePredators works like a charm 👍
Hi,
Characters used in France are not taken into account: (é è à ù ê â û ...) if you work in fr_FR locale.
You need to convert your data source from Windows-1252 to UTF-8 before entering in the function ::
data_utf8=$(echo "$data_ISO" | iconv -f iso8859-1 -t utf-8)
#!/bin/bash
## Written by Adam Danischewski 08/04/2024
declare CURR_ORD
str="${1:-😄.mp4}"
function ord() {
printf -v CURR_ORD "%d" "\"$1"
}
function has_unicode() {
local input="$1"
local -i charcnt=$(wc -m <<<"$input")
local -i bytecnt=$(wc -c <<<"$input")
((charcnt!=bytecnt))
return $?
}
function urlencode() {
sed "s/\x25/%25/g;s/\x20/%20/g;s/\x21/%21/g;s/\x22/%22/g;s/\x23/%23/g;s/\x5c\x24/%24/g;\
s/\x26/%26/g;s/\x27/%27/g;s/\x28/%28/g;s/\x29/%29/g;s/\x2A/%2A/g;s/\x2B/%2B/g;\
s/\x2C/%2C/g;s/\x2D/%2D/g;s/\x3A/%3A/g;s/\x3F/%3F/g;s/\x7C/%7C/g;s/\x5c\x5B/%5B/g"
}
function encode_unicode() {
for ((i=0;i<${#str};i++)); do
char=${str:i:1}
ord "$char"
if ((${#CURR_ORD}>3)); then
od -t x1 <<< "$char" | awk '{$1="";gsub("^[[:space:]]*","");for(i=1;i<NF;i++) printf "%%" toupper($i);}'
else
printf "%s" "$char"
fi
done
}
## Tokenize percents before encoding unicode
function tokenize_orig_pcts() {
sed 's/%/\x01/g'
}
## Tokenize percents after encoding unicode, since this is urlencoded..
function tokenize_pcts() {
sed 's/%/\x02/g'
}
function detokenize_orig_pcts() {
sed 's/\x01/%/g'
}
function detokenize_pcts() {
sed 's/\x02/%/g'
}
function urlencode() {
sed "s/\x25/%25/g;s/\x20/%20/g;s/\x21/%21/g;s/\x22/%22/g;s/\x23/%23/g;s/\x5c\x24/%24/g;\
s/\x26/%26/g;s/\x27/%27/g;s/\x28/%28/g;s/\x29/%29/g;s/\x2A/%2A/g;s/\x2B/%2B/g;\
s/\x2C/%2C/g;s/\x3A/%3A/g;s/\x3F/%3F/g;s/\x7C/%7C/g;s/\x5c\x5B/%5B/g"
}
function main() {
if has_unicode "$str"; then
str=$(tokenize_orig_pcts <<< "$str")
str=$(encode_unicode)
str=$(tokenize_pcts <<< "$str")
str=$(detokenize_orig_pcts <<< "$str")
str=$(urlencode <<< "$str")
detokenize_pcts <<< "$str"
else
urlencode <<< "$str"
fi
}
main
This matches (according to my tests) the output from: jq -jRr '@uri'
Great functions 👍
Unfortunately none of the decode options work with German Umlauts:
Example:
Encode: Günther -> G%FCnther
Decode: G%FCnther -> G�nther
It seems to be something with the encoding. I tried to add "| iconv -f iso8859-1 -t utf-8" as @Twibow says to the decode function from start post but it changes nothing.
Any help appreciated 😄
LC_ALL=C
is needed to support unicode = loop bytes, not characters.LC_COLLATE=C
orLANG=C
do not work.this also must be set before
${#1}
to get the length of$1
in bytes