-
-
Save cdown/1163649 to your computer and use it in GitHub Desktop.
urlencode() { | |
# urlencode <string> | |
old_lc_collate=$LC_COLLATE | |
LC_COLLATE=C | |
local length="${#1}" | |
for (( i = 0; i < length; i++ )); do | |
local c="${1:$i:1}" | |
case $c in | |
[a-zA-Z0-9.~_-]) printf '%s' "$c" ;; | |
*) printf '%%%02X' "'$c" ;; | |
esac | |
done | |
LC_COLLATE=$old_lc_collate | |
} | |
urldecode() { | |
# urldecode <string> | |
local url_encoded="${1//+/ }" | |
printf '%b' "${url_encoded//%/\\x}" | |
} |
Good call @cdown. I had the urldecode call in a command substitution - urldecoded=$(urldecode 's3://...')
. Once I removed the function call from the command substitution, the spaces were retained from the encoding. Now I just need to find a better way to declare the result as a variable...
Edit. Double Quoting around the variable's presentation in downstream commands fixed my issue. Ie echo "$varname"
just a brief nod to mawk
which is five times faster in my tests
(indeed, often faster than sed)
I know it's not a de facto standard like bash
i.e. installed by default on so many systems
but it should be and it is on my systems
I also notice that bash seems to be catching up with ksh93
One line implementation, suitable for storing in .bashrc
urle () { [[ "${1}" ]] || return 1; local LANG=C i x; for (( i = 0; i < ${#1}; i++ )); do x="${1:i:1}"; [[ "${x}" == [a-zA-Z0-9.~_-] ]] && echo -n "${x}" || printf '%%%02X' "'${x}"; done; echo; }
urld () { [[ "${1}" ]] || return 1; : "${1//+/ }"; echo -e "${_//%/\\x}"; }
Thanks for it!
Thanks for this.
Could you please also license this code of yours?
Thanks for the script, but i don't know why when calling urlencode i got in the encoded data a : % at the end !
i had to add a check for systems where collate is not set
if [ -n "$old_lc_collate" ] ; then LC_COLLATE=$old_lc_collate ; fi
LC_ALL=C
is needed to support unicode = loop bytes, not characters.
LC_COLLATE=C
or LANG=C
do not work.
this also must be set before ${#1}
to get the length of $1
in bytes
#!/usr/bin/env bash
# MIT License
# encode special characters per RFC 3986
urlencode() {
local LC_ALL=C # support unicode = loop bytes, not characters
local c i n=${#1}
for (( i=0; i<n; i++ )); do
c="${1:i:1}"
case "$c" in
[-_.~A-Za-z0-9]) # also encode ;,/?:@&=+$!*'()# == encodeURIComponent in javascript
#[-_.~A-Za-z0-9\;,/?:@\&=+\$!*\'\(\)#]) # dont encode ;,/?:@&=+$!*'()# == encodeURI in javascript
printf '%s' "$c" ;;
*) printf '%%%02X' "'$c" ;;
esac
done
echo
}
_test_urlencode() {
local fname=urlencode
local auml=$'\xC3\xA4' # ä = %C3%A4
local euro=$'\xE2\x82\xAC' # € = %E2%82%AC
local tick=$'\x60' # ` = %60
local backtick=$'\xC2\xB4' # ´ = %C2%B4
local input="a:/b c?d=e&f#g-+-;-,-@-\$-!-*-'-(-)-#-$tick-$backtick-$auml-$euro"
# note: we expect uppercase hex codes from %02X format string
local expected="a%3A%2Fb%20c%3Fd%3De%26f%23g-%2B-%3B-%2C-%40-%24-%21-%2A-%27-%28-%29-%23-%60-%C2%B4-%C3%A4-%E2%82%AC" # also encode ;,/?:@&=+$!*'()#
#local expected="a:/b%20c?d=e&f#g-+-;-,-@-\$-!-*-'-(-)-#-%60-%C2%B4-%C3%A4-%E2%82%AC" # dont encode ;,/?:@&=+$!*'()#
local actual="$($fname "$input")"
if [[ "$actual" != "$expected" ]]; then
echo "error in $fname"
# debug
echo "input: $input"
echo "input hex:"; echo -n "$input" | hexdump -v -e '/1 "%02X"' | sed 's/\(..\)/\\x\1/g'; echo
echo "input hexdump:"; echo -n "$input" | hexdump -C
printf "actual: "; echo "$actual"
printf "expected: "; echo "$expected"
exit 1
fi
}
_test_urlencode
This works for me.
https://stackoverflow.com/questions/296536/how-to-urlencode-data-for-curl-command
rawurlencode() { local string="${1}" local strlen=${#string} local encoded="" local pos c o for (( pos=0 ; pos<strlen ; pos++ )); do c=${string:$pos:1} case "$c" in [-_.~a-zA-Z0-9] ) o="${c}" ;; * ) printf -v o '%%%02x' "'$c" esac encoded+="${o}" done echo "${encoded}" # You can either set a return variable (FASTER) REPLY="${encoded}" #+or echo the result (EASIER)... or both... :p }
@ThePredators this breaks on unicode
input: a:/b c?d=e&f#g-+-`-´-ä-€
input hex:
\x61\x3A\x2F\x62\x20\x63\x3F\x64\x3D\x65\x26\x66\x23\x67\x2D\x2B\x2D\x60\x2D\xC2\xB4\x2D\xC3\xA4\x2D\xE2\x82\xAC
input hexdump:
00000000 61 3a 2f 62 20 63 3f 64 3d 65 26 66 23 67 2d 2b |a:/b c?d=e&f#g-+|
00000010 2d 60 2d c2 b4 2d c3 a4 2d e2 82 ac |-`-..-..-...|
0000001c
actual: a%3A%2Fb%20c%3Fd%3De%26f%23g-%2B-%60-%B4-%E4-%20AC
expected: a%3A%2Fb%20c%3Fd%3De%26f%23g-%2B-%60-%C2%B4-%C3%A4-%E2%82%AC
@ThePredators works like a charm 👍
Hi,
Characters used in France are not taken into account: (é è à ù ê â û ...) if you work in fr_FR locale.
You need to convert your data source from Windows-1252 to UTF-8 before entering in the function ::
data_utf8=$(echo "$data_ISO" | iconv -f iso8859-1 -t utf-8)
#!/bin/bash
## Written by Adam Danischewski 08/04/2024
declare CURR_ORD
str="${1:-😄.mp4}"
function ord() {
printf -v CURR_ORD "%d" "\"$1"
}
function has_unicode() {
local input="$1"
local -i charcnt=$(wc -m <<<"$input")
local -i bytecnt=$(wc -c <<<"$input")
((charcnt!=bytecnt))
return $?
}
function urlencode() {
sed "s/\x25/%25/g;s/\x20/%20/g;s/\x21/%21/g;s/\x22/%22/g;s/\x23/%23/g;s/\x5c\x24/%24/g;\
s/\x26/%26/g;s/\x27/%27/g;s/\x28/%28/g;s/\x29/%29/g;s/\x2A/%2A/g;s/\x2B/%2B/g;\
s/\x2C/%2C/g;s/\x2D/%2D/g;s/\x3A/%3A/g;s/\x3F/%3F/g;s/\x7C/%7C/g;s/\x5c\x5B/%5B/g"
}
function encode_unicode() {
for ((i=0;i<${#str};i++)); do
char=${str:i:1}
ord "$char"
if ((${#CURR_ORD}>3)); then
od -t x1 <<< "$char" | awk '{$1="";gsub("^[[:space:]]*","");for(i=1;i<NF;i++) printf "%%" toupper($i);}'
else
printf "%s" "$char"
fi
done
}
## Tokenize percents before encoding unicode
function tokenize_orig_pcts() {
sed 's/%/\x01/g'
}
## Tokenize percents after encoding unicode, since this is urlencoded..
function tokenize_pcts() {
sed 's/%/\x02/g'
}
function detokenize_orig_pcts() {
sed 's/\x01/%/g'
}
function detokenize_pcts() {
sed 's/\x02/%/g'
}
function urlencode() {
sed "s/\x25/%25/g;s/\x20/%20/g;s/\x21/%21/g;s/\x22/%22/g;s/\x23/%23/g;s/\x5c\x24/%24/g;\
s/\x26/%26/g;s/\x27/%27/g;s/\x28/%28/g;s/\x29/%29/g;s/\x2A/%2A/g;s/\x2B/%2B/g;\
s/\x2C/%2C/g;s/\x3A/%3A/g;s/\x3F/%3F/g;s/\x7C/%7C/g;s/\x5c\x5B/%5B/g"
}
function main() {
if has_unicode "$str"; then
str=$(tokenize_orig_pcts <<< "$str")
str=$(encode_unicode)
str=$(tokenize_pcts <<< "$str")
str=$(detokenize_orig_pcts <<< "$str")
str=$(urlencode <<< "$str")
detokenize_pcts <<< "$str"
else
urlencode <<< "$str"
fi
}
main
This matches (according to my tests) the output from: jq -jRr '@uri'
Great functions 👍
Unfortunately none of the decode options work with German Umlauts:
Example:
Encode: Günther -> G%FCnther
Decode: G%FCnther -> G�nther
It seems to be something with the encoding. I tried to add "| iconv -f iso8859-1 -t utf-8" as @Twibow says to the decode function from start post but it changes nothing.
Any help appreciated 😄
@cjplay02 I'm pretty sure the issue is elsewhere.