cdown/gist:1163649

cdown · 2021-01-29T00:50:45Z

@cjplay02 I'm pretty sure the issue is elsewhere.

$ urldecode() {
    # urldecode <string>

    local url_encoded="${1//+/ }"
    printf '%b' "${url_encoded//%/\\x}"
}
$ urldecode 'foo%20%20%20%20%20bar'
foo     bar

cjplay02 · 2021-01-29T00:56:29Z

Good call @cdown. I had the urldecode call in a command substitution - urldecoded=$(urldecode 's3://...'). Once I removed the function call from the command substitution, the spaces were retained from the encoding. Now I just need to find a better way to declare the result as a variable...

Edit. Double Quoting around the variable's presentation in downstream commands fixed my issue. Ie echo "$varname"

dicktyr · 2021-01-31T06:53:14Z

just a brief nod to mawk
which is five times faster in my tests
(indeed, often faster than sed)

I know it's not a de facto standard like bash
i.e. installed by default on so many systems
but it should be and it is on my systems

I also notice that bash seems to be catching up with ksh93

GeekDuanLian · 2021-02-09T11:17:24Z

One line implementation, suitable for storing in .bashrc

urle () { [[ "${1}" ]] || return 1; local LANG=C i x; for (( i = 0; i < ${#1}; i++ )); do x="${1:i:1}"; [[ "${x}" == [a-zA-Z0-9.~_-] ]] && echo -n "${x}" || printf '%%%02X' "'${x}"; done; echo; }
urld () { [[ "${1}" ]] || return 1; : "${1//+/ }"; echo -e "${_//%/\\x}"; }

rojenzaman · 2021-06-13T21:48:09Z

Thanks for it!

SilviaIenciu · 2022-01-07T14:08:38Z

Thanks for this.
Could you please also license this code of yours?

ThePredators · 2022-02-06T09:22:10Z

Thanks for the script, but i don't know why when calling urlencode i got in the encoded data a : % at the end !

ironbishop · 2022-04-04T10:04:19Z

i had to add a check for systems where collate is not set

if [ -n "$old_lc_collate" ] ; then LC_COLLATE=$old_lc_collate ; fi

milahu · 2022-09-06T11:54:13Z

LC_ALL=C is needed to support unicode = loop bytes, not characters.
LC_COLLATE=C or LANG=C do not work.
this also must be set before ${#1} to get the length of $1 in bytes

#!/usr/bin/env bash
# MIT License

# encode special characters per RFC 3986
urlencode() {
    local LC_ALL=C # support unicode = loop bytes, not characters
    local c i n=${#1}
    for (( i=0; i<n; i++ )); do
        c="${1:i:1}"
        case "$c" in
            [-_.~A-Za-z0-9]) # also encode ;,/?:@&=+$!*'()# == encodeURIComponent in javascript
            #[-_.~A-Za-z0-9\;,/?:@\&=+\$!*\'\(\)#]) # dont encode ;,/?:@&=+$!*'()# == encodeURI in javascript
               printf '%s' "$c" ;;
            *) printf '%%%02X' "'$c" ;;
        esac
    done
    echo
}

_test_urlencode() {
  local fname=urlencode
  local auml=$'\xC3\xA4' # ä = %C3%A4
  local euro=$'\xE2\x82\xAC' # € = %E2%82%AC
  local tick=$'\x60' # ` = %60
  local backtick=$'\xC2\xB4' # ´ = %C2%B4
  local input="a:/b c?d=e&f#g-+-;-,-@-\$-!-*-'-(-)-#-$tick-$backtick-$auml-$euro"
  # note: we expect uppercase hex codes from %02X format string
  local expected="a%3A%2Fb%20c%3Fd%3De%26f%23g-%2B-%3B-%2C-%40-%24-%21-%2A-%27-%28-%29-%23-%60-%C2%B4-%C3%A4-%E2%82%AC" # also encode ;,/?:@&=+$!*'()#
  #local expected="a:/b%20c?d=e&f#g-+-;-,-@-\$-!-*-'-(-)-#-%60-%C2%B4-%C3%A4-%E2%82%AC" # dont encode ;,/?:@&=+$!*'()#
  local actual="$($fname "$input")"
  if [[ "$actual" != "$expected" ]]; then
    echo "error in $fname"
    # debug
    echo "input: $input"
    echo "input hex:"; echo -n "$input" | hexdump -v -e '/1 "%02X"' | sed 's/\(..\)/\\x\1/g'; echo
    echo "input hexdump:"; echo -n "$input" | hexdump -C
    printf "actual:   "; echo "$actual"
    printf "expected: "; echo "$expected"
    exit 1
  fi
}
_test_urlencode

ThePredators · 2022-09-07T07:33:33Z

This works for me.

https://stackoverflow.com/questions/296536/how-to-urlencode-data-for-curl-command

rawurlencode() {
  local string="${1}"
  local strlen=${#string}
  local encoded=""
  local pos c o

  for (( pos=0 ; pos<strlen ; pos++ )); do
     c=${string:$pos:1}
     case "$c" in
        [-_.~a-zA-Z0-9] ) o="${c}" ;;
        * )               printf -v o '%%%02x' "'$c"
     esac
     encoded+="${o}"
  done
  echo "${encoded}"    # You can either set a return variable (FASTER) 
  REPLY="${encoded}"   #+or echo the result (EASIER)... or both... :p
}

milahu · 2022-09-07T08:38:33Z

@ThePredators this breaks on unicode

input: a:/b c?d=e&f#g-+-`-´-ä-€
input hex:
\x61\x3A\x2F\x62\x20\x63\x3F\x64\x3D\x65\x26\x66\x23\x67\x2D\x2B\x2D\x60\x2D\xC2\xB4\x2D\xC3\xA4\x2D\xE2\x82\xAC
input hexdump:
00000000  61 3a 2f 62 20 63 3f 64  3d 65 26 66 23 67 2d 2b  |a:/b c?d=e&f#g-+|
00000010  2d 60 2d c2 b4 2d c3 a4  2d e2 82 ac              |-`-..-..-...|
0000001c
actual:   a%3A%2Fb%20c%3Fd%3De%26f%23g-%2B-%60-%B4-%E4-%20AC
expected: a%3A%2Fb%20c%3Fd%3De%26f%23g-%2B-%60-%C2%B4-%C3%A4-%E2%82%AC

see my updated answer

oijkn · 2022-10-03T13:20:17Z

@ThePredators works like a charm 👍

Twibow · 2022-11-07T21:44:42Z

Hi,

Characters used in France are not taken into account: (é è à ù ê â û ...) if you work in fr_FR locale.

You need to convert your data source from Windows-1252 to UTF-8 before entering in the function ::

data_utf8=$(echo "$data_ISO" | iconv -f iso8859-1 -t utf-8)

AdamDanischewski · 2024-08-05T05:07:54Z

#!/bin/bash
 
 ## Written by Adam Danischewski 08/04/2024

declare CURR_ORD 

str="${1:-😄.mp4}"

function ord() {
    printf -v CURR_ORD "%d" "\"$1"
}

function has_unicode() { 
 local input="$1"
 local -i charcnt=$(wc -m <<<"$input")
 local -i bytecnt=$(wc -c <<<"$input")
 ((charcnt!=bytecnt))
 return $?
}

function urlencode() {
    sed "s/\x25/%25/g;s/\x20/%20/g;s/\x21/%21/g;s/\x22/%22/g;s/\x23/%23/g;s/\x5c\x24/%24/g;\
        s/\x26/%26/g;s/\x27/%27/g;s/\x28/%28/g;s/\x29/%29/g;s/\x2A/%2A/g;s/\x2B/%2B/g;\
        s/\x2C/%2C/g;s/\x2D/%2D/g;s/\x3A/%3A/g;s/\x3F/%3F/g;s/\x7C/%7C/g;s/\x5c\x5B/%5B/g"
}

function encode_unicode() { 
for ((i=0;i<${#str};i++)); do
    char=${str:i:1}
    ord "$char"
    if ((${#CURR_ORD}>3)); then 
     od -t x1 <<< "$char" | awk '{$1="";gsub("^[[:space:]]*","");for(i=1;i<NF;i++) printf "%%" toupper($i);}'
    else 
     printf "%s" "$char" 
    fi 
done
}

## Tokenize percents before encoding unicode 
function tokenize_orig_pcts() {
  sed 's/%/\x01/g'
} 

## Tokenize percents after encoding unicode, since this is urlencoded..  
function tokenize_pcts() {
  sed 's/%/\x02/g'
} 

function detokenize_orig_pcts() {
  sed 's/\x01/%/g'
} 

function detokenize_pcts() {
  sed 's/\x02/%/g'
} 

function urlencode() { 
 sed "s/\x25/%25/g;s/\x20/%20/g;s/\x21/%21/g;s/\x22/%22/g;s/\x23/%23/g;s/\x5c\x24/%24/g;\
        s/\x26/%26/g;s/\x27/%27/g;s/\x28/%28/g;s/\x29/%29/g;s/\x2A/%2A/g;s/\x2B/%2B/g;\
        s/\x2C/%2C/g;s/\x3A/%3A/g;s/\x3F/%3F/g;s/\x7C/%7C/g;s/\x5c\x5B/%5B/g"
}

function main() { 
  if has_unicode "$str"; then 
    str=$(tokenize_orig_pcts <<< "$str")
    str=$(encode_unicode)
    str=$(tokenize_pcts <<< "$str")
    str=$(detokenize_orig_pcts <<< "$str")
    str=$(urlencode <<< "$str")
    detokenize_pcts <<< "$str"
  else 
    urlencode <<< "$str"
  fi  
}

main

This matches (according to my tests) the output from: jq -jRr '@uri'

bjoern-vh · 2024-08-18T22:38:32Z

Great functions 👍

Unfortunately none of the decode options work with German Umlauts:

Example:
Encode: Günther -> G%FCnther
Decode: G%FCnther -> G�nther

It seems to be something with the encoding. I tried to add "| iconv -f iso8859-1 -t utf-8" as @Twibow says to the decode function from start post but it changes nothing.

Any help appreciated 😄

cdown/gist:1163649

cdown commented Jan 29, 2021 •

edited

Loading

cjplay02 commented Jan 29, 2021 •

edited

Loading

dicktyr commented Jan 31, 2021

GeekDuanLian commented Feb 9, 2021

rojenzaman commented Jun 13, 2021

SilviaIenciu commented Jan 7, 2022

ThePredators commented Feb 6, 2022

ironbishop commented Apr 4, 2022

milahu commented Sep 6, 2022 •

edited

Loading

ThePredators commented Sep 7, 2022

milahu commented Sep 7, 2022

oijkn commented Oct 3, 2022

Twibow commented Nov 7, 2022

AdamDanischewski commented Aug 5, 2024 •

edited

Loading

bjoern-vh commented Aug 18, 2024

	urlencode() {
	# urlencode <string>

	old_lc_collate=$LC_COLLATE
	LC_COLLATE=C

	local length="${#1}"
	for (( i = 0; i < length; i++ )); do
	local c="${1:$i:1}"
	case $c in
	[a-zA-Z0-9.~_-]) printf '%s' "$c" ;;
	*) printf '%%%02X' "'$c" ;;
	esac
	done

	LC_COLLATE=$old_lc_collate
	}

	urldecode() {
	# urldecode <string>

	local url_encoded="${1//+/ }"
	printf '%b' "${url_encoded//%/\\x}"
	}

cdown/gist:1163649

cdown commented Jan 29, 2021 • edited Loading

cjplay02 commented Jan 29, 2021 • edited Loading

dicktyr commented Jan 31, 2021

GeekDuanLian commented Feb 9, 2021

rojenzaman commented Jun 13, 2021

SilviaIenciu commented Jan 7, 2022

ThePredators commented Feb 6, 2022

ironbishop commented Apr 4, 2022

milahu commented Sep 6, 2022 • edited Loading

ThePredators commented Sep 7, 2022

milahu commented Sep 7, 2022

oijkn commented Oct 3, 2022

Twibow commented Nov 7, 2022

AdamDanischewski commented Aug 5, 2024 • edited Loading

bjoern-vh commented Aug 18, 2024

cdown commented Jan 29, 2021 •

edited

Loading

cjplay02 commented Jan 29, 2021 •

edited

Loading

milahu commented Sep 6, 2022 •

edited

Loading

AdamDanischewski commented Aug 5, 2024 •

edited

Loading