cdown/gist:1163649

milahu · 2022-09-06T11:54:13Z

LC_ALL=C is needed to support unicode = loop bytes, not characters.
LC_COLLATE=C or LANG=C do not work.
this also must be set before ${#1} to get the length of $1 in bytes

#!/usr/bin/env bash
# MIT License

# encode special characters per RFC 3986
urlencode() {
    local LC_ALL=C # support unicode = loop bytes, not characters
    local c i n=${#1}
    for (( i=0; i<n; i++ )); do
        c="${1:i:1}"
        case "$c" in
            [-_.~A-Za-z0-9]) # also encode ;,/?:@&=+$!*'()# == encodeURIComponent in javascript
            #[-_.~A-Za-z0-9\;,/?:@\&=+\$!*\'\(\)#]) # dont encode ;,/?:@&=+$!*'()# == encodeURI in javascript
               printf '%s' "$c" ;;
            *) printf '%%%02X' "'$c" ;;
        esac
    done
    echo
}

_test_urlencode() {
  local fname=urlencode
  local auml=$'\xC3\xA4' # ä = %C3%A4
  local euro=$'\xE2\x82\xAC' # € = %E2%82%AC
  local tick=$'\x60' # ` = %60
  local backtick=$'\xC2\xB4' # ´ = %C2%B4
  local input="a:/b c?d=e&f#g-+-;-,-@-\$-!-*-'-(-)-#-$tick-$backtick-$auml-$euro"
  # note: we expect uppercase hex codes from %02X format string
  local expected="a%3A%2Fb%20c%3Fd%3De%26f%23g-%2B-%3B-%2C-%40-%24-%21-%2A-%27-%28-%29-%23-%60-%C2%B4-%C3%A4-%E2%82%AC" # also encode ;,/?:@&=+$!*'()#
  #local expected="a:/b%20c?d=e&f#g-+-;-,-@-\$-!-*-'-(-)-#-%60-%C2%B4-%C3%A4-%E2%82%AC" # dont encode ;,/?:@&=+$!*'()#
  local actual="$($fname "$input")"
  if [[ "$actual" != "$expected" ]]; then
    echo "error in $fname"
    # debug
    echo "input: $input"
    echo "input hex:"; echo -n "$input" | hexdump -v -e '/1 "%02X"' | sed 's/\(..\)/\\x\1/g'; echo
    echo "input hexdump:"; echo -n "$input" | hexdump -C
    printf "actual:   "; echo "$actual"
    printf "expected: "; echo "$expected"
    exit 1
  fi
}
_test_urlencode

ThePredators · 2022-09-07T07:33:33Z

This works for me.

https://stackoverflow.com/questions/296536/how-to-urlencode-data-for-curl-command

rawurlencode() {
  local string="${1}"
  local strlen=${#string}
  local encoded=""
  local pos c o

  for (( pos=0 ; pos<strlen ; pos++ )); do
     c=${string:$pos:1}
     case "$c" in
        [-_.~a-zA-Z0-9] ) o="${c}" ;;
        * )               printf -v o '%%%02x' "'$c"
     esac
     encoded+="${o}"
  done
  echo "${encoded}"    # You can either set a return variable (FASTER) 
  REPLY="${encoded}"   #+or echo the result (EASIER)... or both... :p
}

milahu · 2022-09-07T08:38:33Z

@ThePredators this breaks on unicode

input: a:/b c?d=e&f#g-+-`-´-ä-€
input hex:
\x61\x3A\x2F\x62\x20\x63\x3F\x64\x3D\x65\x26\x66\x23\x67\x2D\x2B\x2D\x60\x2D\xC2\xB4\x2D\xC3\xA4\x2D\xE2\x82\xAC
input hexdump:
00000000  61 3a 2f 62 20 63 3f 64  3d 65 26 66 23 67 2d 2b  |a:/b c?d=e&f#g-+|
00000010  2d 60 2d c2 b4 2d c3 a4  2d e2 82 ac              |-`-..-..-...|
0000001c
actual:   a%3A%2Fb%20c%3Fd%3De%26f%23g-%2B-%60-%B4-%E4-%20AC
expected: a%3A%2Fb%20c%3Fd%3De%26f%23g-%2B-%60-%C2%B4-%C3%A4-%E2%82%AC

see my updated answer

oijkn · 2022-10-03T13:20:17Z

@ThePredators works like a charm 👍

Twibow · 2022-11-07T21:44:42Z

Hi,

Characters used in France are not taken into account: (é è à ù ê â û ...) if you work in fr_FR locale.

You need to convert your data source from Windows-1252 to UTF-8 before entering in the function ::

data_utf8=$(echo "$data_ISO" | iconv -f iso8859-1 -t utf-8)

AdamDanischewski · 2024-08-05T05:07:54Z

#!/bin/bash
 
 ## Written by Adam Danischewski 08/04/2024

declare CURR_ORD 

str="${1:-😄.mp4}"

function ord() {
    printf -v CURR_ORD "%d" "\"$1"
}

function has_unicode() { 
 local input="$1"
 local -i charcnt=$(wc -m <<<"$input")
 local -i bytecnt=$(wc -c <<<"$input")
 ((charcnt!=bytecnt))
 return $?
}

function urlencode() {
    sed "s/\x25/%25/g;s/\x20/%20/g;s/\x21/%21/g;s/\x22/%22/g;s/\x23/%23/g;s/\x5c\x24/%24/g;\
        s/\x26/%26/g;s/\x27/%27/g;s/\x28/%28/g;s/\x29/%29/g;s/\x2A/%2A/g;s/\x2B/%2B/g;\
        s/\x2C/%2C/g;s/\x2D/%2D/g;s/\x3A/%3A/g;s/\x3F/%3F/g;s/\x7C/%7C/g;s/\x5c\x5B/%5B/g"
}

function encode_unicode() { 
for ((i=0;i<${#str};i++)); do
    char=${str:i:1}
    ord "$char"
    if ((${#CURR_ORD}>3)); then 
     od -t x1 <<< "$char" | awk '{$1="";gsub("^[[:space:]]*","");for(i=1;i<NF;i++) printf "%%" toupper($i);}'
    else 
     printf "%s" "$char" 
    fi 
done
}

## Tokenize percents before encoding unicode 
function tokenize_orig_pcts() {
  sed 's/%/\x01/g'
} 

## Tokenize percents after encoding unicode, since this is urlencoded..  
function tokenize_pcts() {
  sed 's/%/\x02/g'
} 

function detokenize_orig_pcts() {
  sed 's/\x01/%/g'
} 

function detokenize_pcts() {
  sed 's/\x02/%/g'
} 

function urlencode() { 
 sed "s/\x25/%25/g;s/\x20/%20/g;s/\x21/%21/g;s/\x22/%22/g;s/\x23/%23/g;s/\x5c\x24/%24/g;\
        s/\x26/%26/g;s/\x27/%27/g;s/\x28/%28/g;s/\x29/%29/g;s/\x2A/%2A/g;s/\x2B/%2B/g;\
        s/\x2C/%2C/g;s/\x3A/%3A/g;s/\x3F/%3F/g;s/\x7C/%7C/g;s/\x5c\x5B/%5B/g"
}

function main() { 
  if has_unicode "$str"; then 
    str=$(tokenize_orig_pcts <<< "$str")
    str=$(encode_unicode)
    str=$(tokenize_pcts <<< "$str")
    str=$(detokenize_orig_pcts <<< "$str")
    str=$(urlencode <<< "$str")
    detokenize_pcts <<< "$str"
  else 
    urlencode <<< "$str"
  fi  
}

main

This matches (according to my tests) the output from: jq -jRr '@uri'

bjoern-vh · 2024-08-18T22:38:32Z

Great functions 👍

Unfortunately none of the decode options work with German Umlauts:

Example:
Encode: Günther -> G%FCnther
Decode: G%FCnther -> G�nther

It seems to be something with the encoding. I tried to add "| iconv -f iso8859-1 -t utf-8" as @Twibow says to the decode function from start post but it changes nothing.

Any help appreciated 😄

	urlencode() {
	# urlencode <string>

	old_lc_collate=$LC_COLLATE
	LC_COLLATE=C

	local length="${#1}"
	for (( i = 0; i < length; i++ )); do
	local c="${1:$i:1}"
	case $c in
	[a-zA-Z0-9.~_-]) printf '%s' "$c" ;;
	*) printf '%%%02X' "'$c" ;;
	esac
	done

	LC_COLLATE=$old_lc_collate
	}

	urldecode() {
	# urldecode <string>

	local url_encoded="${1//+/ }"
	printf '%b' "${url_encoded//%/\\x}"
	}

cdown/gist:1163649

milahu commented Sep 6, 2022 •

edited

Loading

Uh oh!

ThePredators commented Sep 7, 2022

Uh oh!

milahu commented Sep 7, 2022

Uh oh!

oijkn commented Oct 3, 2022

Uh oh!

Twibow commented Nov 7, 2022

Uh oh!

AdamDanischewski commented Aug 5, 2024 •

edited

Loading

Uh oh!

bjoern-vh commented Aug 18, 2024

Uh oh!

cdown/gist:1163649

milahu commented Sep 6, 2022 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

ThePredators commented Sep 7, 2022

Uh oh!

milahu commented Sep 7, 2022

Uh oh!

oijkn commented Oct 3, 2022

Uh oh!

Twibow commented Nov 7, 2022

Uh oh!

AdamDanischewski commented Aug 5, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

bjoern-vh commented Aug 18, 2024

Uh oh!

milahu commented Sep 6, 2022 •

edited

Loading

AdamDanischewski commented Aug 5, 2024 •

edited

Loading