CharlesNepote · December 5, 2024 09:06
diff --git a/readme.md b/readme.md
diff --git a/csv2datasette b/csv2datasette
 #!/usr/bin/env bash
 # sudo ln -s "$(pwd)/csv2datasette" /usr/bin/csv2datasette

 # csv2datasette is meant to explore CSV data. It is not meant to create a sustainable DB.
 # csv2datasette is a bash script which open CSV files directly in Datasette. It offers
 # a number of options for reading and exploring CSV files, such as --stats, inspired by WTFCsv.
 #
 # `--stats` option includes, for each column: the column name, the number of unique values,
 # the number of filled rows, the number of missing values, the mininmum value, the maximum value, 
 # the average, the sum, the shortest string, the longest string, the number of numeric values, 
 # the number of text values, the number of probable ISO dates, the top 3 frequent values.

 # csv2datasette is only tested under Linux.

 # Its default behavior is to:
 # * create a DB in a random temporary directory (under /tmp), each time the script is used
 # * normalize database name (avoid spaces, hyphens and dots), and use it for the table name
 # * rename columns to avoid spaces
 # * launch Datasette on a random unused port, and open the browser
 # * delete the temporary DB at the end of the script.
 #
 # -e | --generate-column-names      > option allows to automatically add column names if the CSV doesn't have some.
 # -s "," | --separator ","          > allows to change the CSV separator (aka delimiter)
 # -t     |                          > CSV separator is a tab
 # -d "," | --decimal-separator ","  > allows to specify the decimal separator used for numbers (typically "," instead of "." in France)
 # --stats                           > add table called "stats" with stats like WTFCsv: https://databasic.io/en/wtfcsv/
 #
 # If you need to add different or more options to datasette command line, use ~/.csv2datasetterc file.
 # ~/.csv2datasetterc is "bash style" so don't forget "\" at the of each line if you have multiple lines. Eg.:
 # --setting truncate_cells_html 512 \
 # --load-extension /usr/lib/sqlite3/pcre.so

 # Other features:
 # * verify if datasette is well installed
 # * stats: following Bertin's advices, treat zeros (0) differently than other walues, to ease stats reading
 #          for the following columns: filled, missing, numericValues, textValues, probISODate
 #          We have chosen to display zeros in a very light gray
 # * stats: round avg and sum to 5 decimals


 # Inspired from https://gist.github.com/ahebrank/752fdb77876ebe86276df100bca55e79


 # ###
 # TODO (or not). (In approximate order of importance.)
 # * Test https://github.com/visprex/visprex for inspiration
 # * [DB] verify the CSV file is not too big ("... do you want to continue?")
 # * [DB] allows to import CSV files from http links?
 # * [Service] stats: medata.yml to display information about the stats table: columns documentation, aggregated stats (average fillness...)...
 #   * https://docs.datasette.io/en/stable/metadata.html
 #   * Description of the CSV
 #     This table represent data imported from ___ CSV with '|' separator and ',' as the decimal separator.
 #     This table contains XX columns that are YY% complete.
 #     NN columns are fully completed, ZZ almost completed, PP empty, and JJ almost empty.
 #     You can get [stats] related to each columns.
 # * [Service] stats: when few values, display barchart as in https://databasic.io/en/wtfcsv/
 # * [Service] stats: data quality stats: data type homogeneity (all NUMERIC?), weird values, null island, epoch time...
 # * [Service] save DB in its original form, without any transformation, in a dedicated table
 # * [Service] convert dates in ISO-8601 (well suited for sorting)?
 # * [Service] manage null value?
 # * [Service] manage monetary formats?
 # * [Service] detect UNIX timestamp format (eg. 1623855208) -- https://en.wikipedia.org/wiki/Unix_time
 #             => integers between 0 and SELECT strftime('%s', 'now');
 #                                    or SELECT (julianday('now') - 2440587.5) * 86400.0; /* including the fractional part */


 # ###
 # Init
 #
 OPTIONS=""
 _V=0
 SEPARATOR=","

 # Exit if datasette is not installed
 [[ ! $(datasette --version) ]] && { echo "Datasette is not installed or not reachable. \
                                          See Datasette documentation: https://datasette.io/"; \
                                    exit 1; }

 # test if REGEXP module is installed
 regexp_installed=$(sqlite3 -line <<< "SELECT 'Hello, world' REGEXP 'world';")
 if [[ "$?" != "0" ]]; then
    echo -e "REGEXP module is not working. --stats option can't work.\n
             For more information, see: https://gist.github.com/CharlesNepote/3fdb8982dc873f34aa7c320bef580fe3#file-readme-md"
 fi

 usage=$(cat <<EOF
 Usage:  csv2datasette [ -e | --generate-column-names ]
                      [ -s | --separator "," ]
                      [ -t ]
                      [ --decimal-separator ","]
                      [ --stats ]
                      [ -v ]
                      filename(s)

        csv2datasette [-h | --help] # displays this help.

 EOF
 )
 usage() {
  echo "${usage}"
  exit 2
 }

 log() { [[ ${_V} -eq 1 ]] && echo -e "$*"; }

 # 's:' implies that option -s has value and is a mandatory option. ':' means has a value.
 PARSED_ARGUMENTS=$(getopt -a -n csv2datasette \
    -o hes:td:v \
    --long help,interactive,add-to-current-tables,generate-column-names,stats,separator:,tsv,decimal-separator: -- "$@")
 VALID_ARGUMENTS=$?
 [[ "${VALID_ARGUMENTS}" != "0" ]] && usage

 eval set -- "${PARSED_ARGUMENTS}"
 while :
 do
  case "$1" in
    -h | --help)                     usage                 ;;
    -e | --generate-column-names)    COL_NAMES="generated" ; shift   ;;
    -s | --separator)                SEPARATOR="$2"        ; shift 2 ;;
    -t | --tsv)                      SEPARATOR="\t"        ; shift   ;;
    -d | --decimal-separator)        decimal="$2"          ; shift 2 ;;
    --stats)                         STATS=1               ; shift   ;;
    -v)                              _V=1                  ; shift   ;;
    # -- means the end of the arguments; drop this, and break out of the while loop
    --) shift; break ;;
    # If invalid options were passed, then getopt should have reported an error,
    # which we checked as VALID_ARGUMENTS when getopt was called...
    *) echo "Unexpected option: $1 - this should not happen."
       usage ;;
  esac
 done
 log "PARSED_ARGUMENTS is ${PARSED_ARGUMENTS}"
 log "Parameters remaining are: $*"

 # Test file
 [[ -f $@ ]] || { echo "$@ not existing..."; usage; exit 1; }

 # normalize file name to avoid creation of a table with dots (.), spaces ( ) 
 # or hyphens (-). See csvs-to-sqlite issue: "option to lower case column names, 
 # replace space with underscore, and remove reservered character from column names #86"
 #  => https://github.com/simonw/csvs-to-sqlite/issues/86
 #
 # shellcheck disable=SC2001
 name=$(echo "$@" | sed 's/^\(.*\)\.\(.*\)$/\1/') # remove extension (my_data.csv => my_data)
 # shellcheck disable=SC2001
 name=$(echo "${name}" | sed 's/[.\ -]/_/g')      # replace dots (.), spaces ( ) or hyphens (-) by underscore (_)
 log "Table name: ${name}"
 # Store all files in a temporary dir: /tmp/csv2datasette.name.jkfhkhfjuygh
 tempdir=$(mktemp -d "${TMPDIR:-/tmp/}$(basename "$0").${name}.XXXXXXXXXXXX")
 log "Temp dir: ${tempdir}"
 database="${tempdir}/${name}.db"

 # ###
 # Column names
 #
 if [[ ${COL_NAMES} == "generated" ]]; then
  # Guessing the number of columns can be tricky when the CSV file use double quotes.
  # Commons GNU tools can be fooled, that's why at first we try to use smarter tools 
  # if they are already installed.
  #
  # Try to guess number of columns with csvkit
  if [[ $(csvcut -h) ]]; then
    NB_COLS=$(csvcut -n "$@" | wc -l)
    # TODO: exit if 0 column
    log "csvcut have guessed the number of columns: ${NB_COLS}"
    header="col_1"
    for ((i=2; i <= NB_COLS; i++)); do header="${header},col_${i}"; done
    log "Adding header: ${header}\n"
    sed "1 i\\${header}" "$@" > "${tempdir}/${name}.tmp.csv"
  # Else add header thanks to awk
  # TODO: also try with other tools: https://www.unix.com/shell-programming-and-scripting/138223-how-find-count-commas-string-excluding-ones-double-quotes.html
  else
    # https://stackoverflow.com/questions/52044576/how-to-add-a-header-to-text-file-in-bash
    log "Adding header with awk...\n"
    awk -F "${SEPARATOR}" 'NR==1{for(i=1;i<=NF;i++) printf "col_%d%s", $i,(i==NF?ORS:FS)}1' \
      "$@" > "${tempdir}/${name}.tmp.csv"
  fi
 else
    head -n 1 "$@" | grep -q " "
    if [[ $? != 0 ]]; then
      # If there is no space in the columns headers, just make link
      # shellcheck disable=SC2164
      CSV_FILE_FULL_PATH="$(cd "$(dirname "$@")"; pwd -P)/$(basename "$@")"
      log "Link ${CSV_FILE_FULL_PATH} to ${tempdir}/${name}.tmp.csv"
      ln -s "${CSV_FILE_FULL_PATH}" "${tempdir}/${name}.tmp.csv"
    else
      log "rename columns to avoid spaces..."
      # TODO: this is very long with a big file!
      sed '1 s/ /_/g' "$@" > "${tempdir}/${name}.tmp.csv"
    fi
 fi

 # ###
 # Creating database...
 #
 [[ $SEPARATOR = "\t" ]] && DELIMITER="--tsv" || DELIMITER="--csv --delimiter=${SEPARATOR}"
 command="sqlite-utils insert ${database} ${name} ${tempdir}/${name}.tmp.csv ${OPTIONS} ${DELIMITER}"
 log "\n\nsqlite-utils command: ${command}"
 # TODO: this can be very long for big files. Eg. 50+ minutes for Open Food Facts CSV as of 2023-12-14.
 #       Use something different than sqlite-utils?
 ${command}
 [[ $? != 0 ]] && { log "Error $?. Exiting..."; rm -r "${tempdir}"; exit 1; }

 # ###
 # Improving database if necessary
 #
 # Convert decimal separator
 if [[ ${decimal} != "" ]]; then
  readarray columns < <( sqlite3 "${database}" "SELECT name FROM PRAGMA_TABLE_INFO('${name}');" )
  for col in "${columns[@]}"; do col=$(echo "${col}" | sed -e 's/[[:space:]]//g')
    log "Column: ${col}"
    sqlite3 "${database}" \
      "UPDATE ${name} SET [${col}] = CAST(replace([${col}], '${decimal}', '.') AS NUMERIC) WHERE [${col}] LIKE '%${decimal}%';"
  done
 fi

 # ###
 # Creating stats...
 #
 if [[ ${STATS} == 1 ]]; then
  log "\nBuilding stats..."
  # Number of entries in the table
  nbOfEntries=$(sqlite3 "${database}" "select count(*) from [${name}];")
  sqlite3 "${database}" \
    "create table stats(column_name TEXT,
                        uniq INT,
                        filled INT,
                        missing INT,
                        min TEXT,
                        max TEXT,
                        avg INT,
                        sum INT,
                        shortestStr INT,
                        longestStr INT,
                        numericValues INT,
                        textValues INT,
                        probISODate INT,
                        top_3_freq TEXT);"
  readarray columns < <( sqlite3 "${database}" "SELECT name FROM PRAGMA_TABLE_INFO('${name}');" )
  for col in "${columns[@]}"; do col=$(echo "${col}" | sed -e 's/[[:space:]]//g')
    log "\nColumn: ${col}"
    nbOfMissingValues=$(sqlite3 "${database}" "select count([${col}]) from [${name}] where [${col}] == '';")
    nbOfFilledValues=$(sqlite3 "${database}" "select count([${col}]) from [${name}] where [${col}] != '';")
    nbOfDistinctValues=$(sqlite3 "${database}" "select count(distinct [${col}]) from [${name}];")
    # Don't compute anything if the column is empty
    if (( nbOfEntries - nbOfMissingValues == 0 )); then
      log "column ${col} is empty..."
      sqlite3 "${database}" \
        "insert into stats (column_name, uniq, filled, missing) values ('${col}', ${nbOfDistinctValues}, ${nbOfFilledValues}, ${nbOfMissingValues});"
      continue
    fi
    
    # If a column is clearly text, do not compute sum and average
    nbOfTextValues=$(sqlite3 "${database}" "select count(*) from [${name}] where [${col}] != ''
              and not ( [${col}] REGEXP '^[-+]?([0-9]*[\.])?[0-9]+([eE][-+]?\d+)?$'
                        and [${col}] not REGEXP '^[-+]?0[0-9]+' /* exclude 056 or -082 */ );")
    nbOfNumericValues=$(sqlite3 "${database}" "select count(*) from [${name}] where [${col}] != '' 
              and [${col}] REGEXP '^[-+]?([0-9]*[\.])?[0-9]+([eE][-+]?\d+)?$'
              and [${col}] not REGEXP '^[-+]?0[0-9]+' /* exclude 056 or -082 */;")
    log "nbOfEntries (${nbOfEntries}) - nbOfTextValues (${nbOfTextValues}) = $(( nbOfTextValues - nbOfEntries ))"

    textValuesPercent=$(echo "scale=2; (${nbOfTextValues}/${nbOfFilledValues})*100"|bc)
    log "textValuesPercent: ${textValuesPercent}"
    numericValuesPercent=$(echo "scale=2; (${nbOfNumericValues}/${nbOfFilledValues})*100"|bc)
    log "numericValuesPercent: ${numericValuesPercent}"

    # TODO: isText=1 if 50% of text values?
    # TODO: isNumeric=1 if 90% of numeric values (test if this work for codes such as EAN)
    (( $(bc <<<"${textValuesPercent} > 90") )) && isText=1 || isText=0
    log "isText: ${isText}"

    sqlite3 "${database}" \
      "insert into stats (column_name, uniq, filled, missing, min, max, avg, sum, shortestStr, longestStr, numericValues, textValues, probISODate, top_3_freq)
          values (
            '${col}',
            -- uniq
            ${nbOfDistinctValues},
            -- filled
            ${nbOfFilledValues},
            -- missing
            ${nbOfMissingValues},
            -- min
            (select min([${col}]) from [${name}]),
            -- max
            (select max([${col}]) from [${name}]),
            -- Compute average if it's a numeric column
            ( $( (( isText == 1 )) && echo "''" || echo "select round(avg([${col}]) * 100000) / 100000.0 from [${name}]") ),
            -- Compute sum if it's a numeric column
            ( $( (( isText == 1 )) && echo "''" || echo "select round(sum([${col}]) * 100000) / 100000.0 from [${name}]") ),
            -- shortestStr
            (select length([${col}]) as l from [${name}] order by l asc limit 1),
            -- longestStr
            (select length([${col}]) as l from [${name}] order by l desc limit 1),
            -- numericValues
            ${nbOfNumericValues},
            -- textValues
            ${nbOfTextValues},
            -- probISODate
            (select count(*) from [${name}] where [${col}] is not null
              and [${col}] 
              REGEXP '^[1-9][0-9]?[0-9]?[0-9]?((-?(0[13578]|1[02])(-?(0[1-9]|[12][0-9]|3[01]))?|-?((0[469]|11)-?(0[1-9]|[12][0-9]|30))?|-?(02-?(0[1-9]|[12][0-9]))?)?((T| )(0[0-9]|1[0-9]|2[0-3]):(0[0-9]|[1-5][0-9])(:(0[0-9]|[1-5][0-9])((\.[0-9][0-9]?[0-9]?)?(Z|\+[0-2][0-9]:[0-5][0-9])?)?)?)?)?$'
              /**/),
            (select group_concat(n, char(10)) from (
              select [${col}] || ' (' || count([${col}]) || ')' as n from [${name}] GROUP BY [${col}] ORDER BY COUNT([${col}]) DESC limit 3)
          )
      );"
  done
 fi


 # ###
 # Creating metadata file
 #
 metadata=$(cat <<EOF
 extra_css_urls:
  - /static/datasette2csv.css
 extra_js_urls:
  - /static/datasette2csv.js
 EOF
 )
 echo "${metadata}" >> "${tempdir}/metadata.yml"

 mkdir -v "${tempdir}/static"
 js=$(cat << EOF
 /* Add the "zero" CSS class when some stats table columns contains zero (to ease reading) */
 window.addEventListener('load', function () {
  let classes = ["col-filled", "col-missing", "col-numericValues", "col-textValues", "col-probISODate"];
  for (const c of classes) {
    let x = document.getElementsByClassName(c);
    for (let i = 0; i < x.length; i++) {
      console.log(i); console.log(document.getElementsByClassName(c)[i].innerText);
      if (document.getElementsByClassName(c)[i].innerText == 0) { x[i].classList.add('zero'); }
    }
  }
 });
 EOF
 )
 echo "${js}" >> "${tempdir}/static/datasette2csv.js"

 css=$(cat << EOF
 .zero { color: RGBA(0,0,0,.2) !important; }
 EOF
 )
 echo "${css}" >> "${tempdir}/static/datasette2csv.css"

 #read -p "Are you sure? " -n 1 -r; echo


 # ###
 # Datasette port
 #
 # Use a port which is not already used
 port=$(python3 -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()')


 # ### 
 # Launch datasette
 #
 log "\nLaunching datasette...  (Press CTRL+C to quit)"
 # See: https://docs.datasette.io/en/stable/cli-reference.html
 # -h, --host TEXT                 Host for server. Defaults to 127.0.0.1 which means only connections from the local machine
 #                                 will be allowed. Use 0.0.0.0 to listen to all IPs and allow access from other machines.
 # -p, --port INTEGER RANGE        Port for server, defaults to 8001. Use -p 0 to automatically assign an available port.
 #                                 [0<=x<=65535]
 # -o, --open                      Open Datasette in your web browser
 # --metadata FILENAME             Path to JSON/YAML file containing license/source metadata
 datasette "${database}" -h 0.0.0.0 -p "${port}" --open \
  --setting facet_time_limit_ms 20000 \
  --setting facet_suggest_time_limit_ms 10000 \
  --setting sql_time_limit_ms 20000 \
  --setting max_returned_rows 100000 \
  --setting truncate_cells_html 500 \
  $( [[ -f "${HOME}/.csv2datasetterc" ]] && { while read p; do echo $p; done < "${HOME}/.csv2datasetterc"; } ; ) \
  --metadata "${tempdir}/metadata.yml" \
  --static static:${tempdir}/static/


 # ###
 # Delete database
 #
 log "Deleting all files and directories which where created in ${tempdir}..."
 rm -r "${tempdir}"
 #trap "rm -rf ${tempdir}" EXIT
	#!/usr/bin/env bash
	# sudo ln -s "$(pwd)/csv2datasette" /usr/bin/csv2datasette

	# csv2datasette is meant to explore CSV data. It is not meant to create a sustainable DB.
	# csv2datasette is a bash script which open CSV files directly in Datasette. It offers
	# a number of options for reading and exploring CSV files, such as --stats, inspired by WTFCsv.
	#
	# `--stats` option includes, for each column: the column name, the number of unique values,
	# the number of filled rows, the number of missing values, the mininmum value, the maximum value,
	# the average, the sum, the shortest string, the longest string, the number of numeric values,
	# the number of text values, the number of probable ISO dates, the top 3 frequent values.

	# csv2datasette is only tested under Linux.

	# Its default behavior is to:
	# * create a DB in a random temporary directory (under /tmp), each time the script is used
	# * normalize database name (avoid spaces, hyphens and dots), and use it for the table name
	# * rename columns to avoid spaces
	# * launch Datasette on a random unused port, and open the browser
	# * delete the temporary DB at the end of the script.
	#
	# -e \| --generate-column-names > option allows to automatically add column names if the CSV doesn't have some.
	# -s "," \| --separator "," > allows to change the CSV separator (aka delimiter)
	# -t \| > CSV separator is a tab
	# -d "," \| --decimal-separator "," > allows to specify the decimal separator used for numbers (typically "," instead of "." in France)
	# --stats > add table called "stats" with stats like WTFCsv: https://databasic.io/en/wtfcsv/
	#
	# If you need to add different or more options to datasette command line, use ~/.csv2datasetterc file.
	# ~/.csv2datasetterc is "bash style" so don't forget "\" at the of each line if you have multiple lines. Eg.:
	# --setting truncate_cells_html 512 \
	# --load-extension /usr/lib/sqlite3/pcre.so

	# Other features:
	# * verify if datasette is well installed
	# * stats: following Bertin's advices, treat zeros (0) differently than other walues, to ease stats reading
	# for the following columns: filled, missing, numericValues, textValues, probISODate
	# We have chosen to display zeros in a very light gray
	# * stats: round avg and sum to 5 decimals


	# Inspired from https://gist.github.com/ahebrank/752fdb77876ebe86276df100bca55e79


	# ###
	# TODO (or not). (In approximate order of importance.)
	# * Test https://github.com/visprex/visprex for inspiration
	# * [DB] verify the CSV file is not too big ("... do you want to continue?")
	# * [DB] allows to import CSV files from http links?
	# * [Service] stats: medata.yml to display information about the stats table: columns documentation, aggregated stats (average fillness...)...
	# * https://docs.datasette.io/en/stable/metadata.html
	# * Description of the CSV
	# This table represent data imported from ___ CSV with '\|' separator and ',' as the decimal separator.
	# This table contains XX columns that are YY% complete.
	# NN columns are fully completed, ZZ almost completed, PP empty, and JJ almost empty.
	# You can get [stats] related to each columns.
	# * [Service] stats: when few values, display barchart as in https://databasic.io/en/wtfcsv/
	# * [Service] stats: data quality stats: data type homogeneity (all NUMERIC?), weird values, null island, epoch time...
	# * [Service] save DB in its original form, without any transformation, in a dedicated table
	# * [Service] convert dates in ISO-8601 (well suited for sorting)?
	# * [Service] manage null value?
	# * [Service] manage monetary formats?
	# * [Service] detect UNIX timestamp format (eg. 1623855208) -- https://en.wikipedia.org/wiki/Unix_time
	# => integers between 0 and SELECT strftime('%s', 'now');
	# or SELECT (julianday('now') - 2440587.5) * 86400.0; /* including the fractional part */


	# ###
	# Init
	#
	OPTIONS=""
	_V=0
	SEPARATOR=","

	# Exit if datasette is not installed
	[[ ! $(datasette --version) ]] && { echo "Datasette is not installed or not reachable. \
	See Datasette documentation: https://datasette.io/"; \
	exit 1; }

	# test if REGEXP module is installed
	regexp_installed=$(sqlite3 -line <<< "SELECT 'Hello, world' REGEXP 'world';")
	if [[ "$?" != "0" ]]; then
	echo -e "REGEXP module is not working. --stats option can't work.\n
	For more information, see: https://gist.github.com/CharlesNepote/3fdb8982dc873f34aa7c320bef580fe3#file-readme-md"
	fi

	usage=$(cat <<EOF
	Usage: csv2datasette [ -e \| --generate-column-names ]
	[ -s \| --separator "," ]
	[ -t ]
	[ --decimal-separator ","]
	[ --stats ]
	[ -v ]
	filename(s)

	csv2datasette [-h \| --help] # displays this help.

	EOF
	)
	usage() {
	echo "${usage}"
	exit 2
	}

	log() { [[ ${_V} -eq 1 ]] && echo -e "$*"; }

	# 's:' implies that option -s has value and is a mandatory option. ':' means has a value.
	PARSED_ARGUMENTS=$(getopt -a -n csv2datasette \
	-o hes:td:v \
	--long help,interactive,add-to-current-tables,generate-column-names,stats,separator:,tsv,decimal-separator: -- "$@")
	VALID_ARGUMENTS=$?
	[[ "${VALID_ARGUMENTS}" != "0" ]] && usage

	eval set -- "${PARSED_ARGUMENTS}"
	while :
	do
	case "$1" in
	-h \| --help) usage ;;
	-e \| --generate-column-names) COL_NAMES="generated" ; shift ;;
	-s \| --separator) SEPARATOR="$2" ; shift 2 ;;
	-t \| --tsv) SEPARATOR="\t" ; shift ;;
	-d \| --decimal-separator) decimal="$2" ; shift 2 ;;
	--stats) STATS=1 ; shift ;;
	-v) _V=1 ; shift ;;
	# -- means the end of the arguments; drop this, and break out of the while loop
	--) shift; break ;;
	# If invalid options were passed, then getopt should have reported an error,
	# which we checked as VALID_ARGUMENTS when getopt was called...
	*) echo "Unexpected option: $1 - this should not happen."
	usage ;;
	esac
	done
	log "PARSED_ARGUMENTS is ${PARSED_ARGUMENTS}"
	log "Parameters remaining are: $*"

	# Test file
	[[ -f $@ ]] \|\| { echo "$@ not existing..."; usage; exit 1; }

	# normalize file name to avoid creation of a table with dots (.), spaces ( )
	# or hyphens (-). See csvs-to-sqlite issue: "option to lower case column names,
	# replace space with underscore, and remove reservered character from column names #86"
	# => https://github.com/simonw/csvs-to-sqlite/issues/86
	#
	# shellcheck disable=SC2001
	name=$(echo "$@" \| sed 's/^\(.\)\.\(.\)$/\1/') # remove extension (my_data.csv => my_data)
	# shellcheck disable=SC2001
	name=$(echo "${name}" \| sed 's/[.\ -]/_/g') # replace dots (.), spaces ( ) or hyphens (-) by underscore (_)
	log "Table name: ${name}"
	# Store all files in a temporary dir: /tmp/csv2datasette.name.jkfhkhfjuygh
	tempdir=$(mktemp -d "${TMPDIR:-/tmp/}$(basename "$0").${name}.XXXXXXXXXXXX")
	log "Temp dir: ${tempdir}"
	database="${tempdir}/${name}.db"

	# ###
	# Column names
	#
	if [[ ${COL_NAMES} == "generated" ]]; then
	# Guessing the number of columns can be tricky when the CSV file use double quotes.
	# Commons GNU tools can be fooled, that's why at first we try to use smarter tools
	# if they are already installed.
	#
	# Try to guess number of columns with csvkit
	if [[ $(csvcut -h) ]]; then
	NB_COLS=$(csvcut -n "$@" \| wc -l)
	# TODO: exit if 0 column
	log "csvcut have guessed the number of columns: ${NB_COLS}"
	header="col_1"
	for ((i=2; i <= NB_COLS; i++)); do header="${header},col_${i}"; done
	log "Adding header: ${header}\n"
	sed "1 i\\${header}" "$@" > "${tempdir}/${name}.tmp.csv"
	# Else add header thanks to awk
	# TODO: also try with other tools: https://www.unix.com/shell-programming-and-scripting/138223-how-find-count-commas-string-excluding-ones-double-quotes.html
	else
	# https://stackoverflow.com/questions/52044576/how-to-add-a-header-to-text-file-in-bash
	log "Adding header with awk...\n"
	awk -F "${SEPARATOR}" 'NR==1{for(i=1;i<=NF;i++) printf "col_%d%s", $i,(i==NF?ORS:FS)}1' \
	"$@" > "${tempdir}/${name}.tmp.csv"
	fi
	else
	head -n 1 "$@" \| grep -q " "
	if [[ $? != 0 ]]; then
	# If there is no space in the columns headers, just make link
	# shellcheck disable=SC2164
	CSV_FILE_FULL_PATH="$(cd "$(dirname "$@")"; pwd -P)/$(basename "$@")"
	log "Link ${CSV_FILE_FULL_PATH} to ${tempdir}/${name}.tmp.csv"
	ln -s "${CSV_FILE_FULL_PATH}" "${tempdir}/${name}.tmp.csv"
	else
	log "rename columns to avoid spaces..."
	# TODO: this is very long with a big file!
	sed '1 s/ /_/g' "$@" > "${tempdir}/${name}.tmp.csv"
	fi
	fi

	# ###
	# Creating database...
	#
	[[ $SEPARATOR = "\t" ]] && DELIMITER="--tsv" \|\| DELIMITER="--csv --delimiter=${SEPARATOR}"
	command="sqlite-utils insert ${database} ${name} ${tempdir}/${name}.tmp.csv ${OPTIONS} ${DELIMITER}"
	log "\n\nsqlite-utils command: ${command}"
	# TODO: this can be very long for big files. Eg. 50+ minutes for Open Food Facts CSV as of 2023-12-14.
	# Use something different than sqlite-utils?
	${command}
	[[ $? != 0 ]] && { log "Error $?. Exiting..."; rm -r "${tempdir}"; exit 1; }

	# ###
	# Improving database if necessary
	#
	# Convert decimal separator
	if [[ ${decimal} != "" ]]; then
	readarray columns < <( sqlite3 "${database}" "SELECT name FROM PRAGMA_TABLE_INFO('${name}');" )
	for col in "${columns[@]}"; do col=$(echo "${col}" \| sed -e 's/[[:space:]]//g')
	log "Column: ${col}"
	sqlite3 "${database}" \
	"UPDATE ${name} SET [${col}] = CAST(replace([${col}], '${decimal}', '.') AS NUMERIC) WHERE [${col}] LIKE '%${decimal}%';"
	done
	fi

	# ###
	# Creating stats...
	#
	if [[ ${STATS} == 1 ]]; then
	log "\nBuilding stats..."
	# Number of entries in the table
	nbOfEntries=$(sqlite3 "${database}" "select count(*) from [${name}];")
	sqlite3 "${database}" \
	"create table stats(column_name TEXT,
	uniq INT,
	filled INT,
	missing INT,
	min TEXT,
	max TEXT,
	avg INT,
	sum INT,
	shortestStr INT,
	longestStr INT,
	numericValues INT,
	textValues INT,
	probISODate INT,
	top_3_freq TEXT);"
	readarray columns < <( sqlite3 "${database}" "SELECT name FROM PRAGMA_TABLE_INFO('${name}');" )
	for col in "${columns[@]}"; do col=$(echo "${col}" \| sed -e 's/[[:space:]]//g')
	log "\nColumn: ${col}"
	nbOfMissingValues=$(sqlite3 "${database}" "select count([${col}]) from [${name}] where [${col}] == '';")
	nbOfFilledValues=$(sqlite3 "${database}" "select count([${col}]) from [${name}] where [${col}] != '';")
	nbOfDistinctValues=$(sqlite3 "${database}" "select count(distinct [${col}]) from [${name}];")
	# Don't compute anything if the column is empty
	if (( nbOfEntries - nbOfMissingValues == 0 )); then
	log "column ${col} is empty..."
	sqlite3 "${database}" \
	"insert into stats (column_name, uniq, filled, missing) values ('${col}', ${nbOfDistinctValues}, ${nbOfFilledValues}, ${nbOfMissingValues});"
	continue
	fi

	# If a column is clearly text, do not compute sum and average
	nbOfTextValues=$(sqlite3 "${database}" "select count(*) from [${name}] where [${col}] != ''
	and not ( [${col}] REGEXP '^[-+]?([0-9]*[\.])?[0-9]+([eE][-+]?\d+)?$'
	and [${col}] not REGEXP '^[-+]?0[0-9]+' /* exclude 056 or -082 */ );")
	nbOfNumericValues=$(sqlite3 "${database}" "select count(*) from [${name}] where [${col}] != ''
	and [${col}] REGEXP '^[-+]?([0-9]*[\.])?[0-9]+([eE][-+]?\d+)?$'
	and [${col}] not REGEXP '^[-+]?0[0-9]+' /* exclude 056 or -082 */;")
	log "nbOfEntries (${nbOfEntries}) - nbOfTextValues (${nbOfTextValues}) = $(( nbOfTextValues - nbOfEntries ))"

	textValuesPercent=$(echo "scale=2; (${nbOfTextValues}/${nbOfFilledValues})*100"\|bc)
	log "textValuesPercent: ${textValuesPercent}"
	numericValuesPercent=$(echo "scale=2; (${nbOfNumericValues}/${nbOfFilledValues})*100"\|bc)
	log "numericValuesPercent: ${numericValuesPercent}"

	# TODO: isText=1 if 50% of text values?
	# TODO: isNumeric=1 if 90% of numeric values (test if this work for codes such as EAN)
	(( $(bc <<<"${textValuesPercent} > 90") )) && isText=1 \|\| isText=0
	log "isText: ${isText}"

	sqlite3 "${database}" \
	"insert into stats (column_name, uniq, filled, missing, min, max, avg, sum, shortestStr, longestStr, numericValues, textValues, probISODate, top_3_freq)
	values (
	'${col}',
	-- uniq
	${nbOfDistinctValues},
	-- filled
	${nbOfFilledValues},
	-- missing
	${nbOfMissingValues},
	-- min
	(select min([${col}]) from [${name}]),
	-- max
	(select max([${col}]) from [${name}]),
	-- Compute average if it's a numeric column
	( $( (( isText == 1 )) && echo "''" \|\| echo "select round(avg([${col}]) * 100000) / 100000.0 from [${name}]") ),
	-- Compute sum if it's a numeric column
	( $( (( isText == 1 )) && echo "''" \|\| echo "select round(sum([${col}]) * 100000) / 100000.0 from [${name}]") ),
	-- shortestStr
	(select length([${col}]) as l from [${name}] order by l asc limit 1),
	-- longestStr
	(select length([${col}]) as l from [${name}] order by l desc limit 1),
	-- numericValues
	${nbOfNumericValues},
	-- textValues
	${nbOfTextValues},
	-- probISODate
	(select count(*) from [${name}] where [${col}] is not null
	and [${col}]
	REGEXP '^[1-9][0-9]?[0-9]?[0-9]?((-?(0[13578]\|1[02])(-?(0[1-9]\|[12][0-9]\|3[01]))?\|-?((0[469]\|11)-?(0[1-9]\|[12][0-9]\|30))?\|-?(02-?(0[1-9]\|[12][0-9]))?)?((T\| )(0[0-9]\|1[0-9]\|2[0-3]):(0[0-9]\|[1-5][0-9])(:(0[0-9]\|[1-5][0-9])((\.[0-9][0-9]?[0-9]?)?(Z\|\+[0-2][0-9]:[0-5][0-9])?)?)?)?)?$'
	/**/),
	(select group_concat(n, char(10)) from (
	select [${col}] \|\| ' (' \|\| count([${col}]) \|\| ')' as n from [${name}] GROUP BY [${col}] ORDER BY COUNT([${col}]) DESC limit 3)
	)
	);"
	done
	fi


	# ###
	# Creating metadata file
	#
	metadata=$(cat <<EOF
	extra_css_urls:
	- /static/datasette2csv.css
	extra_js_urls:
	- /static/datasette2csv.js
	EOF
	)
	echo "${metadata}" >> "${tempdir}/metadata.yml"

	mkdir -v "${tempdir}/static"
	js=$(cat << EOF
	/* Add the "zero" CSS class when some stats table columns contains zero (to ease reading) */
	window.addEventListener('load', function () {
	let classes = ["col-filled", "col-missing", "col-numericValues", "col-textValues", "col-probISODate"];
	for (const c of classes) {
	let x = document.getElementsByClassName(c);
	for (let i = 0; i < x.length; i++) {
	console.log(i); console.log(document.getElementsByClassName(c)[i].innerText);
	if (document.getElementsByClassName(c)[i].innerText == 0) { x[i].classList.add('zero'); }
	}
	}
	});
	EOF
	)
	echo "${js}" >> "${tempdir}/static/datasette2csv.js"

	css=$(cat << EOF
	.zero { color: RGBA(0,0,0,.2) !important; }
	EOF
	)
	echo "${css}" >> "${tempdir}/static/datasette2csv.css"

	#read -p "Are you sure? " -n 1 -r; echo


	# ###
	# Datasette port
	#
	# Use a port which is not already used
	port=$(python3 -c 'import socket; s=socket.socket(); s.bind(("", 0)); print(s.getsockname()[1]); s.close()')


	# ###
	# Launch datasette
	#
	log "\nLaunching datasette... (Press CTRL+C to quit)"
	# See: https://docs.datasette.io/en/stable/cli-reference.html
	# -h, --host TEXT Host for server. Defaults to 127.0.0.1 which means only connections from the local machine
	# will be allowed. Use 0.0.0.0 to listen to all IPs and allow access from other machines.
	# -p, --port INTEGER RANGE Port for server, defaults to 8001. Use -p 0 to automatically assign an available port.
	# [0<=x<=65535]
	# -o, --open Open Datasette in your web browser
	# --metadata FILENAME Path to JSON/YAML file containing license/source metadata
	datasette "${database}" -h 0.0.0.0 -p "${port}" --open \
	--setting facet_time_limit_ms 20000 \
	--setting facet_suggest_time_limit_ms 10000 \
	--setting sql_time_limit_ms 20000 \
	--setting max_returned_rows 100000 \
	--setting truncate_cells_html 500 \
	$( [[ -f "${HOME}/.csv2datasetterc" ]] && { while read p; do echo $p; done < "${HOME}/.csv2datasetterc"; } ; ) \
	--metadata "${tempdir}/metadata.yml" \
	--static static:${tempdir}/static/


	# ###
	# Delete database
	#
	log "Deleting all files and directories which where created in ${tempdir}..."
	rm -r "${tempdir}"
	#trap "rm -rf ${tempdir}" EXIT