fabiolimace · April 5, 2025 21:23 · fabiolimace · Mar 23, 2025 · fabiolimace · Apr 4, 2025
diff --git a/csv-to-tsv.awk b/csv-to-tsv.awk
 #!/usr/bin/awk -f

 #
 # Reads a CSV file and prints a TSV file to standard output.
 #
 # Usage:
 #        ./csv-to-tsv.awk input.csv > output.tsv
 #        ./csv-to-tsv.awk -v SEP=";" input.csv > output.tsv
 #        ./csv-to-tsv.awk -v COLUMNS=10 input.csv > output.tsv
 #
 # You can specify another field separator by using the `SEP` parameter, for example `-v SEP=";"`. The default is a comma.
 #
 # You can specify the number of fields by using the `COLUMNS` parameter, for example, `-v COLUMNS=10`. The default is the number of fields found in the header.
 #
 # HORIZONTAL TABULATION, CARRIAGE RETURN and LINE FEED are replaced with, respectivelly: '\t', '\r' and '\n'.
 #
 # Escaped quotes ("") are replaced by a single quote (").
 #
 # CSV IETF Specification: https://datatracker.ietf.org/doc/html/rfc4180
 # TSV IANA Specification: https://www.iana.org/assignments/media-types/text/tab-separated-values
 #

 BEGIN {

    CR="\r";
    TAB="\t";
    COMMA=",";
    QUOTE="\"";
    SUBSTITUTE="\032";
    
    FS=COMMA;
    OFS=TAB;
    
    # set the separator
    if (SEP) FS = SEP;
    
    # set the fields number
    if (COLUMNS) FIELDS_NUMBER = COLUMNS;
    
    DISCARDED_FIELD=SUBSTITUTE;
    REGEX_ESC_QUOTE=QUOTE QUOTE;
    REGEX_BEG_QUOTE="^[ ]*" QUOTE;
    REGEX_END_QUOTE=QUOTE "[ ]*$";
    REGEX_ALONE_QUOTE="[^" QUOTE "]" QUOTE "[^" QUOTE "]";
    REGEX_FALSE_BEG_QUOTE="^[ ]*" "(" QUOTE QUOTE ")+" "[^" QUOTE "]";
    REGEX_FALSE_END_QUOTE="[^" QUOTE "]" "(" QUOTE QUOTE ")+" "[ ]*$";
    
    ERROR="";
 }

 function remove_field(col,    i) {
  for (i=col; i<NF; i++) {
    $i = $(i+1)
  }
  NF--
 }

 function discard_fields() {
    for (i = 1; i <= NF; i++) {
        # remove discarded fields
        if ($i == DISCARDED_FIELD) {
            remove_field(i--);
        }
    }
 }

 function error(message, line) {
    ERROR=message;
    delete lines;
    exit;
 }

 function is_beg_quote(v) {
    return (v ~ REGEX_BEG_QUOTE) && (v !~ REGEX_FALSE_BEG_QUOTE);
 }

 function is_end_quote(v) {
    return (v ~ REGEX_END_QUOTE) && (v !~ REGEX_FALSE_END_QUOTE);
 }

 {
    # fix CR LF
    sub(/\r$/,"");
    
    # escape CRs
    gsub(CR,"\\r");
    
    # escape TABs
    gsub(TAB,"\\t");
 }

 CONTINUE {
    # move on after line break
    gsub(TAB, COMMA, CONTINUE);
    $0 = CONTINUE "\\n" $0;
    CONTINUE="";
 }

 NF {
    
    # This forces awk to rebuild the record, using the values of the fields and OFS.
    # Read: https://www.gnu.org/software/gawk/manual/html_node/Changing-Fields.html
    $1 = $1;
    
    if (!FIELDS_NUMBER) FIELDS_NUMBER = NF;
    
    # parse all quoted fields
    for (i = 1; i <= NF; i++) {
    
        if (is_beg_quote($i) && !is_end_quote($i)) {
            for(; i < NF; i++) {
                old = i;
                new = i + 1;
                $new = $old COMMA $new;
                $old = DISCARDED_FIELD;
                if (is_end_quote($new)) break;
            }
            if (i == NF) {
                # read next line
                discard_fields();
                CONTINUE = $0;
                next;
            }
        }
        
        if (is_beg_quote($i) && is_end_quote($i)) {
            # remove extremity quotes
            sub(REGEX_BEG_QUOTE,"",$i);
            sub(REGEX_END_QUOTE,"",$i);
            if (match($i, REGEX_ALONE_QUOTE)) {
                error("near line " NR ", there is a quote that should be escaped.\n" $i);
            }
            # replece residual escaped quotes
            gsub(REGEX_ESC_QUOTE, QUOTE, $i);
        } else {
            if (index($i, QUOTE)) {
                error("near line " NR ", there is a quote that should be escaped within a quote-delimited field.\n" $i);
            }
        }
    }
    
    discard_fields();
    
    if (FIELDS_NUMBER != NF) {
        error("near line " NR ", there are " NF " fields, but there should be " FIELDS_NUMBER " fields.\n" $0);
    }
    
    lines[NR]=$0;
 }

 END {

    if (ERROR) {
        print "ERROR: " ERROR > "/dev/stderr";
        exit 1;
    }
    
    # print sorted lines
    for(i = 1; i <= NR; i++) {
        if (lines[i]) {
            print lines[i];
        }
    }
 }

diff --git a/tsv-to-csv.awk b/tsv-to-csv.awk
 #!/usr/bin/awk -f

 #
 # Reads a TSV file and prints a CSV file to standard output.
 #
 # Also reads DSV files by choosing a field separator other than a TAB.
 #
 # Usage:
 #        ./tsv-to-csv.awk input.tsv > output.csv
 #        ./tsv-to-csv.awk -v SEP=";" input.tsv > output.csv
 #        ./tsv-to-csv.awk -v COLUMNS=10 input.tsv > output.csv
 #        ./tsv-to-csv.awk input.tsv | sed -E 's/\r$//' > output.csv
 #
 # You can specify another field separator by using the `SEP` parameter, for example `-v SEP=":"`. However, If you change the field separator, the input file will be read as a generic DSV (delimited-separated value) file. The default is a TAB.
 #
 # You can specify the number of fields by using the `COLUMNS` parameter, for example, `-v COLUMNS=10`. The default is the number of fields found in the header.
 #
 # Escape sequences for HORIZONTAL TABULATION, CARRIAGE RETURN and LINE FEED, namelly '\t', '\r' and '\n', are replaced with their respective ASCII characters.
 #
 # All fields in the output file are delimited by quotes. Additionally, all output lines end with the CARRIAGE RETURN and LINE FEED pair, as specified in RFC 4180.
 #
 # CSV IETF Specification: https://datatracker.ietf.org/doc/html/rfc4180
 # TSV IANA Specification: https://www.iana.org/assignments/media-types/text/tab-separated-values
 #

 BEGIN {

    LF="\n";
    CR="\r";
    TAB="\t";
    COMMA=",";
    QUOTE="\"";
    
    FS=TAB;
    
    # set the separator
    if (SEP) FS = SEP;
    
    # set the fields number
    if (COLUMNS) FIELDS_NUMBER = COLUMNS;
    
    ERROR="";
 }

 function error(message, line) {
    ERROR=message;
    delete lines;
    exit;
 }

 NF {

    if (!FIELDS_NUMBER) FIELDS_NUMBER = NF;
    
    line = "";
    for (i = 1; i <= NF; i++) {
    
        # lets escape the quotes
        gsub(QUOTE, QUOTE QUOTE, $i);
        
        # putting delimiters
        if (line) line = line COMMA;
        line = line QUOTE $i QUOTE;
    }
    
    if (FIELDS_NUMBER != NF) {
        error("at line " NR ", there are " NF " fields, but there should be " FIELDS_NUMBER " fields.\n" $0);
    }

    gsub(/\\t/, TAB, line);
    gsub(/\\r/, CR, line);
    gsub(/\\n/, LF, line);
    gsub(/$/, CR, line);
    
    lines[NR]=line;
 }

 END {

    if (ERROR) {
        print "ERROR: " ERROR > "/dev/stderr";
        exit 1;
    }
    
    # print sorted lines
    for(i = 1; i <= NR; i++) {
        if (lines[i]) {
            print lines[i];
        }
    }
 }
	#!/usr/bin/awk -f

	#
	# Reads a CSV file and prints a TSV file to standard output.
	#
	# Usage:
	# ./csv-to-tsv.awk input.csv > output.tsv
	# ./csv-to-tsv.awk -v SEP=";" input.csv > output.tsv
	# ./csv-to-tsv.awk -v COLUMNS=10 input.csv > output.tsv
	#
	# You can specify another field separator by using the `SEP` parameter, for example `-v SEP=";"`. The default is a comma.
	#
	# You can specify the number of fields by using the `COLUMNS` parameter, for example, `-v COLUMNS=10`. The default is the number of fields found in the header.
	#
	# HORIZONTAL TABULATION, CARRIAGE RETURN and LINE FEED are replaced with, respectivelly: '\t', '\r' and '\n'.
	#
	# Escaped quotes ("") are replaced by a single quote (").
	#
	# CSV IETF Specification: https://datatracker.ietf.org/doc/html/rfc4180
	# TSV IANA Specification: https://www.iana.org/assignments/media-types/text/tab-separated-values
	#

	BEGIN {

	CR="\r";
	TAB="\t";
	COMMA=",";
	QUOTE="\"";
	SUBSTITUTE="\032";

	FS=COMMA;
	OFS=TAB;

	# set the separator
	if (SEP) FS = SEP;

	# set the fields number
	if (COLUMNS) FIELDS_NUMBER = COLUMNS;

	DISCARDED_FIELD=SUBSTITUTE;
	REGEX_ESC_QUOTE=QUOTE QUOTE;
	REGEX_BEG_QUOTE="^[ ]*" QUOTE;
	REGEX_END_QUOTE=QUOTE "[ ]*$";
	REGEX_ALONE_QUOTE="[^" QUOTE "]" QUOTE "[^" QUOTE "]";
	REGEX_FALSE_BEG_QUOTE="^[ ]*" "(" QUOTE QUOTE ")+" "[^" QUOTE "]";
	REGEX_FALSE_END_QUOTE="[^" QUOTE "]" "(" QUOTE QUOTE ")+" "[ ]*$";

	ERROR="";
	}

	function remove_field(col, i) {
	for (i=col; i<NF; i++) {
	$i = $(i+1)
	}
	NF--
	}

	function discard_fields() {
	for (i = 1; i <= NF; i++) {
	# remove discarded fields
	if ($i == DISCARDED_FIELD) {
	remove_field(i--);
	}
	}
	}

	function error(message, line) {
	ERROR=message;
	delete lines;
	exit;
	}

	function is_beg_quote(v) {
	return (v ~ REGEX_BEG_QUOTE) && (v !~ REGEX_FALSE_BEG_QUOTE);
	}

	function is_end_quote(v) {
	return (v ~ REGEX_END_QUOTE) && (v !~ REGEX_FALSE_END_QUOTE);
	}

	{
	# fix CR LF
	sub(/\r$/,"");

	# escape CRs
	gsub(CR,"\\r");

	# escape TABs
	gsub(TAB,"\\t");
	}

	CONTINUE {
	# move on after line break
	gsub(TAB, COMMA, CONTINUE);
	$0 = CONTINUE "\\n" $0;
	CONTINUE="";
	}

	NF {

	# This forces awk to rebuild the record, using the values of the fields and OFS.
	# Read: https://www.gnu.org/software/gawk/manual/html_node/Changing-Fields.html
	$1 = $1;

	if (!FIELDS_NUMBER) FIELDS_NUMBER = NF;

	# parse all quoted fields
	for (i = 1; i <= NF; i++) {

	if (is_beg_quote($i) && !is_end_quote($i)) {
	for(; i < NF; i++) {
	old = i;
	new = i + 1;
	$new = $old COMMA $new;
	$old = DISCARDED_FIELD;
	if (is_end_quote($new)) break;
	}
	if (i == NF) {
	# read next line
	discard_fields();
	CONTINUE = $0;
	next;
	}
	}

	if (is_beg_quote($i) && is_end_quote($i)) {
	# remove extremity quotes
	sub(REGEX_BEG_QUOTE,"",$i);
	sub(REGEX_END_QUOTE,"",$i);
	if (match($i, REGEX_ALONE_QUOTE)) {
	error("near line " NR ", there is a quote that should be escaped.\n" $i);
	}
	# replece residual escaped quotes
	gsub(REGEX_ESC_QUOTE, QUOTE, $i);
	} else {
	if (index($i, QUOTE)) {
	error("near line " NR ", there is a quote that should be escaped within a quote-delimited field.\n" $i);
	}
	}
	}

	discard_fields();

	if (FIELDS_NUMBER != NF) {
	error("near line " NR ", there are " NF " fields, but there should be " FIELDS_NUMBER " fields.\n" $0);
	}

	lines[NR]=$0;
	}

	END {

	if (ERROR) {
	print "ERROR: " ERROR > "/dev/stderr";
	exit 1;
	}

	# print sorted lines
	for(i = 1; i <= NR; i++) {
	if (lines[i]) {
	print lines[i];
	}
	}
	}
	#!/usr/bin/awk -f

	#
	# Reads a TSV file and prints a CSV file to standard output.
	#
	# Also reads DSV files by choosing a field separator other than a TAB.
	#
	# Usage:
	# ./tsv-to-csv.awk input.tsv > output.csv
	# ./tsv-to-csv.awk -v SEP=";" input.tsv > output.csv
	# ./tsv-to-csv.awk -v COLUMNS=10 input.tsv > output.csv
	# ./tsv-to-csv.awk input.tsv \| sed -E 's/\r$//' > output.csv
	#
	# You can specify another field separator by using the `SEP` parameter, for example `-v SEP=":"`. However, If you change the field separator, the input file will be read as a generic DSV (delimited-separated value) file. The default is a TAB.
	#
	# You can specify the number of fields by using the `COLUMNS` parameter, for example, `-v COLUMNS=10`. The default is the number of fields found in the header.
	#
	# Escape sequences for HORIZONTAL TABULATION, CARRIAGE RETURN and LINE FEED, namelly '\t', '\r' and '\n', are replaced with their respective ASCII characters.
	#
	# All fields in the output file are delimited by quotes. Additionally, all output lines end with the CARRIAGE RETURN and LINE FEED pair, as specified in RFC 4180.
	#
	# CSV IETF Specification: https://datatracker.ietf.org/doc/html/rfc4180
	# TSV IANA Specification: https://www.iana.org/assignments/media-types/text/tab-separated-values
	#

	BEGIN {

	LF="\n";
	CR="\r";
	TAB="\t";
	COMMA=",";
	QUOTE="\"";

	FS=TAB;

	# set the separator
	if (SEP) FS = SEP;

	# set the fields number
	if (COLUMNS) FIELDS_NUMBER = COLUMNS;

	ERROR="";
	}

	function error(message, line) {
	ERROR=message;
	delete lines;
	exit;
	}

	NF {

	if (!FIELDS_NUMBER) FIELDS_NUMBER = NF;

	line = "";
	for (i = 1; i <= NF; i++) {

	# lets escape the quotes
	gsub(QUOTE, QUOTE QUOTE, $i);

	# putting delimiters
	if (line) line = line COMMA;
	line = line QUOTE $i QUOTE;
	}

	if (FIELDS_NUMBER != NF) {
	error("at line " NR ", there are " NF " fields, but there should be " FIELDS_NUMBER " fields.\n" $0);
	}

	gsub(/\\t/, TAB, line);
	gsub(/\\r/, CR, line);
	gsub(/\\n/, LF, line);
	gsub(/$/, CR, line);

	lines[NR]=line;
	}

	END {

	if (ERROR) {
	print "ERROR: " ERROR > "/dev/stderr";
	exit 1;
	}

	# print sorted lines
	for(i = 1; i <= NR; i++) {
	if (lines[i]) {
	print lines[i];
	}
	}
	}
Year	Make	Model	Description	Price
1997	Ford	E350	ac, abs, moon	3000.00
1999	Chevy	Venture "Extended Edition"		4900.00
1999		Venture "Extended Edition", Very Large	n/a	5000.00
1999		Adventure "Extended Edition, Not Very Large"	n/a	5000.00
1996	Jeep	Grand Cherokee	MUST SELL! 1. air, 2. moon roof, 3. loaded	4799.00
	Fiat	Uno	"Way" 1.0	R$ 9999.00
1900	"Fiat"	Palio	1.6 "Essence"	R$ 9999,00
	Wolkswagen	Gol	"G3"	R$ 9999,00