dunn · January 4, 2015 23:24
diff --git a/csv-to-array.awk b/csv-to-array.awk
 #!/usr/local/bin/gawk -f

 # I can't get multiline regular expressions to work well outside of
 # TextMate, so before running this on tweets.csv I opened it in
 # TextMate and replaced:

 # ([^"])\n
 # for
 # $1\\n

 # I had to run it twice (!?) to turn all the multiline tweets into
 # single-line tweets with '\n' instead of actual line breaks.  Then
 # you can run this:

 BEGIN {
  FS="\",\"";
  print "module.exports = [";
  ORS = ",\n";
 }
 {
  # http://www.linuxquestions.org/questions/programming-9/removing-first-line-with-awk-470129/#post2362671
  if (NR != 1) {
    if (substr($6,0,2) != "RT"){
      tweet = gensub("'","\\\\'","g",$6);
      # lol just keep adding backslashes until it works
      tweet = gensub(/\\([^'n])/,"\\\\\\\\\\1","g",tweet);
      print "'"tweet"'";
    }
  }
 }
 END {
  ORS = "\n";
  print "];";
 }

 # POST PROCESSING

 # Life is too short to figure out why awk sucks so much, so I just
 # went back to TextMate and substituted:

 # \\\',$
 # for
 # \'',

 # to close some strings.  And manually deleted the trailing comma on
 # the last array element.
	#!/usr/local/bin/gawk -f

	# I can't get multiline regular expressions to work well outside of
	# TextMate, so before running this on tweets.csv I opened it in
	# TextMate and replaced:

	# ([^"])\n
	# for
	# $1\\n

	# I had to run it twice (!?) to turn all the multiline tweets into
	# single-line tweets with '\n' instead of actual line breaks. Then
	# you can run this:

	BEGIN {
	FS="\",\"";
	print "module.exports = [";
	ORS = ",\n";
	}
	{
	# http://www.linuxquestions.org/questions/programming-9/removing-first-line-with-awk-470129/#post2362671
	if (NR != 1) {
	if (substr($6,0,2) != "RT"){
	tweet = gensub("'","\\\\'","g",$6);
	# lol just keep adding backslashes until it works
	tweet = gensub(/\\([^'n])/,"\\\\\\\\\\1","g",tweet);
	print "'"tweet"'";
	}
	}
	}
	END {
	ORS = "\n";
	print "];";
	}

	# POST PROCESSING

	# Life is too short to figure out why awk sucks so much, so I just
	# went back to TextMate and substituted:

	# \\\',$
	# for
	# \'',

	# to close some strings. And manually deleted the trailing comma on
	# the last array element.