pkoppstein · December 31, 2018 23:24
diff --git a/fromcsvfile.jq b/fromcsvfile.jq
 # Copyright (C) 2018 [email protected]
 # License: Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported (CC BY-NC-SA 3.0)
 # See http://creativecommons.org/licenses/by-nc/3.0/
 # Attribution shall include the copyright notice above.

 # fromcsv.jq version: 0.4 of 2018-12-30
 # Requires: jq with `inputs`
 # Source: https://gist.github.com/pkoppstein/bbbbdf7489c8c515680beb1c75fa59f2

 # A PEG-inspired parser for reading CSV files without the need to "slurp" them.

 # The parser is intended to handle a wide variety of "edge cases".
 # Note that both \r\n and \r\r\n are interpreted as end-of-record.

 # Error-reporting showing the relevant record number and troublesome string fragment is provided on STDERR.

 # Main jq filters: 
 #  fromcsv/0     # JSON string input
 #  fromcsvfile/0 # read from STDIN

 # Example usage: jq -nRc 'include "fromcsvfile"; fromcsvfile' MYFILE.csv

 ######### PEG machinery

 # consume a regular expression rooted at the start of .remainder
 def consume($re):
  # on failure, match yields empty
  (.remainder | match("^" + $re)) as $match
  | .remainder |= .[$match.length :] ;

 def parse($re):
  # on failure, match yields empty
  (.remainder | match("^" + $re)) as $match
  | .remainder |= .[$match.length :]
  |.result += [$match.string | gsub("\"\"";"\"")] ;

 # Utility function as there is no EOF marker
 def at_eof: .remainder | (.=="" or . == "\n" or . == "\r\n");

 ############ Grammar for CSV

 # end-of-record if end-of-record characters or else at eof
 def EOR:
  consume("\n|\r\n|\r\r\n")
  // (if .remainder=="" then . else empty end) ;

 # Internal double-quotes must be doubled; 
 # CRs and LFs are allowed, as are empty quoted fields.
 def field_content_quoted:
  parse("((\"\")|([^\"]))*") ;

 # # EXTENSION: When reading an unquoted field, we ought to recognize CRLF as end-of-record, 
 # i.e. only accept LF if it is NOT preceded by CR.
 # Reject unescaped double-quote
 def unquoted_field:
  (parse("[^\",\r\n]+") | (if .remainder|test("^\r\n") then consume("\r") else empty end))
  // parse("[^\",\n]*") # possibly empty
  ;  

 def quoted_field_continue:
  def trim: sub("(\r\r|\r)$";"");
  .remainder += ("\n" + input | trim)
  # | (.record|debug) as $debug
  | (field_content_quoted | consume("\" *"))
    // quoted_field_continue
    ;
     
 # Ignore blanks before and after the enclosing quotation marks # EXTENSION
 def quoted_field:
  consume(" *\"")
  | ( (field_content_quoted | consume("\" *"))
      // quoted_field_continue ) ;

 def field: 
  quoted_field
  // unquoted_field
  // if at_eof then empty else stderr end ;

 # ("," field)+
 def fields:
  consume(",")
  | field
  | (fields // .) ;
  
 # field ("," field)*
 # i.e. field fields
 def record: field | (fields // .) ;

 def fromcsv:
  ## Loop for processing all the records
  def _fromcsv:
    if at_eof then empty
    else (EOR // .)
    | record
    | select(.result)
    | .result,
      (.result = null | .record+=1 |  _fromcsv)
    end ;

  {record:0, remainder: .}
  | _fromcsv ;

 def fromcsvfile:
  def trim: sub("(\r\r|\r)$";"");
  inputs | trim | fromcsv;
	# Copyright (C) 2018 [email protected]
	# License: Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported (CC BY-NC-SA 3.0)
	# See http://creativecommons.org/licenses/by-nc/3.0/
	# Attribution shall include the copyright notice above.

	# fromcsv.jq version: 0.4 of 2018-12-30
	# Requires: jq with `inputs`
	# Source: https://gist.github.com/pkoppstein/bbbbdf7489c8c515680beb1c75fa59f2

	# A PEG-inspired parser for reading CSV files without the need to "slurp" them.

	# The parser is intended to handle a wide variety of "edge cases".
	# Note that both \r\n and \r\r\n are interpreted as end-of-record.

	# Error-reporting showing the relevant record number and troublesome string fragment is provided on STDERR.

	# Main jq filters:
	# fromcsv/0 # JSON string input
	# fromcsvfile/0 # read from STDIN

	# Example usage: jq -nRc 'include "fromcsvfile"; fromcsvfile' MYFILE.csv

	######### PEG machinery

	# consume a regular expression rooted at the start of .remainder
	def consume($re):
	# on failure, match yields empty
	(.remainder \| match("^" + $re)) as $match
	\| .remainder \|= .[$match.length :] ;

	def parse($re):
	# on failure, match yields empty
	(.remainder \| match("^" + $re)) as $match
	\| .remainder \|= .[$match.length :]
	\|.result += [$match.string \| gsub("\"\"";"\"")] ;

	# Utility function as there is no EOF marker
	def at_eof: .remainder \| (.=="" or . == "\n" or . == "\r\n");

	############ Grammar for CSV

	# end-of-record if end-of-record characters or else at eof
	def EOR:
	consume("\n\|\r\n\|\r\r\n")
	// (if .remainder=="" then . else empty end) ;

	# Internal double-quotes must be doubled;
	# CRs and LFs are allowed, as are empty quoted fields.
	def field_content_quoted:
	parse("((\"\")\|([^\"]))*") ;

	# # EXTENSION: When reading an unquoted field, we ought to recognize CRLF as end-of-record,
	# i.e. only accept LF if it is NOT preceded by CR.
	# Reject unescaped double-quote
	def unquoted_field:
	(parse("[^\",\r\n]+") \| (if .remainder\|test("^\r\n") then consume("\r") else empty end))
	// parse("[^\",\n]*") # possibly empty
	;

	def quoted_field_continue:
	def trim: sub("(\r\r\|\r)$";"");
	.remainder += ("\n" + input \| trim)
	# \| (.record\|debug) as $debug
	\| (field_content_quoted \| consume("\" *"))
	// quoted_field_continue
	;

	# Ignore blanks before and after the enclosing quotation marks # EXTENSION
	def quoted_field:
	consume(" *\"")
	\| ( (field_content_quoted \| consume("\" *"))
	// quoted_field_continue ) ;

	def field:
	quoted_field
	// unquoted_field
	// if at_eof then empty else stderr end ;

	# ("," field)+
	def fields:
	consume(",")
	\| field
	\| (fields // .) ;

	# field ("," field)*
	# i.e. field fields
	def record: field \| (fields // .) ;

	def fromcsv:
	## Loop for processing all the records
	def _fromcsv:
	if at_eof then empty
	else (EOR // .)
	\| record
	\| select(.result)
	\| .result,
	(.result = null \| .record+=1 \| _fromcsv)
	end ;

	{record:0, remainder: .}
	\| _fromcsv ;

	def fromcsvfile:
	def trim: sub("(\r\r\|\r)$";"");
	inputs \| trim \| fromcsv;