aappddeevv · August 29, 2015 14:11
diff --git a/ImportUtilities.m b/ImportUtilities.m
 (* ::Package:: *)

 (* :Title: Import Delimited *)

 (* :Summary: Containts declarations for importing a delimited text file into a session. *)

 BeginPackage["ImportUtilities`"]


 (* Canned functions that can be used as arguments. *)
 WhitespaceSplitter::usage = "Function that splits string records on whitespace."

 ForeachTrim::usage = "Option that trimes the string argument."

 SplitFieldOnComma::usage = "Option that splits a record on commas."

 IncludeAllLines::usage = "Option that includes all lines by always indicating that the line should not be ignored. Always returns false."

 ReadSingleRecord::usage = "Option that reads a single record from a stream."

 NoHeader::usage = "Always 0 indicating that no records should be skipped at the start."

 SingleLineHeader::usage = "Always 1 indiacting that a single record should be skipped at the start."

 IdentityRecordTransformer::usage="Option that always returns the argument directly."

 CopyLine::usage = "Option that just returns the argument directly, thereby just copying the input to the output."

 ImportDelimited::usage = "ImportDelimited[filename] imports the delimited file. Many useful options exist to control the import process."

 MDYH24MS::usage = "Pattern spec to convert from date time string."

 MDYH12MSMAM::usage = "Pattern spec to convert from date time string, very long version."

 MDYH12MSMAMTransformer::usage = "Transformer date time to absolute time"

 MDYH24MSTransformer::usage = "Transformer date time to absolute time"

 Splitter::usage = "Function to split a line."
 MaxProcessed::usage ="All or number representing the number of processed lines to keep."
 MaxLine::usage="All or a number representing the last line number to read to. This is an absolute record position. If you want to control your sample size, use MaxProcessed."
 ForeachSplit::usage="A function to apply to each field. Default is to trim whitespace."
 RecordTransformer::usage="A function applied to the parsed row assuming it is not skipped or ignored. The output of this function is the new parsed row content."
 ApplyAt::usage="Association of line index (1 based) \[Rule] f[line] applied to the unparsed and unprocessed record. This collects lines and are returned in ApplyAt in the returned values. It ignores skippin gand ignoring."
 IgnoreLine::usage="Return a boolean when a line should be ignored. Only applies to lines that are not skipped. Default is to include all lines."
 Reader::usage="Read a line from teh input stream. Return EndOfFile when end of file"

 			     
 Begin["`Private`"]

  MDYH12MSMAM = {"Month", "Day", "Year", "Hour12", "Minute", "Second", "Millisecond", "AMPM"}
  MDYH24MS = {"Month", "Day", "Year", "Hour24", "Minute", "Second"}

  MDYH12MSMAMTransformer = AbsoluteTime[DateList[{#, MDYH12MSMAM }]] &
  MDYH24MSTransformer = AbsoluteTime[DateList[{#, MDYH24MS}]] &
  
  WhitespaceSplitter = StringSplit[#,Whitespace..]&
  ForeachTrim = StringTrim[#]&
  SplitFieldOnComma = StringSplit[#,","]&
  IncludeAllLines = False &
  ReadSingleRecord = Read[#, Record]&
  NoHeader = 0
  SingleLineHeader = 1
  IdentityRecordTransformer = #&
  CopyLine = #&
  
  (* Import a file using the functions to customize the import process. *) 
  ImportDelimited[file_String?FileExistsQ,
  
  (* lines to skip at the start of the stream, ignores results of IgnoreLines *)
    opts: OptionsPattern[{StartSkip -> NoHeader,
      
    (* All or a number representing the number of processed lines to keep. A processed line is like a sample, it was not skipped or ignored *)
    MaxProcessed -> All,
    
    (* All or a number representing the last line number to read to. This is an absolute record position. If you want to control your sample size, use MaxProcessed *)
    MaxLine -> All,
    
    (* function to split a line *)
    Splitter -> SplitFieldOnComma,
    
    (* could use Identity[#]& to preserve surrounding whitespace. Default is to trim whitespace. *)
    ForeachSplit -> ForeachTrim,
    
    (* field index \[Rule] f[string] *)
    Transformers -> <||>,
    
    (* a function applied to the parsed row assuming it is not skipped or ignored. The output of this function is the new parsed row content. The default is identity. *)
    RecordTransformer -> IdentityRecordTransformer,
    
    (* line index (1-based) \[Rule] f[line], applied to the unparsed and unprocessed record which is typically one line of the input stream.
 The resulting value is returned in a applyAt list that has {line, result value} tuples. ApplyAt
 uses simple AppendTo so use it sparaingly to collect values. It ignores skipping and ignoring.*)
  ApplyAt -> <||>,
  
  (* a function returning a boolean that determines whether a line should be ignored, only applies to lines that are not skipped *)
  IgnoreLine -> IncludeAllLines,
  
  (* read a line from the input stream. Return EndOfFile when end of file. *) 
  Reader-> ReadSingleRecord }]] := 
  
  Module[{str, result, line = {}, specials = {}, linecount=0, parsed,ignoredcount=0,startcount = 0,linesprocessed = 0,
 		rr = OptionValue[Reader], 
 		sp=OptionValue[Splitter],
 		maxlines =-1,
 		applies = OptionValue[ApplyAt],
 		recordTransformer = OptionValue[RecordTransformer],
 		transformers = OptionValue[Transformers],
 		fieldProcessor = OptionValue[ForeachSplit],
 		maxProcessed = -1,
 		ignoreLine = OptionValue[IgnoreLine],
 		skips = OptionValue[StartSkip]} ,
 	      str = OpenRead[file];
 	      maxlines = With[{v=OptionValue[MaxLine]}, If[v =!= All,  v, maxlines]];
 	      maxProcessed = With[{v = OptionValue[MaxProcessed]}, If[v =!= All, v, maxProcessed]];
 	      result = Reap[
 		      While[True && If[maxlines<0,True, linecount<maxlines],
 			    line =rr @ str;
 			    linecount++;
 			    If[line === EndOfFile, Break[]];
 			    (* If an applyto exist for this record, use it. *)
 			    If[KeyExistsQ[applies, linecount],
 				With[{aa = applies[[Key[linecount]]]},  
 				    AppendTo[specials, {linecount, aa@line}]]];
 			    If[linecount<=skips,startcount++; Continue[]];
 			    If[ignoreLine@line,ignoredcount++; Continue[]];
 			    
 			    parsed = recordTransformer@(fieldProcessor /@ (sp @ line));
 			    (* Redo this loop to map over the lhs of the associations versus every field! *)
 			    If[Length[transformers]>0,
 			      With[{len = Length[parsed]},
 				  (* if a transform has been specified, transform it then replace the string value (expensive!) *)
 				  MapIndexed[(With[{index=First[#2]},
 						  If[KeyExistsQ[transformers,index],
 						      parsed[[index]]= transformers[index]@ #1]])&, parsed]]];
 			  Sow[parsed, d]; (* reap only data tagged with d for data *)
 		    
 						      
 			    linesprocessed++;
 			    If[maxProcessed >=0 && linesprocessed >= maxProcessed, Break[]]
 		      ], d, Rule];
 	      Close[str];
 	      <|"Processed"->linesprocessed,
 	      "LinesRead" -> linecount,
 	      "StartSkipped"->startcount,
 	      "Ignored"->ignoredcount,
 	      "ApplyAt" -> specials,
 	      "Data"->d /. result[[2]] (* only return the reaped data not the reap data strucure *) |>
 	]
 End[]
 EndPackage[]
	(* ::Package:: *)

	(* :Title: Import Delimited *)

	(* :Summary: Containts declarations for importing a delimited text file into a session. *)

	BeginPackage["ImportUtilities`"]


	(* Canned functions that can be used as arguments. *)
	WhitespaceSplitter::usage = "Function that splits string records on whitespace."

	ForeachTrim::usage = "Option that trimes the string argument."

	SplitFieldOnComma::usage = "Option that splits a record on commas."

	IncludeAllLines::usage = "Option that includes all lines by always indicating that the line should not be ignored. Always returns false."

	ReadSingleRecord::usage = "Option that reads a single record from a stream."

	NoHeader::usage = "Always 0 indicating that no records should be skipped at the start."

	SingleLineHeader::usage = "Always 1 indiacting that a single record should be skipped at the start."

	IdentityRecordTransformer::usage="Option that always returns the argument directly."

	CopyLine::usage = "Option that just returns the argument directly, thereby just copying the input to the output."

	ImportDelimited::usage = "ImportDelimited[filename] imports the delimited file. Many useful options exist to control the import process."

	MDYH24MS::usage = "Pattern spec to convert from date time string."

	MDYH12MSMAM::usage = "Pattern spec to convert from date time string, very long version."

	MDYH12MSMAMTransformer::usage = "Transformer date time to absolute time"

	MDYH24MSTransformer::usage = "Transformer date time to absolute time"

	Splitter::usage = "Function to split a line."
	MaxProcessed::usage ="All or number representing the number of processed lines to keep."
	MaxLine::usage="All or a number representing the last line number to read to. This is an absolute record position. If you want to control your sample size, use MaxProcessed."
	ForeachSplit::usage="A function to apply to each field. Default is to trim whitespace."
	RecordTransformer::usage="A function applied to the parsed row assuming it is not skipped or ignored. The output of this function is the new parsed row content."
	ApplyAt::usage="Association of line index (1 based) \[Rule] f[line] applied to the unparsed and unprocessed record. This collects lines and are returned in ApplyAt in the returned values. It ignores skippin gand ignoring."
	IgnoreLine::usage="Return a boolean when a line should be ignored. Only applies to lines that are not skipped. Default is to include all lines."
	Reader::usage="Read a line from teh input stream. Return EndOfFile when end of file"


	Begin["`Private`"]

	MDYH12MSMAM = {"Month", "Day", "Year", "Hour12", "Minute", "Second", "Millisecond", "AMPM"}
	MDYH24MS = {"Month", "Day", "Year", "Hour24", "Minute", "Second"}

	MDYH12MSMAMTransformer = AbsoluteTime[DateList[{#, MDYH12MSMAM }]] &
	MDYH24MSTransformer = AbsoluteTime[DateList[{#, MDYH24MS}]] &

	WhitespaceSplitter = StringSplit[#,Whitespace..]&
	ForeachTrim = StringTrim[#]&
	SplitFieldOnComma = StringSplit[#,","]&
	IncludeAllLines = False &
	ReadSingleRecord = Read[#, Record]&
	NoHeader = 0
	SingleLineHeader = 1
	IdentityRecordTransformer = #&
	CopyLine = #&

	(* Import a file using the functions to customize the import process. *)
	ImportDelimited[file_String?FileExistsQ,

	(* lines to skip at the start of the stream, ignores results of IgnoreLines *)
	opts: OptionsPattern[{StartSkip -> NoHeader,

	(* All or a number representing the number of processed lines to keep. A processed line is like a sample, it was not skipped or ignored *)
	MaxProcessed -> All,

	(* All or a number representing the last line number to read to. This is an absolute record position. If you want to control your sample size, use MaxProcessed *)
	MaxLine -> All,

	(* function to split a line *)
	Splitter -> SplitFieldOnComma,

	(* could use Identity[#]& to preserve surrounding whitespace. Default is to trim whitespace. *)
	ForeachSplit -> ForeachTrim,

	(* field index \[Rule] f[string] *)
	Transformers -> <\|\|>,

	(* a function applied to the parsed row assuming it is not skipped or ignored. The output of this function is the new parsed row content. The default is identity. *)
	RecordTransformer -> IdentityRecordTransformer,

	(* line index (1-based) \[Rule] f[line], applied to the unparsed and unprocessed record which is typically one line of the input stream.
	The resulting value is returned in a applyAt list that has {line, result value} tuples. ApplyAt
	uses simple AppendTo so use it sparaingly to collect values. It ignores skipping and ignoring.*)
	ApplyAt -> <\|\|>,

	(* a function returning a boolean that determines whether a line should be ignored, only applies to lines that are not skipped *)
	IgnoreLine -> IncludeAllLines,

	(* read a line from the input stream. Return EndOfFile when end of file. *)
	Reader-> ReadSingleRecord }]] :=

	Module[{str, result, line = {}, specials = {}, linecount=0, parsed,ignoredcount=0,startcount = 0,linesprocessed = 0,
	rr = OptionValue[Reader],
	sp=OptionValue[Splitter],
	maxlines =-1,
	applies = OptionValue[ApplyAt],
	recordTransformer = OptionValue[RecordTransformer],
	transformers = OptionValue[Transformers],
	fieldProcessor = OptionValue[ForeachSplit],
	maxProcessed = -1,
	ignoreLine = OptionValue[IgnoreLine],
	skips = OptionValue[StartSkip]} ,
	str = OpenRead[file];
	maxlines = With[{v=OptionValue[MaxLine]}, If[v =!= All, v, maxlines]];
	maxProcessed = With[{v = OptionValue[MaxProcessed]}, If[v =!= All, v, maxProcessed]];
	result = Reap[
	While[True && If[maxlines<0,True, linecount<maxlines],
	line =rr @ str;
	linecount++;
	If[line === EndOfFile, Break[]];
	(* If an applyto exist for this record, use it. *)
	If[KeyExistsQ[applies, linecount],
	With[{aa = applies[[Key[linecount]]]},
	AppendTo[specials, {linecount, aa@line}]]];
	If[linecount<=skips,startcount++; Continue[]];
	If[ignoreLine@line,ignoredcount++; Continue[]];

	parsed = recordTransformer@(fieldProcessor /@ (sp @ line));
	(* Redo this loop to map over the lhs of the associations versus every field! *)
	If[Length[transformers]>0,
	With[{len = Length[parsed]},
	(* if a transform has been specified, transform it then replace the string value (expensive!) *)
	MapIndexed[(With[{index=First[#2]},
	If[KeyExistsQ[transformers,index],
	parsed[[index]]= transformers[index]@ #1]])&, parsed]]];
	Sow[parsed, d]; (* reap only data tagged with d for data *)


	linesprocessed++;
	If[maxProcessed >=0 && linesprocessed >= maxProcessed, Break[]]
	], d, Rule];
	Close[str];
	<\|"Processed"->linesprocessed,
	"LinesRead" -> linecount,
	"StartSkipped"->startcount,
	"Ignored"->ignoredcount,
	"ApplyAt" -> specials,
	"Data"->d /. result[[2]] (* only return the reaped data not the reap data strucure *) \|>
	]
	End[]
	EndPackage[]
No results found