Last active
August 29, 2015 14:11
-
-
Save aappddeevv/0203276b4c1c502336d0 to your computer and use it in GitHub Desktop.
mathematica delimited file importer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(* ::Package:: *) | |
(* :Title: Import Delimited *) | |
(* :Summary: Containts declarations for importing a delimited text file into a session. *) | |
BeginPackage["ImportUtilities`"] | |
(* Canned functions that can be used as arguments. *) | |
WhitespaceSplitter::usage = "Function that splits string records on whitespace." | |
ForeachTrim::usage = "Option that trimes the string argument." | |
SplitFieldOnComma::usage = "Option that splits a record on commas." | |
IncludeAllLines::usage = "Option that includes all lines by always indicating that the line should not be ignored. Always returns false." | |
ReadSingleRecord::usage = "Option that reads a single record from a stream." | |
NoHeader::usage = "Always 0 indicating that no records should be skipped at the start." | |
SingleLineHeader::usage = "Always 1 indiacting that a single record should be skipped at the start." | |
IdentityRecordTransformer::usage="Option that always returns the argument directly." | |
CopyLine::usage = "Option that just returns the argument directly, thereby just copying the input to the output." | |
ImportDelimited::usage = "ImportDelimited[filename] imports the delimited file. Many useful options exist to control the import process." | |
MDYH24MS::usage = "Pattern spec to convert from date time string." | |
MDYH12MSMAM::usage = "Pattern spec to convert from date time string, very long version." | |
MDYH12MSMAMTransformer::usage = "Transformer date time to absolute time" | |
MDYH24MSTransformer::usage = "Transformer date time to absolute time" | |
Splitter::usage = "Function to split a line." | |
MaxProcessed::usage ="All or number representing the number of processed lines to keep." | |
MaxLine::usage="All or a number representing the last line number to read to. This is an absolute record position. If you want to control your sample size, use MaxProcessed." | |
ForeachSplit::usage="A function to apply to each field. Default is to trim whitespace." | |
RecordTransformer::usage="A function applied to the parsed row assuming it is not skipped or ignored. The output of this function is the new parsed row content." | |
ApplyAt::usage="Association of line index (1 based) \[Rule] f[line] applied to the unparsed and unprocessed record. This collects lines and are returned in ApplyAt in the returned values. It ignores skippin gand ignoring." | |
IgnoreLine::usage="Return a boolean when a line should be ignored. Only applies to lines that are not skipped. Default is to include all lines." | |
Reader::usage="Read a line from teh input stream. Return EndOfFile when end of file" | |
Begin["`Private`"] | |
MDYH12MSMAM = {"Month", "Day", "Year", "Hour12", "Minute", "Second", "Millisecond", "AMPM"} | |
MDYH24MS = {"Month", "Day", "Year", "Hour24", "Minute", "Second"} | |
MDYH12MSMAMTransformer = AbsoluteTime[DateList[{#, MDYH12MSMAM }]] & | |
MDYH24MSTransformer = AbsoluteTime[DateList[{#, MDYH24MS}]] & | |
WhitespaceSplitter = StringSplit[#,Whitespace..]& | |
ForeachTrim = StringTrim[#]& | |
SplitFieldOnComma = StringSplit[#,","]& | |
IncludeAllLines = False & | |
ReadSingleRecord = Read[#, Record]& | |
NoHeader = 0 | |
SingleLineHeader = 1 | |
IdentityRecordTransformer = #& | |
CopyLine = #& | |
(* Import a file using the functions to customize the import process. *) | |
ImportDelimited[file_String?FileExistsQ, | |
(* lines to skip at the start of the stream, ignores results of IgnoreLines *) | |
opts: OptionsPattern[{StartSkip -> NoHeader, | |
(* All or a number representing the number of processed lines to keep. A processed line is like a sample, it was not skipped or ignored *) | |
MaxProcessed -> All, | |
(* All or a number representing the last line number to read to. This is an absolute record position. If you want to control your sample size, use MaxProcessed *) | |
MaxLine -> All, | |
(* function to split a line *) | |
Splitter -> SplitFieldOnComma, | |
(* could use Identity[#]& to preserve surrounding whitespace. Default is to trim whitespace. *) | |
ForeachSplit -> ForeachTrim, | |
(* field index \[Rule] f[string] *) | |
Transformers -> <||>, | |
(* a function applied to the parsed row assuming it is not skipped or ignored. The output of this function is the new parsed row content. The default is identity. *) | |
RecordTransformer -> IdentityRecordTransformer, | |
(* line index (1-based) \[Rule] f[line], applied to the unparsed and unprocessed record which is typically one line of the input stream. | |
The resulting value is returned in a applyAt list that has {line, result value} tuples. ApplyAt | |
uses simple AppendTo so use it sparaingly to collect values. It ignores skipping and ignoring.*) | |
ApplyAt -> <||>, | |
(* a function returning a boolean that determines whether a line should be ignored, only applies to lines that are not skipped *) | |
IgnoreLine -> IncludeAllLines, | |
(* read a line from the input stream. Return EndOfFile when end of file. *) | |
Reader-> ReadSingleRecord }]] := | |
Module[{str, result, line = {}, specials = {}, linecount=0, parsed,ignoredcount=0,startcount = 0,linesprocessed = 0, | |
rr = OptionValue[Reader], | |
sp=OptionValue[Splitter], | |
maxlines =-1, | |
applies = OptionValue[ApplyAt], | |
recordTransformer = OptionValue[RecordTransformer], | |
transformers = OptionValue[Transformers], | |
fieldProcessor = OptionValue[ForeachSplit], | |
maxProcessed = -1, | |
ignoreLine = OptionValue[IgnoreLine], | |
skips = OptionValue[StartSkip]} , | |
str = OpenRead[file]; | |
maxlines = With[{v=OptionValue[MaxLine]}, If[v =!= All, v, maxlines]]; | |
maxProcessed = With[{v = OptionValue[MaxProcessed]}, If[v =!= All, v, maxProcessed]]; | |
result = Reap[ | |
While[True && If[maxlines<0,True, linecount<maxlines], | |
line =rr @ str; | |
linecount++; | |
If[line === EndOfFile, Break[]]; | |
(* If an applyto exist for this record, use it. *) | |
If[KeyExistsQ[applies, linecount], | |
With[{aa = applies[[Key[linecount]]]}, | |
AppendTo[specials, {linecount, aa@line}]]]; | |
If[linecount<=skips,startcount++; Continue[]]; | |
If[ignoreLine@line,ignoredcount++; Continue[]]; | |
parsed = recordTransformer@(fieldProcessor /@ (sp @ line)); | |
(* Redo this loop to map over the lhs of the associations versus every field! *) | |
If[Length[transformers]>0, | |
With[{len = Length[parsed]}, | |
(* if a transform has been specified, transform it then replace the string value (expensive!) *) | |
MapIndexed[(With[{index=First[#2]}, | |
If[KeyExistsQ[transformers,index], | |
parsed[[index]]= transformers[index]@ #1]])&, parsed]]]; | |
Sow[parsed, d]; (* reap only data tagged with d for data *) | |
linesprocessed++; | |
If[maxProcessed >=0 && linesprocessed >= maxProcessed, Break[]] | |
], d, Rule]; | |
Close[str]; | |
<|"Processed"->linesprocessed, | |
"LinesRead" -> linecount, | |
"StartSkipped"->startcount, | |
"Ignored"->ignoredcount, | |
"ApplyAt" -> specials, | |
"Data"->d /. result[[2]] (* only return the reaped data not the reap data strucure *) |> | |
] | |
End[] | |
EndPackage[] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment