howellcc · November 9, 2016 22:00 · howellcc · Nov 9, 2016
diff --git a/ParseLargeXMLFiles.cfc b/ParseLargeXMLFiles.cfc
 component
 	output="false"
 	hint="I help to parse large XML files by matching patterns and then only parsing sub-nodes of the document." {


 	public any function Init(required string Nodes, required string XmlFilePath, numeric BufferSize=(1024*1024*5)) {

 		/*
 			Create the regular expression pattern based on the
 			node list. We have to match both standard nodes and
 			self-closing nodes. The first thing we have to do is
 			clean up the node list.
 		*/
 		Nodes = ListChangeDelims(Nodes,"|",	", ");

 		/* Define the pattern. */
 		var Pattern = (	"(?i)" & "<(#Nodes#)\b[^>]*(?<=/)>|" & "<(#Nodes#)\b[^>]*>[\w\W]*?</\2>");

 		/* Set up the instance variables. */
 		VARIABLES.Instance = {

 			/*
 				This the compiled version of our regular
 				expression pattern. By compiling the pattern,
 				it allows us to access the Matcher functionality
 				later on.
 			*/
 			Pattern = CreateObject(
 				"java",
 				"java.util.regex.Pattern"
 				).Compile(
 					JavaCast( "string", Pattern )
 					),

 			/*
 				This is the data buffer that will hold our
 				partial XML file data.
 			*/
 			DataBuffer = "",

 			/*
 				The transfer buffer is what we will use to
 				transfer data from the input file stream into
 				our data buffer. It is this buffer that will
 				determine the size of each file read.
 			*/
 			TransferBuffer = RepeatString( " ", BufferSize ).GetBytes(),

 			/*
 				This will be our buffered file input stream
 				which let us read in the large XML file a
 				chunk at a time.
 			*/
 			InputStream = ""

 		};

 		/*
 			Setup the file intput stream. This buffere input
 			stream will all us to read in the XML file in
 			chunks rather than as a whole.
 		*/
 		VARIABLES.Instance.InputStream = CreateObject(
 			"java",
 			"java.io.BufferedInputStream"
 			).Init(
 				CreateObject(
 					"java",
 					"java.io.FileInputStream"
 					).Init(
 						JavaCast(
 							"string",
 							XmlFilePath
 							)
 						)
 				);

 		/* Return an intialized object. */
 		return THIS ;
 	}


 	public void function Close(){

 		/* Close the file input stream. */
 		VARIABLES.Instance.InputStream.Close();

 		/* Return out. */
 		return;
 	}


 	public any function GetNextNode() {
 		//hint="I return the next node in the XML document. If no node can be found, I return VOID.">

 		/* Create a matcher for our current buffer. */
 		var Matcher = VARIABLES.Instance.Pattern.Matcher(
 			JavaCast( "string", VARIABLES.Instance.DataBuffer )
 			) ;


 		/* Try to find the next node. */
 		if(Matcher.Find()){

 			/*
 				The matcher found a pattern match. Let's pull out
 				the matching XML.
 			*/
 			var XMLData = Matcher.Group();

 			/*
 				Now that we have the pattern matched, we need to
 				figure out how many characters to leave in our
 				buffer.
 			*/
 			var CharsToLeave = (
 				Len( VARIABLES.Instance.DataBuffer ) -
 				(Matcher.Start() + Len( XMLData ))
 				);

 			/*
 				Check to see if we have any characters to leave
 				in the buffer after this match.
 			*/
 			if(CharsToLeave){

 				/* Trim the buffer. */
 				VARIABLES.Instance.DataBuffer = Right(
 					VARIABLES.Instance.DataBuffer,
 					CharsToLeave
 					);

 			}else{

 				/*
 					No character data should be left in the
 					buffer. Just set it to empyt string.
 				*/
 				VARIABLES.Instance.DataBuffer = "" ;

 			}

 			/*
 				Now that we have the buffer updated, parse the
 				XML data and return the root element.
 			*/
 			return
 				XmlParse( Trim( XMLData ) )
 					.XmlRoot ;

 		} else {

 			/*
 				The pattern matcher could not find the next node.
 				This might be because our buffer does contain
 				enough information. Let's try to read more of our
 				XML file into the buffer.
 			*/

 			/* Read input stream into local buffer. */
 			var BytesRead = VARIABLES.Instance.InputStream.Read(
 				VARIABLES.Instance.TransferBuffer,
 				JavaCast( "int", 0 ),
 				JavaCast( "int", ArrayLen( VARIABLES.Instance.TransferBuffer ) )
 				) ;

 			/*
 				Check to see if we read any bytes. If we didn't
 				then we have run out of data to read and cannot
 				possibly match any more node patterns; just
 				return void.
 			*/
 			if (BytesRead EQ -1){

 				/* Release the file input stream. */
 				THIS.Close();

 				/* No more data to be matched. */
 				return;

 			}else{

 				/*
 					We have read data in from the buffered file
 					input stream. Now, let's append that to our
 					internal buffer. Be sure to only move over
 					the bytes that were read - this might not
 					include the whole buffer contents.
 				*/
 				VARIABLES.Instance.DataBuffer &= Left(
 					ToString( VARIABLES.Instance.TransferBuffer ),
 					LOCAL.BytesRead
 					);

 			}


 			/*
 				Now that we have updated our buffer, we want to
 				give the pattern matcher another change to find
 				the node pattern.
 			*/
 			return GetNextNode();

 		}
 	}

 }
	component
	output="false"
	hint="I help to parse large XML files by matching patterns and then only parsing sub-nodes of the document." {


	public any function Init(required string Nodes, required string XmlFilePath, numeric BufferSize=(102410245)) {

	/*
	Create the regular expression pattern based on the
	node list. We have to match both standard nodes and
	self-closing nodes. The first thing we have to do is
	clean up the node list.
	*/
	Nodes = ListChangeDelims(Nodes,"\|", ", ");

	/* Define the pattern. */
	var Pattern = ( "(?i)" & "<(#Nodes#)\b[^>](?<=/)>\|" & "<(#Nodes#)\b[^>]>[\w\W]*?</\2>");

	/* Set up the instance variables. */
	VARIABLES.Instance = {

	/*
	This the compiled version of our regular
	expression pattern. By compiling the pattern,
	it allows us to access the Matcher functionality
	later on.
	*/
	Pattern = CreateObject(
	"java",
	"java.util.regex.Pattern"
	).Compile(
	JavaCast( "string", Pattern )
	),

	/*
	This is the data buffer that will hold our
	partial XML file data.
	*/
	DataBuffer = "",

	/*
	The transfer buffer is what we will use to
	transfer data from the input file stream into
	our data buffer. It is this buffer that will
	determine the size of each file read.
	*/
	TransferBuffer = RepeatString( " ", BufferSize ).GetBytes(),

	/*
	This will be our buffered file input stream
	which let us read in the large XML file a
	chunk at a time.
	*/
	InputStream = ""

	};

	/*
	Setup the file intput stream. This buffere input
	stream will all us to read in the XML file in
	chunks rather than as a whole.
	*/
	VARIABLES.Instance.InputStream = CreateObject(
	"java",
	"java.io.BufferedInputStream"
	).Init(
	CreateObject(
	"java",
	"java.io.FileInputStream"
	).Init(
	JavaCast(
	"string",
	XmlFilePath
	)
	)
	);

	/* Return an intialized object. */
	return THIS ;
	}


	public void function Close(){

	/* Close the file input stream. */
	VARIABLES.Instance.InputStream.Close();

	/* Return out. */
	return;
	}


	public any function GetNextNode() {
	//hint="I return the next node in the XML document. If no node can be found, I return VOID.">

	/* Create a matcher for our current buffer. */
	var Matcher = VARIABLES.Instance.Pattern.Matcher(
	JavaCast( "string", VARIABLES.Instance.DataBuffer )
	) ;


	/* Try to find the next node. */
	if(Matcher.Find()){

	/*
	The matcher found a pattern match. Let's pull out
	the matching XML.
	*/
	var XMLData = Matcher.Group();

	/*
	Now that we have the pattern matched, we need to
	figure out how many characters to leave in our
	buffer.
	*/
	var CharsToLeave = (
	Len( VARIABLES.Instance.DataBuffer ) -
	(Matcher.Start() + Len( XMLData ))
	);

	/*
	Check to see if we have any characters to leave
	in the buffer after this match.
	*/
	if(CharsToLeave){

	/* Trim the buffer. */
	VARIABLES.Instance.DataBuffer = Right(
	VARIABLES.Instance.DataBuffer,
	CharsToLeave
	);

	}else{

	/*
	No character data should be left in the
	buffer. Just set it to empyt string.
	*/
	VARIABLES.Instance.DataBuffer = "" ;

	}

	/*
	Now that we have the buffer updated, parse the
	XML data and return the root element.
	*/
	return
	XmlParse( Trim( XMLData ) )
	.XmlRoot ;

	} else {

	/*
	The pattern matcher could not find the next node.
	This might be because our buffer does contain
	enough information. Let's try to read more of our
	XML file into the buffer.
	*/

	/* Read input stream into local buffer. */
	var BytesRead = VARIABLES.Instance.InputStream.Read(
	VARIABLES.Instance.TransferBuffer,
	JavaCast( "int", 0 ),
	JavaCast( "int", ArrayLen( VARIABLES.Instance.TransferBuffer ) )
	) ;

	/*
	Check to see if we read any bytes. If we didn't
	then we have run out of data to read and cannot
	possibly match any more node patterns; just
	return void.
	*/
	if (BytesRead EQ -1){

	/* Release the file input stream. */
	THIS.Close();

	/* No more data to be matched. */
	return;

	}else{

	/*
	We have read data in from the buffered file
	input stream. Now, let's append that to our
	internal buffer. Be sure to only move over
	the bytes that were read - this might not
	include the whole buffer contents.
	*/
	VARIABLES.Instance.DataBuffer &= Left(
	ToString( VARIABLES.Instance.TransferBuffer ),
	LOCAL.BytesRead
	);

	}


	/*
	Now that we have updated our buffer, we want to
	give the pattern matcher another change to find
	the node pattern.
	*/
	return GetNextNode();

	}
	}

	}