nicholasdunbar · August 29, 2015 14:05
diff --git a/parseApproxColumns.php b/parseApproxColumns.php
 <?php
 //To parse the following unevenly categorized columns:
 /*
 * col1          col2     col3   
 * ====          ====     ====      
 * 1 a b c d e  103       14 as d9   
 * 2   a     103       14 as d9          
 * 3   a      103     14 as d9
 *
 * Into structured data like this:
 *
 * col1         col2  col3
 * ====         ====  ====
 * 1 a b c d e  103   14 as d9
 * 2   a        103   14 as d9
 * 3   a        103   14 as d9
 */

 $headings = array('col1','col2','col3');
 $header = "col1         col2  col3";
 //get the $heading_pos_list by parsing the headings of each column with 
 list($heading_pos_list, $lengths) = parse_heading($headings, $header);
 //Parse each line into a row structure
 $line = '1 a b c d e  103       14 as d9';
 $row = parse_line_approx($line, $heading_pos_list);
 //this works for each column and line
 echo $row('col1');
 //output: 
 //1 a b c d e


 /************* Functions Below **************/
 /**
 * parse_heading
 * 
 * Get the position and length of each column name in $heading
 * if a column name is not in $line then remove it from $heading
 * 
 * @param array $heading 
 * List of column names to get info about from $line
 * Ex:  array("Logical Drive Name", "LUN", "Controller", "Accessible by", "Logical Drive status");
 * 
 * @param string $line 
 * The header for the columns
 * Ex:  "Logical Drive Name    LUN  Controller  Accessible by         Logical Drive status"
 * 
 * @param boolean $is_logging
 * Turn off log messages when a header is not found
 *
 * @return array $return_val
 * Multidimentional array containing the start position of each column and length
 * $return_val[0] = multi-element associative array  
 * Ex:  $return_val[0]['Controller']
 *      $return_val[0]['Accessible by']
 * $return_val[0] contains the starting position for each header
 * $heading_key is the column name that has been like 'CONTROLLER' above
 * Ex:  $return_val[0][$heading_key] = starting position of heading
 * $return_val[1] contains the length of each header
 * Ex:  $return_val[1][$heading_key] = length of heading including white space after it
 * 
 */
 function parse_heading(&$heading, $line, $is_logging = TRUE) {

   //search $line to find each start position of $heading
   $start_pos=array();
   $lengthOf=array();
   $num_headings = count($heading);
   
   for ($i = 0; $i < $num_headings; $i++){
        $column_name = $heading[$i];
        $matches = array();
        $start = strpos($line, $column_name);
        if (preg_match("/^.*?$column_name\s*/", $line, $matches)){
            $end = strlen($matches[0]);
            if($start !== FALSE) {
               $start_pos[$column_name] = $start; 
               $lengthOf[$column_name] = $end-$start;
            }
        } else {
            $heading[$i] = NULL;
        if ($is_logging){
                echo "Alert: Heading '$column_name' was not found.\n";
        }
        }
   }
   
   //filter the elements of the array using the function strlen, 
   //if strlen returns 0 then the element is removed
   $heading = array_filter($heading, 'strlen');
   
   //return start positions and lengths 
   return(array($start_pos, $lengthOf));
 }


 /**
 * parse_line_approx
 * 
 * For parsing columns with raged edges. Pull the data out of $line and 
 * put it into an associative array of fields and keys found in $heading 
 * Ex: 
 * Array(0=>Array('key'=>0),1=>Array('key'=>3)).     
 * 
 * If the positions are close, but off by a few characters, then adjust  
 * the start and end to the real edge of the column. This function is an 
 * improvement on parse_line because it deals with ragged edges of columns.
 * 
 * Example Usage:
 * Say we have the following data:
 * col1          col2     col3   
 * ====          ====     ====      
 * 1 a b c d e  103       14 as d9   
 * 2   a     103       14 as d9          
 * 3   a      103     14 as d9
 *
 * If we want to transform the data into this
 * col1         col2  col3
 * ====         ====  ====
 * 1 a b c d e  103   14 as d9
 * 2   a        103   14 as d9
 * 3   a        103   14 as d9
 * 
 * We would do the following:
 * 1.) get the $heading_pos_list by parsing the headings of each column with 
 * list($heading_pos_list,$lengths) = parse_heading(array('col1','col2','col3'),"col1         col2  col3")
 * 2.) Parse each line into a row structure
 * $row = parse_line_approx($line, $heading_pos_list) 
 * 3.) echo "$row('col1')" etc. etc. for each column and line
 * 4.) format it into columns
 * 
 * @param string $line
 * The input to be processed
 * 
 * @param array $heading_pos_list 
 * The return value you get from list($heading_pos_list,$lengths)=parse_heading(...) 
 * used to determine where columns start ($return_value['some_key']) in a line
 *
 * @return array
 * An array of keys and coresponding values like 
 * array('some_key1'=>'column_value1','some_key2'=>'column_value2','etc'=>'etc')
 */
 function parse_line_approx($line, $heading_pos_list) {

    //init vars
    //return variable
       $row = array();
    //list of whitespaces and values in $line
    $chunks_array = array();
    //list of positions of chunks in string $line
    $chunks_pos_array = array();
    //list of distances of each chunk from every heading
    $chunks_dist_array = array();
    //chunks as they are matched via preg_match_all
    $matches = array();

    //get chunks
    preg_match_all("/\s+|[^\s]+/",$line,$matches,PREG_OFFSET_CAPTURE);
    while ( $el = array_shift($matches[0]) ){
        $chunks_array[] = $el[0];
        $chunks_pos_array[] = $el[1];
    }

    //calculate distances between chunks and each column heading
    $counter = 0;
    $chunks_dist_array = array();
    foreach ($heading_pos_list as $heading=>$start_val){
        
        //begining of column
        $start = $start_val;
        
        foreach ($chunks_pos_array as $index=>$chunk_pos){
            $distance = abs($start-$chunk_pos);
            $chunks_dist_array[$counter][] = array( "heading"=>$heading,
                                "distance"=>$distance,
                                "chunk_index"=>$index,
                                "heading_pos"=>$start,
                                "chunk_pos"=>$chunk_pos,
                                "chunk_val"=>$chunks_array[$index] );
        }
        $counter++;
    }

    //categorize into each heading by whichever is closest to the heading
    $processed_indexes = array();
    foreach ($chunks_dist_array as $index=>$info){
        if (!isset($chunks_dist_array[$index+1])){
            //add remaining to the last column
            for ($i=0;$i<$len;$i++){
                if (!in_array($info[$i]['chunk_index'],$processed_indexes)){
                    if (!isset($row[$info[$i]['heading']])){
                        $row[$info[$i]['heading']] = "";
                    }
                    $row[$info[$i]['heading']] .= $info[$i]['chunk_val'];
                    $processed_indexes[] = $info[$i]['chunk_index'];
                }
            }
            break;
        } 
        $next_info = $chunks_dist_array[$index+1];
        $len = count($info);
        $next_heading_pos = $next_info[0]['heading_pos'];
        $last_proc_index = 0;
        for ($i=0;$i<$len;$i++){
            //categorize into each heading by whichever is closest to the heading
            /*
            col1          col2     col3           col1          col2     col3 
            ================================      ================================
            a b c d 103         14 as d9      =>  a b c         103      14 as d9
            */
            if ($info[$i]['distance'] < $next_info[$i]['distance']){
                if (!in_array($info[$i]['chunk_index'],$processed_indexes)){
                    if (!isset($row[$info[$i]['heading']])){
                        $row[$info[$i]['heading']] = "";
                    }
                    $row[$info[$i]['heading']] .= $info[$i]['chunk_val'];
                    $processed_indexes[] = $info[$i]['chunk_index'];
                    $last_proc_index = $i;
                }
            } else if ($next_info[$i]['chunk_pos'] < $next_heading_pos ) {
                //columns are not always seperated by which chunk is closer to a column
                //take into account cases (like col1) with wide columns full of spaces:
                /*
                col1          col2     col3           col1          col2     col3 
                ================================      ================================
                a b c d e  103       14 as d9     =>  a b c d e     103      14 as d9
                */

                //where the start of a col2 is demarcated by a larger space:
                if ( preg_match("/^\s{2,}|\t$/",$info[$i]['chunk_val']) ){
                    //if there is a big space then break into previous heading
                    for ($j=$last_proc_index+1;$j<=$i;$j++){
                        $row[$info[$j]['heading']] .= $info[$j]['chunk_val'];
                        $processed_indexes[] = $info[$j]['chunk_index'];
                    }
                    //found a logical break, process next column
                    break 1;
                } 

            } else {
                //too far away to categorize in this heading. Move to next heading
                break 1;
            }
        }
    }

    //clean leading and trailing whitespace
    foreach ($row as $heading=>$col_val){
        $row[$heading] = trim($col_val);
    }

    return $row;

 }
	<?php
	//To parse the following unevenly categorized columns:
	/*
	* col1 col2 col3
	* ==== ==== ====
	* 1 a b c d e 103 14 as d9
	* 2 a 103 14 as d9
	* 3 a 103 14 as d9
	*
	* Into structured data like this:
	*
	* col1 col2 col3
	* ==== ==== ====
	* 1 a b c d e 103 14 as d9
	* 2 a 103 14 as d9
	* 3 a 103 14 as d9
	*/

	$headings = array('col1','col2','col3');
	$header = "col1 col2 col3";
	//get the $heading_pos_list by parsing the headings of each column with
	list($heading_pos_list, $lengths) = parse_heading($headings, $header);
	//Parse each line into a row structure
	$line = '1 a b c d e 103 14 as d9';
	$row = parse_line_approx($line, $heading_pos_list);
	//this works for each column and line
	echo $row('col1');
	//output:
	//1 a b c d e


	/*********** Functions Below ************/
	/**
	* parse_heading
	*
	* Get the position and length of each column name in $heading
	* if a column name is not in $line then remove it from $heading
	*
	* @param array $heading
	* List of column names to get info about from $line
	* Ex: array("Logical Drive Name", "LUN", "Controller", "Accessible by", "Logical Drive status");
	*
	* @param string $line
	* The header for the columns
	* Ex: "Logical Drive Name LUN Controller Accessible by Logical Drive status"
	*
	* @param boolean $is_logging
	* Turn off log messages when a header is not found
	*
	* @return array $return_val
	* Multidimentional array containing the start position of each column and length
	* $return_val[0] = multi-element associative array
	* Ex: $return_val[0]['Controller']
	* $return_val[0]['Accessible by']
	* $return_val[0] contains the starting position for each header
	* $heading_key is the column name that has been like 'CONTROLLER' above
	* Ex: $return_val[0][$heading_key] = starting position of heading
	* $return_val[1] contains the length of each header
	* Ex: $return_val[1][$heading_key] = length of heading including white space after it
	*
	*/
	function parse_heading(&$heading, $line, $is_logging = TRUE) {

	//search $line to find each start position of $heading
	$start_pos=array();
	$lengthOf=array();
	$num_headings = count($heading);

	for ($i = 0; $i < $num_headings; $i++){
	$column_name = $heading[$i];
	$matches = array();
	$start = strpos($line, $column_name);
	if (preg_match("/^.?$column_name\s/", $line, $matches)){
	$end = strlen($matches[0]);
	if($start !== FALSE) {
	$start_pos[$column_name] = $start;
	$lengthOf[$column_name] = $end-$start;
	}
	} else {
	$heading[$i] = NULL;
	if ($is_logging){
	echo "Alert: Heading '$column_name' was not found.\n";
	}
	}
	}

	//filter the elements of the array using the function strlen,
	//if strlen returns 0 then the element is removed
	$heading = array_filter($heading, 'strlen');

	//return start positions and lengths
	return(array($start_pos, $lengthOf));
	}


	/**
	* parse_line_approx
	*
	* For parsing columns with raged edges. Pull the data out of $line and
	* put it into an associative array of fields and keys found in $heading
	* Ex:
	* Array(0=>Array('key'=>0),1=>Array('key'=>3)).
	*
	* If the positions are close, but off by a few characters, then adjust
	* the start and end to the real edge of the column. This function is an
	* improvement on parse_line because it deals with ragged edges of columns.
	*
	* Example Usage:
	* Say we have the following data:
	* col1 col2 col3
	* ==== ==== ====
	* 1 a b c d e 103 14 as d9
	* 2 a 103 14 as d9
	* 3 a 103 14 as d9
	*
	* If we want to transform the data into this
	* col1 col2 col3
	* ==== ==== ====
	* 1 a b c d e 103 14 as d9
	* 2 a 103 14 as d9
	* 3 a 103 14 as d9
	*
	* We would do the following:
	* 1.) get the $heading_pos_list by parsing the headings of each column with
	* list($heading_pos_list,$lengths) = parse_heading(array('col1','col2','col3'),"col1 col2 col3")
	* 2.) Parse each line into a row structure
	* $row = parse_line_approx($line, $heading_pos_list)
	* 3.) echo "$row('col1')" etc. etc. for each column and line
	* 4.) format it into columns
	*
	* @param string $line
	* The input to be processed
	*
	* @param array $heading_pos_list
	* The return value you get from list($heading_pos_list,$lengths)=parse_heading(...)
	* used to determine where columns start ($return_value['some_key']) in a line
	*
	* @return array
	* An array of keys and coresponding values like
	* array('some_key1'=>'column_value1','some_key2'=>'column_value2','etc'=>'etc')
	*/
	function parse_line_approx($line, $heading_pos_list) {

	//init vars
	//return variable
	$row = array();
	//list of whitespaces and values in $line
	$chunks_array = array();
	//list of positions of chunks in string $line
	$chunks_pos_array = array();
	//list of distances of each chunk from every heading
	$chunks_dist_array = array();
	//chunks as they are matched via preg_match_all
	$matches = array();

	//get chunks
	preg_match_all("/\s+\|[^\s]+/",$line,$matches,PREG_OFFSET_CAPTURE);
	while ( $el = array_shift($matches[0]) ){
	$chunks_array[] = $el[0];
	$chunks_pos_array[] = $el[1];
	}

	//calculate distances between chunks and each column heading
	$counter = 0;
	$chunks_dist_array = array();
	foreach ($heading_pos_list as $heading=>$start_val){

	//begining of column
	$start = $start_val;

	foreach ($chunks_pos_array as $index=>$chunk_pos){
	$distance = abs($start-$chunk_pos);
	$chunks_dist_array[$counter][] = array( "heading"=>$heading,
	"distance"=>$distance,
	"chunk_index"=>$index,
	"heading_pos"=>$start,
	"chunk_pos"=>$chunk_pos,
	"chunk_val"=>$chunks_array[$index] );
	}
	$counter++;
	}

	//categorize into each heading by whichever is closest to the heading
	$processed_indexes = array();
	foreach ($chunks_dist_array as $index=>$info){
	if (!isset($chunks_dist_array[$index+1])){
	//add remaining to the last column
	for ($i=0;$i<$len;$i++){
	if (!in_array($info[$i]['chunk_index'],$processed_indexes)){
	if (!isset($row[$info[$i]['heading']])){
	$row[$info[$i]['heading']] = "";
	}
	$row[$info[$i]['heading']] .= $info[$i]['chunk_val'];
	$processed_indexes[] = $info[$i]['chunk_index'];
	}
	}
	break;
	}
	$next_info = $chunks_dist_array[$index+1];
	$len = count($info);
	$next_heading_pos = $next_info[0]['heading_pos'];
	$last_proc_index = 0;
	for ($i=0;$i<$len;$i++){
	//categorize into each heading by whichever is closest to the heading
	/*
	col1 col2 col3 col1 col2 col3
	================================ ================================
	a b c d 103 14 as d9 => a b c 103 14 as d9
	*/
	if ($info[$i]['distance'] < $next_info[$i]['distance']){
	if (!in_array($info[$i]['chunk_index'],$processed_indexes)){
	if (!isset($row[$info[$i]['heading']])){
	$row[$info[$i]['heading']] = "";
	}
	$row[$info[$i]['heading']] .= $info[$i]['chunk_val'];
	$processed_indexes[] = $info[$i]['chunk_index'];
	$last_proc_index = $i;
	}
	} else if ($next_info[$i]['chunk_pos'] < $next_heading_pos ) {
	//columns are not always seperated by which chunk is closer to a column
	//take into account cases (like col1) with wide columns full of spaces:
	/*
	col1 col2 col3 col1 col2 col3
	================================ ================================
	a b c d e 103 14 as d9 => a b c d e 103 14 as d9
	*/

	//where the start of a col2 is demarcated by a larger space:
	if ( preg_match("/^\s{2,}\|\t$/",$info[$i]['chunk_val']) ){
	//if there is a big space then break into previous heading
	for ($j=$last_proc_index+1;$j<=$i;$j++){
	$row[$info[$j]['heading']] .= $info[$j]['chunk_val'];
	$processed_indexes[] = $info[$j]['chunk_index'];
	}
	//found a logical break, process next column
	break 1;
	}

	} else {
	//too far away to categorize in this heading. Move to next heading
	break 1;
	}
	}
	}

	//clean leading and trailing whitespace
	foreach ($row as $heading=>$col_val){
	$row[$heading] = trim($col_val);
	}

	return $row;

	}