Last active
August 29, 2015 14:05
-
-
Save nicholasdunbar/9c5388d243bd4dff48f2 to your computer and use it in GitHub Desktop.
For parsing columns from plain text with raged or unevenly spaced edges.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
//To parse the following unevenly categorized columns: | |
/* | |
* col1 col2 col3 | |
* ==== ==== ==== | |
* 1 a b c d e 103 14 as d9 | |
* 2 a 103 14 as d9 | |
* 3 a 103 14 as d9 | |
* | |
* Into structured data like this: | |
* | |
* col1 col2 col3 | |
* ==== ==== ==== | |
* 1 a b c d e 103 14 as d9 | |
* 2 a 103 14 as d9 | |
* 3 a 103 14 as d9 | |
*/ | |
$headings = array('col1','col2','col3'); | |
$header = "col1 col2 col3"; | |
//get the $heading_pos_list by parsing the headings of each column with | |
list($heading_pos_list, $lengths) = parse_heading($headings, $header); | |
//Parse each line into a row structure | |
$line = '1 a b c d e 103 14 as d9'; | |
$row = parse_line_approx($line, $heading_pos_list); | |
//this works for each column and line | |
echo $row('col1'); | |
//output: | |
//1 a b c d e | |
/************* Functions Below **************/ | |
/** | |
* parse_heading | |
* | |
* Get the position and length of each column name in $heading | |
* if a column name is not in $line then remove it from $heading | |
* | |
* @param array $heading | |
* List of column names to get info about from $line | |
* Ex: array("Logical Drive Name", "LUN", "Controller", "Accessible by", "Logical Drive status"); | |
* | |
* @param string $line | |
* The header for the columns | |
* Ex: "Logical Drive Name LUN Controller Accessible by Logical Drive status" | |
* | |
* @param boolean $is_logging | |
* Turn off log messages when a header is not found | |
* | |
* @return array $return_val | |
* Multidimentional array containing the start position of each column and length | |
* $return_val[0] = multi-element associative array | |
* Ex: $return_val[0]['Controller'] | |
* $return_val[0]['Accessible by'] | |
* $return_val[0] contains the starting position for each header | |
* $heading_key is the column name that has been like 'CONTROLLER' above | |
* Ex: $return_val[0][$heading_key] = starting position of heading | |
* $return_val[1] contains the length of each header | |
* Ex: $return_val[1][$heading_key] = length of heading including white space after it | |
* | |
*/ | |
function parse_heading(&$heading, $line, $is_logging = TRUE) { | |
//search $line to find each start position of $heading | |
$start_pos=array(); | |
$lengthOf=array(); | |
$num_headings = count($heading); | |
for ($i = 0; $i < $num_headings; $i++){ | |
$column_name = $heading[$i]; | |
$matches = array(); | |
$start = strpos($line, $column_name); | |
if (preg_match("/^.*?$column_name\s*/", $line, $matches)){ | |
$end = strlen($matches[0]); | |
if($start !== FALSE) { | |
$start_pos[$column_name] = $start; | |
$lengthOf[$column_name] = $end-$start; | |
} | |
} else { | |
$heading[$i] = NULL; | |
if ($is_logging){ | |
echo "Alert: Heading '$column_name' was not found.\n"; | |
} | |
} | |
} | |
//filter the elements of the array using the function strlen, | |
//if strlen returns 0 then the element is removed | |
$heading = array_filter($heading, 'strlen'); | |
//return start positions and lengths | |
return(array($start_pos, $lengthOf)); | |
} | |
/** | |
* parse_line_approx | |
* | |
* For parsing columns with raged edges. Pull the data out of $line and | |
* put it into an associative array of fields and keys found in $heading | |
* Ex: | |
* Array(0=>Array('key'=>0),1=>Array('key'=>3)). | |
* | |
* If the positions are close, but off by a few characters, then adjust | |
* the start and end to the real edge of the column. This function is an | |
* improvement on parse_line because it deals with ragged edges of columns. | |
* | |
* Example Usage: | |
* Say we have the following data: | |
* col1 col2 col3 | |
* ==== ==== ==== | |
* 1 a b c d e 103 14 as d9 | |
* 2 a 103 14 as d9 | |
* 3 a 103 14 as d9 | |
* | |
* If we want to transform the data into this | |
* col1 col2 col3 | |
* ==== ==== ==== | |
* 1 a b c d e 103 14 as d9 | |
* 2 a 103 14 as d9 | |
* 3 a 103 14 as d9 | |
* | |
* We would do the following: | |
* 1.) get the $heading_pos_list by parsing the headings of each column with | |
* list($heading_pos_list,$lengths) = parse_heading(array('col1','col2','col3'),"col1 col2 col3") | |
* 2.) Parse each line into a row structure | |
* $row = parse_line_approx($line, $heading_pos_list) | |
* 3.) echo "$row('col1')" etc. etc. for each column and line | |
* 4.) format it into columns | |
* | |
* @param string $line | |
* The input to be processed | |
* | |
* @param array $heading_pos_list | |
* The return value you get from list($heading_pos_list,$lengths)=parse_heading(...) | |
* used to determine where columns start ($return_value['some_key']) in a line | |
* | |
* @return array | |
* An array of keys and coresponding values like | |
* array('some_key1'=>'column_value1','some_key2'=>'column_value2','etc'=>'etc') | |
*/ | |
function parse_line_approx($line, $heading_pos_list) { | |
//init vars | |
//return variable | |
$row = array(); | |
//list of whitespaces and values in $line | |
$chunks_array = array(); | |
//list of positions of chunks in string $line | |
$chunks_pos_array = array(); | |
//list of distances of each chunk from every heading | |
$chunks_dist_array = array(); | |
//chunks as they are matched via preg_match_all | |
$matches = array(); | |
//get chunks | |
preg_match_all("/\s+|[^\s]+/",$line,$matches,PREG_OFFSET_CAPTURE); | |
while ( $el = array_shift($matches[0]) ){ | |
$chunks_array[] = $el[0]; | |
$chunks_pos_array[] = $el[1]; | |
} | |
//calculate distances between chunks and each column heading | |
$counter = 0; | |
$chunks_dist_array = array(); | |
foreach ($heading_pos_list as $heading=>$start_val){ | |
//begining of column | |
$start = $start_val; | |
foreach ($chunks_pos_array as $index=>$chunk_pos){ | |
$distance = abs($start-$chunk_pos); | |
$chunks_dist_array[$counter][] = array( "heading"=>$heading, | |
"distance"=>$distance, | |
"chunk_index"=>$index, | |
"heading_pos"=>$start, | |
"chunk_pos"=>$chunk_pos, | |
"chunk_val"=>$chunks_array[$index] ); | |
} | |
$counter++; | |
} | |
//categorize into each heading by whichever is closest to the heading | |
$processed_indexes = array(); | |
foreach ($chunks_dist_array as $index=>$info){ | |
if (!isset($chunks_dist_array[$index+1])){ | |
//add remaining to the last column | |
for ($i=0;$i<$len;$i++){ | |
if (!in_array($info[$i]['chunk_index'],$processed_indexes)){ | |
if (!isset($row[$info[$i]['heading']])){ | |
$row[$info[$i]['heading']] = ""; | |
} | |
$row[$info[$i]['heading']] .= $info[$i]['chunk_val']; | |
$processed_indexes[] = $info[$i]['chunk_index']; | |
} | |
} | |
break; | |
} | |
$next_info = $chunks_dist_array[$index+1]; | |
$len = count($info); | |
$next_heading_pos = $next_info[0]['heading_pos']; | |
$last_proc_index = 0; | |
for ($i=0;$i<$len;$i++){ | |
//categorize into each heading by whichever is closest to the heading | |
/* | |
col1 col2 col3 col1 col2 col3 | |
================================ ================================ | |
a b c d 103 14 as d9 => a b c 103 14 as d9 | |
*/ | |
if ($info[$i]['distance'] < $next_info[$i]['distance']){ | |
if (!in_array($info[$i]['chunk_index'],$processed_indexes)){ | |
if (!isset($row[$info[$i]['heading']])){ | |
$row[$info[$i]['heading']] = ""; | |
} | |
$row[$info[$i]['heading']] .= $info[$i]['chunk_val']; | |
$processed_indexes[] = $info[$i]['chunk_index']; | |
$last_proc_index = $i; | |
} | |
} else if ($next_info[$i]['chunk_pos'] < $next_heading_pos ) { | |
//columns are not always seperated by which chunk is closer to a column | |
//take into account cases (like col1) with wide columns full of spaces: | |
/* | |
col1 col2 col3 col1 col2 col3 | |
================================ ================================ | |
a b c d e 103 14 as d9 => a b c d e 103 14 as d9 | |
*/ | |
//where the start of a col2 is demarcated by a larger space: | |
if ( preg_match("/^\s{2,}|\t$/",$info[$i]['chunk_val']) ){ | |
//if there is a big space then break into previous heading | |
for ($j=$last_proc_index+1;$j<=$i;$j++){ | |
$row[$info[$j]['heading']] .= $info[$j]['chunk_val']; | |
$processed_indexes[] = $info[$j]['chunk_index']; | |
} | |
//found a logical break, process next column | |
break 1; | |
} | |
} else { | |
//too far away to categorize in this heading. Move to next heading | |
break 1; | |
} | |
} | |
} | |
//clean leading and trailing whitespace | |
foreach ($row as $heading=>$col_val){ | |
$row[$heading] = trim($col_val); | |
} | |
return $row; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment