Created
December 7, 2018 19:22
-
-
Save reinvented/7e50b84cd85a03325bd090f6822b6ae9 to your computer and use it in GitHub Desktop.
A PHP script to convert text files of Prince Edward Island Hansard to a CSV
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
$files = popen('find ./documents -name "*.txt"', 'r'); | |
$out = fopen('hansard.csv', 'w'); | |
fwrite($out, "date,text,speaker\n"); | |
while (!feof($files)) { | |
$filename = chop(fgets($files, 4096)); | |
$date = basename($filename); | |
$date = str_replace("-hansard.txt", "", $date); | |
if ($filename != '') { | |
$fp = fopen($filename, 'r'); | |
$text = array(); | |
while(!feof($fp)) { | |
$line = []; | |
$line['date'] = $date; | |
$line['text'] = chop(fgets($fp, 4096)); | |
$text[] = $line; | |
} | |
$started = FALSE; | |
foreach($text as $lineno => $line) { | |
// This line begins with a member's name, like Mr. Smith: | |
if ( | |
(preg_match("/^(([A-Z][a-zA-Z\.\-]+\s?\b){1,4}):/", $line['text'], $matches)) or | |
(preg_match("/^(Leader of the Opposition):/", $line['text'], $matches)) or | |
(preg_match("/^(Chair \(.*\)):/", $line['text'], $matches)) or | |
(preg_match("/^(Clerk Assistant \(.*\)):/", $line['text'], $matches)) | |
) { | |
if (!preg_match("/Published by Order of the Legislature/", $line['text'])) { | |
$text[$lineno]['speaker'] = $matches[1]; | |
$text[$lineno]['text'] = str_replace($matches[0] . ' ' , '', $text[$lineno]['text']); | |
$currentline = $lineno; | |
$started = TRUE; | |
} | |
else { | |
unset($text[$lineno]); | |
} | |
} | |
// This line does not begin with a member's name | |
else { | |
if (preg_match("/^.*HANSARD PEI.*/", $line['text'])) { | |
unset($text[$lineno]); | |
} | |
else if (preg_match("/^\d+/", $line['text'])) { | |
unset($text[$lineno]); | |
} | |
else if ($started) { | |
$text[$currentline]['text'] .= ' ' . $line['text']; | |
unset($text[$lineno]); | |
} | |
else { | |
unset($text[$lineno]); | |
} | |
} | |
} | |
foreach($text as $lineno => $line) { | |
$text[$lineno]['wordcount'] = str_word_count($line['text']); | |
} | |
foreach($text as $lineno => $line) { | |
fputcsv($out, array_values($line)); | |
} | |
} | |
} | |
fclose($out); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment