Skip to content

Instantly share code, notes, and snippets.

@peter-mcconnell
Last active August 29, 2015 13:57
Show Gist options
  • Save peter-mcconnell/9896870 to your computer and use it in GitHub Desktop.
Save peter-mcconnell/9896870 to your computer and use it in GitHub Desktop.
ngram
<?php
/**
* Simple Ngram word extraction from phrases
* For large dictionaries it's advised this is ran from terminal
* php ngram.php
*
* @author Peter McConnell
**/
ini_set('memory_limit', '-1');
class Ngram
{
public static function getNgram($position, $max_words, $gram, $tokens) {
$output = array();
$gramwidth = floor($gram/2);
$even = ($gram % 2 == 0);
if((bool)$even) {
$s_position = $position - 1;
} else {
$s_position = $position - $gramwidth;
}
$greaterBoundary = ($gram + $s_position) > $max_words;
$lesserBoundary = $s_position < 0;
if($greaterBoundary) {
$s_position = ($max_words - $gram)+1;
} else if($lesserBoundary) {
$s_position = 0;
}
for($i = 0; $i <= $max_words; $i++) {
if($i >= $s_position && $i < ($s_position+$gram)) {
$output[] = $tokens[$i];
}
}
return $output;
}
}
$dictionary = file_get_contents('dictionary.txt');
$filelines = explode("\n", $dictionary);
# the keys in this array dictate which ngrams are generated
$grams = array(
1 => array(),
2 => array(),
3 => array(),
4 => array(),
5 => array()
);
$fh = fopen('ngrams_new.sql', 'wa');
$c = count($filelines);
$gc = count($grams);
$gi = 1;
$i = 0;
foreach($grams as $r => $null) {
echo "Running: ngram #".$r."\n";
foreach($filelines as $key => $line) {
echo ((($key+1)/$c)*100) . "% complete [stage $r of $gc]\n";
if(trim($line) != "") {
$phrase = trim(strtolower(preg_replace('/\s+/', ' ', preg_replace('/[^A-Za-z0-9\s]+/', '', $line))));
$arr = explode(" ", $phrase);
$max_count = count($arr)-1;
foreach($arr as $head => $word) {
$gram = implode(" ", Ngram::getNgram($head, $max_count, $r, $arr));
$grams[$r][str_replace(" ","_", $gram)] = $gram;
}
}
}
# CREATE TABLE
$tbl = 'ngram_' . $r;
$sql = "
DROP TABLE IF EXISTS `$tbl`;\n
CREATE TABLE $tbl (\n
`id` INT(11) AUTO_INCREMENT NOT NULL,
`phrase` VARCHAR(150) NOT NULL,
FULLTEXT (`phrase`),
PRIMARY KEY (`id`)
) ENGINE=MyISAM;";
$sql .= "
INSERT INTO $tbl
(phrase) VALUES \n('";
$sql .= implode("'),\n('", $grams[$r]);
$sql .= "');";
fwrite($fh, $sql);
}
fclose($fh);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment