Created
September 16, 2015 03:23
-
-
Save bhu1st/3342f0f468ad23590e39 to your computer and use it in GitHub Desktop.
Word Frequency Count
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* | |
URL: http://stackoverflow.com/questions/3169051/code-golf-word-frequency-chart | |
Credit for this code goes to the author, refer the discussion in stackoverflow. | |
*/ | |
$filename = "largefile.txt"; | |
/* get content of $filename in $content */ | |
$content = strtolower(file_get_contents($filename)); | |
/* split $content into array of substrings of $content i.e wordwise */ | |
$wordArray = preg_split('/[^a-z]/', $content, -1, PREG_SPLIT_NO_EMPTY); | |
/* "stop words", filter them */ | |
$filteredArray = array_filter($wordArray, function($x){ | |
return !preg_match("/^(.|a|an|and|the|this|at|in|or|of|is|for|to)$/",$x); | |
}); | |
/* get associative array of values from $filteredArray as keys and their frequency count as value */ | |
$wordFrequencyArray = array_count_values($filteredArray); | |
/* Sort array from higher to lower, keeping keys */ | |
arsort($wordFrequencyArray); | |
/* grab Top 10, huh sorted? */ | |
$top10words = array_slice($wordFrequencyArray,0,10); | |
/* display them */ | |
foreach ($top10words as $topWord => $frequency) | |
echo "$topWord -- $frequency<br/>"; | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment