Created
March 16, 2013 21:35
-
-
Save stevenwoodson/5178425 to your computer and use it in GitHub Desktop.
Determines the key terms in a provided $content string and returns the $return most popular terms. This works by clearing all the most common words from the content and counting the number of occurrences of what's left. We then find all phrases that contain any of those words that occurred more than once in an attempt to isolate the most importa…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Key Terms | |
* | |
* Determines the key terms in a provided $content string and returns the $return most popular terms. This works by | |
* clearing all the most common words from the content and counting the number of occurrences of what's left. We then | |
* find all phrases that contain any of those words that occurred more than once in an attempt to isolate the most | |
* important phrases. The number of terms returned can also be set or set to 0 to return all matches. If there's room | |
* for more returned results ($results has to be greater than zero) then we gather the most used words not in an | |
* existing phrase and fill the returned array. | |
* | |
* @author Steve Woodson <[email protected]> | |
* @copyright 2013 Steve Woodson | |
* @license http://opensource.org/licenses/MIT MIT | |
* @version 0.1.0 | |
* | |
* @param string $content | |
* @param int $return Set to 0 to return all | |
* @return array | |
*/ | |
function keyTerms( $content = NULL, $return = 10 ) { | |
if ( $content == NULL ) { return false; } | |
// Starter Variables | |
$count = $finalTerms = $possibleTermWords = $tempTerm = $tempTermCollection = $terms = $termScore = $words = $wordsFiltered = array(); | |
$stopwords = array("a", "about", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount", "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as", "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "let", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "shall", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "the", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "i'm", "i'd", "it's", "i've", "he's", "i'll", "he'd", "we'd", "it'd", "don't", "can't", "we're", "isn't", "won't", "we've", "we'll", "she's", "you'd", "let's", "who's", "he'll", "it'll", "she'd", "ain't", "who'd", "that's", "didn't", "you're", "you'll", "what's", "wasn't", "you've", "aren't", "here's", "hasn't", "hadn't", "they'd", "here's", "who've", "she'll", "who'll", "that'd", "doesn't", "there's", "they're", "world's", "haven't", "they've", "weren't", "they'll", "o'clock", "mustn't", "needn't", "must've", "that'll", "couldn't", "wouldn't", "could've", "would've", "there'll", "shouldn't", "should've", "im", "id", "its", "ive", "hes", "ill", "hed", "wed", "itd", "dont", "cant", "were", "isnt", "wont", "weve", "well", "shes", "youd", "lets", "whos", "hell", "itll", "shed", "aint", "whod", "thats", "didnt", "youre", "youll", "whats", "wasnt", "youve", "arent", "heres", "hasnt", "hadnt", "theyd", "heres", "whove", "shell", "wholl", "thatd", "doesnt", "theres", "theyre", "worlds", "havent", "theyve", "werent", "theyll", "oclock", "mustnt", "neednt", "mustve", "thatll", "couldnt", "wouldnt", "couldve", "wouldve", "therell", "shouldnt", "shouldve"); | |
// Clean up the provided content, strip tags, trim whitespace, all lowercase and no special characters | |
$content = strtolower( trim( strip_tags( preg_replace('/[^a-zA-Z0-9 \' ]/', ' ', $content)) ) ); | |
// Create an array containing each individual word of the cleaned content | |
$words = $wordsFiltered = explode(' ', $content); | |
// Identify all content words that are part of the stopwords list and remove them | |
foreach ( $words as $key => $item ) { | |
if ( in_array(strtolower($item), $stopwords) || strlen($item) <= 1 ) { unset ( $wordsFiltered[$key] ); } | |
} | |
// Grab a count of all unique terms in the $wordsFiltered array created above | |
$count = array_count_values( $wordsFiltered ); | |
// Narrow down to only words used more than once | |
$possibleTermWords = array_diff($count, array( 1 )); | |
// Cycle through all words | |
foreach ( $words as $key => $word ){ | |
// If this word is in the $possibleTermWords array add it to $tempTerm & $tempTermCollection arrays | |
if ( array_key_exists($word, $possibleTermWords) ) { $tempTerm[$key] = $tempTermCollection[$key] = $word; } | |
// If this word is not part of the $possibleTermWords array, check to see if we have a Term stored | |
else { | |
if ( count( $tempTerm ) > 0 ) { | |
// Huzzah! A term has been stored so lets grab it as a single string in the $terms array and get a total $termsScore | |
$termCount = array(); | |
foreach ( $tempTerm as $term ) { $termCount[] = $count[ $term ]; } | |
$termsScore[] = array_sum($termCount); | |
$terms[] = implode ( ' ', $tempTerm ); | |
} | |
// Clear it out for the next loop | |
$tempTerm = array(); | |
} | |
} | |
// This is a list of words in the content provided that does not contain any stop words or words in the Terms set below | |
$wordsFiltered = array_diff($wordsFiltered, $tempTermCollection); | |
// A (mildly) intelligent list of Terms | |
$terms = array_unique($terms); | |
// Sort the Term Scores and gather the top $return to return | |
arsort($termsScore); | |
foreach ( $termsScore as $key => $score ) { | |
if ( $return == 0 || count($finalTerms) < $return ) { | |
if ( array_key_exists($key, $terms) ) { $finalTerms[] = $terms[$key]; } | |
} else { | |
break; // We've hit our limit, break from this loop | |
} | |
} | |
// If we are done with the terms and still have more room for results, grab some individual words from what's left based on count | |
if ( $return > 0 && count($finalTerms) < $return ) { | |
arsort($count); | |
foreach ( $count as $countTerm =>$countNum ) { | |
if ( count($finalTerms) < $return ) { | |
if ( in_array($countTerm, $wordsFiltered) ) { $finalTerms[] = $countTerm; } | |
} else { | |
break; // We've hit our limit, break from this loop | |
} | |
} | |
} | |
return $finalTerms; | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment