Skip to content

Instantly share code, notes, and snippets.

@stevenwoodson
Created March 16, 2013 21:35
Show Gist options
  • Save stevenwoodson/5178425 to your computer and use it in GitHub Desktop.
Save stevenwoodson/5178425 to your computer and use it in GitHub Desktop.
Determines the key terms in a provided $content string and returns the $return most popular terms. This works by clearing all the most common words from the content and counting the number of occurrences of what's left. We then find all phrases that contain any of those words that occurred more than once in an attempt to isolate the most importa…
<?php
/**
* Key Terms
*
* Determines the key terms in a provided $content string and returns the $return most popular terms. This works by
* clearing all the most common words from the content and counting the number of occurrences of what's left. We then
* find all phrases that contain any of those words that occurred more than once in an attempt to isolate the most
* important phrases. The number of terms returned can also be set or set to 0 to return all matches. If there's room
* for more returned results ($results has to be greater than zero) then we gather the most used words not in an
* existing phrase and fill the returned array.
*
* @author Steve Woodson <[email protected]>
* @copyright 2013 Steve Woodson
* @license http://opensource.org/licenses/MIT MIT
* @version 0.1.0
*
* @param string $content
* @param int $return Set to 0 to return all
* @return array
*/
function keyTerms( $content = NULL, $return = 10 ) {
if ( $content == NULL ) { return false; }
// Starter Variables
$count = $finalTerms = $possibleTermWords = $tempTerm = $tempTermCollection = $terms = $termScore = $words = $wordsFiltered = array();
$stopwords = array("a", "about", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount", "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as", "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "let", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "shall", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "the", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "i'm", "i'd", "it's", "i've", "he's", "i'll", "he'd", "we'd", "it'd", "don't", "can't", "we're", "isn't", "won't", "we've", "we'll", "she's", "you'd", "let's", "who's", "he'll", "it'll", "she'd", "ain't", "who'd", "that's", "didn't", "you're", "you'll", "what's", "wasn't", "you've", "aren't", "here's", "hasn't", "hadn't", "they'd", "here's", "who've", "she'll", "who'll", "that'd", "doesn't", "there's", "they're", "world's", "haven't", "they've", "weren't", "they'll", "o'clock", "mustn't", "needn't", "must've", "that'll", "couldn't", "wouldn't", "could've", "would've", "there'll", "shouldn't", "should've", "im", "id", "its", "ive", "hes", "ill", "hed", "wed", "itd", "dont", "cant", "were", "isnt", "wont", "weve", "well", "shes", "youd", "lets", "whos", "hell", "itll", "shed", "aint", "whod", "thats", "didnt", "youre", "youll", "whats", "wasnt", "youve", "arent", "heres", "hasnt", "hadnt", "theyd", "heres", "whove", "shell", "wholl", "thatd", "doesnt", "theres", "theyre", "worlds", "havent", "theyve", "werent", "theyll", "oclock", "mustnt", "neednt", "mustve", "thatll", "couldnt", "wouldnt", "couldve", "wouldve", "therell", "shouldnt", "shouldve");
// Clean up the provided content, strip tags, trim whitespace, all lowercase and no special characters
$content = strtolower( trim( strip_tags( preg_replace('/[^a-zA-Z0-9 \' ]/', ' ', $content)) ) );
// Create an array containing each individual word of the cleaned content
$words = $wordsFiltered = explode(' ', $content);
// Identify all content words that are part of the stopwords list and remove them
foreach ( $words as $key => $item ) {
if ( in_array(strtolower($item), $stopwords) || strlen($item) <= 1 ) { unset ( $wordsFiltered[$key] ); }
}
// Grab a count of all unique terms in the $wordsFiltered array created above
$count = array_count_values( $wordsFiltered );
// Narrow down to only words used more than once
$possibleTermWords = array_diff($count, array( 1 ));
// Cycle through all words
foreach ( $words as $key => $word ){
// If this word is in the $possibleTermWords array add it to $tempTerm & $tempTermCollection arrays
if ( array_key_exists($word, $possibleTermWords) ) { $tempTerm[$key] = $tempTermCollection[$key] = $word; }
// If this word is not part of the $possibleTermWords array, check to see if we have a Term stored
else {
if ( count( $tempTerm ) > 0 ) {
// Huzzah! A term has been stored so lets grab it as a single string in the $terms array and get a total $termsScore
$termCount = array();
foreach ( $tempTerm as $term ) { $termCount[] = $count[ $term ]; }
$termsScore[] = array_sum($termCount);
$terms[] = implode ( ' ', $tempTerm );
}
// Clear it out for the next loop
$tempTerm = array();
}
}
// This is a list of words in the content provided that does not contain any stop words or words in the Terms set below
$wordsFiltered = array_diff($wordsFiltered, $tempTermCollection);
// A (mildly) intelligent list of Terms
$terms = array_unique($terms);
// Sort the Term Scores and gather the top $return to return
arsort($termsScore);
foreach ( $termsScore as $key => $score ) {
if ( $return == 0 || count($finalTerms) < $return ) {
if ( array_key_exists($key, $terms) ) { $finalTerms[] = $terms[$key]; }
} else {
break; // We've hit our limit, break from this loop
}
}
// If we are done with the terms and still have more room for results, grab some individual words from what's left based on count
if ( $return > 0 && count($finalTerms) < $return ) {
arsort($count);
foreach ( $count as $countTerm =>$countNum ) {
if ( count($finalTerms) < $return ) {
if ( in_array($countTerm, $wordsFiltered) ) { $finalTerms[] = $countTerm; }
} else {
break; // We've hit our limit, break from this loop
}
}
}
return $finalTerms;
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment