stevenwoodson · March 16, 2013 21:35
diff --git a/KeyTerms.php b/KeyTerms.php
 <?php
 /**
 * Key Terms
 * 
 * Determines the key terms in a provided $content string and returns the $return most popular terms. This works by
 * clearing all the most common words from the content and counting the number of occurrences of what's left. We then 
 * find all phrases that contain any of those words that occurred more than once in an attempt to isolate the most 
 * important phrases. The number of terms returned can also be set or set to 0 to return all matches. If there's room 
 * for more returned results ($results has to be greater than zero) then we gather the most used words not in an 
 * existing phrase and fill the returned array.
 * 
 * @author Steve Woodson <[email protected]>
 * @copyright 2013 Steve Woodson
 * @license http://opensource.org/licenses/MIT MIT
 * @version 0.1.0
 * 
 * @param string $content
 * @param int $return Set to 0 to return all
 * @return array
 */
 function keyTerms( $content = NULL, $return = 10 ) {
 	if ( $content == NULL ) {	return false;	}

 	// Starter Variables
 	$count = $finalTerms = $possibleTermWords = $tempTerm = $tempTermCollection = $terms = $termScore = $words = $wordsFiltered = array();
 	$stopwords = array("a", "about", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount", "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as", "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "let", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "shall", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "the", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "i'm", "i'd", "it's", "i've", "he's", "i'll", "he'd", "we'd", "it'd", "don't", "can't", "we're", "isn't", "won't", "we've", "we'll", "she's", "you'd", "let's", "who's", "he'll", "it'll", "she'd", "ain't", "who'd", "that's", "didn't", "you're", "you'll", "what's", "wasn't", "you've", "aren't", "here's", "hasn't", "hadn't", "they'd", "here's", "who've", "she'll", "who'll", "that'd", "doesn't", "there's", "they're", "world's", "haven't", "they've", "weren't", "they'll", "o'clock", "mustn't", "needn't", "must've", "that'll", "couldn't", "wouldn't", "could've", "would've", "there'll", "shouldn't", "should've", "im", "id", "its", "ive", "hes", "ill", "hed", "wed", "itd", "dont", "cant", "were", "isnt", "wont", "weve", "well", "shes", "youd", "lets", "whos", "hell", "itll", "shed", "aint", "whod", "thats", "didnt", "youre", "youll", "whats", "wasnt", "youve", "arent", "heres", "hasnt", "hadnt", "theyd", "heres", "whove", "shell", "wholl", "thatd", "doesnt", "theres", "theyre", "worlds", "havent", "theyve", "werent", "theyll", "oclock", "mustnt", "neednt", "mustve", "thatll", "couldnt", "wouldnt", "couldve", "wouldve", "therell", "shouldnt", "shouldve");

 	// Clean up the provided content, strip tags, trim whitespace, all lowercase and no special characters
 	$content = strtolower( trim( strip_tags( preg_replace('/[^a-zA-Z0-9 \' ]/', ' ', $content)) ) );

 	// Create an array containing each individual word of the cleaned content
 	$words = $wordsFiltered = explode(' ', $content);

 	// Identify all content words that are part of the stopwords list and remove them
 	foreach ( $words as $key => $item ) {
 		if ( in_array(strtolower($item), $stopwords) || strlen($item) <= 1 ) { unset ( $wordsFiltered[$key] ); }
 	}

 	// Grab a count of all unique terms in the $wordsFiltered array created above
 	$count = array_count_values( $wordsFiltered );

 	// Narrow down to only words used more than once
 	$possibleTermWords = array_diff($count, array( 1 ));

 	// Cycle through all words
 	foreach ( $words as $key => $word ){
 		// If this word is in the $possibleTermWords array add it to $tempTerm & $tempTermCollection arrays
 		if ( array_key_exists($word, $possibleTermWords) ) { $tempTerm[$key] = $tempTermCollection[$key] = $word; } 
 		// If this word is not part of the $possibleTermWords array, check to see if we have a Term stored
 		else {
 			if ( count( $tempTerm ) > 0 ) {
 				// Huzzah! A term has been stored so lets grab it as a single string in the $terms array and get a total $termsScore
 				$termCount = array();
 				foreach ( $tempTerm as $term ) { $termCount[] = $count[ $term ]; }
 				$termsScore[] = array_sum($termCount);
 				$terms[] = implode ( ' ', $tempTerm );
 			}
 			// Clear it out for the next loop
 			$tempTerm = array();
 		}
 	}
 	
 	// This is a list of words in the content provided that does not contain any stop words or words in the Terms set below
 	$wordsFiltered = array_diff($wordsFiltered, $tempTermCollection);

 	// A (mildly) intelligent list of Terms
 	$terms = array_unique($terms);

 	// Sort the Term Scores and gather the top $return to return
 	arsort($termsScore);
 	foreach ( $termsScore as $key => $score ) {
 		if ( $return == 0 || count($finalTerms) < $return ) {
 			if ( array_key_exists($key, $terms) ) { $finalTerms[] = $terms[$key]; }
 		} else {
 			break; // We've hit our limit, break from this loop
 		}
 	}

 	// If we are done with the terms and still have more room for results, grab some individual words from what's left based on count
 	if ( $return > 0 && count($finalTerms) < $return ) {
 		arsort($count);
 		foreach ( $count as $countTerm =>$countNum ) {
 			if ( count($finalTerms) < $return ) {
 				if ( in_array($countTerm, $wordsFiltered) ) { $finalTerms[] = $countTerm; }
 			} else {
 				break; // We've hit our limit, break from this loop
 			}
 		}
 	}

 	return $finalTerms;
 }
 ?>
	<?php
	/**
	* Key Terms
	*
	* Determines the key terms in a provided $content string and returns the $return most popular terms. This works by
	* clearing all the most common words from the content and counting the number of occurrences of what's left. We then
	* find all phrases that contain any of those words that occurred more than once in an attempt to isolate the most
	* important phrases. The number of terms returned can also be set or set to 0 to return all matches. If there's room
	* for more returned results ($results has to be greater than zero) then we gather the most used words not in an
	* existing phrase and fill the returned array.
	*
	* @author Steve Woodson <[email protected]>
	* @copyright 2013 Steve Woodson
	* @license http://opensource.org/licenses/MIT MIT
	* @version 0.1.0
	*
	* @param string $content
	* @param int $return Set to 0 to return all
	* @return array
	*/
	function keyTerms( $content = NULL, $return = 10 ) {
	if ( $content == NULL ) { return false; }

	// Starter Variables
	$count = $finalTerms = $possibleTermWords = $tempTerm = $tempTermCollection = $terms = $termScore = $words = $wordsFiltered = array();
	$stopwords = array("a", "about", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also","although","always","am","among", "amongst", "amoungst", "amount", "an", "and", "another", "any","anyhow","anyone","anything","anyway", "anywhere", "are", "around", "as", "at", "back","be","became", "because","become","becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom","but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven","else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "let", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own","part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "shall", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "the", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves", "i'm", "i'd", "it's", "i've", "he's", "i'll", "he'd", "we'd", "it'd", "don't", "can't", "we're", "isn't", "won't", "we've", "we'll", "she's", "you'd", "let's", "who's", "he'll", "it'll", "she'd", "ain't", "who'd", "that's", "didn't", "you're", "you'll", "what's", "wasn't", "you've", "aren't", "here's", "hasn't", "hadn't", "they'd", "here's", "who've", "she'll", "who'll", "that'd", "doesn't", "there's", "they're", "world's", "haven't", "they've", "weren't", "they'll", "o'clock", "mustn't", "needn't", "must've", "that'll", "couldn't", "wouldn't", "could've", "would've", "there'll", "shouldn't", "should've", "im", "id", "its", "ive", "hes", "ill", "hed", "wed", "itd", "dont", "cant", "were", "isnt", "wont", "weve", "well", "shes", "youd", "lets", "whos", "hell", "itll", "shed", "aint", "whod", "thats", "didnt", "youre", "youll", "whats", "wasnt", "youve", "arent", "heres", "hasnt", "hadnt", "theyd", "heres", "whove", "shell", "wholl", "thatd", "doesnt", "theres", "theyre", "worlds", "havent", "theyve", "werent", "theyll", "oclock", "mustnt", "neednt", "mustve", "thatll", "couldnt", "wouldnt", "couldve", "wouldve", "therell", "shouldnt", "shouldve");

	// Clean up the provided content, strip tags, trim whitespace, all lowercase and no special characters
	$content = strtolower( trim( strip_tags( preg_replace('/[^a-zA-Z0-9 \' ]/', ' ', $content)) ) );

	// Create an array containing each individual word of the cleaned content
	$words = $wordsFiltered = explode(' ', $content);

	// Identify all content words that are part of the stopwords list and remove them
	foreach ( $words as $key => $item ) {
	if ( in_array(strtolower($item), $stopwords) \|\| strlen($item) <= 1 ) { unset ( $wordsFiltered[$key] ); }
	}

	// Grab a count of all unique terms in the $wordsFiltered array created above
	$count = array_count_values( $wordsFiltered );

	// Narrow down to only words used more than once
	$possibleTermWords = array_diff($count, array( 1 ));

	// Cycle through all words
	foreach ( $words as $key => $word ){
	// If this word is in the $possibleTermWords array add it to $tempTerm & $tempTermCollection arrays
	if ( array_key_exists($word, $possibleTermWords) ) { $tempTerm[$key] = $tempTermCollection[$key] = $word; }
	// If this word is not part of the $possibleTermWords array, check to see if we have a Term stored
	else {
	if ( count( $tempTerm ) > 0 ) {
	// Huzzah! A term has been stored so lets grab it as a single string in the $terms array and get a total $termsScore
	$termCount = array();
	foreach ( $tempTerm as $term ) { $termCount[] = $count[ $term ]; }
	$termsScore[] = array_sum($termCount);
	$terms[] = implode ( ' ', $tempTerm );
	}
	// Clear it out for the next loop
	$tempTerm = array();
	}
	}

	// This is a list of words in the content provided that does not contain any stop words or words in the Terms set below
	$wordsFiltered = array_diff($wordsFiltered, $tempTermCollection);

	// A (mildly) intelligent list of Terms
	$terms = array_unique($terms);

	// Sort the Term Scores and gather the top $return to return
	arsort($termsScore);
	foreach ( $termsScore as $key => $score ) {
	if ( $return == 0 \|\| count($finalTerms) < $return ) {
	if ( array_key_exists($key, $terms) ) { $finalTerms[] = $terms[$key]; }
	} else {
	break; // We've hit our limit, break from this loop
	}
	}

	// If we are done with the terms and still have more room for results, grab some individual words from what's left based on count
	if ( $return > 0 && count($finalTerms) < $return ) {
	arsort($count);
	foreach ( $count as $countTerm =>$countNum ) {
	if ( count($finalTerms) < $return ) {
	if ( in_array($countTerm, $wordsFiltered) ) { $finalTerms[] = $countTerm; }
	} else {
	break; // We've hit our limit, break from this loop
	}
	}
	}

	return $finalTerms;
	}
	?>