Last active
August 29, 2015 14:19
-
-
Save turanct/654cce490594742378b5 to your computer and use it in GitHub Desktop.
Monoids for map/reduce
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
require_once __DIR__ . '/util.php'; | |
require_once __DIR__ . '/wordcount.php'; | |
/** | |
* Create a document, containing 100 pages of 1000 times "foo bar baz qux" | |
*/ | |
$page = str_repeat("foo bar baz qux\n", 1000); | |
$document = array_fill(0, 100, $page); | |
/** | |
* First, we'll use the map/reduce way: map counting words over the pages, then reduce to a single count | |
*/ | |
$combinedEffort = profile(function() use ($document) { | |
$counts = array_map( | |
@countWords, | |
$document | |
); | |
return array_reduce($counts, @combineCounts, 0); | |
}); | |
var_dump($combinedEffort); | |
/** | |
* Now, we'll first reduce the document to a single string, and then count the words. | |
*/ | |
$singleEffort = profile(function() use ($document) { | |
$fullText = array_reduce($document, @combineStrings, ''); | |
return countWords($fullText); | |
}); | |
var_dump($singleEffort); | |
/** | |
* the count of a number of words is a monoid: | |
* | |
* closure: number_of_words + number_of_words => number_of_words | |
* associativity: (number_of_words_1 + number_of_words_2) + number_of_words3 = number_of_words_1 + (number_of_words_2 + number_of_words_3) | |
* identity: number_of_words + 0 = number_of_words | |
* | |
* see also: http://fsharpforfunandprofit.com/posts/monoids-without-tears/ | |
*/ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
function profile($function) | |
{ | |
echo "start chrono\n"; | |
$starttime = microtime(true); | |
$return = $function(); | |
$stoptime = microtime(true); | |
$seconds = $stoptime - $starttime; | |
echo "stop chrono: " . $seconds . " seconds\n"; | |
return $return; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
function splitWords($string) | |
{ | |
return preg_split('/[^\w]+/', $string, -1, PREG_SPLIT_NO_EMPTY); | |
} | |
function countWords($string) | |
{ | |
return count(splitWords($string)); | |
} | |
function combineStrings($string1, $string2) | |
{ | |
return $string1 . $string2; | |
} | |
function combineCounts($count1, $count2) | |
{ | |
return $count1 + $count2; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
output is as following on my machine:
meaning that the map/reduce way (using monoids) is way more efficient than the second example. Also, when we make
$document
contain1000
pages instead of100
, the second example just crashes the program:start chrono stop chrono: 1.9045460224152 seconds int(4000000) start chrono PHP Fatal error: Allowed memory size of 134217728 bytes exhausted (tried to allocate 71 bytes) in /private/tmp/mapreduce-wordcount/wordcount.php on line 5