Last active
November 19, 2020 23:04
-
-
Save abiusx/e3aecae3fe35925292fb0430217f8c74 to your computer and use it in GitHub Desktop.
A tutorial on Guzzle Promises in PHP
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Promise-based programming tutorial. | |
* | |
* In this tutorial, we use promises to tackle several concurrent problems and make them run faster. | |
* Promises are now available in most languages (C#, PHP, Javascript, etc.). | |
* | |
* What they do is, instead of doing a long-running, I/O blocking task | |
* (e.g., read a file, fetch a URL, etc.), they promise to do it in the future | |
* and return the result. | |
* | |
* Using this *promise*, we can do things that we wanted to do with the result, e.g., | |
* send it to another task or to the user. | |
* | |
* Think of it like a cheque instead of actual money. You can pay for other goods using this | |
* cheque, you can split it in half, aggregate multiple cheques, and do whatever you want. | |
* In the end, to actually get cash after all of your operations, you need to WAIT() on the | |
* final result. However, if you didn't use cheques, you would need to wait on each | |
* operation independently, resulting in a lot longer wait, rather than wait once for all of | |
* the cheques to cash concurrently. | |
* | |
* A promise object is returned from the promise-based I/O library (e.g., GuzzleHttp in PHP). | |
* It's result can be chained to another action using ->then() function on the promise object. | |
* This means that ->then() will be executed once the promise is fulfilled. | |
* ->then() itself returns another promise, which can be used in another ->then(). | |
* Again, think of a cheque that you receive for $100, and another for $50, and | |
* once both are cashed ->then() you send a cheque for $150 to someone else. | |
* Then you ->wait() on the final promise to settle. | |
* | |
* To handle exceptions (i.e., cheque failed to cash, even though it was promised), | |
* a promise object also has ->otherwise() function, which again returns a promise | |
* that has execption handling. | |
* | |
* It is expected that the description above did not make promises clear. That's | |
* why there are 7 examples included in this code that show exactly how they work. | |
* | |
* PHP currently supports promises for HTTP requests using the Guzzle library. | |
* It also supports promises for other operations using the PHP/React library. | |
* Node.js supports promises for almost all I/O, as does C#. | |
*/ | |
require 'vendor/autoload.php'; # `composer require guzzlehttp/guzzle` before running. | |
# Usage: php me.php [1-7] | |
if (isset($argv[1]) and function_exists("example_{$argv[1]}")) | |
call_user_func("example_{$argv[1]}"); | |
else | |
for ($i=0;$i<10;++$i) | |
function_exists("example_{$i}") and call_user_func("example_{$i}"); | |
/** | |
* List of sites to fetch, used in all examples | |
* @return array of string | |
*/ | |
function sites() | |
{ | |
return [ | |
0 => 'www.google.com', | |
1 => 'www.microsoft.com', | |
2 => 'www.yahoo.com', | |
3 => 'www.facebook.com', | |
4 => 'www.apple.com', | |
5 => 'www.amazon.com', | |
6 => 'en.wikipedia.org', | |
7 => 'twitter.com', | |
8 => 'non.existing.website', // Should cause error | |
9 => 'another.non.existing.website', | |
]; | |
} | |
/** | |
* Fetch sites sequentially, using GuzzleHttp, and without promises. | |
* Provided as the baseline of what we want to do. | |
* | |
* Takes 12.78s on my box. | |
*/ | |
function example_1() | |
{ | |
$client = new GuzzleHttp\Client(); | |
# Example 1: fetching $sites sequentially | |
echo "Example 1: fetching websites sequentially\n"; | |
timer(); | |
foreach (sites() as $site) | |
{ | |
try { | |
$res = $client->request('GET', $site); | |
echo "."; | |
} | |
catch (Exception $e) { | |
echo "x"; | |
} | |
} | |
echo " Took ", timer(), " seconds to fetch ", count(sites()), " websites.", PHP_EOL; | |
} | |
/** | |
* Fetches websites sequentially, like example 1. | |
* However, uses Promises to show how they work. | |
* | |
* Specifically, only error handling and success handling is replaced with | |
* the promise's then() and otherwise() callbacks. | |
* | |
* Takes 12.72s on my box. | |
*/ | |
function example_2() | |
{ | |
$client = new GuzzleHttp\Client(); | |
echo "Example 2: fetching websites sequentially using promises\n"; | |
timer(); | |
foreach (sites() as $site) | |
{ | |
$promise = $client->requestAsync('GET', $site); | |
$res = $promise->then(function ($res) { | |
// then() runs on fulfillment of the promise | |
echo "."; | |
})->otherwise(function ($err) { | |
// otherwise() runs on rejection of the promise, i.e., error | |
echo "x"; | |
})->wait(); // wait() blocks until the promise is resolved. | |
} | |
echo " Took ", timer(), " seconds to fetch ", count(sites()), " websites.", PHP_EOL; | |
} | |
/** | |
* Similar to example 2, uses promises to fetch the sites. | |
* | |
* However, aggregates all the promises into 1 promise instead of | |
* waiting on each one of them to finish. | |
* | |
* Then waits on the aggregate promise, resulting in concurrent execution | |
* of all promises. | |
* | |
* Takes 2.37s on my box | |
*/ | |
function example_3() | |
{ | |
$client = new GuzzleHttp\Client(); | |
echo "Example 3: fetching websites concurrently using promises\n"; | |
timer(); | |
$promises = []; | |
foreach (sites() as $site) | |
{ | |
$promise = $client->requestAsync('GET', $site); | |
$promise = $promise->then(function ($res) { | |
// then() returns a new (modified) promise | |
echo "."; | |
}); | |
$promise = $promise->otherwise(function ($err) { | |
// otherwise() returns a new (modified) promise | |
echo "x"; | |
}); | |
$promises[] = $promise; | |
// We do not wait for this promise here. | |
// We want to wait on all promises together. | |
} | |
// Aggregate promises into one promise, which returns all results as array | |
$res = GuzzleHttp\Promise\all($promises)->wait(); | |
echo " Took ", timer(), " seconds to fetch ", count(sites()), " websites.", PHP_EOL; | |
} | |
/** | |
* Similar to example 3, however, odd sites (2x+1) are dependent on even sites (2x), | |
* i.e., first site[0] needs to be fetched and then site[1], but site[0] and site[2] | |
* can be fetched in parallel. | |
* | |
* Uses sites()/2 promises, each of which has a then() clause that fetches another website. | |
* Each promise's then() clause also prints its corresponding site index, to see the order | |
* of execution. | |
* | |
* Takes 3.9s on my box (roughly 2x of example 3, which is expected) | |
* | |
* Note that if the dependency chain was longer than 2, then the code would become ugly. | |
* For example, if we needed to do X, which dependend on Y, which dependend on Z, which | |
* depended on W (W->Z->Y->X), we could either do them sequentially, | |
* or using 4 chained ->then() clauses. The latter would be very ugly and unreadable | |
* code. The former would mean that two functions doing the same cannot be done | |
* concurrently (as we see in example 7). | |
*/ | |
function example_4() | |
{ | |
$client = new GuzzleHttp\Client(); | |
echo "Example 4: fetching websites using promises, but n+1 is dependent on n\n"; | |
timer(); | |
$promises = []; | |
for ($i=0; $i<count(sites()); $i+=2) | |
{ | |
$next = sites()[$i+1]; | |
$promise = $client->requestAsync('GET', sites()[$i]); | |
$promise = $promise->then(function ($res) use ($client, $next, $i) { | |
echo $i; | |
return $client->requestAsync('GET', $next)->then(function ($res) use ($i) { | |
echo $i+1; | |
return $res; | |
}); | |
}) | |
->otherwise(function ($err) { | |
// this otherwise() handles errors for both site[$i] and site[$i+1] | |
echo "x"; | |
}); | |
$promises[] = $promise; | |
} | |
$res = GuzzleHttp\Promise\all($promises)->wait(); | |
echo " Took ", timer(), " seconds to fetch ", count(sites()), " websites.", PHP_EOL; | |
} | |
/** | |
* A simple crawler, which fetches the site, then fetches all links in that site | |
* (and repeats, if needed). | |
* | |
* In each round, fetches all links, then processes all the results to extract new links, | |
* and sends these links to the next round. | |
* | |
* Causes errors because after round 0, there are too many links to open in parallel, | |
* resulting in file I/O exhaustion and many errors on fetching links. | |
* | |
* This is solved in example 6, which is more complicated. This example serves as a reference. | |
* | |
* Takes 36.80s on my box, with 246 fetched URLs (0.15s per URL) | |
*/ | |
function example_5() | |
{ | |
$client = new GuzzleHttp\Client(); | |
echo "Example 5: crawling websites using promises, going deep twice\n"; | |
timer(); | |
// Add https:// to all sites | |
$new_links = array_map(function($_) {return "https://{$_}";}, sites()); | |
$depth = 2; // how deep to go in crawling | |
$success = 0; | |
for ($round=0; $round<$depth; ++$round) | |
{ | |
echo "Round {$round}: "; | |
// Create requests | |
$promises = []; | |
foreach ($new_links as $link) | |
{ | |
$promises[] = $client->requestAsync('GET', $link) | |
->then(function ($res) use ($link, &$success) { | |
echo "."; | |
$success++; // Count success, because many fail in this example | |
return [$link, $res]; | |
}) | |
->otherwise(function ($err) use ($link) { | |
echo "x"; | |
}); | |
} | |
// Run requests | |
$results = []; | |
$concurrency = 7; | |
GuzzleHttp\Promise\each_limit($promises, $concurrency, function ($res, $id) use (&$results) { | |
$results[$id] = $res; | |
})->wait(); | |
// Extract links from results | |
foreach ($results as $tuple) | |
{ | |
list($origin, $res) = $tuple; | |
if (!$res) continue; // Can be null for failed ones | |
// Extract base of URL from full URL | |
preg_match("|^(https?://.*?)/|i", $origin."/", $matches); | |
$base = $matches[1]; | |
// Extract all <a href=...> from the page | |
$body = $res->getBody(); | |
$links = preg_match_all('/<a\s+href\s*=\s*[\'"](.*?)[\'"].*?>/i', $body, $matches); | |
foreach ($matches[1] as $link) | |
{ | |
if (strpos($link, "javascript")!==false) continue; // Skip javascript links | |
$link = html_entity_decode($link); | |
if (!preg_match("#^https?://#", $link)) // If relative URL, add base | |
$link = $base.$link; | |
$new_links[] = $link; | |
} | |
} | |
$new_links = array_unique($new_links); // Remove duplicates | |
echo count($new_links), " new links discovered.\n"; | |
} | |
echo "{$success} links crawled.\n"; | |
echo " Took ", timer(), " seconds to crawl ", count(sites()), " websites.", PHP_EOL; | |
} | |
/** | |
* Similar in behavior to example 5, however, users a generator and a queue | |
* to dynamically add URLs and process them. | |
* | |
* Because we are not creating too many requests in parallel, there are no errors. | |
* Requests are created as they are needed for execution, using PHP's generator | |
* feature (yield). | |
* | |
* Also once the result of one fetch is available, it is processed immediately | |
* and resulting URLs are added back to the queue. | |
* | |
* Takes 58.61s on my box, fetching 366 URLs (0.16s per URL) | |
* @return [type] [description] | |
*/ | |
function example_6() | |
{ | |
$client = new GuzzleHttp\Client(); | |
echo "Example 6: crawling websites using promises, going deep twice, processing immediately\n"; | |
timer(); | |
// Put the list of sites as entry points of queue, with depth 0 | |
$queue = array_map(function($_) {return [0, "https://{$_}"];}, sites()); | |
$max_depth = 2; // How deep to go in crawling | |
$success = 0; // How many links crawled | |
$link_generator = function() use (&$success, &$queue, $client, $max_depth) { | |
while (!empty($queue)) | |
{ | |
list($depth, $link) = array_shift($queue); | |
$success++; | |
$promise = $client->requestAsync('GET', $link) | |
->then(function ($res) use (&$queue, $link, $depth, $max_depth) { | |
echo $depth?":":"."; | |
if ($depth+1 >= $max_depth) return; // If already at max_depth, don't process results | |
preg_match("|^(https?://.*?)/|i", $link."/", $matches); | |
$base = $matches[1]; | |
$links = preg_match_all('/<a\s+href\s*=\s*[\'"](.*?)[\'"].*?>/i', $res->getBody(), $matches); | |
foreach ($matches[1] as $newlink) | |
{ | |
if (strpos($newlink, "javascript")!==false) continue; // Skip javascript links | |
$newlink = html_entity_decode($newlink); | |
if (!preg_match("#^https?://#", $newlink)) // If relative URL, add base | |
$newlink = $base.$newlink; | |
array_push($queue, [$depth+1, $newlink]); | |
} | |
}) | |
->otherwise(function ($err) use ($link) { | |
echo "x"; | |
}); | |
yield $promise; // Return this request for execution | |
} | |
}; | |
$concurrency = 7; | |
GuzzleHttp\Promise\each_limit($link_generator(), $concurrency)->wait(); | |
echo $success, " links crawled.\n"; | |
echo " Took ", timer(), " seconds to crawl ", count(sites()), " websites.", PHP_EOL; | |
} | |
/** | |
* Example 7 is similar to Example 4, where 1 is dependent on 0, and 3 is dependent on 2. | |
* | |
* This time, 1 is dependent on 0, 2 on 1, 3 on 2, 4 on 3. (0->1->2->3->4) | |
* Similarly, 6 dependent on 5, 7 on 6, 8 on 7, 9 on 8. (5->6->7->8->9) | |
* | |
* Instead of nesting 5 then()->then() resulting in an ugly code, | |
* we will use coroutines. A coroutine is internally serial (not concurrent), | |
* but multiple coroutines can be run concurrent to each other! | |
* | |
* This is based on the magic of generators and yield. Each coroutine has 1 | |
* active request at a time, and multiple coroutines can have a set of active, | |
* concurrent requests together. | |
* | |
* This is similar to async keyword in other language such as Node.js and C#, | |
* where multiple "async FUNC()" statements can be run together at multiple | |
* points of a program. | |
* | |
* Takes 7.82s on my box, a little less than 2x example 4. | |
*/ | |
function example_7() | |
{ | |
$client = new GuzzleHttp\Client(); | |
echo "Example 7: fetching websites using promises, 0->1->2->3->4 and 5->6->7->8->9, via coroutine/async\n"; | |
timer(); | |
$new_links = array_map(function($_) {return "https://{$_}";}, sites()); | |
$promises = []; | |
for ($i=0;$i<count(sites()); $i+=5) | |
{ | |
$chunk = array_slice(sites(), $i, 5); | |
// Create coroutine for $chunk of the entire sites | |
$promises[] = GuzzleHttp\Promise\coroutine(function() use ($chunk, $client) { | |
foreach($chunk as $link) | |
{ | |
// Return each request sequentially, using yield | |
yield $client->requestAsync('GET', $link) | |
->then(function ($res) use ($link) { | |
echo "."; | |
return [$link, $res]; | |
}) | |
->otherwise(function ($err) use ($link) { | |
echo "x"; | |
}); | |
} | |
}); | |
} | |
$res = GuzzleHttp\Promise\all($promises)->wait(); // Run all coroutines concurrently | |
echo " Took ", timer(), " seconds to fetch ", count(sites()), " websites.", PHP_EOL; | |
} | |
/** | |
* Reset timer and return time passes since last call. | |
* @return string | |
*/ | |
function timer() | |
{ | |
static $timer = null; | |
$res = "Timer Started"; | |
if ($timer !== null) | |
$res = sprintf("%.2f", microtime(true)-$timer); | |
$timer = microtime(true); | |
return $res; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment