Created
September 2, 2015 23:10
-
-
Save beporter/ff5998a40191b2608034 to your computer and use it in GitHub Desktop.
A quick command line PHP script to scrape "week by week" pregnancy updates from thebump.com's API into local .html files.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* A command line script to grab the "week-by-week" updates from | |
* TheBump.com's API and save them to local .html files. | |
* | |
* Tries to be respectful and cache all results to avoid unecessary HTTP | |
* calls. Uses the API key that thebump.com's own website uses. No malice | |
* is intended by this. | |
* | |
* Usage: | |
* | |
* $ mkdir weeks | |
* $ cd weeks/ | |
* # (Save the script in this folder as `weekbyweek.php`) | |
* $ php weekbyweek.php | |
* # (Files like `week_4.json` and `week_4.html` will be created in this folder.) | |
* # (This process only needs to be run once to generate the .html output files.) | |
* | |
* Then from another page, to embed a given week's html block: | |
* | |
* <?php include 'weeks/week_4.html'; ?> | |
* | |
* @author [email protected] | |
* @version 1.0.0 | |
*/ | |
/** | |
* The base URL to make API requests against. | |
* | |
* There is a `%s` token that will be replaced by the current week in | |
* each request. | |
* | |
* @var string | |
*/ | |
$urlPattern = 'http://services.thebump.com/core/v1//feeds/pregnant/static?apikey=mtycrccp3w2tefa7gud68jfvzvgt9gw7sv8&week=%s'; | |
/** | |
* main() | |
* | |
* Loops through the available pregenancy weeks and either attempts to | |
* query thebump.com's API (saving the JSON results), or uses the | |
* already-saved results to extract data and format it into HTML blocks. | |
* | |
* The first time this process is run, it will create a bunch of files | |
* in whatever directory this script exists in. | |
*/ | |
for ($i = 4; $i <= 42; $i++) { | |
echo PHP_EOL . '=============================='; | |
echo PHP_EOL . "Processing week {$i}..."; | |
$jsonFile = "week_{$i}.json"; | |
$htmlFile = "week_{$i}.html"; | |
// Cache the JSON api responses so we don't annoy the API server. | |
if (is_readable($jsonFile)) { | |
echo PHP_EOL . "Using existing JSON file `{$jsonFile}`."; | |
$json = file_get_contents($jsonFile); | |
} else { | |
$url = sprintf($urlPattern, $i); | |
echo PHP_EOL . "Fetching URL `{$url}`."; | |
$json = fetchHtml($url); | |
echo PHP_EOL . "Saving JSON to file `{$jsonFile}`."; | |
file_put_contents($jsonFile, $json); | |
} | |
// Cache the final HTML blocks as files for easy inclusion elsewhere. | |
if (is_readable($htmlFile)) { | |
echo PHP_EOL . "Using existing HTML file `{$htmlFile}`."; | |
$html = file_get_contents($htmlFile); | |
} else { | |
echo PHP_EOL . "Converting JSON and saving HTML to file `{$htmlFile}`."; | |
$html = formatBlock(extractBlock($json), $i); | |
file_put_contents($htmlFile, $html); | |
} | |
echo PHP_EOL . "Final HTML for week {$i}:" . PHP_EOL . $html . PHP_EOL . PHP_EOL; | |
} | |
echo PHP_EOL . 'Processing complete!' . PHP_EOL . PHP_EOL; | |
/** | |
* Fetch the contents of the provided URL and return as a string. | |
* | |
* Your copy of PHP must have the CURL extension installed in order for | |
* this to work. | |
* | |
* @param string $url The remote URL to fetch. | |
* @return string The contents of the page returned from $url. Note | |
* that this may NOT be HTML if the requests was to a | |
* JSON api service. | |
*/ | |
function fetchHtml($url) { | |
$c = curl_init(); | |
curl_setopt($c, CURLOPT_URL, $url); | |
curl_setopt($c, CURLOPT_RETURNTRANSFER, 1); | |
curl_setopt($c, CURLOPT_CONNECTTIMEOUT, 5); | |
$data = curl_exec($c); | |
curl_close($c); | |
return $data; | |
} | |
/** | |
* Processes the returned json string and extracts the bits relevant to us. | |
* | |
* This function is specific to the data returned from thebump.com's api | |
* service. | |
* | |
* @param string $json The raw JSON string to decode and process. | |
* @return array An array containing the relevant parts of the JSON | |
* payload, including the title, image, source URL and | |
* main body html. | |
*/ | |
function extractBlock($json) { | |
$json = json_decode($json); | |
$card = $json->cards[0]; | |
$data = array( | |
'title' => $card->title, | |
'summary' => $card->dek, | |
'body' => $card->content->body, | |
'img_url' => $card->media->url, | |
'source_url' => $card->content_url, | |
); | |
return $data; | |
} | |
/** | |
* Converts a processed entry into a formatted HTML string. | |
* | |
* Expects as input the result of calling extractBlock(). As usual, you | |
* should always be wary of outputing data obtained from an untrusted | |
* third party on your own domain. | |
* | |
* @param array $data The array returned from extractBlock() for a | |
* given json payload. | |
* @param int $weekNum The integer week number. Used to hint the | |
* image's alt tag. | |
* @return string A formatted HTML <div> suitable for displaying | |
* on a page. | |
*/ | |
function formatBlock($data, $weekNum) { | |
$format = <<<EOD | |
<div class="week-by-week"> | |
<img style="float:right;" src="{$data['img_url']}" alt="Week {$weekNum}" /> | |
<h2><a href="{$data['source_url']}">{$data['title']}</a></h2> | |
{$data['summary']} | |
{$data['body']} | |
</div> | |
EOD; | |
return $format; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment