Skip to content

Instantly share code, notes, and snippets.

@fich
Last active June 3, 2018 08:41
Show Gist options
  • Save fich/570d1e423bb0f141535b0fb76eb54906 to your computer and use it in GitHub Desktop.
Save fich/570d1e423bb0f141535b0fb76eb54906 to your computer and use it in GitHub Desktop.
Scrape and Parse Goodreads Quotes
<?php
$url = "https://www.goodreads.com/quotes?page=1";
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 120);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0);
curl_setopt($ch, CURLOPT_USERAGENT,'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13) Gecko/20080311 Firefox/2.0.0.13');
$output = curl_exec($ch);
$info = curl_getinfo($ch);
curl_close($ch);
if ($info['http_code']=='200') {
$page = new DOMDocument;
libxml_use_internal_errors(true);
$page->loadHTML($output);
$page_xpath = new DOMXPath($page);
$entries = $page_xpath->query("//div[contains(@class, 'quotes')]/div[contains(@class, 'quote')]/div[contains(@class, 'quoteDetails')]");
$result = [];
foreach ($entries as $entry) {
$quoteText = $page_xpath->query("div[contains(@class, 'quoteText')]",$entry)[0];
$author = $page_xpath->query("*[contains(@class, 'authorOrTitle')]",$quoteText)[0];
$parse = explode("―", $quoteText->nodeValue);
$quoteFooter = $page_xpath->query("div[contains(@class, 'quoteFooter')]/div[contains(@class, 'left')]/a",$entry);
$tags = implode(',', array_map(create_function('$o', 'return $o->nodeValue;'), iterator_to_array($quoteFooter)));
$row = [];
$row['quoteText'] = trim($parse[0]);
$row['authorUrl'] = trim($author->getAttribute('href'));
$row['authorName'] = trim($author->nodeValue);
$row['tags'] = $tags;
$result[] = $row;
}
print_r($result);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment