Skip to content

Instantly share code, notes, and snippets.

@mitry
Created May 24, 2012 01:07
Show Gist options
  • Save mitry/2778754 to your computer and use it in GitHub Desktop.
Save mitry/2778754 to your computer and use it in GitHub Desktop.
Radikal.ru parser
<?php
$ch = curl_init();
foreach (glob("???.htm") as $file) {
//echo "$file\n";
$dom = new DOMDocument();
$dom->recover = true;
@ $dom -> loadHTMLFile($file) or die("loadHTML error for '$file'");
$path = new DOMXPath($dom);
$xp = new DOMXPath($dom);
foreach ($path->query('id("form4action")/following-sibling::table') as $table) {
// Image URL
$url = $xp->query('.//td[@class="imgcont"]/a[@target="_blank"]/@href', $table)->item(0)->nodeValue;
$fname = basename($url);
# echo "$url\t";
// Image file name
// $fnane = $xp->query('.//div[@class="imgname"]//b', $table)->item(0)->nodeValue;
// $fnane = preg_replace('/.+?:\s+(.+)/', '$1', $fnane);
// if ($fnane === 'Unknown') $fnane = '""';
// echo "$fnane\t";
// Iamge description
$desc = $xp->query('.//div[@class="imginfo4"][2]//b', $table)->item(0)->nodeValue;
$desc = trim($desc);
# echo "$desc\n";
if ($desc) {
@mkdir("./$desc", 0755);
$fname= "./$desc/" . $fname;
}
$fh = fopen($fname, 'xb');
curl_setopt($ch, CURLOPT_FILE, $fh);
curl_setopt($ch, CURLOPT_URL, $url);
if (!curl_exec($ch)) {
trigger_error("Can't save $url as '$fname'");
ulink($fname); // ??? delete before close ???
}
fflush($fh);
fclose($fh);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment