Skip to content

Instantly share code, notes, and snippets.

@ckhung
Last active April 9, 2021 04:01
Show Gist options
  • Save ckhung/7424304627692444febe to your computer and use it in GitHub Desktop.
Save ckhung/7424304627692444febe to your computer and use it in GitHub Desktop.
extract a piece of a web page using css selector
#!/usr/bin/php
<?php
require_once 'QueryPath/qp.php';
// 若是使用舊版 (2.1.2) 的 QueryPath, 應該這樣寫:
// require_once 'QueryPath/QueryPath.php';
// 詳見 「網頁搜括小工具: 用 extract.php 擷取網頁當中的一小塊」
// http://newtoypia.blogspot.tw/2016/03/web-scraping.html
// extract.php -s 'div.hentry' < ckhung.html > ckhung2.html
// extract.php -p blogger < ckhung.html > ckhung3.html
// For an incomplete input (no html header) with utf8 contents,
// use -u to wrap the input html so that utf8 characters
// use -t to remove all html tags and retain only text
// can be correctly processed.
// use -c div,table to clothe div (outer) and table (inner) around the extracted piece
// use -w to further wrap html header and body around the wrapped piece
// so that it becomes a complete html.
// convert a predefined name to its search string
$n2ss = array(
'eff' => '#main',
'yro' => '.body',
'blogger' => 'div.hentry',
);
$options = getopt("p:s:tc:uw");
# https://github.com/technosophos/querypath/issues/94
$qp_options = array(
'convert_from_encoding' => 'UTF-8',
'convert_to_encoding' => 'UTF-8',
'strip_low_ascii' => FALSE,
);
if (array_key_exists("p", $options)) {
if (array_key_exists("s", $options))
exit("please use either -p <predefined> or -s <selector> (but not both)\n");
if (! array_key_exists($options["p"], $n2ss))
exit("'$options[p]' is not a predefined name\n");
$selector = $n2ss[$options["p"]];
} else {
if (array_key_exists("s", $options))
$selector = $options["s"];
else
$selector = "";
}
$header = <<<EOT
<html>
<head>
<meta http-equiv='Content-Type' content='text/html; charset=UTF-8' />
</head>
<body>
EOT;
$footer = <<<EOT
</body>
</html>
EOT;
$in = file_get_contents('php://stdin');
if (array_key_exists("u", $options)) {
$in = "$header$in$footer";
}
$qp = htmlqp($in, NULL, $qp_options);
if (strlen($selector) > 0) {
$out = "";
foreach ($qp->find($selector) as $found) {
$out .= array_key_exists("t", $options) ?
$found->text() : $found->html();
$out .= "\n";
}
} else {
$out = array_key_exists("t", $options) ?
$qp->text() : $qp->html();
}
if (array_key_exists("c", $options)) {
$clothes = array_reverse(explode(",", $options["c"]));
foreach ($clothes as $layer) {
$out = "<$layer>\n$out\n</$layer>";
}
}
if (array_key_exists("w", $options))
$out = "$header$out$footer";
echo $out;
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment