Skip to content

Instantly share code, notes, and snippets.

@sasezaki
Last active December 20, 2015 14:28
Show Gist options
  • Save sasezaki/6146296 to your computer and use it in GitHub Desktop.
Save sasezaki/6146296 to your computer and use it in GitHub Desktop.
proposal
<?php
class Scraper{} // just a util or alias
$scraper = new Scraper();
$scraper->getDocumentResolver()->setHtmlFormatter();
class Scraper
{
use EventManagerAware;
public function find($expr)
{
$document = $this->getDocumentResolver()->getDocument();
//$this->getEventManager()->dispatch();
$documenet->getDom(Document::FORMAT_XHTML);
}
}
class DocumentResolverFactory
{
public static function fromUri($uri)
{
$documentResolver = new DocumentResolver;
$documentResolver->setUri($uri);
return $documentResolver;
}
public static function fromContent($content, $http_respones_header = array(), $uri = null)
{
}
public static function fromResponseOfZF2($response, $uri = null)
{
}
}
class DocumentResolver implements DocumentResolverInterface, HtmlFormatterAwareInterface
{
use HttpClientAwareTrait;
use EventManagerAware;
public function setUri($uri){}
public function setDocument(){}
public function getDocument()
{
if ($this->document) {
return $this->document;
} elseif ($this->response) {
$document = new Document($this->uri);
} else {
$generateDocument = $this->getGenerateDocumentCallback();
$document = $generateDocument();
}
$document->setHtmlFormatter($this->htmlFormatter);
$this->document = $document;
return $this->document;
}
public function getGenerateDocumentCallback()
{
return function () {
return new Document($this->getHttpClient()->send(), $this->uri);
};
}
}
class Document
{
const FORMAT_NONE = 0;
const FORMAT_XHTML = 1;
const FORMAT_HTML5 = 2;
public function getContent($format = false)
{
if ($this->content) {
$content = $this->content;
}
if (!$this->reponse && $this->dom) {
$content = $this->dom->saveHTML();
}
if ($format) {
$content = $this->getHtmlFormmater()->getXhtml($content);
}
return $content;
}
public function getDom($format = false)
{
$doc = new DOMDocument();
$doc->loadHTML($this->getContent($format));
}
}
class HtmlFormatter implements HtmlFormatterInterface
{
use EventManagerAware;
public function getXhtml()
{
//$this->getEventManager()->dispatch();
// some tidy
//$this->getEventManager()->dispatch();
}
}
class Processer
{
public function find($target)
{
if (is_string($target)) {
} else {
$process = self::process($target, $this->keyMapChanger);
}
$ret = $this->progressScrape($process);
}
protected function progressScrape($process)
{
$response = $resourceLocator->getHttpResponse($this->uri);
$context = $this->strategy->som();
foreach () {
}
}
}
$id = Scraper::uri($uri)->scrape('#id')->html()->filter(function($element) {});
// array
$ret = Scraper::uri($uri)->scrape([
'//title' => 'html',
'//a' => ['@href', 'callback', function($href) {return $href;}],
'//tr' => [['html'], function ($trs) { return new LimitIterator($trs, 0, 2);}],
'//id2' => function ($element) {
return $element;
}
]);
//// multidimentional
$processor = Scraper::processor([
]);
$scraper->scrape(['#id' => [$process]]);
////
$process->keyMap([
'//tr[@class="item"]' => 'entry'
]);
$process->setHydrator($hydrator);
//
$scraper->scrape(function(){
yield '//td' =>
});
<?php
use Diggin\Scraper\DocumentResolverFactory;
use Diggin\Scraper\DocumentResolver;
use Diggin\Scraper\Finder;
/**
* constructor
*/
$resultSet = (new Finder($uri))->each('.postlist', function($postlist){
return
});
$resultSet = (new Finder(['header' => $headers, $body, 'base-uri' => $uri]))->each('.postlist', function($postlist){
return
});
$resultSet = (new Finder(DocumentResolverFactory::uri($uri)))->each('.postlist', function($postlist){
return
});
$resultSet = (new Finder(DocumentResolverFactory::response($response, $baseUri)))->each('.postlist', function($postlist){
return
});
/**
* shortcuts
*/
$finder->findOne('h3 a')->href();
$finder->find('h3')->current();
/**
* complex
*/
$resultSet = $finder
->bindfirst('content .key1')
->bindfirst('li .key3', function($key3) {return (int) $key3;});
->bindfirst('li .key4', 'html', 'specified-key-name')
->bindEach('div .list', function($list) {
$list->bindfirst();
return $list;
})
->bindHelper('feed')
->execute();
foreach ($resultSet as $v) {
$v->html();
}
/**
* attachment
*/
$documentResolver->getEventManager()->attach('afterGet', $callback);
/**
* crawling
*/
$finder->bindEach(/**/);
foreach ($uris as $uri) {
$finder->getDocumentResolver()->setUri($uri);
$finder->execute();
}
/**
* pagerize
*/
$finder->getHelper('autopagerize')->some();
$finder->bindHeper('autopagerize');
while (true) {
$finder->execute();
}
/**
* setup http client
*/
new Finder((new DocumentResolver)->getHttpClient()->setConfig());
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment