Last active
December 20, 2015 14:28
-
-
Save sasezaki/6146296 to your computer and use it in GitHub Desktop.
proposal
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
class Scraper{} // just a util or alias | |
$scraper = new Scraper(); | |
$scraper->getDocumentResolver()->setHtmlFormatter(); | |
class Scraper | |
{ | |
use EventManagerAware; | |
public function find($expr) | |
{ | |
$document = $this->getDocumentResolver()->getDocument(); | |
//$this->getEventManager()->dispatch(); | |
$documenet->getDom(Document::FORMAT_XHTML); | |
} | |
} | |
class DocumentResolverFactory | |
{ | |
public static function fromUri($uri) | |
{ | |
$documentResolver = new DocumentResolver; | |
$documentResolver->setUri($uri); | |
return $documentResolver; | |
} | |
public static function fromContent($content, $http_respones_header = array(), $uri = null) | |
{ | |
} | |
public static function fromResponseOfZF2($response, $uri = null) | |
{ | |
} | |
} | |
class DocumentResolver implements DocumentResolverInterface, HtmlFormatterAwareInterface | |
{ | |
use HttpClientAwareTrait; | |
use EventManagerAware; | |
public function setUri($uri){} | |
public function setDocument(){} | |
public function getDocument() | |
{ | |
if ($this->document) { | |
return $this->document; | |
} elseif ($this->response) { | |
$document = new Document($this->uri); | |
} else { | |
$generateDocument = $this->getGenerateDocumentCallback(); | |
$document = $generateDocument(); | |
} | |
$document->setHtmlFormatter($this->htmlFormatter); | |
$this->document = $document; | |
return $this->document; | |
} | |
public function getGenerateDocumentCallback() | |
{ | |
return function () { | |
return new Document($this->getHttpClient()->send(), $this->uri); | |
}; | |
} | |
} | |
class Document | |
{ | |
const FORMAT_NONE = 0; | |
const FORMAT_XHTML = 1; | |
const FORMAT_HTML5 = 2; | |
public function getContent($format = false) | |
{ | |
if ($this->content) { | |
$content = $this->content; | |
} | |
if (!$this->reponse && $this->dom) { | |
$content = $this->dom->saveHTML(); | |
} | |
if ($format) { | |
$content = $this->getHtmlFormmater()->getXhtml($content); | |
} | |
return $content; | |
} | |
public function getDom($format = false) | |
{ | |
$doc = new DOMDocument(); | |
$doc->loadHTML($this->getContent($format)); | |
} | |
} | |
class HtmlFormatter implements HtmlFormatterInterface | |
{ | |
use EventManagerAware; | |
public function getXhtml() | |
{ | |
//$this->getEventManager()->dispatch(); | |
// some tidy | |
//$this->getEventManager()->dispatch(); | |
} | |
} | |
class Processer | |
{ | |
public function find($target) | |
{ | |
if (is_string($target)) { | |
} else { | |
$process = self::process($target, $this->keyMapChanger); | |
} | |
$ret = $this->progressScrape($process); | |
} | |
protected function progressScrape($process) | |
{ | |
$response = $resourceLocator->getHttpResponse($this->uri); | |
$context = $this->strategy->som(); | |
foreach () { | |
} | |
} | |
} | |
$id = Scraper::uri($uri)->scrape('#id')->html()->filter(function($element) {}); | |
// array | |
$ret = Scraper::uri($uri)->scrape([ | |
'//title' => 'html', | |
'//a' => ['@href', 'callback', function($href) {return $href;}], | |
'//tr' => [['html'], function ($trs) { return new LimitIterator($trs, 0, 2);}], | |
'//id2' => function ($element) { | |
return $element; | |
} | |
]); | |
//// multidimentional | |
$processor = Scraper::processor([ | |
]); | |
$scraper->scrape(['#id' => [$process]]); | |
//// | |
$process->keyMap([ | |
'//tr[@class="item"]' => 'entry' | |
]); | |
$process->setHydrator($hydrator); | |
// | |
$scraper->scrape(function(){ | |
yield '//td' => | |
}); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
use Diggin\Scraper\DocumentResolverFactory; | |
use Diggin\Scraper\DocumentResolver; | |
use Diggin\Scraper\Finder; | |
/** | |
* constructor | |
*/ | |
$resultSet = (new Finder($uri))->each('.postlist', function($postlist){ | |
return | |
}); | |
$resultSet = (new Finder(['header' => $headers, $body, 'base-uri' => $uri]))->each('.postlist', function($postlist){ | |
return | |
}); | |
$resultSet = (new Finder(DocumentResolverFactory::uri($uri)))->each('.postlist', function($postlist){ | |
return | |
}); | |
$resultSet = (new Finder(DocumentResolverFactory::response($response, $baseUri)))->each('.postlist', function($postlist){ | |
return | |
}); | |
/** | |
* shortcuts | |
*/ | |
$finder->findOne('h3 a')->href(); | |
$finder->find('h3')->current(); | |
/** | |
* complex | |
*/ | |
$resultSet = $finder | |
->bindfirst('content .key1') | |
->bindfirst('li .key3', function($key3) {return (int) $key3;}); | |
->bindfirst('li .key4', 'html', 'specified-key-name') | |
->bindEach('div .list', function($list) { | |
$list->bindfirst(); | |
return $list; | |
}) | |
->bindHelper('feed') | |
->execute(); | |
foreach ($resultSet as $v) { | |
$v->html(); | |
} | |
/** | |
* attachment | |
*/ | |
$documentResolver->getEventManager()->attach('afterGet', $callback); | |
/** | |
* crawling | |
*/ | |
$finder->bindEach(/**/); | |
foreach ($uris as $uri) { | |
$finder->getDocumentResolver()->setUri($uri); | |
$finder->execute(); | |
} | |
/** | |
* pagerize | |
*/ | |
$finder->getHelper('autopagerize')->some(); | |
$finder->bindHeper('autopagerize'); | |
while (true) { | |
$finder->execute(); | |
} | |
/** | |
* setup http client | |
*/ | |
new Finder((new DocumentResolver)->getHttpClient()->setConfig()); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment