Last active
December 16, 2015 21:50
-
-
Save gbili/5503007 to your computer and use it in GitHub Desktop.
this is the script that im trying to run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
use Gbili\Miner\BluePrint, | |
Gbili\Miner\BluePrint\Action\Extract\Savable as ExtractSavable, | |
Gbili\Miner\BluePrint\Action\GetContents\Savable as GetContentsSavable, | |
Gbili\Vid\Savable\Lexer; | |
$a = array();//actions | |
$b = new BluePrint\Savable(); | |
$b->setHost('myhost.com'); | |
$b->setBasePath('./../'); | |
$a[0] = new GetContentsSavable(); | |
$a[0]->setBluePrint($b); | |
$a[0]->setData('http://myhost.com'); | |
//child | |
$a[2] = new ExtractSavable(); | |
$a[0]->addChild($a[2]); | |
$a[2]->setUseMatchAll(false); | |
$a[2]->setData('<h3>Categories</h3>[^<]*<table width="100%" border="0" cellspacing="0" cellpadding="0">[^<]*<tr>[^<]*<td width="50%" valign="top"><strong>#</strong><br />(.+?)</td></tr>'); | |
//CATEGORIES | |
$a[3] = new ExtractSavable(); | |
$a[2]->addChild($a[3]); | |
$a[3]->setUseMatchAll(true); | |
$a[3]->spitGroupAsEntity(2, Lexer::CATEGORY); | |
$a[3]->interceptGroupsOneByOne(1, 'prependHostToUrl'); | |
$a[3]->setData('<a href="([^"]+)" target="_blank">([^<]+)</a> ?\(\d*\)<br />'); | |
//CATEGORY and page | |
$a[4] = new GetContentsSavable(); | |
$a[3]->addChild($a[4]); | |
$a[4]->setInputParentRegexGroupNumber(1); | |
//VIDEO | |
$a[5] = new ExtractSavable(); | |
$a[4]->addChild($a[5]); | |
$a[5]->setUseMatchAll(true); | |
$a[5]->setAsNewInstanceGeneratingPoint(); | |
$a[5]->spitGroupAsEntity(2, Lexer::TITLE) | |
->spitGroupAsEntity(3, Lexer::IMAGE) | |
->spitGroupAsEntity(4, Lexer::TIME_LENGTH) | |
->spitGroupAsEntity(5, Lexer::DATE) | |
->spitGroupAsEntity(6, Lexer::HOST_NAME); | |
//works : <table cellpadding=2 cellspacing=0 width="185">[^<]*<tr><td colspan=2><a href="([^"]+)" target="_blank">([^<]+)</a><br /></td></tr>[^<]*<tr><td colspan=2><a href="[^"]+" class="thumb" target="_blank"><img id="[^"]+" src="([^"]+)" width="180" height="135"></a><br /></td></tr>[^<]*<tr><td><font class="s">([^<]+)</font><br /></td><td align=right><font class="s">([^<]+)</font><br /><a href="/search/\\?rs=1\\&c=0\\&s=\\d+" target="_blank">([^<]+)</a><br /></td></tr>[^<]*</table>[^<]*<br /> | |
$a[5]->setData('<table cellpadding=2 cellspacing=0 width="185">[^<]*<tr><td colspan=2><a href="([^"]+)" target="_blank">([^<]+)</a><br /></td></tr>[^<]*<tr><td colspan=2><a href="[^"]+" class="thumb" target="_blank"><img id="[^"]+" src="([^"]+)" width="180" height="135"></a><br /></td></tr>[^<]*<tr><td><font class="s">([^<]+)</font><br /></td><td align=right><font class="s">([^<]+)</font><br /><a href="/search/\?rs=1\&c=0\&s=\d+" target="_blank">([^<]+)</a><br /></td></tr>[^<]*</table>[^<]*<br />'); | |
//VIDEO SOURCE | |
$a[7] = new ExtractSavable(); | |
$a[5]->addChild($a[7]); | |
$a[7]->setInputParentRegexGroupNumber(1); | |
$a[7]->setUseMatchAll(false); | |
$a[7]->spitGroupAsEntity(1, Lexer::SOURCE); | |
$a[7]->setData('(http://.+)$'); | |
//loop GET CATOGORY GETCONTENTS CALLBACK LOOP PARAMS | |
$a[6] = new ExtractSavable(); | |
$a[4]->addChild($a[6]); | |
$a[6]->injectResultTo($a[4]); | |
$a[6]->setUseMatchAll(false); | |
$a[6]->setData('<h1>\W*((?:(:?,\s)?(?:(?:[\w\d]+)(?:\s[\w\d]+)*))+)[^<]*</h1>[^<]*<table width="97%" border="0" cellspacing="0" cellpadding="0">[^<]*<tr>[^<]*<td width="40%">\W*showing ?(\d+)-(\d+) of (\d+)\.[^<]*</td>'); | |
$b->save(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment