Created
June 20, 2016 09:29
-
-
Save andronex/0d59eac87eb683d862bd5c77de02325c to your computer and use it in GitHub Desktop.
Парсер вопросов/ответов с сайта pravoved.ru для наполнения сайта на MODX Revolution
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* требуется библиотека phpQuery | |
**/ | |
$parserURL = array( | |
'Уголовные дела' => 'https://pravoved.ru/questions/criminal-law/' | |
,'Дела семейные' => 'https://pravoved.ru/questions/family-law/' | |
,'Недвижимость' => 'https://pravoved.ru/questions/realty/' | |
,'Автоюрист' => 'https://pravoved.ru/questions/auto-law/' | |
,'Наследственные дела' => 'https://pravoved.ru/questions/inheritance/' | |
,'Арбитраж' => 'https://pravoved.ru/questions/arbitration/' | |
,'Трудовое право' => 'https://pravoved.ru/questions/labour-law/' | |
); | |
require(dirname(__FILE__).'/query/phpquery-master/phpQuery/phpQuery.php'); | |
function get_web_page( $url ){ | |
$options = array( | |
CURLOPT_RETURNTRANSFER => true, // return web page | |
CURLOPT_HEADER => false, // don't return headers | |
CURLOPT_FOLLOWLOCATION => false, // follow redirects | |
CURLOPT_ENCODING => "", // handle all encodings | |
CURLOPT_USERAGENT => "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.52 Safari/537.17", // who am i | |
CURLOPT_AUTOREFERER => true, // set referer on redirect | |
CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect | |
CURLOPT_TIMEOUT => 120, // timeout on response | |
CURLOPT_MAXREDIRS => 10, // stop after 10 redirects | |
); | |
$ch = curl_init( $url ); | |
curl_setopt_array( $ch, $options ); | |
$content = curl_exec( $ch ); | |
$err = curl_errno( $ch ); | |
$errmsg = curl_error( $ch ); | |
$header = curl_getinfo( $ch ); | |
curl_close( $ch ); | |
$header['errno'] = $err; | |
$header['errmsg'] = $errmsg; | |
$header['content'] = $content; | |
return $header; | |
} | |
define('MODX_API_MODE', true); | |
require_once dirname(dirname(__FILE__)).'/index.php'; | |
$modx->getService('error','error.modError'); | |
$modx->getRequest(); | |
$modx->setLogLevel(modX::LOG_LEVEL_ERROR); | |
$modx->setLogTarget('FILE'); | |
$modx->error->message = null; | |
define('MODX_ACTION_MODE', true); | |
$document = ''; | |
$data = array('parent' => 217, 'context_key' => 'web', 'template' => 8, 'class_key' => 'modDocument', 'published' => 1); | |
$count = 0; | |
foreach($parserURL as $tag => $pURL){ | |
$page = get_web_page($pURL); | |
unset($document); | |
$document = phpQuery::newDocument($page['content']); | |
$data['tagger-2'] = $tag; | |
$links = $document->find('.questionBody'); | |
foreach($links as $link){ | |
$uri = pq($link)->find('h3 > a')->attr('href'); | |
$url = 'https://pravoved.ru'.$uri; | |
$number = str_replace(array('/','question'),'',$uri); | |
//$url = 'https://pravoved.ru/question/1280740/'; | |
$answer_page = get_web_page($url); | |
$answer = phpQuery::newDocument($answer_page['content']); | |
$q = 'SELECT MAX(`id`) AS id FROM '.$modx->getTableName('modResource'); | |
$o = $modx->prepare($q); | |
$o->execute(); | |
$resmaxid = $o->fetch(PDO::FETCH_ASSOC); | |
$resmaxid = ($resmaxid['id'] + 1).'-'; | |
$data['pagetitle'] = $resmaxid; | |
$data['pagetitle'] .= $answer->find('h1:first')->text(); | |
$data['content'] = trim($answer->find('.questionBody div[itemprop=text]')->html()); | |
$data['content'] = preg_replace("~<a[^>]+href\s*=\s*[\x27\x22]?[^\x20\x27\x22\x3E]+[\x27\x22]?[^>]*>(.+?)</a>~is", '$1',$data['content']); | |
$author = $answer->find('.questionBody .questionAuthor')->text(); | |
$date = str_replace(', вопрос №'.$number,'',trim($answer->find('.questionBody .questionDate')->text())); | |
$authorTpl = ' | |
<div class="row"> | |
<div class="col-md-6"><p><i class="icon-calendar"></i> Вопрос задан: [[+date]]</div> | |
<div class="col-md-6"><p><i class="icon-user"></i> Автор вопроса: [[+author]]</div> | |
</div>'; | |
$data['link_attributes'] = $number; | |
if($answer->find('.questionBody .questionAdditions + *')->text()){ | |
$data['content'] .= '<h3>Дополнение по сути вопроса</h3>'.$answer->find('.questionBody .questionAdditions + *')->htmlOuter(); | |
} | |
//типографирование | |
if($data['content']) { | |
$myCurl = curl_init(); | |
$xml = '<?xml version="1.0" encoding="windows-1251" ?> | |
<preferences> | |
<nowraped insert="0" nonbsp="1" length="0"> | |
<start><![CDATA[<nobr>]]></start> | |
<end><![CDATA[</nobr>]]></end> | |
</nowraped> | |
<acronym insert="0"></acronym> | |
<link target="_blank" rel="nofollow" /> | |
</preferences>'; | |
curl_setopt_array($myCurl, array( | |
CURLOPT_URL => 'http://www.typograf.ru/webservice/', | |
CURLOPT_RETURNTRANSFER => true, | |
CURLOPT_POST => true, | |
CURLOPT_POSTFIELDS => http_build_query(array('text' => $data['content'], 'chr' => 'UTF-8', 'xml' => $xml)) | |
)); | |
if($response = curl_exec($myCurl)) $data['content'] = $response; | |
curl_close($myCurl); | |
} | |
$description = str_replace(array('(',')'),array(' '),$modx->runSnippet('summary',array( | |
'input' => $data['content'] | |
,'len' => 100 | |
,'dotted' => 0 | |
) | |
)); | |
$description .= '…'; | |
$data['description'] = $description; | |
$data['content'] .= str_replace(array('[[+date]]','[[+author]]'),array($date,$author),$authorTpl); | |
$i = 0; $comment = array(); | |
$place = array(' адвоката ',' адвокату ',' адвокат '); | |
$new = array(' <a href="[[~3]]" title="Адвокат Тарабрин А.И.">адвоката</a> ',' <a href="[[~3]]" title="Адвокат Тарабрин А.И.">адвокату</a> ',' <a href="[[~3]]" title="Адвокат Тарабрин А.И.">адвокат</a> '); | |
$quations = $answer->find('.questionComments ul li'); | |
foreach($quations as $quation){ | |
if(empty(pq($quation)->find('.commentWrapper .innerCommentText div[itemprop=text]')->text())) continue; | |
else $comment['comment'][] = str_replace($place,$new,preg_replace("~<a[^>]+href\s*=\s*[\x27\x22]?[^\x20\x27\x22\x3E]+[\x27\x22]?[^>]*>(.+?)</a>~is", '$1',trim(pq($quation)->find('.commentWrapper .innerCommentText div[itemprop=text]')->html()))); | |
$comment['advokat'][] = pq($quation)->find('.prvd-lawyer-name .prvd-wrapper')->text(); | |
if(!empty(pq($quation)->find('.clientComment p.title + *')->text())) { | |
$comment['comment'][] = preg_replace("~<a[^>]+href\s*=\s*[\x27\x22]?[^\x20\x27\x22\x3E]+[\x27\x22]?[^>]*>(.+?)</a>~is", '$1',trim(pq($quation)->find('.clientComment p.title + *')->htmlOuter())); | |
$comment['advokat'][] = $author; | |
} | |
$i++; | |
} | |
if($i == 0) continue; | |
if (!$modx->getObject($data['class_key'], array('link_attributes' => $data['link_attributes']))) { | |
$modx->error->message = null; | |
$response = $modx->runProcessor('resource/create', $data); | |
if ($response->isError()) { | |
$modx->log(modX::LOG_LEVEL_ERROR, "Error on create: \n". print_r($response->getAllErrors(), 1)); | |
} | |
else { | |
$resource = $response->getObject(); | |
$res = $modx->getObject($data['class_key'], array('id' => $resource['id'])); | |
$res->set('pagetitle', str_replace($resmaxid, '', $res->get('pagetitle'))); | |
$res->save(); | |
$vс = 0; | |
$prop = array( | |
'thread' => 'resource-'.$resource['id'] | |
,'parent' => 0 | |
,'id' => 0 | |
,'email' => '[email protected]' | |
); | |
$linkURL = 'http://advo1.ru/'.$res->get('alias'); | |
get_web_page($linkURL); | |
foreach($comment['comment'] as $vcom){ | |
$prop['text'] = $vcom; | |
$prop['name'] = $comment['advokat'][$vс]; | |
$vс++; | |
unset($properties); | |
if (!empty($prop['thread']) && $thread = $modx->getObject('TicketThread', array('name' => $prop['thread']))) { | |
$properties = $thread->get('properties'); | |
} | |
$properties['enableCaptcha'] = 0; | |
$Tickets = $modx->getService('tickets','Tickets',$modx->getOption('tickets.core_path',null,$modx->getOption('core_path').'components/tickets/').'model/tickets/', $properties); | |
$response = $Tickets->saveComment($prop); | |
} | |
$count++; | |
echo "для ресурса {$resource['id']} создано {$vс} комментариев\n"; | |
} | |
} | |
} | |
} | |
echo "создано {$count} ресурсов\n"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment