Skip to content

Instantly share code, notes, and snippets.

@andronex
Created June 20, 2016 09:29
Show Gist options
  • Save andronex/0d59eac87eb683d862bd5c77de02325c to your computer and use it in GitHub Desktop.
Save andronex/0d59eac87eb683d862bd5c77de02325c to your computer and use it in GitHub Desktop.
Парсер вопросов/ответов с сайта pravoved.ru для наполнения сайта на MODX Revolution
<?php
/**
* требуется библиотека phpQuery
**/
$parserURL = array(
'Уголовные дела' => 'https://pravoved.ru/questions/criminal-law/'
,'Дела семейные' => 'https://pravoved.ru/questions/family-law/'
,'Недвижимость' => 'https://pravoved.ru/questions/realty/'
,'Автоюрист' => 'https://pravoved.ru/questions/auto-law/'
,'Наследственные дела' => 'https://pravoved.ru/questions/inheritance/'
,'Арбитраж' => 'https://pravoved.ru/questions/arbitration/'
,'Трудовое право' => 'https://pravoved.ru/questions/labour-law/'
);
require(dirname(__FILE__).'/query/phpquery-master/phpQuery/phpQuery.php');
function get_web_page( $url ){
$options = array(
CURLOPT_RETURNTRANSFER => true, // return web page
CURLOPT_HEADER => false, // don't return headers
CURLOPT_FOLLOWLOCATION => false, // follow redirects
CURLOPT_ENCODING => "", // handle all encodings
CURLOPT_USERAGENT => "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.52 Safari/537.17", // who am i
CURLOPT_AUTOREFERER => true, // set referer on redirect
CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect
CURLOPT_TIMEOUT => 120, // timeout on response
CURLOPT_MAXREDIRS => 10, // stop after 10 redirects
);
$ch = curl_init( $url );
curl_setopt_array( $ch, $options );
$content = curl_exec( $ch );
$err = curl_errno( $ch );
$errmsg = curl_error( $ch );
$header = curl_getinfo( $ch );
curl_close( $ch );
$header['errno'] = $err;
$header['errmsg'] = $errmsg;
$header['content'] = $content;
return $header;
}
define('MODX_API_MODE', true);
require_once dirname(dirname(__FILE__)).'/index.php';
$modx->getService('error','error.modError');
$modx->getRequest();
$modx->setLogLevel(modX::LOG_LEVEL_ERROR);
$modx->setLogTarget('FILE');
$modx->error->message = null;
define('MODX_ACTION_MODE', true);
$document = '';
$data = array('parent' => 217, 'context_key' => 'web', 'template' => 8, 'class_key' => 'modDocument', 'published' => 1);
$count = 0;
foreach($parserURL as $tag => $pURL){
$page = get_web_page($pURL);
unset($document);
$document = phpQuery::newDocument($page['content']);
$data['tagger-2'] = $tag;
$links = $document->find('.questionBody');
foreach($links as $link){
$uri = pq($link)->find('h3 > a')->attr('href');
$url = 'https://pravoved.ru'.$uri;
$number = str_replace(array('/','question'),'',$uri);
//$url = 'https://pravoved.ru/question/1280740/';
$answer_page = get_web_page($url);
$answer = phpQuery::newDocument($answer_page['content']);
$q = 'SELECT MAX(`id`) AS id FROM '.$modx->getTableName('modResource');
$o = $modx->prepare($q);
$o->execute();
$resmaxid = $o->fetch(PDO::FETCH_ASSOC);
$resmaxid = ($resmaxid['id'] + 1).'-';
$data['pagetitle'] = $resmaxid;
$data['pagetitle'] .= $answer->find('h1:first')->text();
$data['content'] = trim($answer->find('.questionBody div[itemprop=text]')->html());
$data['content'] = preg_replace("~<a[^>]+href\s*=\s*[\x27\x22]?[^\x20\x27\x22\x3E]+[\x27\x22]?[^>]*>(.+?)</a>~is", '$1',$data['content']);
$author = $answer->find('.questionBody .questionAuthor')->text();
$date = str_replace(', вопрос №'.$number,'',trim($answer->find('.questionBody .questionDate')->text()));
$authorTpl = '
<div class="row">
<div class="col-md-6"><p><i class="icon-calendar"></i> Вопрос задан: [[+date]]</div>
<div class="col-md-6"><p><i class="icon-user"></i> Автор вопроса: [[+author]]</div>
</div>';
$data['link_attributes'] = $number;
if($answer->find('.questionBody .questionAdditions + *')->text()){
$data['content'] .= '<h3>Дополнение по сути вопроса</h3>'.$answer->find('.questionBody .questionAdditions + *')->htmlOuter();
}
//типографирование
if($data['content']) {
$myCurl = curl_init();
$xml = '<?xml version="1.0" encoding="windows-1251" ?>
<preferences>
<nowraped insert="0" nonbsp="1" length="0">
<start><![CDATA[<nobr>]]></start>
<end><![CDATA[</nobr>]]></end>
</nowraped>
<acronym insert="0"></acronym>
<link target="_blank" rel="nofollow" />
</preferences>';
curl_setopt_array($myCurl, array(
CURLOPT_URL => 'http://www.typograf.ru/webservice/',
CURLOPT_RETURNTRANSFER => true,
CURLOPT_POST => true,
CURLOPT_POSTFIELDS => http_build_query(array('text' => $data['content'], 'chr' => 'UTF-8', 'xml' => $xml))
));
if($response = curl_exec($myCurl)) $data['content'] = $response;
curl_close($myCurl);
}
$description = str_replace(array('(',')'),array(' '),$modx->runSnippet('summary',array(
'input' => $data['content']
,'len' => 100
,'dotted' => 0
)
));
$description .= '&hellip;';
$data['description'] = $description;
$data['content'] .= str_replace(array('[[+date]]','[[+author]]'),array($date,$author),$authorTpl);
$i = 0; $comment = array();
$place = array(' адвоката ',' адвокату ',' адвокат ');
$new = array(' <a href="[[~3]]" title="Адвокат Тарабрин А.И.">адвоката</a> ',' <a href="[[~3]]" title="Адвокат Тарабрин А.И.">адвокату</a> ',' <a href="[[~3]]" title="Адвокат Тарабрин А.И.">адвокат</a> ');
$quations = $answer->find('.questionComments ul li');
foreach($quations as $quation){
if(empty(pq($quation)->find('.commentWrapper .innerCommentText div[itemprop=text]')->text())) continue;
else $comment['comment'][] = str_replace($place,$new,preg_replace("~<a[^>]+href\s*=\s*[\x27\x22]?[^\x20\x27\x22\x3E]+[\x27\x22]?[^>]*>(.+?)</a>~is", '$1',trim(pq($quation)->find('.commentWrapper .innerCommentText div[itemprop=text]')->html())));
$comment['advokat'][] = pq($quation)->find('.prvd-lawyer-name .prvd-wrapper')->text();
if(!empty(pq($quation)->find('.clientComment p.title + *')->text())) {
$comment['comment'][] = preg_replace("~<a[^>]+href\s*=\s*[\x27\x22]?[^\x20\x27\x22\x3E]+[\x27\x22]?[^>]*>(.+?)</a>~is", '$1',trim(pq($quation)->find('.clientComment p.title + *')->htmlOuter()));
$comment['advokat'][] = $author;
}
$i++;
}
if($i == 0) continue;
if (!$modx->getObject($data['class_key'], array('link_attributes' => $data['link_attributes']))) {
$modx->error->message = null;
$response = $modx->runProcessor('resource/create', $data);
if ($response->isError()) {
$modx->log(modX::LOG_LEVEL_ERROR, "Error on create: \n". print_r($response->getAllErrors(), 1));
}
else {
$resource = $response->getObject();
$res = $modx->getObject($data['class_key'], array('id' => $resource['id']));
$res->set('pagetitle', str_replace($resmaxid, '', $res->get('pagetitle')));
$res->save();
$vс = 0;
$prop = array(
'thread' => 'resource-'.$resource['id']
,'parent' => 0
,'id' => 0
,'email' => '[email protected]'
);
$linkURL = 'http://advo1.ru/'.$res->get('alias');
get_web_page($linkURL);
foreach($comment['comment'] as $vcom){
$prop['text'] = $vcom;
$prop['name'] = $comment['advokat'][$vс];
$vс++;
unset($properties);
if (!empty($prop['thread']) && $thread = $modx->getObject('TicketThread', array('name' => $prop['thread']))) {
$properties = $thread->get('properties');
}
$properties['enableCaptcha'] = 0;
$Tickets = $modx->getService('tickets','Tickets',$modx->getOption('tickets.core_path',null,$modx->getOption('core_path').'components/tickets/').'model/tickets/', $properties);
$response = $Tickets->saveComment($prop);
}
$count++;
echo "для ресурса {$resource['id']} создано {$vс} комментариев\n";
}
}
}
}
echo "создано {$count} ресурсов\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment