Created
June 5, 2018 19:56
-
-
Save codedokode/590f9094988bbd22db7164bb83e54405 to your computer and use it in GitHub Desktop.
Скрипт исправления сломанных тредов 1, 4b и 15
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
use Symfony\Component\DomCrawler\Crawler; | |
use phpClub\Util\DOMUtil; | |
set_time_limit(0); | |
require __DIR__ . '/../vendor/autoload.php'; | |
$di = require __DIR__ . '/../src/Bootstrap.php'; | |
/** | |
* Исправляет 1, 4b и 15 тред, в котором много сломанных кусков HTML. | |
* | |
* Использование: php script.php <input >output | |
*/ | |
/* | |
* Поиск проблем в верстке | |
* | |
* ="[^"<>]*(<|>[^>]) | |
* <[^<>"]*?[^=>]" | |
* <([^<>"]|"[^"<>]{0,100}?"){0,100}?"[^"<>]*(<|>[^>]) | |
*/ | |
$html = file_get_contents('php://stdin'); | |
$stderr = fopen('php://stderr', 'a'); | |
$dupPostsOk = in_array('--dup-posts-ok', $argv); | |
// Определяем номер треда | |
if (false !== strstr($html, 'thread_236463')) { | |
$threadNo = 1; | |
} elseif (false !== strstr($html, 'thread_345388')) { | |
$threadNo = 15; | |
} elseif (false !== strstr($html, 'thread_280501')) { | |
$threadNo = '4b'; | |
} else { | |
fprintf($stderr, "Nothing to fix here\n"); | |
echo $html; | |
exit(0); | |
} | |
$thread1Fixes = [ | |
'<a ">' => '', | |
'<span class="postertripr />' => '', | |
'<blockquofom mobile">' => '', | |
'<a onmouseover="showPostPreviepan>' => '', | |
'</spclass="postinfom>' => '', | |
'<span class="postpне давал.' => '', | |
'<blockqw(event)"' => '', | |
'<a class="postbtn_adm" href="http://2ch.hk/pr/res/236463.html#" style="display:none" onclick="javascript:addAdminMenu(this); return false;" onmouseout="javasмотреть. ' => '', | |
]; | |
$thread15Fixes = [ | |
'<div i); return false;">' => '', | |
'<blocst_347440" class="post">' => '', | |
'<a onmouseover="showPostPreview(event)" onmouseout="delPostь' => '', | |
'<table id="poишешь' => '', | |
'<span clabtn_rep" href="#" onclick="javascript:addQuickReply(\'347680\'); return false;">' => '', | |
'<span class="subjecss=" unkfunc">' => '', | |
'<a onmouseover="showP" reflink1">' => '', | |
'<span data-utc="1339778953" pr res 345388.html#348611">' => '', | |
'<a onmouseover="showPostPrevclass=" datetime postnum">' => '', | |
]; | |
$thread4bFixes = [ | |
'<span class="posternamlockquote id=" m281173">' => '', | |
'<a onmouseover="showPostPreview(event)" onmotpanel">' => '', | |
'</spa">' => '', | |
'<a class="postbtn_adm" href="http://2ch.pm/pr/res/280501.html#" style="display:none" onclick="javascript:addAdminMenu(tливость' => '', | |
'<a class="postbtn_adm" href="http://2ch.pm/pr/res/280501.html#" style="display:none" onclick="his);" return="" false;"="" onmouseout="javascript:removeAdminMenu(event); return false;">' => '', | |
'<input type="checkbox" name="delete" clm mobile">' => '', | |
'<a href="http://2ch.pm/pr/res/280501.html#286ass=" turnmeoff"="" value="286625">' => '', | |
'<a class="postbtn_adm" href="http://2ch.pm/pr/res/280501.html#" style="display:none" onclick="javascript:addAdminMenu(this); return false;" onmouseout="javascripts=" unkfunc"="">' => '', | |
'<span class="ref=" pr res 280501.html#288346">' => '', | |
]; | |
if ($threadNo == 1) { | |
$html = strtr($html, $thread1Fixes); | |
$headerTill = '<div id="thread_236463" class="thread">'; | |
$footerFrom = '</div>[<a href="http://2ch.hk/pr/wakaba.html">Назад</a>]'; | |
} elseif ($threadNo == '4b') { | |
$html = strtr($html, $thread4bFixes); | |
$headerTill = '<div id="thread_280501" class="thread">'; | |
$footerFrom = '</div>[<a href="'; | |
} elseif ($threadNo == 15) { | |
$html = strtr($html, $thread15Fixes); | |
$headerTill = '<div id="thread_345388" class="thread">'; | |
$footerFrom = '</div>[<a href="http://2ch.hk/pr/wakaba.html">Назад</a>]'; | |
} | |
list($header, $rest) = explode($headerTill, $html, 2); | |
$header .= $headerTill; | |
list($body, $footer) = explode($footerFrom, $rest, 2); | |
$footer = $footerFrom . $footer; | |
if (strlen($html) !== strlen($header) + strlen($body) + strlen($footer)) { | |
throw new \Exception("Data lost while splitting"); | |
} | |
if ($dupPostsOk) { | |
$newBody = $html; | |
} else { | |
$bodyCrawler = new Crawler($body); | |
$posts = $bodyCrawler->filterXPath('//body/*'); | |
$newNodes = fixDupPosts($posts, $stderr); | |
$newBodyParts = []; | |
foreach ($newNodes as $node) { | |
$nodeHtml = DOMUtil::getOuterHtml($node); | |
$newBodyParts[] = $nodeHtml; | |
} | |
$newBody = implode("\n", $newBodyParts); | |
fprintf( | |
$stderr, | |
"Old body: %d bytes, %d posts, new: %d bytes, %d posts\n", | |
strlen($body), | |
$posts->count(), | |
strlen($newBody), | |
$newNodes->count() | |
); | |
} | |
echo $header . $newBody . $footer; | |
function fixDupPosts(Crawler $posts, $stderr) | |
{ | |
fprintf($stderr, "%d posts found\n", $posts->count()); | |
$newNodes = new Crawler; | |
$old = []; | |
$dupIds = []; | |
$posts->each(function ($post) use (&$old, &$dupIds, $stderr) { | |
$id = $post->attr('id'); | |
// В id бывают пробелы | |
$id = trim($id); | |
if (!$id) { | |
fprintf($stderr, "Empty id at html, skip it: %s\n\n", DOMUtil::getOuterHtml($post->getNode(0))); | |
return; | |
} | |
if (array_key_exists($id, $old)) { | |
$dupIds[] = $id; | |
} | |
$old[$id] = $post; | |
}); | |
fprintf($stderr, "%d nodes replaced\n", count($dupIds)); | |
// $countedIds = array_count_values($dupIds); | |
// foreach ($countedIds as $id => $num) { | |
// fprintf($stderr, "%s x%s\t", $id, $num); | |
// } | |
// fprintf($stderr, "\n"); | |
foreach ($old as $node) { | |
$newNodes->addNode($node->getNode(0)); | |
} | |
return $newNodes; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment