Created
June 23, 2016 11:16
-
-
Save korchasa/92f7d797934f31f5749ddc49ad70a6d1 to your computer and use it in GitHub Desktop.
Download from tikiwiki and convert to markdown
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Convert tikiwiki to reStructuredText | |
*/ | |
$begin_symbol = '<div id="page-data" class="clearfix">'; | |
$end_symbol = '<hr class="hrwikibottom" />'; | |
$url_mask = 'https://help.megaplan.ru/tiki-print.php?page=%s'; | |
if ($argc < 3) { | |
die("Usage: {$argv[0]} file_with_page_ids output_dir"); | |
} | |
$out_dir = $argv[2]; | |
foreach(glob($out_dir.'/*') as $file) { | |
if(is_file($file)) | |
unlink($file); | |
} | |
$page_names = file($argv[1], FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); | |
foreach($page_names as $page_name) { | |
$content = file_get_contents(sprintf($url_mask, $page_name)); | |
echo "Export {$page_name}..."; | |
// cut content | |
$from = strpos($content, $begin_symbol) + strlen($begin_symbol); | |
$length = strpos($content, $end_symbol) - $from; | |
$content = substr($content, $from, $length - 12); //12 to delete trailing </div> o_0 | |
//file_put_contents($out_dir.'/'.$page_name.'.source.html', $content); | |
// replace colspan by td | |
preg_match_all('/colspan="(\d*)/', $content, $colspan_matches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER); | |
foreach($colspan_matches as $colspan_match) { | |
list($repeats, $offset) = $colspan_match[1]; | |
$colspan_td_end = strpos($content, '</td>', $offset); | |
$content = substr($content, 0, $colspan_td_end) . str_repeat('</td><td>', $repeats - 1) . substr($content, $colspan_td_end); | |
} | |
$content = stripAttributes($content, ['href']); | |
$content = customStuff($content); | |
$html_file = $out_dir.'/'.$page_name.'.html'; | |
file_put_contents($html_file, trim($content)); | |
$out_file = $out_dir.'/'.$page_name.'.rst'; | |
passthru('pandoc --wrap=preserve -f html -t rst -o '.$out_file.' '.$html_file); | |
unlink($html_file); | |
echo "DONE\n"; | |
} | |
function stripAttributes($s, $allowedattr = array()) { | |
if (preg_match_all("/<[^>]*\\s([^>]*)\\/*>/msiU", $s, $res, PREG_SET_ORDER)) { | |
foreach ($res as $r) { | |
$tag = $r[0]; | |
$attrs = array(); | |
preg_match_all("/\\s.*=(['\"]).*\\1/msiU", " " . $r[1], $split, PREG_SET_ORDER); | |
foreach ($split as $spl) { | |
$attrs[] = $spl[0]; | |
} | |
$newattrs = array(); | |
foreach ($attrs as $a) { | |
$tmp = explode("=", $a); | |
if (trim($a) != "" && (!isset($tmp[1]) || (trim($tmp[0]) != "" && !in_array(strtolower(trim($tmp[0])), $allowedattr)))) { | |
} else { | |
$newattrs[] = $a; | |
} | |
} | |
$attrs = implode(" ", $newattrs); | |
$rpl = str_replace($r[1], $attrs, $tag); | |
$s = str_replace($tag, $rpl, $s); | |
} | |
} | |
return str_replace(' >', '>', $s); | |
} | |
/** | |
* | |
*/ | |
function customStuff($content) | |
{ | |
// remove tags params and <br/> | |
// $content = preg_replace("/<([a-z][a-z0-9]*)[^>]*?(\/?)>/i",'<$1$2>', $content); | |
$content = preg_replace("/<br\s*\/>/i",'</p><p>', $content); | |
$content = preg_replace("/<td><strong>/i",'<th>', $content); | |
$content = preg_replace("/<\/strong>\s*<\/td>/i",'</th>', $content); | |
$content = str_replace('<a href="API">Назад к оглавлению</a></p><p>', '', $content); | |
$content = str_replace('<a href="API_rules">См. "Общие правила запросов"</a></p><p>', '', $content); | |
$content = str_replace('URI:</p><p>', "URI:</p><p>\n", $content); | |
return $content; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment