Skip to content

Instantly share code, notes, and snippets.

@korchasa
Created June 23, 2016 11:16
Show Gist options
  • Save korchasa/92f7d797934f31f5749ddc49ad70a6d1 to your computer and use it in GitHub Desktop.
Save korchasa/92f7d797934f31f5749ddc49ad70a6d1 to your computer and use it in GitHub Desktop.
Download from tikiwiki and convert to markdown
<?php
/**
* Convert tikiwiki to reStructuredText
*/
$begin_symbol = '<div id="page-data" class="clearfix">';
$end_symbol = '<hr class="hrwikibottom" />';
$url_mask = 'https://help.megaplan.ru/tiki-print.php?page=%s';
if ($argc < 3) {
die("Usage: {$argv[0]} file_with_page_ids output_dir");
}
$out_dir = $argv[2];
foreach(glob($out_dir.'/*') as $file) {
if(is_file($file))
unlink($file);
}
$page_names = file($argv[1], FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
foreach($page_names as $page_name) {
$content = file_get_contents(sprintf($url_mask, $page_name));
echo "Export {$page_name}...";
// cut content
$from = strpos($content, $begin_symbol) + strlen($begin_symbol);
$length = strpos($content, $end_symbol) - $from;
$content = substr($content, $from, $length - 12); //12 to delete trailing </div> o_0
//file_put_contents($out_dir.'/'.$page_name.'.source.html', $content);
// replace colspan by td
preg_match_all('/colspan="(\d*)/', $content, $colspan_matches, PREG_OFFSET_CAPTURE | PREG_SET_ORDER);
foreach($colspan_matches as $colspan_match) {
list($repeats, $offset) = $colspan_match[1];
$colspan_td_end = strpos($content, '</td>', $offset);
$content = substr($content, 0, $colspan_td_end) . str_repeat('</td><td>', $repeats - 1) . substr($content, $colspan_td_end);
}
$content = stripAttributes($content, ['href']);
$content = customStuff($content);
$html_file = $out_dir.'/'.$page_name.'.html';
file_put_contents($html_file, trim($content));
$out_file = $out_dir.'/'.$page_name.'.rst';
passthru('pandoc --wrap=preserve -f html -t rst -o '.$out_file.' '.$html_file);
unlink($html_file);
echo "DONE\n";
}
function stripAttributes($s, $allowedattr = array()) {
if (preg_match_all("/<[^>]*\\s([^>]*)\\/*>/msiU", $s, $res, PREG_SET_ORDER)) {
foreach ($res as $r) {
$tag = $r[0];
$attrs = array();
preg_match_all("/\\s.*=(['\"]).*\\1/msiU", " " . $r[1], $split, PREG_SET_ORDER);
foreach ($split as $spl) {
$attrs[] = $spl[0];
}
$newattrs = array();
foreach ($attrs as $a) {
$tmp = explode("=", $a);
if (trim($a) != "" && (!isset($tmp[1]) || (trim($tmp[0]) != "" && !in_array(strtolower(trim($tmp[0])), $allowedattr)))) {
} else {
$newattrs[] = $a;
}
}
$attrs = implode(" ", $newattrs);
$rpl = str_replace($r[1], $attrs, $tag);
$s = str_replace($tag, $rpl, $s);
}
}
return str_replace(' >', '>', $s);
}
/**
*
*/
function customStuff($content)
{
// remove tags params and <br/>
// $content = preg_replace("/<([a-z][a-z0-9]*)[^>]*?(\/?)>/i",'<$1$2>', $content);
$content = preg_replace("/<br\s*\/>/i",'</p><p>', $content);
$content = preg_replace("/<td><strong>/i",'<th>', $content);
$content = preg_replace("/<\/strong>\s*<\/td>/i",'</th>', $content);
$content = str_replace('<a href="API">Назад к оглавлению</a></p><p>', '', $content);
$content = str_replace('<a href="API_rules">См. "Общие правила запросов"</a></p><p>', '', $content);
$content = str_replace('URI:</p><p>', "URI:</p><p>\n", $content);
return $content;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment