Last active
December 27, 2015 09:28
-
-
Save ddebin/7303653 to your computer and use it in GitHub Desktop.
Ce script convertit un backup/export DotClear ("blog-backup.txt" flatExport) en posts markdown pour Octopress/Jekyll (ou autres) et en un fichier de commentaires WXR (WordPress eXtended RSS) pour import XML sur Disqus.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// -- | |
// - Ce script convertit un backup/export DotClear en posts markdown pour Octopress/Jekyll (ou autres) | |
// et en un fichier de commentaires WXR (WordPress eXtended RSS) pour import XML sur Disqus. | |
// - Il faut installer le plugin "flatExport" dans DotClear pour récupérer le "blog-backup.txt" | |
// cf. http://plugins.dotaddict.org/dc1/details/flatExport | |
// - Vous avez besoin de Pandoc d'installé sur la machine pour la conversion vers Markdown | |
// cf. http://johnmacfarlane.net/pandoc/ | |
// -- | |
// le prefix du blog pour les URLs de posts dans Disqus | |
define('POST_WEB_PREFIX', 'http://damiendebin.net/blog/'); | |
// quelques valeurs par défaut mises en entête YAML de chaque post | |
define('POST_LAYOUT', 'post'); | |
define('POST_CATEGORIES', '[paris.photobloggers.org]'); | |
define('POST_PUBLISHED', 'false'); | |
define('POST_COMMENTS', 'true'); | |
define('POST_TAGS', '[]'); | |
// chemin vers le backup "flatExport" | |
define('BLOG_BACKUP_PATH', __DIR__.'/blog-backup.txt'); | |
// si vous souhaitez éviter de préciser "author: ..." pour certaines personnes | |
$SKIP_AUTHORS = array('ddebin'); | |
//$SKIP_AUTHORS = array(); | |
// la timezone des heures utilisées dans le backup "flatExport" | |
date_default_timezone_set('Europe/Paris'); | |
// ------------- | |
function clean_value(&$v ,$k) | |
{ | |
$v = str_replace(array('\n', '\r', '\t', '\"', '…', '–', '’', '“', '”', '\~', '\>', '\<'), array("\n", "\r", "\t", '"', '...', '-', "'", '"', '"', '~', '>', '<'), $v); | |
} | |
function convert_to_markdown($html) | |
{ | |
$tmpFile = sprintf("%s/%s", sys_get_temp_dir(), uniqid("pandoc")); | |
file_put_contents($tmpFile, $html); | |
$command = 'pandoc --from=html --to=markdown_phpextra --no-wrap '.escapeshellarg($tmpFile); | |
exec($command, $output); | |
unlink($tmpFile); | |
return implode("\n", $output); | |
} | |
$r = file(BLOG_BACKUP_PATH, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); | |
// settings | |
$settings = array(); | |
$i = 0; | |
while ((strpos($r[$i], '[setting ') !== 0) && ($i < count($r))) $i++; | |
$keys = str_getcsv(substr($r[$i], strlen('[setting '), -1)); | |
$i++; | |
while ((substr($r[$i], 0, 1) == '"') && ($i < count($r))) | |
{ | |
$l = array_combine($keys, str_getcsv($r[$i])); | |
array_walk($l, 'clean_value'); | |
$settings[$l['setting_id']] = $l; | |
$i++; | |
} | |
// posts | |
$posts = array(); | |
$i = 0; | |
while ((strpos($r[$i], '[post ') !== 0) && ($i < count($r))) $i++; | |
$keys = str_getcsv(substr($r[$i], strlen('[post '), -1)); | |
$i++; | |
while ((substr($r[$i], 0, 1) == '"') && ($i < count($r))) | |
{ | |
$l = array_combine($keys, str_getcsv($r[$i])); | |
array_walk($l, 'clean_value'); | |
$l['comments'] = array(); | |
$posts[$l['post_id']] = $l; | |
$i++; | |
} | |
// comments | |
$comments = array(); | |
$i = 0; | |
while ((strpos($r[$i], '[comment ') !== 0) && ($i < count($r))) $i++; | |
$keys = str_getcsv(substr($r[$i], strlen('[comment '), -1)); | |
$i++; | |
while ((substr($r[$i], 0, 1) == '"') && ($i < count($r))) | |
{ | |
$l = array_combine($keys, str_getcsv($r[$i])); | |
array_walk($l, 'clean_value'); | |
$posts[$l['post_id']]['comments'][] = $l; | |
$comments[] = $l; | |
$i++; | |
} | |
// users | |
$users = array(); | |
$i = 0; | |
while ((strpos($r[$i], '[user ') !== 0) && ($i < count($r))) $i++; | |
$keys = str_getcsv(substr($r[$i], strlen('[user '), -1)); | |
$i++; | |
while ((substr($r[$i], 0, 1) == '"') && ($i < count($r))) | |
{ | |
$l = array_combine($keys, str_getcsv($r[$i])); | |
array_walk($l, 'clean_value'); | |
$users[$l['user_id']] = $l; | |
$i++; | |
} | |
// export posts w/ comments | |
// -- | |
$post_dir = __DIR__.'/_posts'; | |
@mkdir($post_dir); | |
$xml_comment = <<<EOT | |
<?xml version="1.0" encoding="UTF-8"?> | |
<rss version="2.0" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:dsq="http://www.disqus.com/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:wp="http://wordpress.org/export/1.0/"> | |
<channel> | |
EOT; | |
foreach ($posts as $post) | |
{ | |
//print_r($post); | |
$post_md = ''; | |
if (!empty($post['post_chapo'])) $post_md .= convert_to_markdown($post['post_chapo'])."<!-- more -->\n"; | |
if (!empty($post['post_content'])) $post_md .= convert_to_markdown($post['post_content']); | |
$post_md = html_entity_decode($post_md); | |
$post_md = preg_replace(array('/-\s{2,}/'), array('- '), $post_md); | |
$post_md = trim($post_md); | |
$date = strtotime($post['post_creadt']); | |
$post_layout = POST_LAYOUT; | |
$post_categories = POST_CATEGORIES; | |
$post_published = POST_PUBLISHED; | |
$post_comments = POST_COMMENTS; | |
$post_tags = POST_TAGS; | |
$post_uri = POST_WEB_PREFIX.date("Y/m/d/", $date).$post['post_titre_url'].'/'; | |
$post_date = date('Y-m-d H:i:sP', $date); | |
$post_title = html_entity_decode($post['post_titre']); | |
if (strpos($post_title, '"') === FALSE) $post_title = '"'.$post_title.'"'; | |
if (is_array($SKIP_AUTHORS) && in_array($post['user_id'], $SKIP_AUTHORS)) $post_author = ''; | |
else $post_author = "\nauthor: {$users[$post['user_id']]['user_prenom']} {$users[$post['user_id']]['user_nom']}"; | |
$post_content = <<<EOT | |
--- | |
date: $post_date | |
layout: $post_layout | |
title: $post_title$post_author | |
comments: $post_comments | |
categories: $post_categories | |
tags: $post_tags | |
published: $post_published | |
--- | |
EOT; | |
$post_content .= "\n".$post_md; | |
echo "$post_title\n"; | |
$path = $post_dir.'/'.date('Y-m-d', $date).'-'.$post['post_titre_url'].'.markdown'; | |
file_put_contents($path, $post_content); // write Markdown post file | |
$post_title = htmlspecialchars(html_entity_decode($post['post_titre'])); | |
$post_uri = htmlspecialchars($post_uri); | |
$post_date_gmt = htmlspecialchars(gmdate('Y-m-d H:i:s', $date)); | |
if (!empty($post['comments'])) | |
{ | |
$xml_comment .= <<<EOT | |
<item> | |
<title>$post_title</title> | |
<link>$post_uri</link> | |
<dsq:thread_identifier>$post_uri</dsq:thread_identifier> | |
<wp:post_date_gmt>$post_date_gmt</wp:post_date_gmt> | |
<wp:comment_status>open</wp:comment_status> | |
EOT; | |
foreach ($post['comments'] as $comment) | |
{ | |
$comment_date_gmt = gmdate('Y-m-d H:i:s', strtotime($comment['comment_dt'])); | |
$comment_id = htmlspecialchars($comment['comment_id']); | |
$comment_auteur = htmlspecialchars($comment['comment_auteur']); | |
$comment_email = htmlspecialchars($comment['comment_email']); | |
$comment_site = htmlspecialchars($comment['comment_site']); | |
$comment_ip = htmlspecialchars($comment['comment_ip']); | |
$comment_pub = htmlspecialchars($comment['comment_pub']); | |
$xml_comment .= <<<EOT | |
<wp:comment> | |
<wp:comment_id>$comment_id</wp:comment_id> | |
<wp:comment_author>$comment_auteur</wp:comment_author> | |
<wp:comment_author_email>$comment_email</wp:comment_author_email> | |
<wp:comment_author_url>$comment_site</wp:comment_author_url> | |
<wp:comment_author_IP>$comment_ip</wp:comment_author_IP> | |
<wp:comment_date_gmt>$comment_date_gmt</wp:comment_date_gmt> | |
<wp:comment_content><![CDATA[{$comment['comment_content']}]]></wp:comment_content> | |
<wp:comment_approved>$comment_pub</wp:comment_approved> | |
<wp:comment_parent>0</wp:comment_parent> | |
</wp:comment> | |
EOT; | |
} | |
$xml_comment .= <<<EOT | |
</item> | |
EOT; | |
} | |
//break; //debugging purpose | |
} | |
$xml_comment .= <<<EOT | |
</channel> | |
</rss> | |
EOT; | |
file_put_contents(__DIR__.'/comments.xml', $xml_comment); // write WXR compatible XML file for Disqus | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment