Created
June 20, 2013 20:25
-
-
Save BlakeTurner/5826280 to your computer and use it in GitHub Desktop.
Convert HTML to Markdown in PHP.
Great for migrating away from terrifying wysiwyg code.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* | |
* to-markdown - an HTML to Markdown converter for PHP | |
* | |
* Copyright 2013, Blake Turner | |
* Licenced under the MIT licence | |
* | |
*/ | |
function toMarkdown($string, $housekeeping = TRUE) { | |
$markdown = $string; | |
// Get rid of attributes! | |
// Particularly useful when converting garbage wysiwyg code to markdown | |
if ($housekeeping) { | |
$els = array('p', 'span', 'ul', 'ol', 'li', 'strong', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'); | |
foreach ($els as $el) { | |
$markdown = preg_replace("/<$el.*?>/", "<$el>", $markdown); | |
} | |
} | |
$regexMap = array( | |
'p' => "", | |
'\/p' => "\n", | |
'span' => '', | |
'span.*?' => '', | |
'\/span' => '', | |
'h1' => '#', | |
'h2' => '##', | |
'h3' => '###', | |
'h4' => '####', | |
'h5' => '#####', | |
'h6' => '######', | |
'\/h\d' => "\n", | |
'br' => "\n", | |
'br\s\/' => "\n", | |
'strong' => "__", | |
'\/strong' => "__", | |
'em' => "_", | |
'\/em' => "_", | |
); | |
// Replace simple tags w/ markdown equivalent | |
foreach ($regexMap as $el => $replacement) { | |
$markdown = preg_replace("/<$el>/", $replacement, $markdown); | |
} | |
// Images: Capture <img> tags, isolate src | |
if (preg_match_all('/<img.*src="([^\s"]*?)".*>?/', $markdown, $matches)) { | |
// Swap markdown replacement for <img> markup for each match | |
foreach ($matches[0] as $i => $img_markup) { | |
$url = $matches[1][$i]; | |
// Alt tags | |
$alt = NULL; | |
if (preg_match('/alt="([^\s"]*?)"/', $img_markup, $alt_match)) { | |
$alt = $alt[1]; | |
} | |
// Build markdown and swap | |
$img_markdown = "![$alt]($url)"; | |
$markdown = str_replace($img_markup, $img_markdown, $markdown); | |
} | |
} | |
// Links: Capture <a> tags, isolate href | |
if (preg_match_all('/<a.*?href="([^\s"]*?)".*?>(.*?)?<\/a>/', $markdown, $matches)) { | |
// Swap markdown replacement for <a> markup for each match | |
foreach ($matches[0] as $i => $a_markup) { | |
$href = $matches[1][$i]; | |
$text = $matches[2][$i]; | |
// Build markdown and swap | |
$a_markdown = "[$text]($href)"; | |
$markdown = str_replace($a_markup, $a_markdown, $markdown); | |
} | |
} | |
// Unordered lists | |
if (preg_match_all('/<ul>(.*?)<\/ul>/s', $markdown, $matches)) { | |
$markdown = preg_replace('/<ul>|<\/ul>/', '', $markdown); | |
foreach ($matches[0] as $ul) { | |
if (preg_match_all('/<li>(.*?)<\/li>/s', $ul, $list_items)) { | |
foreach ($list_items[0] as $i => $li_markup) { | |
$li_inner = $list_items[1][$i]; | |
$li_markdown = '- ' . $li_inner; | |
$markdown = str_replace($li_markup, $li_markdown, $markdown); | |
} | |
} | |
} | |
} | |
// Ordered lists | |
if (preg_match_all('/<ol>(.*?)<\/ol>/s', $markdown, $matches)) { | |
$markdown = preg_replace('/<ol>|<\/ol>/', '', $markdown); | |
foreach ($matches[0] as $ol) { | |
if (preg_match_all('/<li>(.*?)<\/li>/s', $ol, $list_items)) { | |
foreach ($list_items[0] as $i => $li_markup) { | |
$li_inner = $list_items[1][$i]; | |
$li_markdown = $i+1 . '. ' . $li_inner; | |
$markdown = str_replace($li_markup, $li_markdown, $markdown); | |
} | |
} | |
} | |
} | |
// Blockquotes | |
if (preg_match_all('/<blockquote>(.*?)<\/blockquote>/s', $markdown, $matches)) { | |
foreach ($matches[1] as $i => $inner_html) { | |
$blockquote_markup = $matches[$i]; | |
$blockquote_markdown = ''; | |
$lines = explode("\n", $inner_html); | |
foreach ($lines as $line) { | |
$blockquote_markdown .= '> ' . $line . "\n"; | |
} | |
$markdown = str_replace($blockquote_markup, $blockquote_markdown, $markdown); | |
} | |
} | |
return $markdown; | |
} |
Dirty but works for me:
// Table if (preg_match_all('/<table>(.*?)<\/table>/s', $markdown, $matches)) { $markdown = preg_replace('/<table>|<\/table>/', '', $markdown); foreach ($matches[0] as $table) { if (preg_match_all('/<tr>(.*?)<\/tr>/s', $table, $table_items)) { foreach ($table_items[0] as $i => $tr_markup) { if (preg_match_all('/.*?<(th|td)>(.*?)<\/(th|td)>.*?/s', $tr_markup, $tr_items)) { $colums = count($tr_items[1]); foreach ($tr_items[0] as $i => $tr_markup) { $td_inner = $tr_items[2][$i]; $td_markdown = $td_inner; if ($colums!=($i+1)) $td_markdown .= '|'; $markdown = str_replace($tr_markup, $td_markdown, $markdown); } } } } } $markdown = preg_replace('/<tr>/', '', $markdown); $head = PHP_EOL.'|'.str_repeat('---|', $colums).PHP_EOL; $markdown = preg_replace('/<\/tr>/', $head, $markdown, 1); $markdown = preg_replace('/<\/tr>/', PHP_EOL, $markdown); }
Will it work if there are in-line styles for table and rows?
If you add all 'table','tr','th','td' the the $els array and set $housekeeping = TRUE (like the default) all attributes are deleted, so also the inline styles you asked for ;-)
@mykeysr or you directly use this little improvement https://gist.github.com/gaffling/94eca5d545b1781a2ea34324b1cf7a6c
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Dirty but works for me: