Created
November 11, 2008 22:32
-
-
Save al3x/24002 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/php -q | |
<?php | |
// modified from http://3v1n0.tuxfamily.org/scripts/detextile/HTML-to-Textile.php | |
class html2textile { | |
function detextile($text) { | |
$text = preg_replace("/(<\/?)(\w+)([^>]*>)/e", "'\\1'.strtolower('\\2').'\\3'", $text); | |
$text = preg_replace("/<br[ \/]*>\s*/","\n",$text); | |
$text = preg_replace("/(^[\r\n]*|[\r\n]+)[\s\t]*[\r\n]+/", "\n", $text); | |
$text = stripslashes($text); | |
$oktags = array('p','ol','ul','li','i','b','em','strong','span','a','h[1-6]', | |
'table','tr','td','u','del','sup','sub','blockquote', 'center', 'code'); | |
$text = preg_replace_callback("/\t*<(img)\s*([^>]*)>/Usi", | |
array($this,'processTag'),$text); | |
foreach($oktags as $tag){ | |
$text = preg_replace_callback("/\t*<(".$tag.")\s*([^>]*)>(.*)<\/\\1>/Usi", | |
array($this,'processTag'),$text); | |
} | |
$text = $this->detextile_process_glyphs($text); | |
$text = $this->detextile_process_lists($text); | |
$text = preg_replace('/^\t* *p\. /m','',$text); | |
return str_replace(array("#\\","/#"), | |
array(">","<"), | |
strip_tags($this->decode_high($text), '<pre>')); //strip_tags($text); | |
} | |
function detextile_process_glyphs($text) { | |
$glyphs = array( | |
'’'=>'\'', # single closing | |
'‘'=>'\'', # single opening | |
'”'=>'"', # double closing | |
'“'=>'"', # double opening | |
'—'=>'--', # em dash | |
'–'=>' - ', # en dash | |
'×' =>'x', # dimension sign | |
'™'=>'(TM)', # trademark | |
'®' =>'(R)', # registered | |
'©' =>'(C)', # copyright | |
'…'=>'...' # ellipsis | |
); | |
foreach($glyphs as $f=>$r){ | |
$text = str_replace($f,$r,$text); | |
} | |
return $text; | |
} | |
function detextile_process_lists($text) { | |
$list = false; | |
$text = preg_split("/(<.*>)/U",$text,-1,PREG_SPLIT_DELIM_CAPTURE); | |
foreach($text as $line){ | |
if ($list == false && preg_match('/<ol /',$line)){ | |
$line = ""; | |
$list = "o"; | |
} else if (preg_match('/<\/ol/',$line)){ | |
$line = ""; | |
$list = false; | |
} else if ($list == false && preg_match('/<ul/',$line)){ | |
$line = ""; | |
$list = "u"; | |
} else if (preg_match('/<\/ul/',$line)){ | |
$line = ""; | |
$list = false; | |
} else if ($list == 'o'){ | |
$line = preg_replace('/<li.*>/U','# ', $line); | |
} else if ($list == 'u'){ | |
$line = preg_replace('/<li .*>/U','* ', $line); | |
} | |
$glyph_out[] = $line; | |
} | |
return $text = implode('',$glyph_out); | |
} | |
function processTag($matches) { | |
list($all,$tag,$atts,$content) = $matches; | |
$a = $this->splat($atts); | |
$phr = array( | |
'em'=>'_', | |
'i'=>'__', | |
'b'=>'**', | |
'strong'=>'*', | |
'cite'=>'??', | |
'del'=>'-', | |
'ins'=>'+', | |
'u'=>'+', | |
'sup'=>'^', | |
'sub'=>'~', | |
'span'=>'%', | |
'code'=>'@' | |
); | |
$blk = array('p','h1','h2','h3','h4','h5','h6'); | |
if(isset($phr[$tag])) { | |
return $phr[$tag].$this->sci($a).$content.$phr[$tag]; | |
} elseif($tag=='blockquote') { | |
return 'bq.'.$this->sci($a).' '.$content; | |
} elseif($tag=='center') { | |
return 'p=.'.$this->sci($a).' '.$content; | |
} elseif(in_array($tag,$blk)) { | |
return $tag.$this->sci($a).'. '.$content; | |
} elseif ($tag=='a') { | |
$t = $this->filterAtts($a,array('href','title')); | |
$out = '"'.$content; | |
$out.= (isset($t['title'])) ? ' ('.preg_replace(array("/\(/","/\)/"), array("[","]"), $t['title']).')' : ''; | |
$out.= '":'.$t['href']; | |
return $out; | |
} elseif ($tag=='img') { | |
$t = $this->filterAtts($a,array('src','alt')); | |
$out = '!'.($t['src']); | |
$out.= (isset($t['alt'])) ? '('.preg_replace(array("/\(/","/\)/"), array("[","]"), $t['alt']).')' : ''; | |
$out.= '!'; | |
return $out; | |
} else { | |
return $all; | |
} | |
} | |
// ------------------------------------------------------------- | |
function filterAtts($atts,$ok) | |
{ | |
foreach($atts as $a) { | |
if(in_array($a['name'],$ok)) { | |
if($a['att']!='') { | |
$out[$a['name']] = $a['att']; | |
} | |
} | |
} | |
# dump($out); | |
return $out; | |
} | |
// ------------------------------------------------------------- | |
function sci($a) | |
{ | |
$out = ''; | |
foreach($a as $t){ | |
$out.= ($t['name']=='class') ? '(='.$t['att'].')' : ''; | |
$out.= ($t['name']=='id') ? '[='.$t['att'].']' : ''; | |
$out.= ($t['name']=='style') ? '{='.$t['att'].'}' : ''; | |
$out.= ($t['name']=='cite') ? ':'.$t['att'] : ''; | |
if ($t['name']=='align') | |
if ($t['att'] == "left") | |
$out.= '/#'; | |
elseif ($t['att'] == "right") | |
$out.= '#\\'; | |
elseif ($t['att'] == "center") | |
$out.= '='; | |
elseif ($t['att'] == "justify") | |
$out.= '/##\\'; | |
} | |
return $out; | |
} | |
// ------------------------------------------------------------- | |
function splat($attr) // returns attributes as an array | |
{ | |
$arr = array(); | |
$atnm = ''; | |
$mode = 0; | |
while (strlen($attr) != 0){ | |
$ok = 0; | |
switch ($mode) { | |
case 0: // name | |
if (preg_match('/^([a-z]+)/i', $attr, $match)) { | |
$atnm = $match[1]; $ok = $mode = 1; | |
$attr = preg_replace('/^[a-z]+/i', '', $attr); | |
} | |
break; | |
case 1: // = | |
if (preg_match('/^\s*=\s*/', $attr)) { | |
$ok = 1; $mode = 2; | |
$attr = preg_replace('/^\s*=\s*/', '', $attr); | |
break; | |
} | |
if (preg_match('/^\s+/', $attr)) { | |
$ok = 1; $mode = 0; | |
$arr[] = array('name'=>$atnm,'whole'=>$atnm,'att'=>$atnm); | |
$attr = preg_replace('/^\s+/', '', $attr); | |
} | |
break; | |
case 2: // value | |
if (preg_match('/^("[^"]*")(\s+|$)/', $attr, $match)) { | |
$arr[]=array('name' =>$atnm,'whole'=>$atnm.'='.$match[1], | |
'att'=>str_replace('"','',$match[1])); | |
$ok = 1; $mode = 0; | |
$attr = preg_replace('/^"[^"]*"(\s+|$)/', '', $attr); | |
break; | |
} | |
if (preg_match("/^('[^']*')(\s+|$)/", $attr, $match)) { | |
$arr[]=array('name' =>$atnm,'whole'=>$atnm.'='.$match[1], | |
'att'=>str_replace("'",'',$match[1])); | |
$ok = 1; $mode = 0; | |
$attr = preg_replace("/^'[^']*'(\s+|$)/", '', $attr); | |
break; | |
} | |
if (preg_match("/^(\w+)(\s+|$)/", $attr, $match)) { | |
$arr[]= | |
array('name'=>$atnm,'whole'=>$atnm.'="'.$match[1].'"', | |
'att'=>$match[1]); | |
$ok = 1; $mode = 0; | |
$attr = preg_replace("/^\w+(\s+|$)/", '', $attr); | |
} | |
break; | |
} | |
if ($ok == 0){ | |
$attr = preg_replace('/^\S*\s*/', '', $attr); | |
$mode = 0; | |
} | |
} | |
if ($mode == 1) $arr[] = | |
array ('name'=>$atnm,'whole'=>$atnm.'="'.$atnm.'"','att'=>$atnm); | |
return $arr; | |
} | |
// ------------------------------------------------------------- | |
function cmap() { | |
$f = 0xffff; | |
$cmap = array( | |
160, 255, 0, $f, | |
402, 402, 0, $f, | |
913, 929, 0, $f, | |
931, 937, 0, $f, | |
945, 969, 0, $f, | |
977, 978, 0, $f, | |
982, 982, 0, $f, | |
8226, 8226, 0, $f, | |
8230, 8230, 0, $f, | |
8242, 8243, 0, $f, | |
8254, 8254, 0, $f, | |
8260, 8260, 0, $f, | |
8465, 8465, 0, $f, | |
8472, 8472, 0, $f, | |
8476, 8476, 0, $f, | |
8482, 8482, 0, $f, | |
8501, 8501, 0, $f, | |
8592, 8596, 0, $f, | |
8629, 8629, 0, $f, | |
8656, 8660, 0, $f, | |
8704, 8704, 0, $f, | |
8706, 8707, 0, $f, | |
8709, 8709, 0, $f, | |
8711, 8713, 0, $f, | |
8715, 8715, 0, $f, | |
8719, 8719, 0, $f, | |
8721, 8722, 0, $f, | |
8727, 8727, 0, $f, | |
8730, 8730, 0, $f, | |
8733, 8734, 0, $f, | |
8736, 8736, 0, $f, | |
8743, 8747, 0, $f, | |
8756, 8756, 0, $f, | |
8764, 8764, 0, $f, | |
8773, 8773, 0, $f, | |
8776, 8776, 0, $f, | |
8800, 8801, 0, $f, | |
8804, 8805, 0, $f, | |
8834, 8836, 0, $f, | |
8838, 8839, 0, $f, | |
8853, 8853, 0, $f, | |
8855, 8855, 0, $f, | |
8869, 8869, 0, $f, | |
8901, 8901, 0, $f, | |
8968, 8971, 0, $f, | |
9001, 9002, 0, $f, | |
9674, 9674, 0, $f, | |
9824, 9824, 0, $f, | |
9827, 9827, 0, $f, | |
9829, 9830, 0, $f, | |
338, 339, 0, $f, | |
352, 353, 0, $f, | |
376, 376, 0, $f, | |
710, 710, 0, $f, | |
732, 732, 0, $f, | |
8194, 8195, 0, $f, | |
8201, 8201, 0, $f, | |
8204, 8207, 0, $f, | |
8211, 8212, 0, $f, | |
8216, 8218, 0, $f, | |
8218, 8218, 0, $f, | |
8220, 8222, 0, $f, | |
8224, 8225, 0, $f, | |
8240, 8240, 0, $f, | |
8249, 8250, 0, $f, | |
8364, 8364, 0, $f | |
); | |
return $cmap; | |
} | |
// ------------------------------------------------------------- | |
function decode_high($text) { | |
$cmap = $this->cmap(); | |
return mb_decode_numericentity($text, $cmap, "UTF-8"); | |
} | |
} | |
$filename = $argv[1]; | |
$fp = fopen($filename, "r"); | |
$body = fread($fp, filesize($filename)); | |
fclose($fp); | |
$html = stripslashes($body); | |
$html2textile = new html2textile; | |
$textile = $html2textile->detextile($html); | |
print($textile . "\n"); | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment