-
-
Save hans/24003 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/php -q | |
| <?php | |
| // modified from http://3v1n0.tuxfamily.org/scripts/detextile/HTML-to-Textile.php | |
| class html2textile { | |
| function detextile($text) { | |
| $text = preg_replace("/(<\/?)(\w+)([^>]*>)/e", "'\\1'.strtolower('\\2').'\\3'", $text); | |
| $text = preg_replace("/<br[ \/]*>\s*/","\n",$text); | |
| $text = preg_replace("/(^[\r\n]*|[\r\n]+)[\s\t]*[\r\n]+/", "\n", $text); | |
| $text = stripslashes($text); | |
| $oktags = array('p','ol','ul','li','i','b','em','strong','span','a','h[1-6]', | |
| 'table','tr','td','u','del','sup','sub','blockquote', 'center', 'code'); | |
| $text = preg_replace_callback("/\t*<(img)\s*([^>]*)>/Usi", | |
| array($this,'processTag'),$text); | |
| foreach($oktags as $tag){ | |
| $text = preg_replace_callback("/\t*<(".$tag.")\s*([^>]*)>(.*)<\/\\1>/Usi", | |
| array($this,'processTag'),$text); | |
| } | |
| $text = $this->detextile_process_glyphs($text); | |
| $text = $this->detextile_process_lists($text); | |
| $text = preg_replace('/^\t* *p\. /m','',$text); | |
| return str_replace(array("#\\","/#"), | |
| array(">","<"), | |
| strip_tags($this->decode_high($text), '<pre>')); //strip_tags($text); | |
| } | |
| function detextile_process_glyphs($text) { | |
| $glyphs = array( | |
| '’'=>'\'', # single closing | |
| '‘'=>'\'', # single opening | |
| '”'=>'"', # double closing | |
| '“'=>'"', # double opening | |
| '—'=>'--', # em dash | |
| '–'=>' - ', # en dash | |
| '×' =>'x', # dimension sign | |
| '™'=>'(TM)', # trademark | |
| '®' =>'(R)', # registered | |
| '©' =>'(C)', # copyright | |
| '…'=>'...' # ellipsis | |
| ); | |
| foreach($glyphs as $f=>$r){ | |
| $text = str_replace($f,$r,$text); | |
| } | |
| return $text; | |
| } | |
| function detextile_process_lists($text) { | |
| $list = false; | |
| $text = preg_split("/(<.*>)/U",$text,-1,PREG_SPLIT_DELIM_CAPTURE); | |
| foreach($text as $line){ | |
| if ($list == false && preg_match('/<ol /',$line)){ | |
| $line = ""; | |
| $list = "o"; | |
| } else if (preg_match('/<\/ol/',$line)){ | |
| $line = ""; | |
| $list = false; | |
| } else if ($list == false && preg_match('/<ul/',$line)){ | |
| $line = ""; | |
| $list = "u"; | |
| } else if (preg_match('/<\/ul/',$line)){ | |
| $line = ""; | |
| $list = false; | |
| } else if ($list == 'o'){ | |
| $line = preg_replace('/<li.*>/U','# ', $line); | |
| } else if ($list == 'u'){ | |
| $line = preg_replace('/<li .*>/U','* ', $line); | |
| } | |
| $glyph_out[] = $line; | |
| } | |
| return $text = implode('',$glyph_out); | |
| } | |
| function processTag($matches) { | |
| list($all,$tag,$atts,$content) = $matches; | |
| $a = $this->splat($atts); | |
| $phr = array( | |
| 'em'=>'_', | |
| 'i'=>'__', | |
| 'b'=>'**', | |
| 'strong'=>'*', | |
| 'cite'=>'??', | |
| 'del'=>'-', | |
| 'ins'=>'+', | |
| 'u'=>'+', | |
| 'sup'=>'^', | |
| 'sub'=>'~', | |
| 'span'=>'%', | |
| 'code'=>'@' | |
| ); | |
| $blk = array('p','h1','h2','h3','h4','h5','h6'); | |
| if(isset($phr[$tag])) { | |
| return $phr[$tag].$this->sci($a).$content.$phr[$tag]; | |
| } elseif($tag=='blockquote') { | |
| return 'bq.'.$this->sci($a).' '.$content; | |
| } elseif($tag=='center') { | |
| return 'p=.'.$this->sci($a).' '.$content; | |
| } elseif(in_array($tag,$blk)) { | |
| return $tag.$this->sci($a).'. '.$content; | |
| } elseif ($tag=='a') { | |
| $t = $this->filterAtts($a,array('href','title')); | |
| $out = '"'.$content; | |
| $out.= (isset($t['title'])) ? ' ('.preg_replace(array("/\(/","/\)/"), array("[","]"), $t['title']).')' : ''; | |
| $out.= '":'.$t['href']; | |
| return $out; | |
| } elseif ($tag=='img') { | |
| $t = $this->filterAtts($a,array('src','alt')); | |
| $out = '!'.($t['src']); | |
| $out.= (isset($t['alt'])) ? '('.preg_replace(array("/\(/","/\)/"), array("[","]"), $t['alt']).')' : ''; | |
| $out.= '!'; | |
| return $out; | |
| } else { | |
| return $all; | |
| } | |
| } | |
| // ------------------------------------------------------------- | |
| function filterAtts($atts,$ok) | |
| { | |
| foreach($atts as $a) { | |
| if(in_array($a['name'],$ok)) { | |
| if($a['att']!='') { | |
| $out[$a['name']] = $a['att']; | |
| } | |
| } | |
| } | |
| # dump($out); | |
| return $out; | |
| } | |
| // ------------------------------------------------------------- | |
| function sci($a) | |
| { | |
| $out = ''; | |
| foreach($a as $t){ | |
| $out.= ($t['name']=='class') ? '(='.$t['att'].')' : ''; | |
| $out.= ($t['name']=='id') ? '[='.$t['att'].']' : ''; | |
| $out.= ($t['name']=='style') ? '{='.$t['att'].'}' : ''; | |
| $out.= ($t['name']=='cite') ? ':'.$t['att'] : ''; | |
| if ($t['name']=='align') | |
| if ($t['att'] == "left") | |
| $out.= '/#'; | |
| elseif ($t['att'] == "right") | |
| $out.= '#\\'; | |
| elseif ($t['att'] == "center") | |
| $out.= '='; | |
| elseif ($t['att'] == "justify") | |
| $out.= '/##\\'; | |
| } | |
| return $out; | |
| } | |
| // ------------------------------------------------------------- | |
| function splat($attr) // returns attributes as an array | |
| { | |
| $arr = array(); | |
| $atnm = ''; | |
| $mode = 0; | |
| while (strlen($attr) != 0){ | |
| $ok = 0; | |
| switch ($mode) { | |
| case 0: // name | |
| if (preg_match('/^([a-z]+)/i', $attr, $match)) { | |
| $atnm = $match[1]; $ok = $mode = 1; | |
| $attr = preg_replace('/^[a-z]+/i', '', $attr); | |
| } | |
| break; | |
| case 1: // = | |
| if (preg_match('/^\s*=\s*/', $attr)) { | |
| $ok = 1; $mode = 2; | |
| $attr = preg_replace('/^\s*=\s*/', '', $attr); | |
| break; | |
| } | |
| if (preg_match('/^\s+/', $attr)) { | |
| $ok = 1; $mode = 0; | |
| $arr[] = array('name'=>$atnm,'whole'=>$atnm,'att'=>$atnm); | |
| $attr = preg_replace('/^\s+/', '', $attr); | |
| } | |
| break; | |
| case 2: // value | |
| if (preg_match('/^("[^"]*")(\s+|$)/', $attr, $match)) { | |
| $arr[]=array('name' =>$atnm,'whole'=>$atnm.'='.$match[1], | |
| 'att'=>str_replace('"','',$match[1])); | |
| $ok = 1; $mode = 0; | |
| $attr = preg_replace('/^"[^"]*"(\s+|$)/', '', $attr); | |
| break; | |
| } | |
| if (preg_match("/^('[^']*')(\s+|$)/", $attr, $match)) { | |
| $arr[]=array('name' =>$atnm,'whole'=>$atnm.'='.$match[1], | |
| 'att'=>str_replace("'",'',$match[1])); | |
| $ok = 1; $mode = 0; | |
| $attr = preg_replace("/^'[^']*'(\s+|$)/", '', $attr); | |
| break; | |
| } | |
| if (preg_match("/^(\w+)(\s+|$)/", $attr, $match)) { | |
| $arr[]= | |
| array('name'=>$atnm,'whole'=>$atnm.'="'.$match[1].'"', | |
| 'att'=>$match[1]); | |
| $ok = 1; $mode = 0; | |
| $attr = preg_replace("/^\w+(\s+|$)/", '', $attr); | |
| } | |
| break; | |
| } | |
| if ($ok == 0){ | |
| $attr = preg_replace('/^\S*\s*/', '', $attr); | |
| $mode = 0; | |
| } | |
| } | |
| if ($mode == 1) $arr[] = | |
| array ('name'=>$atnm,'whole'=>$atnm.'="'.$atnm.'"','att'=>$atnm); | |
| return $arr; | |
| } | |
| // ------------------------------------------------------------- | |
| function cmap() { | |
| $f = 0xffff; | |
| $cmap = array( | |
| 160, 255, 0, $f, | |
| 402, 402, 0, $f, | |
| 913, 929, 0, $f, | |
| 931, 937, 0, $f, | |
| 945, 969, 0, $f, | |
| 977, 978, 0, $f, | |
| 982, 982, 0, $f, | |
| 8226, 8226, 0, $f, | |
| 8230, 8230, 0, $f, | |
| 8242, 8243, 0, $f, | |
| 8254, 8254, 0, $f, | |
| 8260, 8260, 0, $f, | |
| 8465, 8465, 0, $f, | |
| 8472, 8472, 0, $f, | |
| 8476, 8476, 0, $f, | |
| 8482, 8482, 0, $f, | |
| 8501, 8501, 0, $f, | |
| 8592, 8596, 0, $f, | |
| 8629, 8629, 0, $f, | |
| 8656, 8660, 0, $f, | |
| 8704, 8704, 0, $f, | |
| 8706, 8707, 0, $f, | |
| 8709, 8709, 0, $f, | |
| 8711, 8713, 0, $f, | |
| 8715, 8715, 0, $f, | |
| 8719, 8719, 0, $f, | |
| 8721, 8722, 0, $f, | |
| 8727, 8727, 0, $f, | |
| 8730, 8730, 0, $f, | |
| 8733, 8734, 0, $f, | |
| 8736, 8736, 0, $f, | |
| 8743, 8747, 0, $f, | |
| 8756, 8756, 0, $f, | |
| 8764, 8764, 0, $f, | |
| 8773, 8773, 0, $f, | |
| 8776, 8776, 0, $f, | |
| 8800, 8801, 0, $f, | |
| 8804, 8805, 0, $f, | |
| 8834, 8836, 0, $f, | |
| 8838, 8839, 0, $f, | |
| 8853, 8853, 0, $f, | |
| 8855, 8855, 0, $f, | |
| 8869, 8869, 0, $f, | |
| 8901, 8901, 0, $f, | |
| 8968, 8971, 0, $f, | |
| 9001, 9002, 0, $f, | |
| 9674, 9674, 0, $f, | |
| 9824, 9824, 0, $f, | |
| 9827, 9827, 0, $f, | |
| 9829, 9830, 0, $f, | |
| 338, 339, 0, $f, | |
| 352, 353, 0, $f, | |
| 376, 376, 0, $f, | |
| 710, 710, 0, $f, | |
| 732, 732, 0, $f, | |
| 8194, 8195, 0, $f, | |
| 8201, 8201, 0, $f, | |
| 8204, 8207, 0, $f, | |
| 8211, 8212, 0, $f, | |
| 8216, 8218, 0, $f, | |
| 8218, 8218, 0, $f, | |
| 8220, 8222, 0, $f, | |
| 8224, 8225, 0, $f, | |
| 8240, 8240, 0, $f, | |
| 8249, 8250, 0, $f, | |
| 8364, 8364, 0, $f | |
| ); | |
| return $cmap; | |
| } | |
| // ------------------------------------------------------------- | |
| function decode_high($text) { | |
| $cmap = $this->cmap(); | |
| return mb_decode_numericentity($text, $cmap, "UTF-8"); | |
| } | |
| } | |
| $filename = $argv[1]; | |
| $fp = fopen($filename, "r"); | |
| $body = fread($fp, filesize($filename)); | |
| fclose($fp); | |
| $html = stripslashes($body); | |
| $html2textile = new html2textile; | |
| $textile = $html2textile->detextile($html); | |
| print($textile . "\n"); | |
| ?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment