Last active
March 1, 2016 10:51
-
-
Save mytory/96e17f9386ee1afbb97e to your computer and use it in GitHub Desktop.
Created by [gnoownow10](https://github.com/gnoownow10). docx, doc, hwp to html.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* @see dependencies: libreoffice, pyhwp https://pythonhosted.org/pyhwp/ko/ | |
* Class Anything2html | |
*/ | |
class Anything2html { | |
static $hwp5html; | |
static $libreoffice; | |
static $error; | |
static function convert($content, $extension, $default_handler, $error_handler) { | |
static::$error = false; | |
$handler = "{$extension}_handler"; | |
$can_handle = method_exists(__CLASS__, $handler); | |
if ($can_handle) { | |
try { | |
return call_user_func([__CLASS__, $handler], $content); | |
} catch (ExecFailedException $e) { | |
static::$error = true; | |
return call_user_func($error_handler, $e); | |
} | |
} else { | |
static::$error = true; | |
return call_user_func($default_handler, $content); | |
} | |
} | |
static function executable_exists($cmd) { | |
exec("which $cmd", $output, $exit_code); | |
return (bool) $output; | |
} | |
static function get_body_inner_html($html) { | |
$dom = new DOMDocument(); | |
$dom->loadXML($html); | |
$body = $dom->getElementsByTagName('body')->item(0); | |
$inner_html = ''; | |
foreach ($body->childNodes as $child) { $inner_html .= $dom->saveHTML($child); } | |
return $inner_html; | |
} | |
static function txt_handler($content) { | |
$content = htmlentities($content); | |
return self::html_handler("<meta charset='UTF-8'><pre>$content</pre>"); | |
} | |
static function html_handler($content) { | |
$dom = new DOMDocument(); | |
$dom->loadHTML(self::strip_invalid_xml_chars($content)); | |
return $dom->saveXML(); | |
} | |
static function docx_handler($content, $extension = 'docx') { | |
if (! self::executable_exists(self::$libreoffice)) { | |
throw new Exception("libreoffice executable is not found"); | |
} | |
# prepare file names. | |
$workspace = self::get_workspace(); | |
$origin_file = $workspace . DIRECTORY_SEPARATOR . "document.$extension"; | |
$out_dir = $workspace . DIRECTORY_SEPARATOR . 'html'; | |
$out_file = $out_dir . DIRECTORY_SEPARATOR . "document.html"; | |
file_put_contents($origin_file, $content); | |
# execute the command | |
$cmd = sprintf("export HOME={$workspace} && " . self::$libreoffice." --headless --convert-to html:HTML %s --outdir %s 2>&1", | |
escapeshellarg($origin_file), | |
escapeshellarg($out_dir) | |
); | |
exec($cmd, $output, $exit_code); | |
# sometimes libreoffice returns 139 even if it already did conversion fine | |
$has_error = ($exit_code !== 0) && ($exit_code !== 139); | |
if ($has_error) { | |
throw new ExecFailedException("Failed to convert to html", $exit_code, $cmd, $output); | |
} | |
$dom = new DOMDOcument(); | |
$dom->loadHTML(file_get_contents($out_file)); | |
$html = $dom->saveXML($dom->documentElement); | |
return $html; | |
} | |
static function hwp_handler($content) { | |
if (! self::executable_exists(self::$hwp5html)) { | |
throw new Exception("hwp5html executable is not found."); | |
} | |
# prepare file names | |
$workspace = self::get_workspace(); | |
$origin_file = $workspace . DIRECTORY_SEPARATOR . "document.hwp"; | |
$out_dir = $workspace . DIRECTORY_SEPARATOR . "html"; | |
$out_file = $out_dir . DIRECTORY_SEPARATOR . 'index.xhtml'; | |
file_put_contents($origin_file, $content); | |
# execute the command | |
$cmd = sprintf(self::$hwp5html." %s %s 2>&1", | |
escapeshellarg($origin_file), | |
$out_dir | |
); | |
exec($cmd, $output, $exit_code); | |
$has_error = ($exit_code !== 0); | |
if ($has_error) { | |
throw new ExecFailedException("Failed to convert to html", $exit_code, $cmd, $output); | |
} | |
return file_get_contents($out_file); | |
} | |
static function doc_handler($content) { | |
return self::docx_handler($content, 'doc'); | |
} | |
static function strip_invalid_xml_chars($content) { | |
// See http://www.w3.org/TR/xml/#charsets | |
return preg_replace('/ | |
[^ | |
\x{9} | |
\x{A} | |
\x{D} | |
\x{20}-\x{D7FF} | |
\x{E000}-\x{FFFD} | |
\x{10000}-\x{10FFF} | |
]/ux', '', | |
$content | |
); | |
} | |
private static function get_workspace() { | |
// 랜덤 파일 생성해서 이름을 받고. 기존에 exec로 output을 받던 것은 맥에서 호환성 문제가 발생해 사용하지 않음. | |
$workspace = tempnam(sys_get_temp_dir(), 'a2html.'); | |
// 파일은 지우고, 디렉토리를 만듦. | |
unlink($workspace); | |
if( ! is_dir($workspace)){ | |
mkdir($workspace, 0777, true); | |
} | |
register_shutdown_function(function () use ($workspace) { | |
call_user_func(['Anything2html', 'rm_rf'], $workspace); | |
}); | |
return $workspace; | |
} | |
private static function rm_rf($path) { | |
if(strstr($path, sys_get_temp_dir())){ | |
@exec(sprintf("rm -rf %s", escapeshellarg($path))); | |
}else{ | |
echo 'You do not rm folder that is not temp.'; | |
exit; | |
} | |
} | |
} | |
/** | |
* Class ExecFailedException | |
*/ | |
class ExecFailedException extends Exception { | |
public $cmd; | |
public $output; | |
/** | |
* @param string $message | |
* @param int $code | |
* @param string $cmd | |
* @param string $output | |
* @param Exception $previous | |
*/ | |
function __construct($message, $code, $cmd, $output, $previous = null) { | |
$this->cmd = $cmd; | |
$this->output = implode("\n", $output); | |
parent::__construct($message, $code, $previous); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment