Created
March 29, 2016 16:46
-
-
Save forethoughtde/913d7380c09a77169d2e076b03a26e67 to your computer and use it in GitHub Desktop.
Programs which reads the HTML file, chunks by tags, write them to file and create JSON file which contains chapter heading.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
class parseHTML | |
{ | |
//counter appended to chunked file names | |
protected $counter = 0; | |
//Store book information such as book name, chapter names, version | |
protected $bookInfo; | |
//Holds filename with no extension | |
protected $fileName; | |
//Holds the pointer to the opened file | |
protected $fileHandle; | |
protected $writingFileHandle; | |
protected $matchTag; | |
protected $chapterHeadTag = "/x-berschrift-1--nur-f-r-Header-/"; | |
protected $subChapterTag = "/x-berschrift-2/"; | |
protected $userDefinedToken = "/[CHUNK]/"; | |
public function __construct($file) | |
{ | |
$this->file = $file; | |
$this->init(); | |
} | |
/** | |
* Initialize | |
* | |
* @return void | |
*/ | |
public function init() | |
{ | |
//extract the filename without extension, create a directory with the name and move to it | |
$this->extractFileName($this->file); | |
$this->makeDirectory($this->fileName); | |
$this->fileHandle = $this->openReadFile($this->file); | |
$this->changeDirectory($this->fileName); | |
$this->readingFile($this->fileHandle); | |
//checks for the matching tag | |
// if found close if a file already opened then open new file and start writing to it until you find another same match tag or chuck | |
$this->openJSONFile(); | |
} | |
/** | |
* Reads line by line in the opened file | |
* | |
* @return void | |
*/ | |
public function readingFile($fHandle) | |
{ | |
while(!feof($fHandle)) | |
{ | |
$line = fgets($fHandle); | |
$this->checkingMatchTag($line); | |
} | |
} | |
/** | |
* This function open the given file | |
* | |
* @return file pointer | |
*/ | |
public function openReadFile($file) | |
{ | |
return fopen($file, 'r'); | |
} | |
/** | |
* This function extracts the filename. | |
*(filename is the name of the file without extension part) | |
* | |
* @return void | |
*/ | |
public function extractFileName($file) | |
{ | |
$this->fileName = explode('.', $file)[0]; | |
} | |
/** | |
* This function gives the filename | |
* (filename is the name of the file without extension part) | |
* | |
* @return void | |
*/ | |
public function getFileName($file) | |
{ | |
return $this->fileName; | |
} | |
public function getFile() | |
{ | |
return $this->file; | |
} | |
/** | |
* Checks if the given tag is present in the line if yes return True, | |
* otherwise False | |
* | |
* @return void | |
*/ | |
public function checkingMatchTag($line) | |
{ | |
if(preg_match($this->chapterHeadTag, $line)) | |
{ | |
//Calling to get chapter heading | |
$this->extractTitle($line); | |
} | |
else if(preg_match($this->subChapterTag, $line) or preg_match($this->userDefinedToken, $line)) | |
{ | |
if($this->writingFileHandle and $this->matchTag) | |
{ | |
$this->closeFile($this->writingFileHandle); | |
$this->incrementCounter(); | |
$this->openWriteFile(); | |
$this->writeLineToFile($line); | |
} | |
else | |
{ | |
$this->openWriteFile($line); | |
$this->matchTag = True; | |
$this->writeLineToFile($line); | |
} | |
} | |
else if($this->matchTag) | |
{ | |
$this->writeLineToFile($line); | |
} | |
} | |
/** | |
* Creates a directory with $name if it does not exist | |
* | |
* @return void | |
*/ | |
public function makeDirectory($name) | |
{ | |
if(!is_dir($name)) | |
{ | |
mkdir($name); | |
} | |
} | |
public function extractTitle($line) | |
{ | |
$this->bookInfo['chapter'][] = strip_tags($line); | |
} | |
/** | |
* Changes the current working directry to $name | |
* | |
* @return void | |
*/ | |
public function changeDirectory($name) | |
{ | |
chdir($name); | |
} | |
/** | |
* Initialize | |
* | |
* @return void | |
*/ | |
public function openJSONFile() | |
{ | |
$handleJSON = fopen('config.json', 'w'); | |
fwrite($handleJSON, json_encode($this->bookInfo)); | |
$this->closeFile($handleJSON); | |
} | |
public function openWriteFile() | |
{ | |
$this->writingFileHandle = fopen($this->fileName.'000'.$this->counter.'.'.'html','w'); | |
} | |
/** | |
* Initialize | |
* | |
* @return void | |
*/ | |
public function setMatchTag($tag) | |
{ | |
$this->tag = $tag; | |
} | |
/** | |
* This function returns match tag | |
* | |
* @return string | |
*/ | |
public function getMatchTag() | |
{ | |
return $this->tag; | |
} | |
/** | |
* Write line to the opened file | |
* | |
* @return void | |
*/ | |
public function writeLineToFile($line) | |
{ | |
fwrite($this->writingFileHandle, $line); | |
} | |
public function setUserDefineToken($token) | |
{ | |
$this->userDefinedToken = $token; | |
} | |
public function getUserDefineToken() | |
{ | |
return $this->userDefinedToken; | |
} | |
/** | |
* Increment the counter | |
* | |
* @return void | |
*/ | |
public function incrementCounter() | |
{ | |
$this->counter++; | |
} | |
/** | |
* This function returns counter | |
* | |
* @return int | |
*/ | |
public function getCounter() | |
{ | |
return $this->counter; | |
} | |
/** | |
* Close the file | |
* | |
* @return void | |
*/ | |
public function closeFile($handle) | |
{ | |
fclose($handle); | |
} | |
} | |
new parseHTML('testing1.html'); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment