Last active
August 29, 2015 14:24
-
-
Save mcantelon/082a6ceefcbfa66ede99 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Remove HTML tags from various i18n information object fields. | |
* | |
* To execute script (from AtoM root directory): | |
* | |
* php symfony tools:run remove-html.php | |
*/ | |
class i18nRemoveHtmlTags | |
{ | |
public static $columns = array( | |
'title', | |
'alternate_title', | |
'edition', | |
'extent_and_medium', | |
'archival_history', | |
'acquisition', | |
'scope_and_content', | |
'appraisal', | |
'accruals', | |
'arrangement', | |
'access_conditions', | |
'reproduction_conditions', | |
'physical_characteristics', | |
'finding_aids', | |
'location_of_originals', | |
'location_of_copies', | |
'related_units_of_description', | |
'institution_responsible_identifier', | |
'rules', | |
'sources', | |
'revision_history' | |
); | |
public function execute() | |
{ | |
$rowCount = 0; | |
$changedCount = 0; | |
$columnsChangedCount = 0; | |
// Fetch all information object i18n rows | |
$query = "SELECT * FROM information_object_i18n WHERE id != ". QubitInformationObject::ROOT_ID; | |
$statement = QubitPdo::prepareAndExecute($query); | |
while ($io = $statement->fetch(PDO::FETCH_OBJ)) | |
{ | |
// Process HTML in row's columns | |
$columnsChanged = $this->processInformationObjectI18nHtml($io); | |
// Update total column values changed | |
if ($columnsChanged) | |
{ | |
$changedCount++; | |
$columnsChangedCount += $columnsChanged; | |
} | |
// Report progress | |
$message = 'Processed information object '.$io->id; | |
if ($columnsChanged) | |
{ | |
$message .= ' ('. $columnsChanged . ' changes)'; | |
} | |
print $message ."\n"; | |
$rowCount++; | |
} | |
// Report summary of processing | |
$message = 'Processed '. $rowCount .' information objects.'; | |
if ($changedCount) | |
{ | |
$message .= ' Changed '. $changedCount .' information objects'; | |
$message .= ' ('. $columnsChangedCount .' field values changed).'; | |
} | |
print $message ."\n"; | |
} | |
/** | |
* Determine what information object i18n columns are populated | |
* and update them. | |
* | |
* @param stdClass $io row of information object i18n data | |
* | |
* @return integer number of columns changed | |
*/ | |
private function processInformationObjectI18nHtml(&$io) | |
{ | |
// Determine what column values contain HTML | |
$columnValues = array(); | |
foreach(i18nRemoveHtmlTags::$columns as $column) | |
{ | |
// Store column name/value for processing if it contains tags | |
if ($io->{$column} && ($io->{$column} != strip_tags($io->{$column}))) | |
{ | |
$columnValues[$column] = $io->{$column}; | |
} | |
} | |
// Update database with transformed column values | |
$this->transformHtmlInI18nTableColumns( | |
'information_object_i18n', $io->id, $io->culture, $columnValues); | |
return count($columnValues); | |
} | |
/** | |
* Transform HTML column values into text and update specified i18n table row | |
* | |
* @param string i18n table name | |
* @param integer $id ID of row in an i18n table | |
* @param string $culture culture code of a row in an i18n table | |
* @param array $columnValues key/value array of column/value data to process | |
* | |
* @return void | |
*/ | |
private function transformHtmlInI18nTableColumns($table, $id, $culture, $columnValues) | |
{ | |
// Aseemble query and note parsed column values | |
$values = array(); | |
$query = 'UPDATE '. $table .' SET '; | |
foreach($columnValues as $column => $value) | |
{ | |
// Only update if tags are found | |
if ($value != strip_tags($value)) { | |
$transformedValue = $this->transformHtmlToText($value); | |
$query .= (count($values)) ? ', ' : ''; | |
$query .= $column ."=?"; | |
$values[] = $transformedValue; | |
} | |
} | |
$query .= " WHERE id='". $id ."' AND culture='". $culture ."'"; | |
if (count($values)) | |
{ | |
QubitPdo::prepareAndExecute($query, $values); | |
} | |
} | |
/** | |
* Transform HTML into text using the Document Object Model | |
* | |
* @param string $html HTML to transform into text | |
* | |
* @return string transformed text | |
*/ | |
private function transformHtmlToText($html) | |
{ | |
// Parse HTML | |
$doc = new DOMDocument(); | |
$doc->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8')); | |
// Apply transformations | |
$this->transformDocument($doc); | |
// Convert to string and strip leading/trailing whitespace | |
return trim(strip_tags($doc->saveXml($doc->documentElement))); | |
} | |
/** | |
* Transform specific tags within a DOM document | |
* | |
* @param DOMDocument $doc DOM document | |
* | |
* @return void | |
*/ | |
private function transformDocument(&$doc) | |
{ | |
// Create text representations of various HTML tags | |
$this->transformDocumentLinks($doc); | |
$this->transformDocumentLists($doc); | |
$this->transformDocumentDescriptionLists($doc); | |
$this->transformDocumentBreaks($doc); | |
// Deal with paragraphs last, as other transformations create them | |
$this->transformDocumentParasIntoNewlines($doc); | |
} | |
/** | |
* Transform link tags into text | |
* | |
* @param DOMDocument $doc DOM document | |
* | |
* @return void | |
*/ | |
private function transformDocumentLinks(&$doc) | |
{ | |
$linkList = $doc->getElementsByTagName('a'); | |
// Loop through each <a> tag and replace with text content | |
while ($linkList->length > 0) | |
{ | |
$linkNode = $linkList->item(0); | |
$linkText = $linkNode->textContent; | |
$linkHref = $linkNode->getAttribute('href'); | |
if ($linkHref) | |
{ | |
if (0 === strpos(strtolower($linkHref), 'mailto:')) { | |
$linkHref = removeFromStartOfString($linkHref, 'mailto:'); | |
} | |
$linkText .= ' ['. $linkHref .']'; | |
} | |
$newTextNode = $doc->createTextNode($linkText); | |
$linkNode->parentNode->replaceChild($newTextNode, $linkNode); | |
} | |
} | |
/** | |
* Transform unordered list-related tags into text and enclose in | |
* <p> tags | |
* | |
* @param DOMDocument $doc DOM document | |
* | |
* @return void | |
*/ | |
private function transformDocumentLists(&$doc) | |
{ | |
$ulList = $doc->getElementsByTagName('ul'); | |
// Loop through each <ul> tag and change to a <p> tag | |
while ($ulList->length > 0) | |
{ | |
$listNode = $ulList->item(0); | |
$newParaNode = $doc->createElement('p'); | |
// Assemble text representation of list | |
$paraText = ''; | |
foreach($listNode->childNodes as $childNode) | |
{ | |
$paraText .= '* '. $childNode->textContent ."\n"; | |
} | |
// Set <p> element's text | |
$newTextNode = $doc->createTextNode($paraText); | |
$newParaNode->appendChild($newTextNode); | |
$listNode->parentNode->replaceChild($newParaNode, $listNode); | |
} | |
} | |
/** | |
* Transform description list-related tags, removing description | |
* terms and enclosing the description text in <p> tags | |
* | |
* @param DOMDocument $doc DOM document | |
* | |
* @return void | |
*/ | |
private function transformDocumentDescriptionLists(&$doc) | |
{ | |
$termList = $doc->getElementsByTagName('dt'); | |
// Loop through each <dt> tag and remove it | |
while ($termList->length > 0) | |
{ | |
$termNode = $termList->item(0); | |
$termNode->parentNode->removeChild($termNode); | |
} | |
$descriptionList = $doc->getElementsByTagName('dd'); | |
// Look through each <dd> element and change to a <p> element | |
while ($descriptionList->length > 0) | |
{ | |
$descriptionNode = $descriptionList->item(0); | |
// Create <p> node with description's text | |
$newParaNode = $doc->createElement('p'); | |
$newTextNode = $doc->createTextNode($descriptionNode->textContent); | |
$newParaNode->appendChild($newTextNode); | |
$descriptionNode->parentNode->replaceChild($newParaNode, $descriptionNode); | |
} | |
} | |
/** | |
* Transform break tags into newlines | |
* | |
* @param DOMDocument $doc DOM document | |
* | |
* @return void | |
*/ | |
private function transformDocumentBreaks(&$doc) | |
{ | |
$breakList = $doc->getElementsByTagName('br'); | |
// Loop through each <p> and replace with text | |
while($breakList->length) | |
{ | |
$breakNode = $breakList->item(0); | |
$newTextNode = $doc->createTextNode("\n"); | |
$breakNode->parentNode->replaceChild($newTextNode, $breakNode); | |
} | |
} | |
/** | |
* Transform paragraph tags into newlines | |
* | |
* @param DOMDocument $doc DOM document | |
* | |
* @return void | |
*/ | |
private function transformDocumentParasIntoNewlines(&$doc) | |
{ | |
$paraList = $doc->getElementsByTagName('p'); | |
// Loop through each <p> and replace with text | |
while($paraList->length) | |
{ | |
$paraNode = $paraList->item(0); | |
$paraText = "\n". $paraNode->textContent ."\n"; | |
$newTextNode = $doc->createTextNode($paraText); | |
$paraNode->parentNode->replaceChild($newTextNode, $paraNode); | |
} | |
} | |
/** | |
* Remove text at the start of strings | |
* | |
* @param string $string text | |
* @param string $substring text to remove | |
* | |
* @return string processed text | |
*/ | |
private function removeFromStartOfString($string, $substring) | |
{ | |
return substr($string, strlen($substring), strlen($string) - strlen($substring)); | |
} | |
} | |
$script = new i18nRemoveHtmlTags(); | |
$script->execute(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment