Skip to content

Instantly share code, notes, and snippets.

@heddn
Created December 19, 2013 14:35
Show Gist options
  • Select an option

  • Save heddn/8040014 to your computer and use it in GitHub Desktop.

Select an option

Save heddn/8040014 to your computer and use it in GitHub Desktop.
<?php
/**
* @file
* HTML Migration.
*/
abstract class PfizerProMigration extends Migration {
public $baseDir;
public $siteMapArray;
/**
* Consturctor.
*/
public function __construct() {
parent::__construct();
// Build the site map array from the Pfizer provided sitemap
$this->_parseSiteMap();
// Since the base directory of the HTML files can change depending on the
// environment, we keep it in a variable. There is no interface for this,
// set it using drush vset.
$this->baseDir = drupal_get_path('module', 'pfizer_migration') . '/fixtures';
}
/**
* This function analyzes the sitemap to create key value pairs for file and url path of a page.
**/
protected function _parseSiteMap() {
$this->siteMapArray = array();
$siteMapLocation = realpath('./') . '/sites/all/files/pfizer migration sitemap.html';
$siteMapDocument = file_get_html($siteMapLocation);
$links = $siteMapDocument->find('a');
foreach($links as $element) {
$parts = explode('--', $element->innertext);
if (array_key_exists(1, $parts)) {
$filepath = str_replace('/wyeth_html/home/minisites/', '/hcp/', trim($parts[1]));
$filepath = str_replace('/wyeth_html/home/products_landing/', '/products_landing/', $filepath);
$urlpath = str_replace('http://www.pfizerpro.com/', '', $element->href);
$this->siteMapArray[$urlpath] = $filepath;
}
}
}
/**
* Prepare aliases.
*/
protected function prepareAlias($row) {
// If we have a match from this document to the sitemap then we know the alias.
if ($key = array_search($row->sourceid, $this->siteMapArray)) {
$alias = $key;
}
else {
// Set the url_alias.
$alias = ltrim($row->sourceid, '/');
// Retrieve the dir and the page.
$path = explode('/', $alias);
$page = strtolower(end($path));
$dir = strtolower(prev($path));
// Remove the .html extension and the dir name.
$search = array('.html', $dir . '_', $dir, 'index');
$replace = '';
$page = str_replace($search, $replace, $page);
// Index page for dir if page is an empty string.
if (empty($page)) {
// Remove last element.
array_pop($path);
// Set the aliases correctly.
$alias = implode($path, '/');
}
else {
// Remove last element.
array_pop($path);
// Put the new page on the path.
array_push($path, ucfirst($page));
// Set the aliases correctly.
$alias = implode($path, '/');
}
// Changes URL alias for effexorxr microsite
$alias = str_replace('hcp/effexor/Overview', 'hcp/effexorxr', $alias);
$alias = str_replace('hcp/effexor/Important_treatment_considerations', 'hcp/effexorxr/important-safety-information', $alias);
// Changes URL alias for flectorpatch microsite
$alias = str_replace('hcp/flectorpatch/Adverseevents', 'hcp/flectorpatch/adverse-events', $alias);
$alias = str_replace('hcp/flectorpatch/Flector-patch-samples', 'hcp/flectorpatch/samples', $alias);
$alias = str_replace('hcp/flectorpatch/Isi', 'hcp/flectorpatch/safety-information', $alias);
$alias = strtolower($alias);
}
return $alias;
}
}
class PfizerProHTMLMigration extends PfizerProMigration {
/**
* Consturctor.
*/
public function __construct() {
parent::__construct();
// A map of source HTML filename -> destination node id.
$this->map = new MigrateSQLMap($this->machineName,
array(
'sourceid' => array(
'type' => 'varchar',
'length' => 255,
'not null' => TRUE,
),
),
MigrateDestinationNode::getKeySchema()
);
// The source fields.
$fields = array(
'title' => t('Title'),
'body' => t('Body'),
'uid' => t('User id'),
'',
);
// Match HTML files.
$regex = '/(.*\.htm$|.*\.html$)/i';
// The source of the migration is HTML files from the old site.
$list_files = new MigrateListFiles(array($this->baseDir), $this->baseDir, $regex);
$item_file = new MigrateItemFile($this->baseDir);
$this->source = new MigrateSourceList($list_files, $item_file, $fields);
// The destination is the mynode content type.
$this->destination = new MigrateDestinationNode('page');
// Map the fields, pretty straightforward in this case.
$this->addFieldMapping('uid', 'uid');
$this->addFieldMapping('title', 'title');
$this->addFieldMapping('body', 'body')
->arguments(array('format' => 'raw_html'));
$this->addFieldMapping('created', 'created');
$this->addFieldMapping('path', 'url_alias');
}
/**
* Prepare a row.
*/
public function prepareRow($row) {
// Always include this fragment at the beginning of every prepareRow()
// implementation, so parent classes can ignore rows.
if (parent::prepareRow($row) === FALSE) {
return FALSE;
}
// Set to admin for now.
$row->uid = 1;
// If the string contains '/includes/' then we need to treat it differently and we should ignore that row as its handled by another import
if (stripos($row->sourceid, '/includes/') !== FALSE) {
return FALSE;
}
//Find the occurances of the wyethcomponent tag
$this->_findIncludes($row->filedata, $matches);
$document = $row->filedata;
$this->_replaceWyethComponent($document, $matches);
// Load HTML from a string.
$html = str_get_html($document);
// Retrieve all the css and scripts
$head = $html->find('head', 0);
$styles = array();
$scripts = array();
if (!empty($head)) {
foreach ($head->find('style, script') as $element) {
switch ($element->tag) {
case 'style':
$styles[] = $element->innertext;
break;
case 'script':
$scripts[] = $element->innertext;
break;
}
}
}
// Set the styles and scripts.
$row->styles = $styles;
$row->scripts = $scripts;
// Retrieve the body.
$row->body = $html->find('body', 0)->innertext;
//Un comment these lines if you encounter problems with the import
// $row->body = str_replace('’', "&apos;", $row->body);
// $row->body = str_replace('∗', '*', $row->body);
// $row->body = str_replace(chr(146), "'", $row->body);
// $row->body = str_replace(chr(153), "&trade;", $row->body);
// $row->body = str_replace(chr(151), "&mdash;", $row->body);
// $row->body = str_replace(chr(174), "&reg;", $row->body);
// $row->body = str_replace('®', "&reg;", $row->body);
// The title is the filename if it's not found in the html.
$title = $html->find('title', 0)->innertext;
$row->title = empty($title) ? $row->sourceid : $title;
$row->title = str_replace('&reg;', '®', $row->title);
$row->title = str_replace('&mdash;', '–', $row->title);
$row->title = str_replace('&#33;', '!', $row->title);
$row->title = str_replace('&#34;', '"', $row->title);
$row->title = str_replace('&#35;', '#', $row->title);
$row->title = str_replace('&#36;', '$', $row->title);
$row->title = str_replace('&#37;', '%', $row->title);
$row->title = str_replace('&#38;', '&', $row->title);
$row->title = str_replace('&#39;', '\'', $row->title);
$row->title = str_replace('&#40;', '(', $row->title);
$row->title = str_replace('&#41;', ')', $row->title);
$row->title = str_replace('&#42;', '*', $row->title);
$row->title = str_replace('&#43;', '+', $row->title);
$row->title = str_replace('&#44;', ',', $row->title);
$row->title = str_replace('&#45;', '-', $row->title);
$row->title = str_replace('&#46;', '.', $row->title);
$row->title = str_replace('&#47;', '/', $row->title);
$row->title = str_replace('&#48;', '0', $row->title);
$row->title = str_replace('&#49;', '1', $row->title);
$row->title = str_replace('&#50;', '2', $row->title);
$row->title = str_replace('&#51;', '3', $row->title);
$row->title = str_replace('&#52;', '4', $row->title);
$row->title = str_replace('&#53;', '5', $row->title);
$row->title = str_replace('&#54;', '6', $row->title);
$row->title = str_replace('&#55;', '7', $row->title);
$row->title = str_replace('&#56;', '8', $row->title);
$row->title = str_replace('&#57;', '9', $row->title);
$row->title = str_replace('&#58;', ':', $row->title);
$row->title = str_replace('&#59;', ';', $row->title);
$row->title = str_replace('&#60;', '<', $row->title);
$row->title = str_replace('&#61;', '=', $row->title);
$row->title = str_replace('&#62;', '>', $row->title);
$row->title = str_replace('&#63;', '?', $row->title);
$row->title = str_replace('&#64;', '@', $row->title);
$row->title = str_replace('&#65;', 'A', $row->title);
$row->title = str_replace('&#66;', 'B', $row->title);
$row->title = str_replace('&#67;', 'C', $row->title);
$row->title = str_replace('&#68;', 'D', $row->title);
$row->title = str_replace('&#69;', 'E', $row->title);
$row->title = str_replace('&#70;', 'F', $row->title);
$row->title = str_replace('&#71;', 'G', $row->title);
$row->title = str_replace('&#72;', 'H', $row->title);
$row->title = str_replace('&#73;', 'I', $row->title);
$row->title = str_replace('&#74;', 'J', $row->title);
$row->title = str_replace('&#75;', 'K', $row->title);
$row->title = str_replace('&#76;', 'L', $row->title);
$row->title = str_replace('&#77;', 'M', $row->title);
$row->title = str_replace('&#78;', 'N', $row->title);
$row->title = str_replace('&#79;', 'O', $row->title);
$row->title = str_replace('&#80;', 'P', $row->title);
$row->title = str_replace('&#81;', 'Q', $row->title);
$row->title = str_replace('&#82;', 'R', $row->title);
$row->title = str_replace('&#83;', 'S', $row->title);
$row->title = str_replace('&#84;', 'T', $row->title);
$row->title = str_replace('&#85;', 'U', $row->title);
$row->title = str_replace('&#86;', 'V', $row->title);
$row->title = str_replace('&#87;', 'W', $row->title);
$row->title = str_replace('&#88;', 'X', $row->title);
$row->title = str_replace('&#89;', 'Y', $row->title);
$row->title = str_replace('&#90;', 'Z', $row->title);
$row->title = str_replace('&#97;', 'a', $row->title);
$row->title = str_replace('&#98;', 'b', $row->title);
$row->title = str_replace('&#99;', 'c', $row->title);
$row->title = str_replace('&#100;', 'd', $row->title);
$row->title = str_replace('&#101;', 'e', $row->title);
$row->title = str_replace('&#102;', 'f', $row->title);
$row->title = str_replace('&#103;', 'g', $row->title);
$row->title = str_replace('&#104;', 'h', $row->title);
$row->title = str_replace('&#105;', 'i', $row->title);
$row->title = str_replace('&#106;', 'j', $row->title);
$row->title = str_replace('&#107;', 'k', $row->title);
$row->title = str_replace('&#108;', 'l', $row->title);
$row->title = str_replace('&#109;', 'm', $row->title);
$row->title = str_replace('&#110;', 'n', $row->title);
$row->title = str_replace('&#111;', 'o', $row->title);
$row->title = str_replace('&#112;', 'p', $row->title);
$row->title = str_replace('&#113;', 'q', $row->title);
$row->title = str_replace('&#114;', 'r', $row->title);
$row->title = str_replace('&#115;', 's', $row->title);
$row->title = str_replace('&#116;', 't', $row->title);
$row->title = str_replace('&#117;', 'u', $row->title);
$row->title = str_replace('&#118;', 'v', $row->title);
$row->title = str_replace('&#119;', 'w', $row->title);
$row->title = str_replace('&#120;', 'x', $row->title);
$row->title = str_replace('&#121;', 'y', $row->title);
$row->title = str_replace('&#122;', 'z', $row->title);
$row->url_alias = $this->prepareAlias($row);
}
/**
* Prepare the entity.
*/
public function prepare($entity, $row) {
// Save the styles and scripts.
$entity->cpn['css'] = implode("\n", $row->styles);
$entity->cpn['js'] = implode("\n", $row->scripts);
}
private function _findIncludes($document, &$matches) {
$WYETH_COMPONENT_SEARCH_STRING = '<wyethcomponent com.wyeth.include.html.ResourceInclude';
$WYETH_INCLUDE_PATH_MAPS = array(
'hcp' => '/wyeth_html/home/minisites',
'products_landing' => '/wyeth_html/home/products_landing',
);
$WYETH_INCLUDE_PATH = '/wyeth_html/home/minisites';
// Does the document or remaining documnet contain the wyeth callout?
if ($start_of_wyeth_tag = stripos($document, $WYETH_COMPONENT_SEARCH_STRING)) {
// Find the start of the wyeth path and move the pointer the length of the string to find the relative path
$remaining_document = substr($document, $start_of_wyeth_tag);
$start_char_pos = 0;
foreach($WYETH_INCLUDE_PATH_MAPS as $key => $path) {
$start_char_pos = stripos($remaining_document, $path);
if ($start_char_pos !== FALSE) {
$start_char_pos += strlen($path);
$directory_name = $key;
break;
}
}
//Chop off the beginning of the string
$remaining_document = substr($remaining_document, $start_char_pos);
//Find the end of the tag to calculate the difference
$end_char_pos = stripos($remaining_document, '>');
//Save the string from 0 to End
$include_path = trim(substr($remaining_document, 0, $end_char_pos),'/ ');
$entire_tag_to_replace = substr($document, $start_of_wyeth_tag, $start_char_pos + $end_char_pos + 2 );
if ($this->_isCommented($remaining_document) == false) {
//We include the hcp/ here to make sure our paths will line up.
$matches[$directory_name . '/' . $include_path] = $entire_tag_to_replace;
}
//Recursively call the function so we find all the matches in each file.
$this->_findIncludes($remaining_document, $matches);
}
}
private function _replaceWyethComponent(&$document, $matches) {
if (count($matches) == 0 ) return;
foreach($matches as $match_filename => $replace){
// $document = str_replace('div', 'vid', $document);
$document = str_replace($replace, '[[nid:' . $match_filename . ']]', $document);
}
}
// Keep parsing the document until a "<!--" or "-->" string is found. Returns values accordingly.
private function _isCommented($document) {
// $open is the index of the next open comment tag.
$open = stripos($document, "<!--");
// $close is the index of the next close comment tag.
$close = stripos($document, "-->");
// If we found an open tag first, that means we were in a chunk of code that was uncommented.
if ($open < $close) {
return FALSE;
}
// else if we found a close tag first, that means we were in a chunk of code that was commented.
else if ($close < $open) {
return TRUE;
}
// if we can't find either tag, we are at the end of the document. we are not in a commented chunk of code.
return FALSE;
}
}
class MetaTagsMigration extends PfizerProMigration {
public function __construct() {
parent::__construct();
$this->description = t('Migrate metatags into field collections.');
$this->dependencies = array('PfizerProHTML');
$fields = array(
'field_meta_name' => 'Metatag Name',
'field_meta_value' => 'Metatag Value',
);
// Match HTML files.
$regex = '/(.*\.htm$|.*\.html$)/i';
// The source of the migration is HTML files from the old site.
$parser = new PfizerMetatagContentParser();
$list_files = new MigrateListFiles(array($this->baseDir), $this->baseDir, $regex, array(), $parser);
$item_file = new MigrateItemFile($this->baseDir, TRUE, $parser);
$this->source = new MigrateSourceList($list_files, $item_file, $fields);
$this->destination = new MigrateDestinationFieldCollection(
'field_custom_metatags',
array('host_entity_type' => 'node')
);
$this->map = new MigrateSQLMap(
$this->machineName,
array(
'sourceid' => array(
'type' => 'varchar',
'length' => 60,
'not null' => FALSE,
),
'field_meta_name' => array(
'type' => 'varchar',
'length' => 60,
'not null' => FALSE,
),
'field_meta_value' => array(
'type' => 'varchar',
'length' => 255,
'not null' => FALSE,
)
),
MigrateDestinationFieldCollection::getKeySchema()
);
$this->addFieldMapping('host_entity_id', 'nid')->sourceMigration('PfizerProHTML');
$this->addFieldMapping('field_meta_name', 'meta_name');
$this->addFieldMapping('field_meta_value', 'meta_value');
}
/**
* Prepare a row.
*/
public function prepareRow($row) {
// Always include this fragment at the beginning of every prepareRow()
// implementation, so parent classes can ignore rows.
if (parent::prepareRow($row) === FALSE || empty($row->filedata)) {
return FALSE;
}
list($row->sourceid, $row->field_meta_name) = explode(MIGRATE_CHUNK_SEPARATOR , $row->sourceid);
$row->field_meta_value = $row->filedata;
// Calculate the appropriate host entity id.
$row->nid = substr(drupal_lookup_path('source', $this->prepareAlias($row)), 5);
// If we can't find the host entity, then we can't import it.
// Or if we don't have any valid metatags, also don't import.
if (!$row->nid || empty($row->field_meta_name) || empty($row->field_meta_value)) {
return FALSE;
}
return $row;
}
public function prepareKey($source_key, $row) {
$key = array();
foreach ($source_key as $field_name => $field_schema) {
if (empty($row->$field_name)) {
$key[$field_name] = $row->sourceid;
}
else {
$key[$field_name] = $row->$field_name;
}
}
return $key;
}
/**
* Prepare the entity.
*/
public function prepare($entity, $row) {
// Save the styles and scripts.
$entity->cpn['css'] = implode("\n", $row->styles);
$entity->cpn['js'] = implode("\n", $row->scripts);
}
}
/**
* Simple parser that doesn't actually parse anything - it just returns the
* whole file as a single chunk. This is the default parser used and is
* primarily for when your HTML files map one-to-one to nodes.
*/
class PfizerMetatagContentParser extends MigrateContentParser {
public function setContent($content) {
$this->content = array();
// Load HTML from a string.
$html = str_get_html($content);
// Retrieve all the css and scripts
$head = $html->find('head', 0);
if (!empty($head)) {
foreach ($head->find('meta') as $element) {
$attrs = $element->getAllAttributes ();
if (!empty($attrs['name']) && !empty($attrs['content'])) {
$this->content[$attrs['name']] = $attrs['content'];
}
}
}
}
public function getChunkIDs() {
return array_keys($this->content);
}
public function getChunk($id) {
if (empty($id)) {
return '';
}
return $this->content[$id];
}
public function getChunkCount() {
return count($this->content);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment