Created
December 19, 2013 14:35
-
-
Save heddn/8040014 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| /** | |
| * @file | |
| * HTML Migration. | |
| */ | |
| abstract class PfizerProMigration extends Migration { | |
| public $baseDir; | |
| public $siteMapArray; | |
| /** | |
| * Consturctor. | |
| */ | |
| public function __construct() { | |
| parent::__construct(); | |
| // Build the site map array from the Pfizer provided sitemap | |
| $this->_parseSiteMap(); | |
| // Since the base directory of the HTML files can change depending on the | |
| // environment, we keep it in a variable. There is no interface for this, | |
| // set it using drush vset. | |
| $this->baseDir = drupal_get_path('module', 'pfizer_migration') . '/fixtures'; | |
| } | |
| /** | |
| * This function analyzes the sitemap to create key value pairs for file and url path of a page. | |
| **/ | |
| protected function _parseSiteMap() { | |
| $this->siteMapArray = array(); | |
| $siteMapLocation = realpath('./') . '/sites/all/files/pfizer migration sitemap.html'; | |
| $siteMapDocument = file_get_html($siteMapLocation); | |
| $links = $siteMapDocument->find('a'); | |
| foreach($links as $element) { | |
| $parts = explode('--', $element->innertext); | |
| if (array_key_exists(1, $parts)) { | |
| $filepath = str_replace('/wyeth_html/home/minisites/', '/hcp/', trim($parts[1])); | |
| $filepath = str_replace('/wyeth_html/home/products_landing/', '/products_landing/', $filepath); | |
| $urlpath = str_replace('http://www.pfizerpro.com/', '', $element->href); | |
| $this->siteMapArray[$urlpath] = $filepath; | |
| } | |
| } | |
| } | |
| /** | |
| * Prepare aliases. | |
| */ | |
| protected function prepareAlias($row) { | |
| // If we have a match from this document to the sitemap then we know the alias. | |
| if ($key = array_search($row->sourceid, $this->siteMapArray)) { | |
| $alias = $key; | |
| } | |
| else { | |
| // Set the url_alias. | |
| $alias = ltrim($row->sourceid, '/'); | |
| // Retrieve the dir and the page. | |
| $path = explode('/', $alias); | |
| $page = strtolower(end($path)); | |
| $dir = strtolower(prev($path)); | |
| // Remove the .html extension and the dir name. | |
| $search = array('.html', $dir . '_', $dir, 'index'); | |
| $replace = ''; | |
| $page = str_replace($search, $replace, $page); | |
| // Index page for dir if page is an empty string. | |
| if (empty($page)) { | |
| // Remove last element. | |
| array_pop($path); | |
| // Set the aliases correctly. | |
| $alias = implode($path, '/'); | |
| } | |
| else { | |
| // Remove last element. | |
| array_pop($path); | |
| // Put the new page on the path. | |
| array_push($path, ucfirst($page)); | |
| // Set the aliases correctly. | |
| $alias = implode($path, '/'); | |
| } | |
| // Changes URL alias for effexorxr microsite | |
| $alias = str_replace('hcp/effexor/Overview', 'hcp/effexorxr', $alias); | |
| $alias = str_replace('hcp/effexor/Important_treatment_considerations', 'hcp/effexorxr/important-safety-information', $alias); | |
| // Changes URL alias for flectorpatch microsite | |
| $alias = str_replace('hcp/flectorpatch/Adverseevents', 'hcp/flectorpatch/adverse-events', $alias); | |
| $alias = str_replace('hcp/flectorpatch/Flector-patch-samples', 'hcp/flectorpatch/samples', $alias); | |
| $alias = str_replace('hcp/flectorpatch/Isi', 'hcp/flectorpatch/safety-information', $alias); | |
| $alias = strtolower($alias); | |
| } | |
| return $alias; | |
| } | |
| } | |
| class PfizerProHTMLMigration extends PfizerProMigration { | |
| /** | |
| * Consturctor. | |
| */ | |
| public function __construct() { | |
| parent::__construct(); | |
| // A map of source HTML filename -> destination node id. | |
| $this->map = new MigrateSQLMap($this->machineName, | |
| array( | |
| 'sourceid' => array( | |
| 'type' => 'varchar', | |
| 'length' => 255, | |
| 'not null' => TRUE, | |
| ), | |
| ), | |
| MigrateDestinationNode::getKeySchema() | |
| ); | |
| // The source fields. | |
| $fields = array( | |
| 'title' => t('Title'), | |
| 'body' => t('Body'), | |
| 'uid' => t('User id'), | |
| '', | |
| ); | |
| // Match HTML files. | |
| $regex = '/(.*\.htm$|.*\.html$)/i'; | |
| // The source of the migration is HTML files from the old site. | |
| $list_files = new MigrateListFiles(array($this->baseDir), $this->baseDir, $regex); | |
| $item_file = new MigrateItemFile($this->baseDir); | |
| $this->source = new MigrateSourceList($list_files, $item_file, $fields); | |
| // The destination is the mynode content type. | |
| $this->destination = new MigrateDestinationNode('page'); | |
| // Map the fields, pretty straightforward in this case. | |
| $this->addFieldMapping('uid', 'uid'); | |
| $this->addFieldMapping('title', 'title'); | |
| $this->addFieldMapping('body', 'body') | |
| ->arguments(array('format' => 'raw_html')); | |
| $this->addFieldMapping('created', 'created'); | |
| $this->addFieldMapping('path', 'url_alias'); | |
| } | |
| /** | |
| * Prepare a row. | |
| */ | |
| public function prepareRow($row) { | |
| // Always include this fragment at the beginning of every prepareRow() | |
| // implementation, so parent classes can ignore rows. | |
| if (parent::prepareRow($row) === FALSE) { | |
| return FALSE; | |
| } | |
| // Set to admin for now. | |
| $row->uid = 1; | |
| // If the string contains '/includes/' then we need to treat it differently and we should ignore that row as its handled by another import | |
| if (stripos($row->sourceid, '/includes/') !== FALSE) { | |
| return FALSE; | |
| } | |
| //Find the occurances of the wyethcomponent tag | |
| $this->_findIncludes($row->filedata, $matches); | |
| $document = $row->filedata; | |
| $this->_replaceWyethComponent($document, $matches); | |
| // Load HTML from a string. | |
| $html = str_get_html($document); | |
| // Retrieve all the css and scripts | |
| $head = $html->find('head', 0); | |
| $styles = array(); | |
| $scripts = array(); | |
| if (!empty($head)) { | |
| foreach ($head->find('style, script') as $element) { | |
| switch ($element->tag) { | |
| case 'style': | |
| $styles[] = $element->innertext; | |
| break; | |
| case 'script': | |
| $scripts[] = $element->innertext; | |
| break; | |
| } | |
| } | |
| } | |
| // Set the styles and scripts. | |
| $row->styles = $styles; | |
| $row->scripts = $scripts; | |
| // Retrieve the body. | |
| $row->body = $html->find('body', 0)->innertext; | |
| //Un comment these lines if you encounter problems with the import | |
| // $row->body = str_replace('’', "'", $row->body); | |
| // $row->body = str_replace('∗', '*', $row->body); | |
| // $row->body = str_replace(chr(146), "'", $row->body); | |
| // $row->body = str_replace(chr(153), "™", $row->body); | |
| // $row->body = str_replace(chr(151), "—", $row->body); | |
| // $row->body = str_replace(chr(174), "®", $row->body); | |
| // $row->body = str_replace('®', "®", $row->body); | |
| // The title is the filename if it's not found in the html. | |
| $title = $html->find('title', 0)->innertext; | |
| $row->title = empty($title) ? $row->sourceid : $title; | |
| $row->title = str_replace('®', '®', $row->title); | |
| $row->title = str_replace('—', '–', $row->title); | |
| $row->title = str_replace('!', '!', $row->title); | |
| $row->title = str_replace('"', '"', $row->title); | |
| $row->title = str_replace('#', '#', $row->title); | |
| $row->title = str_replace('$', '$', $row->title); | |
| $row->title = str_replace('%', '%', $row->title); | |
| $row->title = str_replace('&', '&', $row->title); | |
| $row->title = str_replace(''', '\'', $row->title); | |
| $row->title = str_replace('(', '(', $row->title); | |
| $row->title = str_replace(')', ')', $row->title); | |
| $row->title = str_replace('*', '*', $row->title); | |
| $row->title = str_replace('+', '+', $row->title); | |
| $row->title = str_replace(',', ',', $row->title); | |
| $row->title = str_replace('-', '-', $row->title); | |
| $row->title = str_replace('.', '.', $row->title); | |
| $row->title = str_replace('/', '/', $row->title); | |
| $row->title = str_replace('0', '0', $row->title); | |
| $row->title = str_replace('1', '1', $row->title); | |
| $row->title = str_replace('2', '2', $row->title); | |
| $row->title = str_replace('3', '3', $row->title); | |
| $row->title = str_replace('4', '4', $row->title); | |
| $row->title = str_replace('5', '5', $row->title); | |
| $row->title = str_replace('6', '6', $row->title); | |
| $row->title = str_replace('7', '7', $row->title); | |
| $row->title = str_replace('8', '8', $row->title); | |
| $row->title = str_replace('9', '9', $row->title); | |
| $row->title = str_replace(':', ':', $row->title); | |
| $row->title = str_replace(';', ';', $row->title); | |
| $row->title = str_replace('<', '<', $row->title); | |
| $row->title = str_replace('=', '=', $row->title); | |
| $row->title = str_replace('>', '>', $row->title); | |
| $row->title = str_replace('?', '?', $row->title); | |
| $row->title = str_replace('@', '@', $row->title); | |
| $row->title = str_replace('A', 'A', $row->title); | |
| $row->title = str_replace('B', 'B', $row->title); | |
| $row->title = str_replace('C', 'C', $row->title); | |
| $row->title = str_replace('D', 'D', $row->title); | |
| $row->title = str_replace('E', 'E', $row->title); | |
| $row->title = str_replace('F', 'F', $row->title); | |
| $row->title = str_replace('G', 'G', $row->title); | |
| $row->title = str_replace('H', 'H', $row->title); | |
| $row->title = str_replace('I', 'I', $row->title); | |
| $row->title = str_replace('J', 'J', $row->title); | |
| $row->title = str_replace('K', 'K', $row->title); | |
| $row->title = str_replace('L', 'L', $row->title); | |
| $row->title = str_replace('M', 'M', $row->title); | |
| $row->title = str_replace('N', 'N', $row->title); | |
| $row->title = str_replace('O', 'O', $row->title); | |
| $row->title = str_replace('P', 'P', $row->title); | |
| $row->title = str_replace('Q', 'Q', $row->title); | |
| $row->title = str_replace('R', 'R', $row->title); | |
| $row->title = str_replace('S', 'S', $row->title); | |
| $row->title = str_replace('T', 'T', $row->title); | |
| $row->title = str_replace('U', 'U', $row->title); | |
| $row->title = str_replace('V', 'V', $row->title); | |
| $row->title = str_replace('W', 'W', $row->title); | |
| $row->title = str_replace('X', 'X', $row->title); | |
| $row->title = str_replace('Y', 'Y', $row->title); | |
| $row->title = str_replace('Z', 'Z', $row->title); | |
| $row->title = str_replace('a', 'a', $row->title); | |
| $row->title = str_replace('b', 'b', $row->title); | |
| $row->title = str_replace('c', 'c', $row->title); | |
| $row->title = str_replace('d', 'd', $row->title); | |
| $row->title = str_replace('e', 'e', $row->title); | |
| $row->title = str_replace('f', 'f', $row->title); | |
| $row->title = str_replace('g', 'g', $row->title); | |
| $row->title = str_replace('h', 'h', $row->title); | |
| $row->title = str_replace('i', 'i', $row->title); | |
| $row->title = str_replace('j', 'j', $row->title); | |
| $row->title = str_replace('k', 'k', $row->title); | |
| $row->title = str_replace('l', 'l', $row->title); | |
| $row->title = str_replace('m', 'm', $row->title); | |
| $row->title = str_replace('n', 'n', $row->title); | |
| $row->title = str_replace('o', 'o', $row->title); | |
| $row->title = str_replace('p', 'p', $row->title); | |
| $row->title = str_replace('q', 'q', $row->title); | |
| $row->title = str_replace('r', 'r', $row->title); | |
| $row->title = str_replace('s', 's', $row->title); | |
| $row->title = str_replace('t', 't', $row->title); | |
| $row->title = str_replace('u', 'u', $row->title); | |
| $row->title = str_replace('v', 'v', $row->title); | |
| $row->title = str_replace('w', 'w', $row->title); | |
| $row->title = str_replace('x', 'x', $row->title); | |
| $row->title = str_replace('y', 'y', $row->title); | |
| $row->title = str_replace('z', 'z', $row->title); | |
| $row->url_alias = $this->prepareAlias($row); | |
| } | |
| /** | |
| * Prepare the entity. | |
| */ | |
| public function prepare($entity, $row) { | |
| // Save the styles and scripts. | |
| $entity->cpn['css'] = implode("\n", $row->styles); | |
| $entity->cpn['js'] = implode("\n", $row->scripts); | |
| } | |
| private function _findIncludes($document, &$matches) { | |
| $WYETH_COMPONENT_SEARCH_STRING = '<wyethcomponent com.wyeth.include.html.ResourceInclude'; | |
| $WYETH_INCLUDE_PATH_MAPS = array( | |
| 'hcp' => '/wyeth_html/home/minisites', | |
| 'products_landing' => '/wyeth_html/home/products_landing', | |
| ); | |
| $WYETH_INCLUDE_PATH = '/wyeth_html/home/minisites'; | |
| // Does the document or remaining documnet contain the wyeth callout? | |
| if ($start_of_wyeth_tag = stripos($document, $WYETH_COMPONENT_SEARCH_STRING)) { | |
| // Find the start of the wyeth path and move the pointer the length of the string to find the relative path | |
| $remaining_document = substr($document, $start_of_wyeth_tag); | |
| $start_char_pos = 0; | |
| foreach($WYETH_INCLUDE_PATH_MAPS as $key => $path) { | |
| $start_char_pos = stripos($remaining_document, $path); | |
| if ($start_char_pos !== FALSE) { | |
| $start_char_pos += strlen($path); | |
| $directory_name = $key; | |
| break; | |
| } | |
| } | |
| //Chop off the beginning of the string | |
| $remaining_document = substr($remaining_document, $start_char_pos); | |
| //Find the end of the tag to calculate the difference | |
| $end_char_pos = stripos($remaining_document, '>'); | |
| //Save the string from 0 to End | |
| $include_path = trim(substr($remaining_document, 0, $end_char_pos),'/ '); | |
| $entire_tag_to_replace = substr($document, $start_of_wyeth_tag, $start_char_pos + $end_char_pos + 2 ); | |
| if ($this->_isCommented($remaining_document) == false) { | |
| //We include the hcp/ here to make sure our paths will line up. | |
| $matches[$directory_name . '/' . $include_path] = $entire_tag_to_replace; | |
| } | |
| //Recursively call the function so we find all the matches in each file. | |
| $this->_findIncludes($remaining_document, $matches); | |
| } | |
| } | |
| private function _replaceWyethComponent(&$document, $matches) { | |
| if (count($matches) == 0 ) return; | |
| foreach($matches as $match_filename => $replace){ | |
| // $document = str_replace('div', 'vid', $document); | |
| $document = str_replace($replace, '[[nid:' . $match_filename . ']]', $document); | |
| } | |
| } | |
| // Keep parsing the document until a "<!--" or "-->" string is found. Returns values accordingly. | |
| private function _isCommented($document) { | |
| // $open is the index of the next open comment tag. | |
| $open = stripos($document, "<!--"); | |
| // $close is the index of the next close comment tag. | |
| $close = stripos($document, "-->"); | |
| // If we found an open tag first, that means we were in a chunk of code that was uncommented. | |
| if ($open < $close) { | |
| return FALSE; | |
| } | |
| // else if we found a close tag first, that means we were in a chunk of code that was commented. | |
| else if ($close < $open) { | |
| return TRUE; | |
| } | |
| // if we can't find either tag, we are at the end of the document. we are not in a commented chunk of code. | |
| return FALSE; | |
| } | |
| } | |
| class MetaTagsMigration extends PfizerProMigration { | |
| public function __construct() { | |
| parent::__construct(); | |
| $this->description = t('Migrate metatags into field collections.'); | |
| $this->dependencies = array('PfizerProHTML'); | |
| $fields = array( | |
| 'field_meta_name' => 'Metatag Name', | |
| 'field_meta_value' => 'Metatag Value', | |
| ); | |
| // Match HTML files. | |
| $regex = '/(.*\.htm$|.*\.html$)/i'; | |
| // The source of the migration is HTML files from the old site. | |
| $parser = new PfizerMetatagContentParser(); | |
| $list_files = new MigrateListFiles(array($this->baseDir), $this->baseDir, $regex, array(), $parser); | |
| $item_file = new MigrateItemFile($this->baseDir, TRUE, $parser); | |
| $this->source = new MigrateSourceList($list_files, $item_file, $fields); | |
| $this->destination = new MigrateDestinationFieldCollection( | |
| 'field_custom_metatags', | |
| array('host_entity_type' => 'node') | |
| ); | |
| $this->map = new MigrateSQLMap( | |
| $this->machineName, | |
| array( | |
| 'sourceid' => array( | |
| 'type' => 'varchar', | |
| 'length' => 60, | |
| 'not null' => FALSE, | |
| ), | |
| 'field_meta_name' => array( | |
| 'type' => 'varchar', | |
| 'length' => 60, | |
| 'not null' => FALSE, | |
| ), | |
| 'field_meta_value' => array( | |
| 'type' => 'varchar', | |
| 'length' => 255, | |
| 'not null' => FALSE, | |
| ) | |
| ), | |
| MigrateDestinationFieldCollection::getKeySchema() | |
| ); | |
| $this->addFieldMapping('host_entity_id', 'nid')->sourceMigration('PfizerProHTML'); | |
| $this->addFieldMapping('field_meta_name', 'meta_name'); | |
| $this->addFieldMapping('field_meta_value', 'meta_value'); | |
| } | |
| /** | |
| * Prepare a row. | |
| */ | |
| public function prepareRow($row) { | |
| // Always include this fragment at the beginning of every prepareRow() | |
| // implementation, so parent classes can ignore rows. | |
| if (parent::prepareRow($row) === FALSE || empty($row->filedata)) { | |
| return FALSE; | |
| } | |
| list($row->sourceid, $row->field_meta_name) = explode(MIGRATE_CHUNK_SEPARATOR , $row->sourceid); | |
| $row->field_meta_value = $row->filedata; | |
| // Calculate the appropriate host entity id. | |
| $row->nid = substr(drupal_lookup_path('source', $this->prepareAlias($row)), 5); | |
| // If we can't find the host entity, then we can't import it. | |
| // Or if we don't have any valid metatags, also don't import. | |
| if (!$row->nid || empty($row->field_meta_name) || empty($row->field_meta_value)) { | |
| return FALSE; | |
| } | |
| return $row; | |
| } | |
| public function prepareKey($source_key, $row) { | |
| $key = array(); | |
| foreach ($source_key as $field_name => $field_schema) { | |
| if (empty($row->$field_name)) { | |
| $key[$field_name] = $row->sourceid; | |
| } | |
| else { | |
| $key[$field_name] = $row->$field_name; | |
| } | |
| } | |
| return $key; | |
| } | |
| /** | |
| * Prepare the entity. | |
| */ | |
| public function prepare($entity, $row) { | |
| // Save the styles and scripts. | |
| $entity->cpn['css'] = implode("\n", $row->styles); | |
| $entity->cpn['js'] = implode("\n", $row->scripts); | |
| } | |
| } | |
| /** | |
| * Simple parser that doesn't actually parse anything - it just returns the | |
| * whole file as a single chunk. This is the default parser used and is | |
| * primarily for when your HTML files map one-to-one to nodes. | |
| */ | |
| class PfizerMetatagContentParser extends MigrateContentParser { | |
| public function setContent($content) { | |
| $this->content = array(); | |
| // Load HTML from a string. | |
| $html = str_get_html($content); | |
| // Retrieve all the css and scripts | |
| $head = $html->find('head', 0); | |
| if (!empty($head)) { | |
| foreach ($head->find('meta') as $element) { | |
| $attrs = $element->getAllAttributes (); | |
| if (!empty($attrs['name']) && !empty($attrs['content'])) { | |
| $this->content[$attrs['name']] = $attrs['content']; | |
| } | |
| } | |
| } | |
| } | |
| public function getChunkIDs() { | |
| return array_keys($this->content); | |
| } | |
| public function getChunk($id) { | |
| if (empty($id)) { | |
| return ''; | |
| } | |
| return $this->content[$id]; | |
| } | |
| public function getChunkCount() { | |
| return count($this->content); | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment