heddn · December 19, 2013 14:35
diff --git a/content.inc b/content.inc
 <?php
 /**
 * @file
 * HTML Migration.
 */

 abstract class  PfizerProMigration extends Migration {
  public $baseDir;
  public $siteMapArray;
  /**
   * Consturctor.
   */
  public function __construct() {
    parent::__construct();

    // Build the site map array from the Pfizer provided sitemap
    $this->_parseSiteMap();

    // Since the base directory of the HTML files can change depending on the
    // environment, we keep it in a variable. There is no interface for this,
    // set it using drush vset.
    $this->baseDir = drupal_get_path('module', 'pfizer_migration') . '/fixtures';
  }

 /**
 * This function analyzes the sitemap to create key value pairs for file and url path of a page.
 **/
  protected function _parseSiteMap() {

    $this->siteMapArray = array();
    $siteMapLocation = realpath('./') . '/sites/all/files/pfizer migration sitemap.html';
    
    $siteMapDocument = file_get_html($siteMapLocation);

    $links = $siteMapDocument->find('a');

    foreach($links as $element) {
      $parts = explode('--', $element->innertext);

      if (array_key_exists(1, $parts)) {
        $filepath = str_replace('/wyeth_html/home/minisites/', '/hcp/', trim($parts[1]));
        $filepath = str_replace('/wyeth_html/home/products_landing/', '/products_landing/', $filepath);
        $urlpath = str_replace('http://www.pfizerpro.com/', '', $element->href);
        $this->siteMapArray[$urlpath] = $filepath;
      }
    }
  }


  /**
   * Prepare aliases.
   */
  protected function prepareAlias($row) {
    // If we have a match from this document to the sitemap then we know the alias.
    if ($key = array_search($row->sourceid, $this->siteMapArray)) {
      $alias = $key;
    }
    else {
      // Set the url_alias.
      $alias = ltrim($row->sourceid, '/');

      // Retrieve the dir and the page.
      $path = explode('/', $alias);
      $page = strtolower(end($path));
      $dir = strtolower(prev($path));

      // Remove the .html extension and the dir name.
      $search = array('.html', $dir . '_', $dir, 'index');
      $replace = '';
      $page = str_replace($search, $replace, $page);

      // Index page for dir if page is an empty string.
      if (empty($page)) {
        // Remove last element.
        array_pop($path);

        // Set the aliases correctly.
        $alias = implode($path, '/');
      }
      else {
        // Remove last element.
        array_pop($path);

        // Put the new page on the path.
        array_push($path, ucfirst($page));

        // Set the aliases correctly.
        $alias = implode($path, '/');
      }

      // Changes URL alias for effexorxr microsite
      $alias = str_replace('hcp/effexor/Overview', 'hcp/effexorxr', $alias);
      $alias = str_replace('hcp/effexor/Important_treatment_considerations', 'hcp/effexorxr/important-safety-information', $alias);

      // Changes URL alias for flectorpatch microsite
      $alias = str_replace('hcp/flectorpatch/Adverseevents', 'hcp/flectorpatch/adverse-events', $alias);
      $alias = str_replace('hcp/flectorpatch/Flector-patch-samples', 'hcp/flectorpatch/samples', $alias);
      $alias = str_replace('hcp/flectorpatch/Isi', 'hcp/flectorpatch/safety-information', $alias);

      $alias = strtolower($alias);
    }

    return $alias;
  }
 }

 class PfizerProHTMLMigration extends PfizerProMigration {
  /**
   * Consturctor.
   */
  public function __construct() {
    parent::__construct();

    // A map of source HTML filename -> destination node id.
    $this->map = new MigrateSQLMap($this->machineName,
      array(
        'sourceid' => array(
          'type' => 'varchar',
          'length' => 255,
          'not null' => TRUE,
        ),
      ),
      MigrateDestinationNode::getKeySchema()
    );

    // The source fields.
    $fields = array(
      'title' => t('Title'),
      'body' => t('Body'),
      'uid' => t('User id'),
      '',
    );

    // Match HTML files.
    $regex = '/(.*\.htm$|.*\.html$)/i';

    // The source of the migration is HTML files from the old site.
    $list_files = new MigrateListFiles(array($this->baseDir), $this->baseDir, $regex);

    $item_file = new MigrateItemFile($this->baseDir);

    $this->source = new MigrateSourceList($list_files, $item_file, $fields);

    // The destination is the mynode content type.
    $this->destination = new MigrateDestinationNode('page');

    // Map the fields, pretty straightforward in this case.
    $this->addFieldMapping('uid', 'uid');
    $this->addFieldMapping('title', 'title');
    $this->addFieldMapping('body', 'body')
      ->arguments(array('format' => 'raw_html'));
    $this->addFieldMapping('created', 'created');
    $this->addFieldMapping('path', 'url_alias');
  }

  /**
   * Prepare a row.
   */
  public function prepareRow($row) {
    // Always include this fragment at the beginning of every prepareRow()
    // implementation, so parent classes can ignore rows.
    if (parent::prepareRow($row) === FALSE) {
      return FALSE;
    }

    // Set to admin for now.
    $row->uid = 1;
    // If the string contains '/includes/' then we need to treat it differently and we should ignore that row as its handled by another import
    if (stripos($row->sourceid, '/includes/') !== FALSE) {
      return FALSE;
    }

    //Find the occurances of the wyethcomponent tag
    $this->_findIncludes($row->filedata, $matches);
    $document = $row->filedata;
    $this->_replaceWyethComponent($document, $matches);

    // Load HTML from a string.
    $html = str_get_html($document);

    // Retrieve all the css and scripts
    $head = $html->find('head', 0);
    $styles = array();
    $scripts = array();

    if (!empty($head)) {
      foreach ($head->find('style, script') as $element) {
        switch ($element->tag) {
          case 'style':
            $styles[] = $element->innertext;
            break;
          case 'script':
            $scripts[] = $element->innertext;
            break;
        }
      }
    }

    // Set the styles and scripts.
    $row->styles = $styles;
    $row->scripts = $scripts;

    // Retrieve the body.
    $row->body = $html->find('body', 0)->innertext;

    //Un comment these lines if you encounter problems with the import
    // $row->body = str_replace('’', "&apos;", $row->body);
    // $row->body = str_replace('∗', '*', $row->body);
    // $row->body = str_replace(chr(146), "'", $row->body);
    // $row->body = str_replace(chr(153), "&trade;", $row->body);
    // $row->body = str_replace(chr(151), "&mdash;", $row->body);
    // $row->body = str_replace(chr(174), "&reg;", $row->body);
    // $row->body = str_replace('®', "&reg;", $row->body);

    // The title is the filename if it's not found in the html.
    $title = $html->find('title', 0)->innertext;
    $row->title = empty($title) ? $row->sourceid : $title;

    $row->title = str_replace('&reg;', '®', $row->title);
    $row->title = str_replace('&mdash;', '–', $row->title);

    $row->title = str_replace('&#33;', '!', $row->title);
    $row->title = str_replace('&#34;', '"', $row->title);
    $row->title = str_replace('&#35;', '#', $row->title);
    $row->title = str_replace('&#36;', '$', $row->title);
    $row->title = str_replace('&#37;', '%', $row->title);
    $row->title = str_replace('&#38;', '&', $row->title);
    $row->title = str_replace('&#39;', '\'', $row->title);
    $row->title = str_replace('&#40;', '(', $row->title);
    $row->title = str_replace('&#41;', ')', $row->title);
    $row->title = str_replace('&#42;', '*', $row->title);
    $row->title = str_replace('&#43;', '+', $row->title);
    $row->title = str_replace('&#44;', ',', $row->title);
    $row->title = str_replace('&#45;', '-', $row->title);
    $row->title = str_replace('&#46;', '.', $row->title);
    $row->title = str_replace('&#47;', '/', $row->title);
    $row->title = str_replace('&#48;', '0', $row->title);
    $row->title = str_replace('&#49;', '1', $row->title);
    $row->title = str_replace('&#50;', '2', $row->title);
    $row->title = str_replace('&#51;', '3', $row->title);
    $row->title = str_replace('&#52;', '4', $row->title);
    $row->title = str_replace('&#53;', '5', $row->title);
    $row->title = str_replace('&#54;', '6', $row->title);
    $row->title = str_replace('&#55;', '7', $row->title);
    $row->title = str_replace('&#56;', '8', $row->title);
    $row->title = str_replace('&#57;', '9', $row->title);
    $row->title = str_replace('&#58;', ':', $row->title);
    $row->title = str_replace('&#59;', ';', $row->title);
    $row->title = str_replace('&#60;', '<', $row->title);
    $row->title = str_replace('&#61;', '=', $row->title);
    $row->title = str_replace('&#62;', '>', $row->title);
    $row->title = str_replace('&#63;', '?', $row->title);
    $row->title = str_replace('&#64;', '@', $row->title);
    $row->title = str_replace('&#65;', 'A', $row->title);
    $row->title = str_replace('&#66;', 'B', $row->title);
    $row->title = str_replace('&#67;', 'C', $row->title);
    $row->title = str_replace('&#68;', 'D', $row->title);
    $row->title = str_replace('&#69;', 'E', $row->title);
    $row->title = str_replace('&#70;', 'F', $row->title);
    $row->title = str_replace('&#71;', 'G', $row->title);
    $row->title = str_replace('&#72;', 'H', $row->title);
    $row->title = str_replace('&#73;', 'I', $row->title);
    $row->title = str_replace('&#74;', 'J', $row->title);
    $row->title = str_replace('&#75;', 'K', $row->title);
    $row->title = str_replace('&#76;', 'L', $row->title);
    $row->title = str_replace('&#77;', 'M', $row->title);
    $row->title = str_replace('&#78;', 'N', $row->title);
    $row->title = str_replace('&#79;', 'O', $row->title);
    $row->title = str_replace('&#80;', 'P', $row->title);
    $row->title = str_replace('&#81;', 'Q', $row->title);
    $row->title = str_replace('&#82;', 'R', $row->title);
    $row->title = str_replace('&#83;', 'S', $row->title);
    $row->title = str_replace('&#84;', 'T', $row->title);
    $row->title = str_replace('&#85;', 'U', $row->title);
    $row->title = str_replace('&#86;', 'V', $row->title);
    $row->title = str_replace('&#87;', 'W', $row->title);
    $row->title = str_replace('&#88;', 'X', $row->title);
    $row->title = str_replace('&#89;', 'Y', $row->title);
    $row->title = str_replace('&#90;', 'Z', $row->title);

    $row->title = str_replace('&#97;', 'a', $row->title);
    $row->title = str_replace('&#98;', 'b', $row->title);
    $row->title = str_replace('&#99;', 'c', $row->title);
    $row->title = str_replace('&#100;', 'd', $row->title);
    $row->title = str_replace('&#101;', 'e', $row->title);
    $row->title = str_replace('&#102;', 'f', $row->title);
    $row->title = str_replace('&#103;', 'g', $row->title);
    $row->title = str_replace('&#104;', 'h', $row->title);
    $row->title = str_replace('&#105;', 'i', $row->title);
    $row->title = str_replace('&#106;', 'j', $row->title);
    $row->title = str_replace('&#107;', 'k', $row->title);
    $row->title = str_replace('&#108;', 'l', $row->title);
    $row->title = str_replace('&#109;', 'm', $row->title);
    $row->title = str_replace('&#110;', 'n', $row->title);
    $row->title = str_replace('&#111;', 'o', $row->title);
    $row->title = str_replace('&#112;', 'p', $row->title);
    $row->title = str_replace('&#113;', 'q', $row->title);
    $row->title = str_replace('&#114;', 'r', $row->title);
    $row->title = str_replace('&#115;', 's', $row->title);
    $row->title = str_replace('&#116;', 't', $row->title);
    $row->title = str_replace('&#117;', 'u', $row->title);
    $row->title = str_replace('&#118;', 'v', $row->title);
    $row->title = str_replace('&#119;', 'w', $row->title);
    $row->title = str_replace('&#120;', 'x', $row->title);
    $row->title = str_replace('&#121;', 'y', $row->title);
    $row->title = str_replace('&#122;', 'z', $row->title);

    $row->url_alias = $this->prepareAlias($row);
  }

  /**
   * Prepare the entity.
   */
  public function prepare($entity, $row) {
    // Save the styles and scripts.
    $entity->cpn['css'] = implode("\n", $row->styles);
    $entity->cpn['js'] = implode("\n", $row->scripts);
  }

  private function _findIncludes($document, &$matches) {
    $WYETH_COMPONENT_SEARCH_STRING = '<wyethcomponent com.wyeth.include.html.ResourceInclude';
    $WYETH_INCLUDE_PATH_MAPS = array(
      'hcp' => '/wyeth_html/home/minisites',
      'products_landing' => '/wyeth_html/home/products_landing',
    );
    $WYETH_INCLUDE_PATH = '/wyeth_html/home/minisites';
    // Does the document or remaining documnet contain the wyeth callout?
    if ($start_of_wyeth_tag = stripos($document, $WYETH_COMPONENT_SEARCH_STRING)) {

      // Find the start of the wyeth path and move the pointer the length of the string to find the relative path
      $remaining_document = substr($document, $start_of_wyeth_tag);
      $start_char_pos = 0;
      foreach($WYETH_INCLUDE_PATH_MAPS as $key => $path) {

        $start_char_pos = stripos($remaining_document, $path);

        if ($start_char_pos !== FALSE) {
          $start_char_pos += strlen($path);
          $directory_name = $key;
          break;
        }
      }

      //Chop off the beginning of the string
      $remaining_document = substr($remaining_document, $start_char_pos);
      //Find the end of the tag to calculate the difference
      $end_char_pos = stripos($remaining_document, '>');
      //Save the string from 0 to End
      $include_path = trim(substr($remaining_document, 0, $end_char_pos),'/ ');
      $entire_tag_to_replace = substr($document, $start_of_wyeth_tag, $start_char_pos + $end_char_pos + 2 );

      if ($this->_isCommented($remaining_document) == false) {
        //We include the hcp/ here to make sure our paths will line up.
        $matches[$directory_name . '/' . $include_path] = $entire_tag_to_replace;
      }
      //Recursively call the function so we find all the matches in each file.
      $this->_findIncludes($remaining_document, $matches);
    }
  }

  private function _replaceWyethComponent(&$document, $matches) {
    if (count($matches) == 0 ) return;

    foreach($matches as $match_filename => $replace){
      // $document = str_replace('div', 'vid', $document);
      $document = str_replace($replace, '[[nid:' . $match_filename . ']]', $document);
    }
  }

  // Keep parsing the document until a "<!--" or "-->" string is found. Returns values accordingly.
  private function _isCommented($document) {

    // $open is the index of the next open comment tag.
    $open = stripos($document, "<!--");

    // $close is the index of the next close comment tag.
    $close = stripos($document, "-->");

    // If we found an open tag first, that means we were in a chunk of code that was uncommented.
    if ($open < $close) {
      return FALSE;
    }
    // else if we found a close tag first, that means we were in a chunk of code that was commented.
    else if ($close < $open) {
      return TRUE;
    }
    // if we can't find either tag, we are at the end of the document. we are not in a commented chunk of code.
    return FALSE;

  }
 }

 class MetaTagsMigration extends PfizerProMigration {
  public function __construct() {
    parent::__construct();
    $this->description = t('Migrate metatags into field collections.');
    $this->dependencies = array('PfizerProHTML');
    $fields = array(
      'field_meta_name' => 'Metatag Name',
      'field_meta_value' => 'Metatag Value',
    );

    // Match HTML files.
    $regex = '/(.*\.htm$|.*\.html$)/i';

    // The source of the migration is HTML files from the old site.
    $parser = new PfizerMetatagContentParser();
    $list_files = new MigrateListFiles(array($this->baseDir), $this->baseDir, $regex, array(), $parser);

    $item_file = new MigrateItemFile($this->baseDir, TRUE, $parser);

    $this->source = new MigrateSourceList($list_files, $item_file, $fields);

    $this->destination = new MigrateDestinationFieldCollection(
      'field_custom_metatags',
      array('host_entity_type' => 'node')
    );
    $this->map = new MigrateSQLMap(
      $this->machineName,
      array(
        'sourceid' => array(
          'type' => 'varchar',
          'length' => 60,
          'not null' => FALSE,
        ),
        'field_meta_name' => array(
          'type' => 'varchar',
          'length' => 60,
          'not null' => FALSE,
        ),
        'field_meta_value' => array(
          'type' => 'varchar',
          'length' => 255,
          'not null' => FALSE,
        )
      ),

      MigrateDestinationFieldCollection::getKeySchema()
    );
    $this->addFieldMapping('host_entity_id', 'nid')->sourceMigration('PfizerProHTML');
    $this->addFieldMapping('field_meta_name', 'meta_name');
    $this->addFieldMapping('field_meta_value', 'meta_value');
  }
  /**
   * Prepare a row.
   */
  public function prepareRow($row) {
    // Always include this fragment at the beginning of every prepareRow()
    // implementation, so parent classes can ignore rows.
    if (parent::prepareRow($row) === FALSE || empty($row->filedata)) {
      return FALSE;
    }

    list($row->sourceid, $row->field_meta_name) = explode(MIGRATE_CHUNK_SEPARATOR , $row->sourceid);
    $row->field_meta_value = $row->filedata;

    // Calculate the appropriate host entity id.
    $row->nid = substr(drupal_lookup_path('source', $this->prepareAlias($row)), 5);

    // If we can't find the host entity, then we can't import it.
    // Or if we don't have any valid metatags, also don't import.
    if (!$row->nid || empty($row->field_meta_name) || empty($row->field_meta_value)) {
      return FALSE;
    }

    return $row;
  }

  public function prepareKey($source_key, $row) {
    $key = array();

    foreach ($source_key as $field_name => $field_schema) {
      if (empty($row->$field_name)) {
        $key[$field_name] = $row->sourceid;
      }
      else {
        $key[$field_name] = $row->$field_name;
      }
    }

    return $key;
  }

  /**
   * Prepare the entity.
   */
  public function prepare($entity, $row) {
    // Save the styles and scripts.
    $entity->cpn['css'] = implode("\n", $row->styles);
    $entity->cpn['js'] = implode("\n", $row->scripts);
  }
 }

 /**
 * Simple parser that doesn't actually parse anything - it just returns the
 * whole file as a single chunk. This is the default parser used and is
 * primarily for when your HTML files map one-to-one to nodes.
 */
 class PfizerMetatagContentParser extends MigrateContentParser {

  public function setContent($content) {
    $this->content = array();

    // Load HTML from a string.
    $html = str_get_html($content);

    // Retrieve all the css and scripts
    $head = $html->find('head', 0);

    if (!empty($head)) {
      foreach ($head->find('meta') as $element) {
        $attrs = $element->getAllAttributes ();
        if (!empty($attrs['name']) && !empty($attrs['content'])) {
          $this->content[$attrs['name']] = $attrs['content'];
        }
      }
    }
  }

  public function getChunkIDs() {
    return array_keys($this->content);
  }
  public function getChunk($id) {
    if (empty($id)) {
      return '';
    }
    return $this->content[$id];
  }
  public function getChunkCount() {
    return count($this->content);
  }
 }
	<?php
	/**
	* @file
	* HTML Migration.
	*/

	abstract class PfizerProMigration extends Migration {
	public $baseDir;
	public $siteMapArray;
	/**
	* Consturctor.
	*/
	public function __construct() {
	parent::__construct();

	// Build the site map array from the Pfizer provided sitemap
	$this->_parseSiteMap();

	// Since the base directory of the HTML files can change depending on the
	// environment, we keep it in a variable. There is no interface for this,
	// set it using drush vset.
	$this->baseDir = drupal_get_path('module', 'pfizer_migration') . '/fixtures';
	}

	/**
	* This function analyzes the sitemap to create key value pairs for file and url path of a page.
	**/
	protected function _parseSiteMap() {

	$this->siteMapArray = array();
	$siteMapLocation = realpath('./') . '/sites/all/files/pfizer migration sitemap.html';

	$siteMapDocument = file_get_html($siteMapLocation);

	$links = $siteMapDocument->find('a');

	foreach($links as $element) {
	$parts = explode('--', $element->innertext);

	if (array_key_exists(1, $parts)) {
	$filepath = str_replace('/wyeth_html/home/minisites/', '/hcp/', trim($parts[1]));
	$filepath = str_replace('/wyeth_html/home/products_landing/', '/products_landing/', $filepath);
	$urlpath = str_replace('http://www.pfizerpro.com/', '', $element->href);
	$this->siteMapArray[$urlpath] = $filepath;
	}
	}
	}


	/**
	* Prepare aliases.
	*/
	protected function prepareAlias($row) {
	// If we have a match from this document to the sitemap then we know the alias.
	if ($key = array_search($row->sourceid, $this->siteMapArray)) {
	$alias = $key;
	}
	else {
	// Set the url_alias.
	$alias = ltrim($row->sourceid, '/');

	// Retrieve the dir and the page.
	$path = explode('/', $alias);
	$page = strtolower(end($path));
	$dir = strtolower(prev($path));

	// Remove the .html extension and the dir name.
	$search = array('.html', $dir . '_', $dir, 'index');
	$replace = '';
	$page = str_replace($search, $replace, $page);

	// Index page for dir if page is an empty string.
	if (empty($page)) {
	// Remove last element.
	array_pop($path);

	// Set the aliases correctly.
	$alias = implode($path, '/');
	}
	else {
	// Remove last element.
	array_pop($path);

	// Put the new page on the path.
	array_push($path, ucfirst($page));

	// Set the aliases correctly.
	$alias = implode($path, '/');
	}

	// Changes URL alias for effexorxr microsite
	$alias = str_replace('hcp/effexor/Overview', 'hcp/effexorxr', $alias);
	$alias = str_replace('hcp/effexor/Important_treatment_considerations', 'hcp/effexorxr/important-safety-information', $alias);

	// Changes URL alias for flectorpatch microsite
	$alias = str_replace('hcp/flectorpatch/Adverseevents', 'hcp/flectorpatch/adverse-events', $alias);
	$alias = str_replace('hcp/flectorpatch/Flector-patch-samples', 'hcp/flectorpatch/samples', $alias);
	$alias = str_replace('hcp/flectorpatch/Isi', 'hcp/flectorpatch/safety-information', $alias);

	$alias = strtolower($alias);
	}

	return $alias;
	}
	}

	class PfizerProHTMLMigration extends PfizerProMigration {
	/**
	* Consturctor.
	*/
	public function __construct() {
	parent::__construct();

	// A map of source HTML filename -> destination node id.
	$this->map = new MigrateSQLMap($this->machineName,
	array(
	'sourceid' => array(
	'type' => 'varchar',
	'length' => 255,
	'not null' => TRUE,
	),
	),
	MigrateDestinationNode::getKeySchema()
	);

	// The source fields.
	$fields = array(
	'title' => t('Title'),
	'body' => t('Body'),
	'uid' => t('User id'),
	'',
	);

	// Match HTML files.
	$regex = '/(.\.htm$\|.\.html$)/i';

	// The source of the migration is HTML files from the old site.
	$list_files = new MigrateListFiles(array($this->baseDir), $this->baseDir, $regex);

	$item_file = new MigrateItemFile($this->baseDir);

	$this->source = new MigrateSourceList($list_files, $item_file, $fields);

	// The destination is the mynode content type.
	$this->destination = new MigrateDestinationNode('page');

	// Map the fields, pretty straightforward in this case.
	$this->addFieldMapping('uid', 'uid');
	$this->addFieldMapping('title', 'title');
	$this->addFieldMapping('body', 'body')
	->arguments(array('format' => 'raw_html'));
	$this->addFieldMapping('created', 'created');
	$this->addFieldMapping('path', 'url_alias');
	}

	/**
	* Prepare a row.
	*/
	public function prepareRow($row) {
	// Always include this fragment at the beginning of every prepareRow()
	// implementation, so parent classes can ignore rows.
	if (parent::prepareRow($row) === FALSE) {
	return FALSE;
	}

	// Set to admin for now.
	$row->uid = 1;
	// If the string contains '/includes/' then we need to treat it differently and we should ignore that row as its handled by another import
	if (stripos($row->sourceid, '/includes/') !== FALSE) {
	return FALSE;
	}

	//Find the occurances of the wyethcomponent tag
	$this->_findIncludes($row->filedata, $matches);
	$document = $row->filedata;
	$this->_replaceWyethComponent($document, $matches);

	// Load HTML from a string.
	$html = str_get_html($document);

	// Retrieve all the css and scripts
	$head = $html->find('head', 0);
	$styles = array();
	$scripts = array();

	if (!empty($head)) {
	foreach ($head->find('style, script') as $element) {
	switch ($element->tag) {
	case 'style':
	$styles[] = $element->innertext;
	break;
	case 'script':
	$scripts[] = $element->innertext;
	break;
	}
	}
	}

	// Set the styles and scripts.
	$row->styles = $styles;
	$row->scripts = $scripts;

	// Retrieve the body.
	$row->body = $html->find('body', 0)->innertext;

	//Un comment these lines if you encounter problems with the import
	// $row->body = str_replace('’', "'", $row->body);
	// $row->body = str_replace('∗', '*', $row->body);
	// $row->body = str_replace(chr(146), "'", $row->body);
	// $row->body = str_replace(chr(153), "™", $row->body);
	// $row->body = str_replace(chr(151), "—", $row->body);
	// $row->body = str_replace(chr(174), "®", $row->body);
	// $row->body = str_replace('®', "®", $row->body);

	// The title is the filename if it's not found in the html.
	$title = $html->find('title', 0)->innertext;
	$row->title = empty($title) ? $row->sourceid : $title;

	$row->title = str_replace('®', '®', $row->title);
	$row->title = str_replace('—', '–', $row->title);

	$row->title = str_replace('!', '!', $row->title);
	$row->title = str_replace('"', '"', $row->title);
	$row->title = str_replace('#', '#', $row->title);
	$row->title = str_replace('$', '$', $row->title);
	$row->title = str_replace('%', '%', $row->title);
	$row->title = str_replace('&', '&', $row->title);
	$row->title = str_replace(''', '\'', $row->title);
	$row->title = str_replace('(', '(', $row->title);
	$row->title = str_replace(')', ')', $row->title);
	$row->title = str_replace('', '', $row->title);
	$row->title = str_replace('+', '+', $row->title);
	$row->title = str_replace(',', ',', $row->title);
	$row->title = str_replace('-', '-', $row->title);
	$row->title = str_replace('.', '.', $row->title);
	$row->title = str_replace('/', '/', $row->title);
	$row->title = str_replace('0', '0', $row->title);
	$row->title = str_replace('1', '1', $row->title);
	$row->title = str_replace('2', '2', $row->title);
	$row->title = str_replace('3', '3', $row->title);
	$row->title = str_replace('4', '4', $row->title);
	$row->title = str_replace('5', '5', $row->title);
	$row->title = str_replace('6', '6', $row->title);
	$row->title = str_replace('7', '7', $row->title);
	$row->title = str_replace('8', '8', $row->title);
	$row->title = str_replace('9', '9', $row->title);
	$row->title = str_replace(':', ':', $row->title);
	$row->title = str_replace(';', ';', $row->title);
	$row->title = str_replace('<', '<', $row->title);
	$row->title = str_replace('=', '=', $row->title);
	$row->title = str_replace('>', '>', $row->title);
	$row->title = str_replace('?', '?', $row->title);
	$row->title = str_replace('@', '@', $row->title);
	$row->title = str_replace('A', 'A', $row->title);
	$row->title = str_replace('B', 'B', $row->title);
	$row->title = str_replace('C', 'C', $row->title);
	$row->title = str_replace('D', 'D', $row->title);
	$row->title = str_replace('E', 'E', $row->title);
	$row->title = str_replace('F', 'F', $row->title);
	$row->title = str_replace('G', 'G', $row->title);
	$row->title = str_replace('H', 'H', $row->title);
	$row->title = str_replace('I', 'I', $row->title);
	$row->title = str_replace('J', 'J', $row->title);
	$row->title = str_replace('K', 'K', $row->title);
	$row->title = str_replace('L', 'L', $row->title);
	$row->title = str_replace('M', 'M', $row->title);
	$row->title = str_replace('N', 'N', $row->title);
	$row->title = str_replace('O', 'O', $row->title);
	$row->title = str_replace('P', 'P', $row->title);
	$row->title = str_replace('Q', 'Q', $row->title);
	$row->title = str_replace('R', 'R', $row->title);
	$row->title = str_replace('S', 'S', $row->title);
	$row->title = str_replace('T', 'T', $row->title);
	$row->title = str_replace('U', 'U', $row->title);
	$row->title = str_replace('V', 'V', $row->title);
	$row->title = str_replace('W', 'W', $row->title);
	$row->title = str_replace('X', 'X', $row->title);
	$row->title = str_replace('Y', 'Y', $row->title);
	$row->title = str_replace('Z', 'Z', $row->title);

	$row->title = str_replace('a', 'a', $row->title);
	$row->title = str_replace('b', 'b', $row->title);
	$row->title = str_replace('c', 'c', $row->title);
	$row->title = str_replace('d', 'd', $row->title);
	$row->title = str_replace('e', 'e', $row->title);
	$row->title = str_replace('f', 'f', $row->title);
	$row->title = str_replace('g', 'g', $row->title);
	$row->title = str_replace('h', 'h', $row->title);
	$row->title = str_replace('i', 'i', $row->title);
	$row->title = str_replace('j', 'j', $row->title);
	$row->title = str_replace('k', 'k', $row->title);
	$row->title = str_replace('l', 'l', $row->title);
	$row->title = str_replace('m', 'm', $row->title);
	$row->title = str_replace('n', 'n', $row->title);
	$row->title = str_replace('o', 'o', $row->title);
	$row->title = str_replace('p', 'p', $row->title);
	$row->title = str_replace('q', 'q', $row->title);
	$row->title = str_replace('r', 'r', $row->title);
	$row->title = str_replace('s', 's', $row->title);
	$row->title = str_replace('t', 't', $row->title);
	$row->title = str_replace('u', 'u', $row->title);
	$row->title = str_replace('v', 'v', $row->title);
	$row->title = str_replace('w', 'w', $row->title);
	$row->title = str_replace('x', 'x', $row->title);
	$row->title = str_replace('y', 'y', $row->title);
	$row->title = str_replace('z', 'z', $row->title);

	$row->url_alias = $this->prepareAlias($row);
	}

	/**
	* Prepare the entity.
	*/
	public function prepare($entity, $row) {
	// Save the styles and scripts.
	$entity->cpn['css'] = implode("\n", $row->styles);
	$entity->cpn['js'] = implode("\n", $row->scripts);
	}

	private function _findIncludes($document, &$matches) {
	$WYETH_COMPONENT_SEARCH_STRING = '<wyethcomponent com.wyeth.include.html.ResourceInclude';
	$WYETH_INCLUDE_PATH_MAPS = array(
	'hcp' => '/wyeth_html/home/minisites',
	'products_landing' => '/wyeth_html/home/products_landing',
	);
	$WYETH_INCLUDE_PATH = '/wyeth_html/home/minisites';
	// Does the document or remaining documnet contain the wyeth callout?
	if ($start_of_wyeth_tag = stripos($document, $WYETH_COMPONENT_SEARCH_STRING)) {

	// Find the start of the wyeth path and move the pointer the length of the string to find the relative path
	$remaining_document = substr($document, $start_of_wyeth_tag);
	$start_char_pos = 0;
	foreach($WYETH_INCLUDE_PATH_MAPS as $key => $path) {

	$start_char_pos = stripos($remaining_document, $path);

	if ($start_char_pos !== FALSE) {
	$start_char_pos += strlen($path);
	$directory_name = $key;
	break;
	}
	}

	//Chop off the beginning of the string
	$remaining_document = substr($remaining_document, $start_char_pos);
	//Find the end of the tag to calculate the difference
	$end_char_pos = stripos($remaining_document, '>');
	//Save the string from 0 to End
	$include_path = trim(substr($remaining_document, 0, $end_char_pos),'/ ');
	$entire_tag_to_replace = substr($document, $start_of_wyeth_tag, $start_char_pos + $end_char_pos + 2 );

	if ($this->_isCommented($remaining_document) == false) {
	//We include the hcp/ here to make sure our paths will line up.
	$matches[$directory_name . '/' . $include_path] = $entire_tag_to_replace;
	}
	//Recursively call the function so we find all the matches in each file.
	$this->_findIncludes($remaining_document, $matches);
	}
	}

	private function _replaceWyethComponent(&$document, $matches) {
	if (count($matches) == 0 ) return;

	foreach($matches as $match_filename => $replace){
	// $document = str_replace('div', 'vid', $document);
	$document = str_replace($replace, '[[nid:' . $match_filename . ']]', $document);
	}
	}

	// Keep parsing the document until a "<!--" or "-->" string is found. Returns values accordingly.
	private function _isCommented($document) {

	// $open is the index of the next open comment tag.
	$open = stripos($document, "<!--");

	// $close is the index of the next close comment tag.
	$close = stripos($document, "-->");

	// If we found an open tag first, that means we were in a chunk of code that was uncommented.
	if ($open < $close) {
	return FALSE;
	}
	// else if we found a close tag first, that means we were in a chunk of code that was commented.
	else if ($close < $open) {
	return TRUE;
	}
	// if we can't find either tag, we are at the end of the document. we are not in a commented chunk of code.
	return FALSE;

	}
	}

	class MetaTagsMigration extends PfizerProMigration {
	public function __construct() {
	parent::__construct();
	$this->description = t('Migrate metatags into field collections.');
	$this->dependencies = array('PfizerProHTML');
	$fields = array(
	'field_meta_name' => 'Metatag Name',
	'field_meta_value' => 'Metatag Value',
	);

	// Match HTML files.
	$regex = '/(.\.htm$\|.\.html$)/i';

	// The source of the migration is HTML files from the old site.
	$parser = new PfizerMetatagContentParser();
	$list_files = new MigrateListFiles(array($this->baseDir), $this->baseDir, $regex, array(), $parser);

	$item_file = new MigrateItemFile($this->baseDir, TRUE, $parser);

	$this->source = new MigrateSourceList($list_files, $item_file, $fields);

	$this->destination = new MigrateDestinationFieldCollection(
	'field_custom_metatags',
	array('host_entity_type' => 'node')
	);
	$this->map = new MigrateSQLMap(
	$this->machineName,
	array(
	'sourceid' => array(
	'type' => 'varchar',
	'length' => 60,
	'not null' => FALSE,
	),
	'field_meta_name' => array(
	'type' => 'varchar',
	'length' => 60,
	'not null' => FALSE,
	),
	'field_meta_value' => array(
	'type' => 'varchar',
	'length' => 255,
	'not null' => FALSE,
	)
	),

	MigrateDestinationFieldCollection::getKeySchema()
	);
	$this->addFieldMapping('host_entity_id', 'nid')->sourceMigration('PfizerProHTML');
	$this->addFieldMapping('field_meta_name', 'meta_name');
	$this->addFieldMapping('field_meta_value', 'meta_value');
	}
	/**
	* Prepare a row.
	*/
	public function prepareRow($row) {
	// Always include this fragment at the beginning of every prepareRow()
	// implementation, so parent classes can ignore rows.
	if (parent::prepareRow($row) === FALSE \|\| empty($row->filedata)) {
	return FALSE;
	}

	list($row->sourceid, $row->field_meta_name) = explode(MIGRATE_CHUNK_SEPARATOR , $row->sourceid);
	$row->field_meta_value = $row->filedata;

	// Calculate the appropriate host entity id.
	$row->nid = substr(drupal_lookup_path('source', $this->prepareAlias($row)), 5);

	// If we can't find the host entity, then we can't import it.
	// Or if we don't have any valid metatags, also don't import.
	if (!$row->nid \|\| empty($row->field_meta_name) \|\| empty($row->field_meta_value)) {
	return FALSE;
	}

	return $row;
	}

	public function prepareKey($source_key, $row) {
	$key = array();

	foreach ($source_key as $field_name => $field_schema) {
	if (empty($row->$field_name)) {
	$key[$field_name] = $row->sourceid;
	}
	else {
	$key[$field_name] = $row->$field_name;
	}
	}

	return $key;
	}

	/**
	* Prepare the entity.
	*/
	public function prepare($entity, $row) {
	// Save the styles and scripts.
	$entity->cpn['css'] = implode("\n", $row->styles);
	$entity->cpn['js'] = implode("\n", $row->scripts);
	}
	}

	/**
	* Simple parser that doesn't actually parse anything - it just returns the
	* whole file as a single chunk. This is the default parser used and is
	* primarily for when your HTML files map one-to-one to nodes.
	*/
	class PfizerMetatagContentParser extends MigrateContentParser {

	public function setContent($content) {
	$this->content = array();

	// Load HTML from a string.
	$html = str_get_html($content);

	// Retrieve all the css and scripts
	$head = $html->find('head', 0);

	if (!empty($head)) {
	foreach ($head->find('meta') as $element) {
	$attrs = $element->getAllAttributes ();
	if (!empty($attrs['name']) && !empty($attrs['content'])) {
	$this->content[$attrs['name']] = $attrs['content'];
	}
	}
	}
	}

	public function getChunkIDs() {
	return array_keys($this->content);
	}
	public function getChunk($id) {
	if (empty($id)) {
	return '';
	}
	return $this->content[$id];
	}
	public function getChunkCount() {
	return count($this->content);
	}
	}
No results found