Last active
March 29, 2016 13:06
-
-
Save kenee/17c901108a7432c1a7f5 to your computer and use it in GitHub Desktop.
import wordpress page that crawl from googlecache to database
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
ini_set('display_errors',true); | |
error_reporting(E_ALL ^ E_NOTICE); | |
$obj = new HtmlImporter(); | |
$obj->run(); | |
class HtmlImporter { | |
function run(){ | |
$this->parsePage(); | |
} | |
function getDbInstance(){ | |
require __DIR__."/wp-config.php"; | |
$dsn = sprintf('mysql:host=%s;port=3306;dbname=%s;charset=UTF8;',DB_HOST,DB_NAME); | |
$db = new PDO($dsn,DB_USER,DB_PASSWORD); | |
$db->query("SET NAMES utf8;"); | |
return $db; | |
} | |
function getPages(){ | |
$ret = array(); | |
$d = dir("."); | |
while (false !== ($entry = $d->read())) { | |
if (preg_match('/p_\d+\.html/', $entry)){ | |
$ret[] = $entry; | |
} | |
} | |
$d->close(); | |
return $ret; | |
} | |
function parsePage(){ | |
$db = $this->getDbInstance(); | |
$db->exec('TRUNCATE wp_posts'); | |
$db->exec('TRUNCATE wp_term_taxonomy'); | |
$db->exec('TRUNCATE wp_terms'); | |
$db->exec('TRUNCATE wp_term_relationships'); | |
$pages = $this->getPages(); | |
foreach ($pages as $page) { | |
echo "importing $page \n"; | |
try { | |
$pageobj = new PageParser($page); | |
$pagedb['id'] = $pageobj->getId(); | |
$pagedb['title'] = $pageobj->getTitle(); | |
$pagedb['content'] = $pageobj->getContent(); | |
$pagedb['pubdate'] = $pageobj->getPubdate(); | |
$cats = $pageobj->getCategory(); | |
} catch (Exception $e) { | |
echo $e->getMessage(). "\n"; | |
} | |
$this->savePage($pagedb); | |
$this->saveCat($pagedb['id'],$cats); | |
} | |
} | |
function savePage($pagedb){ | |
$db = $this->getDbInstance(); | |
$sql = sprintf("INSERT INTO `wp_posts` (`id`,`post_title`,`post_content`,`post_date`,`post_excerpt`,`to_ping`,`pinged`,`post_content_filtered`) values ('%s','%s','%s','%s','','','','')",$pagedb['id'],$pagedb['title'],$pagedb['content'],$pagedb['pubdate']); | |
//echo $sql; | |
$db->exec($sql) or die(print_r($db->errorInfo(), true)); | |
//todo:fix gmt date | |
} | |
function saveCat($page_id,$cats) { | |
$db = $this->getDbInstance(); | |
foreach ($cats as $catdb) { | |
$sql = sprintf("INSERT INTO `wp_terms` (`term_id`,`name`,`slug`) values ('%s','%s','%s')",$catdb['term_id'],$catdb['name'],$catdb['name']); | |
$db->exec($sql); | |
$sql = sprintf("INSERT INTO `wp_term_taxonomy` (`term_id`,`taxonomy`,`description`) values ('%s','%s','')",$catdb['term_id'],'category'); | |
if( $db->exec($sql) ){ | |
//映射关系查询表,getTermTaxonomyId要用 | |
$this->termTaxonomyR[$catdb['term_id']] = $db->lastInsertId(); | |
} | |
$this->updatePageCat($page_id,$catdb['term_id']); | |
} | |
} | |
function updatePageCat($page_id,$term_id){ | |
$db = $this->getDbInstance(); | |
$term_taxonomy_id = $this->getTermTaxonomyId($term_id); | |
$sql = sprintf("INSERT INTO `wp_term_relationships` (`object_id`,`term_taxonomy_id`) values ('%s','%s')",$page_id,$term_taxonomy_id); | |
$db->exec($sql); | |
} | |
function getTermTaxonomyId($term_id){ | |
return $this->termTaxonomyR[$term_id]; | |
} | |
} | |
class PageParser { | |
function __construct($page){ | |
#$page = "p_640.html"; | |
$this->filename = $page; | |
try{ | |
$this->content = file_get_contents($page); | |
} catch (Exception $e){ | |
throw new Exception("Error To Read $page", 1); | |
} | |
$regex = '/<header class="entry-header">([\s\S]+)<h1 class="entry-title">([\s\S]+)<\/h1>([\S\s]+)<div class="entry-content">([\s\S]+)<\/div>([\S\s]+)class="entry-meta">([\s\S]+)<\/footer>/'; | |
if(!preg_match($regex,$this->content,$match)){ | |
throw new Exception("Regex Not Match ", 1); | |
} | |
$this->title = $match[2]; | |
$this->content = $match[4]; | |
$this->content_meta = $match[6]; | |
} | |
public function getId(){ | |
if(!preg_match('/p_(\d+)\.html/',$this->filename,$match)){ | |
throw new Exception('No Id found'); | |
return; | |
} | |
return $match[1]; | |
} | |
public function getTitle(){ | |
return $this->title; | |
} | |
public function getContent(){ | |
return $this->content; | |
} | |
public function getPubdate(){ | |
if(preg_match('/datetime="([\s\S]+)">.*<\/time>/',$this->content_meta,$match)){ | |
$date = $match[1]; | |
return date("Y-m-d H:i:s",strtotime($date) - 8 * 3600); | |
} | |
} | |
public function getCategory(){ | |
$cats = array(); | |
$regex = '/cat=(\d+)" rel="category">([^<]*)<\/a>/'; | |
if(preg_match_all($regex,$this->content_meta,$matchs)){ | |
$ids = $matchs[1]; | |
$names = $matchs[2]; | |
for($i=0;$i<count($ids);$i++) { | |
$cat['term_id'] = $ids[$i]; | |
$cat['name'] = $names[$i]; | |
$cats[] = $cat; | |
} | |
} | |
return $cats; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment