Skip to content

Instantly share code, notes, and snippets.

@kenee
Last active March 29, 2016 13:06
Show Gist options
  • Save kenee/17c901108a7432c1a7f5 to your computer and use it in GitHub Desktop.
Save kenee/17c901108a7432c1a7f5 to your computer and use it in GitHub Desktop.
import wordpress page that crawl from googlecache to database
<?php
ini_set('display_errors',true);
error_reporting(E_ALL ^ E_NOTICE);
$obj = new HtmlImporter();
$obj->run();
class HtmlImporter {
function run(){
$this->parsePage();
}
function getDbInstance(){
require __DIR__."/wp-config.php";
$dsn = sprintf('mysql:host=%s;port=3306;dbname=%s;charset=UTF8;',DB_HOST,DB_NAME);
$db = new PDO($dsn,DB_USER,DB_PASSWORD);
$db->query("SET NAMES utf8;");
return $db;
}
function getPages(){
$ret = array();
$d = dir(".");
while (false !== ($entry = $d->read())) {
if (preg_match('/p_\d+\.html/', $entry)){
$ret[] = $entry;
}
}
$d->close();
return $ret;
}
function parsePage(){
$db = $this->getDbInstance();
$db->exec('TRUNCATE wp_posts');
$db->exec('TRUNCATE wp_term_taxonomy');
$db->exec('TRUNCATE wp_terms');
$db->exec('TRUNCATE wp_term_relationships');
$pages = $this->getPages();
foreach ($pages as $page) {
echo "importing $page \n";
try {
$pageobj = new PageParser($page);
$pagedb['id'] = $pageobj->getId();
$pagedb['title'] = $pageobj->getTitle();
$pagedb['content'] = $pageobj->getContent();
$pagedb['pubdate'] = $pageobj->getPubdate();
$cats = $pageobj->getCategory();
} catch (Exception $e) {
echo $e->getMessage(). "\n";
}
$this->savePage($pagedb);
$this->saveCat($pagedb['id'],$cats);
}
}
function savePage($pagedb){
$db = $this->getDbInstance();
$sql = sprintf("INSERT INTO `wp_posts` (`id`,`post_title`,`post_content`,`post_date`,`post_excerpt`,`to_ping`,`pinged`,`post_content_filtered`) values ('%s','%s','%s','%s','','','','')",$pagedb['id'],$pagedb['title'],$pagedb['content'],$pagedb['pubdate']);
//echo $sql;
$db->exec($sql) or die(print_r($db->errorInfo(), true));
//todo:fix gmt date
}
function saveCat($page_id,$cats) {
$db = $this->getDbInstance();
foreach ($cats as $catdb) {
$sql = sprintf("INSERT INTO `wp_terms` (`term_id`,`name`,`slug`) values ('%s','%s','%s')",$catdb['term_id'],$catdb['name'],$catdb['name']);
$db->exec($sql);
$sql = sprintf("INSERT INTO `wp_term_taxonomy` (`term_id`,`taxonomy`,`description`) values ('%s','%s','')",$catdb['term_id'],'category');
if( $db->exec($sql) ){
//映射关系查询表,getTermTaxonomyId要用
$this->termTaxonomyR[$catdb['term_id']] = $db->lastInsertId();
}
$this->updatePageCat($page_id,$catdb['term_id']);
}
}
function updatePageCat($page_id,$term_id){
$db = $this->getDbInstance();
$term_taxonomy_id = $this->getTermTaxonomyId($term_id);
$sql = sprintf("INSERT INTO `wp_term_relationships` (`object_id`,`term_taxonomy_id`) values ('%s','%s')",$page_id,$term_taxonomy_id);
$db->exec($sql);
}
function getTermTaxonomyId($term_id){
return $this->termTaxonomyR[$term_id];
}
}
class PageParser {
function __construct($page){
#$page = "p_640.html";
$this->filename = $page;
try{
$this->content = file_get_contents($page);
} catch (Exception $e){
throw new Exception("Error To Read $page", 1);
}
$regex = '/<header class="entry-header">([\s\S]+)<h1 class="entry-title">([\s\S]+)<\/h1>([\S\s]+)<div class="entry-content">([\s\S]+)<\/div>([\S\s]+)class="entry-meta">([\s\S]+)<\/footer>/';
if(!preg_match($regex,$this->content,$match)){
throw new Exception("Regex Not Match ", 1);
}
$this->title = $match[2];
$this->content = $match[4];
$this->content_meta = $match[6];
}
public function getId(){
if(!preg_match('/p_(\d+)\.html/',$this->filename,$match)){
throw new Exception('No Id found');
return;
}
return $match[1];
}
public function getTitle(){
return $this->title;
}
public function getContent(){
return $this->content;
}
public function getPubdate(){
if(preg_match('/datetime="([\s\S]+)">.*<\/time>/',$this->content_meta,$match)){
$date = $match[1];
return date("Y-m-d H:i:s",strtotime($date) - 8 * 3600);
}
}
public function getCategory(){
$cats = array();
$regex = '/cat=(\d+)" rel="category">([^<]*)<\/a>/';
if(preg_match_all($regex,$this->content_meta,$matchs)){
$ids = $matchs[1];
$names = $matchs[2];
for($i=0;$i<count($ids);$i++) {
$cat['term_id'] = $ids[$i];
$cat['name'] = $names[$i];
$cats[] = $cat;
}
}
return $cats;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment