maricris-sn · October 23, 2010 09:16
diff --git a/Migrate to your Wordpress Blog using Ruby and Atom b/Migrate to your Wordpress Blog using Ruby and Atom
 require 'rubygems'
 require 'open-uri'
 require 'net/http'
 require 'hpricot'
 require 'atom/entry'
 require 'atom/collection'

 #Declare your variables
 urls_to_import = "urls.txt"
 wp_blog_host = "livinglife.sweetperceptions.com"
 wp_blog_uri = "http://#{wp_blog_host}"
 wp_base = "http://#{wp_blog_host}/wp-app.php"
 wp_blog_username = "myusername"
 wp_blog_password = "mypassword"
 your_blog_source = "http://sweetperceptions.i.ph"
 which_pages = 1..19

 authors = {
  'Maricris Nonato' => {'user' => 'myusername', 'password' => 'mypassword'}
 }

 registered_categories = ["About me", "Artistry", "Cool Finds", "Dreams", "Events", "Health and Beauty", "Horoscope", "Living Life", "Meme", "Movies", "Music", "Notes", "Pet Love", "Quotes", "Random thoughts", "Stories to share", "Techie", "Travel"]

 synonym_categories = {
  "About me" => ["me"], 
  "Artistry" => ["poem"],
  "Cool Finds" => ["cool"],
  "Dreams" => ["dream","dreams"],
  "Events" => ["event", "bday", "birthday", "Christmas", "New year", "new-year", "celebration"],
  "Health and Beauty" => ["health", "sickness", "headache", "fever", "cancer"],
  "Horoscope" => ["cookie", "fortune", "horoscope", "astrology", "psych"],
  "Living Life" => ["life", "kalokohan"],
  "Meme" => ["meme"],
  "Movies" => ["hollywood", "movie", "movies", "movie-lines", "happy-feet"],
  "Music" => ["song", "songs", "singer", "music", "ost"],
  "Notes" => ["notes"],
  "Pet Love" => ["pet", "cat", "dog", "animal", "animals", "pets"],
  "Quotes" => ["quote", "quotes"],
  "Random thoughts" => ["thought", "thoughts", "think", "logic"],
  "Stories to share" => ["story", "stories", "adventure"],
  "Techie" => ["tech", "techie", "work", "web2.0", "development", "software", "online", "skype", "pc"],
  "Travel" => ["philippines", "travel", "province"],
 }

 # Rules of matching to categories:
 # 1. exact match
 # 2. synonyms/variations -> manual

 # Get all urls of your posts
 # Uncomment if you want to use Option B
 # A. By scraping your links online, OR
 urls = Array.new

 which_pages.each do |page|
  from = Hpricot(open(your_blog_source + "/page/#{page.to_s}/"))
  urls << (from/"h3[@class='entrytitle']/a").collect{|x| x['href']}
 end

 # B. read in URLs from text file
 # urls = File.readlines(urls_to_import).map { |line| line.chomp }

 urls = urls.flatten.compact
 
 # Parse each HTML document from list of URLs
 urls.each { |target|
  doc = Hpricot(open(target))
  
  # Extract HTML within element matching XPath expression
  title = (CGI::unescapeHTML((doc/"div/h3[@class='entrytitle']/a").inner_html.strip)).gsub(/\r\n/, '')

  author = "Chris"

  timestr = (doc/"div[@class='meta-post']").inner_html[/\d+:\d\d:\d\d/]
  datestr = ((doc/"div/span[@class='date']").inner_html.strip).gsub(/\r\n/, '')
  datestr = datestr + " " + timestr
  datestr = DateTime.parse(datestr).strftime('%a, %-d %b %Y %T -0500')

  hExcerpt = ((doc/"div[@class='entry_summary']").inner_html).gsub(/\r\n/, '')

  filtered_tags = []
  tags = (doc/"div[@class='tag-list']/a").collect{|x| x.inner_html}
  
  #rule 1 -> exact match
  filtered_tags << tags.collect{|x| x if registered_categories.include?(x)}.compact
  
  #rule 2 -> synonyms
  synonym_categories.keys.each do |syn|
    filtered_tags << tags.collect{|x| syn if (synonym_categories[syn]).include?(x)}.compact
  end
  
  tags = filtered_tags.flatten.compact.uniq.join(',')
  
  # Get your contents by finding all paras in the entry post
  entry_id = "postentry-#{doc.at("div[@class='blog']")['id'].split('-').last}"
  
  # Get the main body content
  contents = (doc/"##{entry_id}")
  
  # Remove unneeded elements  
  (doc/"##{entry_id}/h3").remove  
  (doc/"##{entry_id}/span[@class='date']").remove  
  (doc/"##{entry_id}/div[@class='tag-list']").remove
  (doc/"##{entry_id}/div[@class='meta-post']").remove
  
  # removing string not found in any Hpricot element parent
  contents = (doc/"##{entry_id}").inner_html.gsub("\n        \n    \n    \n    \n        \n",'').gsub("\n        \n    \n",'')
  
  content = contents
  
  # Atom Author element
  author = Atom::Author.new
  author.name = author
  author.uri = wp_blog_uri

  # Atom Entry element      
  entry = Atom::Entry.new
  entry.title = title
  entry.summary = hExcerpt
  entry.content = content
  entry.content.type = "html"
  entry.published = datestr
  entry.updated = datestr
  entry.tag_with(tags, ',')
  entry.authors << author

  req = Atom::HTTP.new
  req.user = wp_blog_username
  req.pass = wp_blog_password
  req.always_auth = :basic
 
  # Atom Collection
  c = Atom::Collection.new(wp_base + "/posts", req)
  
  res = c.post! entry

  puts "Imported URL: #{target}, at #{datestr}, #{res.message}\n"
 }
	require 'rubygems'
	require 'open-uri'
	require 'net/http'
	require 'hpricot'
	require 'atom/entry'
	require 'atom/collection'

	#Declare your variables
	urls_to_import = "urls.txt"
	wp_blog_host = "livinglife.sweetperceptions.com"
	wp_blog_uri = "http://#{wp_blog_host}"
	wp_base = "http://#{wp_blog_host}/wp-app.php"
	wp_blog_username = "myusername"
	wp_blog_password = "mypassword"
	your_blog_source = "http://sweetperceptions.i.ph"
	which_pages = 1..19

	authors = {
	'Maricris Nonato' => {'user' => 'myusername', 'password' => 'mypassword'}
	}

	registered_categories = ["About me", "Artistry", "Cool Finds", "Dreams", "Events", "Health and Beauty", "Horoscope", "Living Life", "Meme", "Movies", "Music", "Notes", "Pet Love", "Quotes", "Random thoughts", "Stories to share", "Techie", "Travel"]

	synonym_categories = {
	"About me" => ["me"],
	"Artistry" => ["poem"],
	"Cool Finds" => ["cool"],
	"Dreams" => ["dream","dreams"],
	"Events" => ["event", "bday", "birthday", "Christmas", "New year", "new-year", "celebration"],
	"Health and Beauty" => ["health", "sickness", "headache", "fever", "cancer"],
	"Horoscope" => ["cookie", "fortune", "horoscope", "astrology", "psych"],
	"Living Life" => ["life", "kalokohan"],
	"Meme" => ["meme"],
	"Movies" => ["hollywood", "movie", "movies", "movie-lines", "happy-feet"],
	"Music" => ["song", "songs", "singer", "music", "ost"],
	"Notes" => ["notes"],
	"Pet Love" => ["pet", "cat", "dog", "animal", "animals", "pets"],
	"Quotes" => ["quote", "quotes"],
	"Random thoughts" => ["thought", "thoughts", "think", "logic"],
	"Stories to share" => ["story", "stories", "adventure"],
	"Techie" => ["tech", "techie", "work", "web2.0", "development", "software", "online", "skype", "pc"],
	"Travel" => ["philippines", "travel", "province"],
	}

	# Rules of matching to categories:
	# 1. exact match
	# 2. synonyms/variations -> manual

	# Get all urls of your posts
	# Uncomment if you want to use Option B
	# A. By scraping your links online, OR
	urls = Array.new

	which_pages.each do \|page\|
	from = Hpricot(open(your_blog_source + "/page/#{page.to_s}/"))
	urls << (from/"h3[@class='entrytitle']/a").collect{\|x\| x['href']}
	end

	# B. read in URLs from text file
	# urls = File.readlines(urls_to_import).map { \|line\| line.chomp }

	urls = urls.flatten.compact

	# Parse each HTML document from list of URLs
	urls.each { \|target\|
	doc = Hpricot(open(target))

	# Extract HTML within element matching XPath expression
	title = (CGI::unescapeHTML((doc/"div/h3[@class='entrytitle']/a").inner_html.strip)).gsub(/\r\n/, '')

	author = "Chris"

	timestr = (doc/"div[@class='meta-post']").inner_html[/\d+:\d\d:\d\d/]
	datestr = ((doc/"div/span[@class='date']").inner_html.strip).gsub(/\r\n/, '')
	datestr = datestr + " " + timestr
	datestr = DateTime.parse(datestr).strftime('%a, %-d %b %Y %T -0500')

	hExcerpt = ((doc/"div[@class='entry_summary']").inner_html).gsub(/\r\n/, '')

	filtered_tags = []
	tags = (doc/"div[@class='tag-list']/a").collect{\|x\| x.inner_html}

	#rule 1 -> exact match
	filtered_tags << tags.collect{\|x\| x if registered_categories.include?(x)}.compact

	#rule 2 -> synonyms
	synonym_categories.keys.each do \|syn\|
	filtered_tags << tags.collect{\|x\| syn if (synonym_categories[syn]).include?(x)}.compact
	end

	tags = filtered_tags.flatten.compact.uniq.join(',')

	# Get your contents by finding all paras in the entry post
	entry_id = "postentry-#{doc.at("div[@class='blog']")['id'].split('-').last}"

	# Get the main body content
	contents = (doc/"##{entry_id}")

	# Remove unneeded elements
	(doc/"##{entry_id}/h3").remove
	(doc/"##{entry_id}/span[@class='date']").remove
	(doc/"##{entry_id}/div[@class='tag-list']").remove
	(doc/"##{entry_id}/div[@class='meta-post']").remove

	# removing string not found in any Hpricot element parent
	contents = (doc/"##{entry_id}").inner_html.gsub("\n \n \n \n \n \n",'').gsub("\n \n \n",'')

	content = contents

	# Atom Author element
	author = Atom::Author.new
	author.name = author
	author.uri = wp_blog_uri

	# Atom Entry element
	entry = Atom::Entry.new
	entry.title = title
	entry.summary = hExcerpt
	entry.content = content
	entry.content.type = "html"
	entry.published = datestr
	entry.updated = datestr
	entry.tag_with(tags, ',')
	entry.authors << author

	req = Atom::HTTP.new
	req.user = wp_blog_username
	req.pass = wp_blog_password
	req.always_auth = :basic

	# Atom Collection
	c = Atom::Collection.new(wp_base + "/posts", req)

	res = c.post! entry

	puts "Imported URL: #{target}, at #{datestr}, #{res.message}\n"
	}
No results found