Skip to content

Instantly share code, notes, and snippets.

@dchest
Created January 8, 2010 23:20
Show Gist options
  • Save dchest/272559 to your computer and use it in GitHub Desktop.
Save dchest/272559 to your computer and use it in GitHub Desktop.
Converting Sellme.ru to text files
#!/usr/bin/ruby
require 'FileUtils'
Dir.glob('**/*.txt', File::FNM_CASEFOLD).each do |filename|
txtname = "../md/" + filename
puts filename + " -> " + txtname
FileUtils.mkdir_p(File.dirname(txtname))
`html2text.py #{filename} > #{txtname}`
end
#!/usr/bin/ruby
require 'rubygems'
require 'hpricot'
require 'rtranslate'
Dir.glob('**/*.html', File::FNM_CASEFOLD).each do |filename|
if File.basename(filename) == "index.html"
next
end
puts "Processing " + filename
doc = open(filename) { |f| Hpricot(f) }
title = doc.search("//h1[@id='page-title']").inner_html
slug = Translate.t(title, "Russian", "English").scan(/[A-Za-z0-9]+/).join("-").downcase()[0..56] #.gsub(/+/,"").downcase # gsub(/[^A-Za-z0-9]+/,"-").downcase
if slug == ""
slug = "untitled"
end
body = doc.search("//div[@class='asset-body']").inner_html
more = doc.search("//div[@class='asset-more']").inner_html
date = doc.search("//span[@class='byline']/abbr[@class='published']").inner_html
out = "<h1>" + title + "</h1>\n"
out += "<p>" + body + "</p>\n"
out += "<p>" + more + "</p>\n"
out += "<p><em>" + date + "</em></p>\n"
has_com = false
(doc/"//div.comment").each do |c|
unless has_com
out += "<hr>\n"
has_com = true
end
out += "<em>"+(c/"span.author").inner_html+":</em>\n"
out += "<blockquote>"+(c/"div.comment-content").inner_html+"</blockquote>\n"
end
#p = IO.popen('html2text.py')
#p.write(out)
#p.close_write
#out = p.read
#p.close
filename = filename.gsub(File.basename(filename), slug + ".txt")
File.open(filename, "w") { |f| f.write(out) }
#`html2text.py < #{filename}`
#puts out
puts "-> " + filename + "\n\n"
#exit
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment