Skip to content

Instantly share code, notes, and snippets.

@vikhyat
Created June 16, 2014 17:37
Show Gist options
  • Select an option

  • Save vikhyat/5569625983b95f99bcc3 to your computer and use it in GitHub Desktop.

Select an option

Save vikhyat/5569625983b95f99bcc3 to your computer and use it in GitHub Desktop.
Manga Information Importer
require 'nokogiri'
require 'date'
require 'json'
def repair_broken_html(html)
rep = html
# Broken UTF-8 encoding.
rep = rep.unpack('C*').pack('U*') unless rep.valid_encoding?
# "<" is not escaped.
rep = rep.gsub(/<([^A-Za-z!\/])/, '&lt;\1')
rep
end
def get_json(mal_id)
html = File.open("data/#{mal_id}").read
return "" if html.include? "No manga found, check the manga id and try again."
noko = Nokogiri::HTML(repair_broken_html html)
meta = {mal_id: mal_id.to_i}
# Get title and alternate title.
title = noko.css("h1").children[1].text.strip
aka = noko.css(".spaceit_pad").select {|x| x.text.include? "English:" }[0].text.gsub("English: ", "").strip rescue nil
meta[:romaji_title] = title
meta[:english_title] = (title != aka) ? aka : nil
# Synopsis
meta[:synopsis] = noko.css("td").select {|x| x.css("h2").text == "Synopsis" }[0].text.gsub("Synopsis", '').strip rescue nil
# Characters
meta[:characters] = noko.css("td td").select {|x| x.css("h2").any? {|y| y.text.include? "Characters" } }.first.css("a").map {|x| [x.text, x.attr(:href)] }.select {|x| x[0].strip.length > 0 and x[1] =~ /http:\/\/myanimelist\.net\/character\// }.map {|x| [x[0].split(',').map {|x| x.strip }.reverse.join(' '), x[1].gsub(/^.+character\/(\d+)\/.+$/, '\1').to_i] }.uniq
sidebar = noko.css('table tr td.borderClass')[0]
# Cover image URL
meta[:poster_image] = sidebar.css("img")[0].attribute('src').value
# Genres
meta[:genres] = (sidebar.css("div").select {|x| x.text.include? "Genres:" }[0].css("a").map(&:text) rescue []).compact
# Volumes
meta[:volumes] = (sidebar.css("div").select {|x| x.text.include? "Volumes:" }[0].children[1].text.to_i rescue nil)
meta[:volumes] = nil if meta[:volumes] == 0
# Chapters
meta[:chapters] = (sidebar.css("div").select {|x| x.text.include? "Chapters:" }[0].children[1].text.to_i rescue nil)
meta[:chapters] = nil if meta[:chapters] == 0
# Status
meta[:status] = (sidebar.css("div").select {|x| x.text.include? "Status:" }[0].children[1].text.strip rescue nil)
# Authors
meta[:authors] = sidebar.css("div").select {|x| x.text.include? "Authors:" }.first.css("a").map {|x| [x.text, x.attr(:href)] }.select {|x| x[0].strip.length > 0 and x[1] =~ /http:\/\/myanimelist\.net\/people\// }.map {|x| [x[0].split(',').map {|x| x.strip }.reverse.join(' '), x[1].gsub(/^.+people\/(\d+)\/.+$/, '\1').to_i] }.uniq
# Air dates
meta[:dates] = {}
begin
dates = sidebar.css('div').select {|x| x.text.include? "Published:" }[0].text.gsub("Published:", '')
if dates.include? "to"
dates = dates.split(" to ").map {|x| x.strip }
dates = dates.map do |x|
if x.strip.match /^\d+$/
Date.new x.strip.to_i
else
x == "?" ? nil : DateTime.parse(x).to_date
end
end
meta[:dates][:from] = dates[0]
meta[:dates][:to] = dates[1]
else
if dates.strip.match /^\d+$/
meta[:dates][:from] = Date.new dates.strip.to_i
else
meta[:dates][:from] = (dates.strip == "?") ? nil : DateTime.parse(dates).to_date
end
end
rescue
end
# Serialization
meta[:serialization] = (sidebar.css("div").select {|x| x.text.include? "Serialization:" }[0].css("a").map(&:text) rescue []).compact
# Genres
meta[:genres] = (sidebar.css("div").select {|x| x.text.include? "Genres:" }[0].css("a").map(&:text) rescue []).compact
return meta.to_json
end
if __FILE__ == $0
id = ARGV[0].to_i
p JSON.parse(get_json(id))
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment