Created
June 16, 2014 17:37
-
-
Save vikhyat/5569625983b95f99bcc3 to your computer and use it in GitHub Desktop.
Manga Information Importer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| require 'nokogiri' | |
| require 'date' | |
| require 'json' | |
| def repair_broken_html(html) | |
| rep = html | |
| # Broken UTF-8 encoding. | |
| rep = rep.unpack('C*').pack('U*') unless rep.valid_encoding? | |
| # "<" is not escaped. | |
| rep = rep.gsub(/<([^A-Za-z!\/])/, '<\1') | |
| rep | |
| end | |
| def get_json(mal_id) | |
| html = File.open("data/#{mal_id}").read | |
| return "" if html.include? "No manga found, check the manga id and try again." | |
| noko = Nokogiri::HTML(repair_broken_html html) | |
| meta = {mal_id: mal_id.to_i} | |
| # Get title and alternate title. | |
| title = noko.css("h1").children[1].text.strip | |
| aka = noko.css(".spaceit_pad").select {|x| x.text.include? "English:" }[0].text.gsub("English: ", "").strip rescue nil | |
| meta[:romaji_title] = title | |
| meta[:english_title] = (title != aka) ? aka : nil | |
| # Synopsis | |
| meta[:synopsis] = noko.css("td").select {|x| x.css("h2").text == "Synopsis" }[0].text.gsub("Synopsis", '').strip rescue nil | |
| # Characters | |
| meta[:characters] = noko.css("td td").select {|x| x.css("h2").any? {|y| y.text.include? "Characters" } }.first.css("a").map {|x| [x.text, x.attr(:href)] }.select {|x| x[0].strip.length > 0 and x[1] =~ /http:\/\/myanimelist\.net\/character\// }.map {|x| [x[0].split(',').map {|x| x.strip }.reverse.join(' '), x[1].gsub(/^.+character\/(\d+)\/.+$/, '\1').to_i] }.uniq | |
| sidebar = noko.css('table tr td.borderClass')[0] | |
| # Cover image URL | |
| meta[:poster_image] = sidebar.css("img")[0].attribute('src').value | |
| # Genres | |
| meta[:genres] = (sidebar.css("div").select {|x| x.text.include? "Genres:" }[0].css("a").map(&:text) rescue []).compact | |
| # Volumes | |
| meta[:volumes] = (sidebar.css("div").select {|x| x.text.include? "Volumes:" }[0].children[1].text.to_i rescue nil) | |
| meta[:volumes] = nil if meta[:volumes] == 0 | |
| # Chapters | |
| meta[:chapters] = (sidebar.css("div").select {|x| x.text.include? "Chapters:" }[0].children[1].text.to_i rescue nil) | |
| meta[:chapters] = nil if meta[:chapters] == 0 | |
| # Status | |
| meta[:status] = (sidebar.css("div").select {|x| x.text.include? "Status:" }[0].children[1].text.strip rescue nil) | |
| # Authors | |
| meta[:authors] = sidebar.css("div").select {|x| x.text.include? "Authors:" }.first.css("a").map {|x| [x.text, x.attr(:href)] }.select {|x| x[0].strip.length > 0 and x[1] =~ /http:\/\/myanimelist\.net\/people\// }.map {|x| [x[0].split(',').map {|x| x.strip }.reverse.join(' '), x[1].gsub(/^.+people\/(\d+)\/.+$/, '\1').to_i] }.uniq | |
| # Air dates | |
| meta[:dates] = {} | |
| begin | |
| dates = sidebar.css('div').select {|x| x.text.include? "Published:" }[0].text.gsub("Published:", '') | |
| if dates.include? "to" | |
| dates = dates.split(" to ").map {|x| x.strip } | |
| dates = dates.map do |x| | |
| if x.strip.match /^\d+$/ | |
| Date.new x.strip.to_i | |
| else | |
| x == "?" ? nil : DateTime.parse(x).to_date | |
| end | |
| end | |
| meta[:dates][:from] = dates[0] | |
| meta[:dates][:to] = dates[1] | |
| else | |
| if dates.strip.match /^\d+$/ | |
| meta[:dates][:from] = Date.new dates.strip.to_i | |
| else | |
| meta[:dates][:from] = (dates.strip == "?") ? nil : DateTime.parse(dates).to_date | |
| end | |
| end | |
| rescue | |
| end | |
| # Serialization | |
| meta[:serialization] = (sidebar.css("div").select {|x| x.text.include? "Serialization:" }[0].css("a").map(&:text) rescue []).compact | |
| # Genres | |
| meta[:genres] = (sidebar.css("div").select {|x| x.text.include? "Genres:" }[0].css("a").map(&:text) rescue []).compact | |
| return meta.to_json | |
| end | |
| if __FILE__ == $0 | |
| id = ARGV[0].to_i | |
| p JSON.parse(get_json(id)) | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment