Last active
September 30, 2015 12:27
-
-
Save venj/1789006 to your computer and use it in GitHub Desktop.
!!!!!DEPRECATED!!!!! Tracking the changes to the website is FUCKING BORING!!!! Torrent spider. Hard to maintain...Damn it, always changes web page...
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# Download torrents from SOME WEBSITE. | |
require "rubygems" | |
require "open-uri" | |
require "hpricot" | |
require "uri" | |
require "colorize" | |
require 'uri' | |
require 'net/http' | |
# Global vars | |
website = "http://javjunkies.com" | |
download_base = nil | |
base_uri = "#{website}/main/page/" | |
mainpage_index = 1 | |
IMG_PROXY = nil | |
PG_PROXY = nil | |
MAX_RETRY = 5 | |
# Exceptions Classes | |
class DownloadLimitError < StandardError;end | |
class ServerOverLoadError < StandardError;end | |
class TorrentServerError < StandardError;end | |
class TorrentNotFoundException < StandardError;end | |
class LinkRedirectException < StandardError;end | |
class SiteMaintenanceException < StandardError;end | |
# Local URL Expander Class | |
class Expander | |
def self.expand(url) | |
open("http://venj.me/url.php?shorturl=#{url}").read.strip | |
end | |
end | |
def skip_page?(page) | |
open('.finished').readlines.include?(page + "\n") ? true : false | |
end | |
def add_page(page) | |
open('.finished', 'a+') {|f| f.puts page } | |
end | |
# Just loop it. | |
while true | |
# Grab main page (daily list.) | |
print "\nOpening posts page: #{mainpage_index}..." | |
html = open("#{base_uri}#{mainpage_index}/", :proxy => PG_PROXY).read | |
html = open("#{website}/main/", :proxy => PG_PROXY).read if mainpage_index == 1 | |
begin | |
raise SiteMaintenanceException.new if html.index("Site Maintenance") | |
rescue SiteMaintenanceException => e | |
puts "Maintenance.\n".yellow | |
break | |
end | |
puts | |
maindoc = Hpricot(html) | |
maindoc.search("//div[@class='post']/div[@class='entry']/a").each do |entry| | |
onclick = entry.attributes["onclick"] | |
next if onclick =~ /window.open/ # Skip the fake daily list. | |
page_uri = onclick.match(/location.href='([^']+)'/)[1] | |
begin | |
# Grab daily pagination. | |
print "Opening #{URI::parse(page_uri).path}..." | |
(puts "skip day!".green; next) if skip_page?(page_uri) | |
puts | |
page_content = open(page_uri, :proxy => PG_PROXY).read | |
dlbaseregex = /window\.open\("(http:\/\/javjunkies\.com\/[^"]+)"/ | |
download_base = page_content.match(dlbaseregex)[1] | |
page_doc = Hpricot(page_content) | |
rescue Exception => e | |
puts "Finished or Unknow error...".red | |
puts e.message | |
exit 1 | |
end | |
# Get pagination (page count) | |
begin | |
lastpage_index = page_doc.search("//div[@class='post']/div[@class='entry']/p/font/a").last.inner_html.to_i | |
rescue Exception => e | |
lastpage_index = 1 | |
end | |
filename_base = File.basename(File.dirname(page_uri)) + "_" + File.basename(page_uri) | |
print "Processing #{filename_base}, "; puts "#{lastpage_index} pages...".green | |
# Fetch the daily list. | |
(1..lastpage_index).each do |i| | |
refererLink = "#{page_uri}#{i}/" | |
print " parse page #{i}: torrent " | |
(puts "...skip page!".green; next) if skip_page?(refererLink) | |
if i == 1 | |
pagination_doc = page_doc | |
else | |
pagination_doc = Hpricot(open(refererLink, :proxy => PG_PROXY).read) | |
end | |
# Fetch torrents and images | |
tr_count = 1 | |
pagination_doc.search("//div[@class='post']/div[@class='entry']/div[@class='image']").each do |e| | |
next if e.search("div").size < 1 # Skip fake torrent post, 2013-03-13. | |
print "#{tr_count}"; tr_count += 1 | |
# Try to skip un-needed file formats, but its not usable for now. | |
skip_torrent = false | |
e.search("span") do |span| | |
html_text = span.inner_html | |
begin | |
skip_torrent = true if html_text =~ /\.[iso|mds]/ # allow wmv torrent. | |
rescue Exception => e | |
skip_torrent = true | |
end | |
end | |
(print "f".yellow; print ", ";next) if skip_torrent | |
# This is where real download begins. | |
e.search("a") do |a| | |
# Build the torrent link. | |
link_regx = /JavJ\('([^']+)'\)/ | |
#puts a.attributes["onclick"].match(link_regx)[1] | |
(print "e".magenta; print ", ";next) if a.attributes["onclick"] !~ link_regx | |
tr_link = download_base + a.attributes["onclick"].match(link_regx)[1] | |
#puts tr_link | |
# Build the image link | |
image_link_regx = /url\('([^']+)'\)/ | |
image_link_t = e.search("//div")[0].attributes["style"].match(image_link_regx)[1] | |
# Expand the image short link. | |
retry_counts = 0 | |
while retry_counts < MAX_RETRY | |
begin | |
image_link = Expander.expand(image_link_t) | |
rescue StandardError => e | |
retry_counts += 1 | |
end | |
break if image_link | |
end | |
#puts image_link | |
# Build image name and torrent name for save to disk. | |
image_filename = filename_base + "." + image_link.split("/").last | |
tr_filename = "" | |
if tr_filename.split(".").last == "torrent" | |
tr_filename = filename_base + "_" + tr_link.split("/").last | |
else | |
tr_filename = filename_base + "_" + image_link.split("/").last.split(".")[0] + ".torrent" | |
end | |
# Skip the torrent and image if exists. | |
(print "s".blue;print ", ";next) if ((File.exists? tr_filename) && (File.exists? image_filename)) | |
ftext = "" | |
begin | |
print ", " | |
# Download torrent | |
unless File.exists? tr_filename # Skip it if torrent exists. | |
open(tr_link, "referer" => refererLink, :proxy => PG_PROXY) do |inf| | |
ftext = inf.read | |
#puts ftext | |
# Raise exceptions according to different conditions. | |
raise DownloadLimitError.new if ftext.index "You reached your download limit." | |
raise ServerOverLoadError if ftext.index "Server under heavy load, please try again later!" | |
raise TorrentServerError.new if ftext.index "Internal Server Error" | |
raise TorrentNotFoundException.new if ftext.index "Error 404" | |
raise LinkRedirectException.new if ftext[0..20].index("html") | |
open(tr_filename, "w+") do |ouf| | |
ouf.write(ftext) | |
end | |
end | |
end | |
rescue DownloadLimitError => err | |
print "gocha!\n".red | |
puts "Daily download limit reached. Please retry tomorrow.\n".red | |
exit | |
rescue ServerOverLoadError => err | |
print "damn!\n".red | |
puts "Server is under heavy load. Really?!\n".red | |
exit | |
rescue TorrentServerError => err | |
print "ouch!\n".red | |
puts "Torrent server 500 internal error.\n".red | |
exit | |
rescue TorrentNotFoundException => err | |
print "\b\b" | |
print "?".yellow | |
print ", " | |
rescue LinkRedirectException => err | |
print "fuck!\n".red | |
puts "Script broken!!!".red | |
exit | |
rescue Exception => err | |
puts "Error downloading torrent.\n".red | |
puts err.message | |
exit | |
end | |
# Download image for the torrent | |
next if File.exists? image_filename # Skip it if image exists. | |
retry_counts = 0 | |
while retry_counts < MAX_RETRY | |
begin | |
open(image_link, :proxy => IMG_PROXY) do |inf| | |
open(image_filename, "w+") do |ouf| | |
ouf.write(inf.read) | |
end | |
end | |
rescue StandardError => err | |
retry_counts += 1 | |
puts "Error downloading image: #{image_filename} (#{err.message}), retry.".red | |
puts "#{image_link} , retry...".red | |
else | |
break # Success if exec to here. | |
end | |
end | |
end | |
end | |
add_page(refererLink) | |
puts "done.".green # One daily list pagination finished | |
end | |
add_page(page_uri) | |
end | |
mainpage_index += 1 # One day finished. | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment