Skip to content

Instantly share code, notes, and snippets.

@mopemope
Created February 17, 2009 02:26
Show Gist options
  • Save mopemope/65539 to your computer and use it in GitHub Desktop.
Save mopemope/65539 to your computer and use it in GitHub Desktop.
from twisted.web import client
from twisted.internet import reactor, defer
from lxml import etree
from StringIO import StringIO
import re
import os
save_dir = "~/.video/xhamster/"
base_url = "http://www.xhamster.com"
download_re = re.compile("so.addVariable\('file',\s*'([\w\d\.:/_\-\?=]*)'\);", re.M)
download_re2 = re.compile("so.addVariable\('srv',\s*'([\w\d.:/_]*)'\);", re.M)
q = []
dlist = []
is_run = True
def error(e, url):
print("%s is failure." % url)
print(e)
def dl_error(e, url, data):
if e.check(ValueError):
#print "retry %s" % url
url, save_path, p = data
#q.insert(0, (url, save_path, 0))
if q:
download(q.pop())
return
print("%s is failure. retain %d" % (url, len(q)))
print(e)
if q:
download(q.pop())
else:
global dl
dl = None
def stop(res):
global is_run
if is_run:
reactor.stop()
is_run = False
if not dl:
dl = defer.DeferredList(dlist)
dl.addCallback(stop)
def finish(result, url):
print("finish %s retain %d" % (url, len(q)))
if q:
download(q.pop())
else:
global dl
def stop(res):
global is_run
if is_run:
reactor.stop()
is_run = False
if not dl:
dl = defer.DeferredList(dlist)
dl.addCallback(stop)
def download(data):
url, save_path, partial = data
d = client.downloadPage(url, save_path, supportPartial=partial)
dlist.append(d)
d.addCallback(finish, url).addErrback(dl_error, url, data);
def contents(data, href):
url_base = "http://dl%s.xhamster.com/flv2/%s"
m = download_re.search(data)
if m:
fl = m.group(1)
name = fl.split(".")[0]
ext = ".flv"
filename = name + "_xhamster" + ext
if name.find('girl_alone') == -1 and name.find('girls_kiss') == -1 and name.find('_massage') == -1:
save_path = os.path.join(save_dir, filename)
srv = download_re2.search(data).group(1)
url = url_base % (srv, fl)
#print(url)
#print(save_path)
q.append((url, save_path, 1))
pages = set()
def get_list(data, page_link):
parser = etree.HTMLParser()
root = etree.parse(StringIO(data), parser)
links = root.findall(".//a[@href]")
defList = []
for e in links:
href = e.attrib['href']
if "/search.php?page=" in href and e.text == "Next":
pages.add(href)
elif e.text and href.find("movies") > 0:
print(href)
d = client.getPage(base_url + href)
defList.append(d)
d.addCallback(contents, base_url + href).addErrback(error, base_url
+ href)
def next(result):
if pages:
page = pages.pop()
print("-" * 80)
print(base_url + page)
get_search_result(base_url + page, True)
else:
q.reverse()
for i in xrange(5):
download(q.pop())
dl = defer.DeferredList(defList)
dl.addCallback(next)
def search(word):
url = base_url + "/search.php?page=1&q=%s&s=da" % word
get_search_result(url, True)
def get_search_result(url, page_link):
client.getPage(url).addCallback(get_list, page_link).addErrback(error, url)
search('japanese')
reactor.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment