Skip to content

Instantly share code, notes, and snippets.

@mopemope
Created February 17, 2009 02:26
Show Gist options
  • Save mopemope/65540 to your computer and use it in GitHub Desktop.
Save mopemope/65540 to your computer and use it in GitHub Desktop.
from twisted.web import client
from twisted.internet import reactor, defer
from lxml import etree
from StringIO import StringIO
import re
import os
save_dir = "~/.video/xnxx/"
base_url = "http://video.xnxx.com"
download_re = re.compile("so.addVariable\('flv_url',\s*'([\w\d.:/_]*)'\);", re.M)
q = []
dlist = []
is_run = True
def error(e, url):
print("%s is failure." % url)
print(e)
def dl_error(e, url):
print("%s is failure. retain %d" % (url, len(q)))
print(e)
if q:
download(q.pop())
else:
global dl
dl = None
def stop(res):
global is_run
if is_run:
reactor.stop()
is_run = False
if not dl:
dl = defer.DeferredList(dlist)
dl.addCallback(stop)
def finish(result, url):
print("finish %s retain %d" % (url, len(q)))
if q:
download(q.pop())
else:
global dl
dl = None
def stop(res):
global is_run
if is_run:
reactor.stop()
is_run = False
if not dl:
dl = defer.DeferredList(dlist)
dl.addCallback(stop)
def download(data):
url, save_path = data
d = client.downloadPage(url, save_path,
supportPartial=1)
dlist.append(d)
d.addCallback(finish, url).addErrback(dl_error,
url);
def contents(data, href):
name = href[href.rfind('/')+1:]
m = download_re.search(data)
if m:
url = m.group(1)
ext = url[url.rfind("."):]
filename = name + "_xnxx" + ext
save_path = os.path.join(save_dir, filename)
#print save_path
q.append((url,save_path))
def get_list(data, page_link):
parser = etree.HTMLParser()
root = etree.parse(StringIO(data), parser)
links = root.findall(".//a[@href]")
defList = []
pages = set()
for e in links:
href = e.attrib['href']
if href.startswith("/tags/") and e.text == "Next":
pages.add(href)
elif "video.xnxx.com/video" in href:
print(href)
d = client.getPage(href)
defList.append(d)
d.addCallback(contents, href).addErrback(error, href)
def next(result):
if pages:
page = pages.pop()
url = base_url + page
print("-" * 80)
print(url)
get_search_result(url, False)
else:
q.reverse()
for i in xrange(5):
download(q.pop())
dl = defer.DeferredList(defList)
dl.addCallback(next)
def search(word):
url = base_url + "/tags/%s" % word
get_search_result(url, True)
def get_search_result(url, page_link):
client.getPage(url).addCallback(get_list, page_link).addErrback(error, url)
search('japanese')
reactor.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment