Sorseg · December 10, 2015 13:29
diff --git a/grab_mlp_general.py b/grab_mlp_general.py
 #!/usr/bin/python
 '''
 BeautifulSoup is required:
 http://www.crummy.com/software/BeautifulSoup/download/3.x/BeautifulSoup-3.2.1.tar.gz
 '''
 from BeautifulSoup import BeautifulSoup as soup
 from urllib import urlopen as uopen, urlretrieve
 import re, os, errno, subprocess as sp, threading
 from time import sleep
 from urlparse import urljoin
 from contextlib import closing
 ROOT_PAGE = 'http://2ch.hk/'
 GRAB_RULE = re.compile('General')
 BOARD = 'mlp'
 SAVEDIR = '/media/storage/dev/mlp_general/' 

 def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc:
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else: raise


 def get_threads():
    s = soup(uopen(os.path.join(ROOT_PAGE, BOARD)))
    def finder(tag):
        return tag(True, 'oppost', text = GRAB_RULE)
    return [int(i['id'].split('_')[-1]) for i in s(finder, 'thread')]
    
 def get_links(thread):
    url = '/'.join([ROOT_PAGE, BOARD, 'res', str(thread)+'.html'])
    page = uopen(url)
    if page.code != 200:
        print "ERROR", page.code, "in thread", url
    s = soup(page)
    links = [a['href'] for post in s('div','oppost')+s('table', 'post') for a in post('a', {'name':'expandfunc'})]
    links.append('/'.join([BOARD, 'res', str(thread)+'.html']))
    return links
    
 def dwnl(end_url, end_fname):
    print "Downloading...", end_url, end_fname
    with closing(uopen(end_url)) as rem_file:
        data = rem_file.read() 
    if '<html' in data and not end_url.endswith('html'):
        print "FAILED!", end_url
        return
    urlretrieve(end_url, end_fname + '.part')
    os.rename(end_fname+'.part', end_fname)
    print "DONE", end_fname, "LEFT:", threading.active_count() - 2
    
 def download(thread, links):
    thread_fs_root = os.path.join(SAVEDIR, str(thread))
    mkdir_p(thread_fs_root)
    new_links = []
    for l in links:
        fname = l.split('/')[-1]
        end_fname = os.path.join(thread_fs_root, fname)
        if not os.path.exists(end_fname) or end_fname.endswith('html'):
            new_links.append(l)
    print "Thread:", thread, "({}/{})".format(len(new_links), len(links))
    for l in new_links:
        end_url = '/'.join([ROOT_PAGE, l])
        end_fname = os.path.join(thread_fs_root, l.split('/')[-1])
        t = threading.Thread(target = dwnl, args = (end_url, end_fname))
        t.start()
        while threading.active_count() > 30:
            sleep(0.5)

 if __name__ == '__main__':
    threads = get_threads()
    mkdir_p(SAVEDIR)
    print "Working on", threads
    for t in threads: 
        l = get_links(t)
        download(t, l)
	#!/usr/bin/python
	'''
	BeautifulSoup is required:
	http://www.crummy.com/software/BeautifulSoup/download/3.x/BeautifulSoup-3.2.1.tar.gz
	'''
	from BeautifulSoup import BeautifulSoup as soup
	from urllib import urlopen as uopen, urlretrieve
	import re, os, errno, subprocess as sp, threading
	from time import sleep
	from urlparse import urljoin
	from contextlib import closing
	ROOT_PAGE = 'http://2ch.hk/'
	GRAB_RULE = re.compile('General')
	BOARD = 'mlp'
	SAVEDIR = '/media/storage/dev/mlp_general/'

	def mkdir_p(path):
	try:
	os.makedirs(path)
	except OSError as exc:
	if exc.errno == errno.EEXIST and os.path.isdir(path):
	pass
	else: raise


	def get_threads():
	s = soup(uopen(os.path.join(ROOT_PAGE, BOARD)))
	def finder(tag):
	return tag(True, 'oppost', text = GRAB_RULE)
	return [int(i['id'].split('_')[-1]) for i in s(finder, 'thread')]

	def get_links(thread):
	url = '/'.join([ROOT_PAGE, BOARD, 'res', str(thread)+'.html'])
	page = uopen(url)
	if page.code != 200:
	print "ERROR", page.code, "in thread", url
	s = soup(page)
	links = [a['href'] for post in s('div','oppost')+s('table', 'post') for a in post('a', {'name':'expandfunc'})]
	links.append('/'.join([BOARD, 'res', str(thread)+'.html']))
	return links

	def dwnl(end_url, end_fname):
	print "Downloading...", end_url, end_fname
	with closing(uopen(end_url)) as rem_file:
	data = rem_file.read()
	if '<html' in data and not end_url.endswith('html'):
	print "FAILED!", end_url
	return
	urlretrieve(end_url, end_fname + '.part')
	os.rename(end_fname+'.part', end_fname)
	print "DONE", end_fname, "LEFT:", threading.active_count() - 2

	def download(thread, links):
	thread_fs_root = os.path.join(SAVEDIR, str(thread))
	mkdir_p(thread_fs_root)
	new_links = []
	for l in links:
	fname = l.split('/')[-1]
	end_fname = os.path.join(thread_fs_root, fname)
	if not os.path.exists(end_fname) or end_fname.endswith('html'):
	new_links.append(l)
	print "Thread:", thread, "({}/{})".format(len(new_links), len(links))
	for l in new_links:
	end_url = '/'.join([ROOT_PAGE, l])
	end_fname = os.path.join(thread_fs_root, l.split('/')[-1])
	t = threading.Thread(target = dwnl, args = (end_url, end_fname))
	t.start()
	while threading.active_count() > 30:
	sleep(0.5)

	if __name__ == '__main__':
	threads = get_threads()
	mkdir_p(SAVEDIR)
	print "Working on", threads
	for t in threads:
	l = get_links(t)
	download(t, l)