vwrs · May 22, 2018 13:43
diff --git a/crawler-with-tor.py b/crawler-with-tor.py
 import os
 import wget
 import requests
 from bs4 import BeautifulSoup

 # use Tor
 local_proxy = 'socks5://127.1:9050'
 socks_proxy = {
    'http': local_proxy,
    'https': local_proxy
 }
 current_ip = requests.get(
    url='http://icanhazip.com/',
    proxies=socks_proxy,
    verify=False
 )
 print('original IP: ', current_ip)
 current_ip = requests.get(
    url='http://icanhazip.com/',
    proxies=socks_proxy,
    verify=False
 )
 print('Tor IP: ', current_ip)

 # start crawling
 http = 'https://papers.nips.cc/'
 rq = requests.get(http)
 sp = BeautifulSoup(rq.text, 'html5lib')

 layerage = []

 uls = sp.find_all('ul')
 for ul in uls:
    lis = ul.find_all('li')
    for li in lis:
        link = li.find('a').get('href')
        if '/' != link:
            layerage.append(link)
        else:
            pass

 for age in layerage:
    http1 = http + age.replace('/', '', 1)
    print(http1)
    rq = requests.get(http1)
    sp = BeautifulSoup(rq.text, 'html5lib')
    uls = sp.find_all('ul')
    for ul in uls:
        lis = ul.find_all('li')
    for li in lis:
        link = li.find('a').get("href")
        if '/' != link:
            link2 = http + link.replace('/', '', 1)
            os.system('wget -r -nc -A pdf ' + link2)
        else:
            pass
	import os
	import wget
	import requests
	from bs4 import BeautifulSoup

	# use Tor
	local_proxy = 'socks5://127.1:9050'
	socks_proxy = {
	'http': local_proxy,
	'https': local_proxy
	}
	current_ip = requests.get(
	url='http://icanhazip.com/',
	proxies=socks_proxy,
	verify=False
	)
	print('original IP: ', current_ip)
	current_ip = requests.get(
	url='http://icanhazip.com/',
	proxies=socks_proxy,
	verify=False
	)
	print('Tor IP: ', current_ip)

	# start crawling
	http = 'https://papers.nips.cc/'
	rq = requests.get(http)
	sp = BeautifulSoup(rq.text, 'html5lib')

	layerage = []

	uls = sp.find_all('ul')
	for ul in uls:
	lis = ul.find_all('li')
	for li in lis:
	link = li.find('a').get('href')
	if '/' != link:
	layerage.append(link)
	else:
	pass

	for age in layerage:
	http1 = http + age.replace('/', '', 1)
	print(http1)
	rq = requests.get(http1)
	sp = BeautifulSoup(rq.text, 'html5lib')
	uls = sp.find_all('ul')
	for ul in uls:
	lis = ul.find_all('li')
	for li in lis:
	link = li.find('a').get("href")
	if '/' != link:
	link2 = http + link.replace('/', '', 1)
	os.system('wget -r -nc -A pdf ' + link2)
	else:
	pass