Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save rvvvt/e8163fdf2a13c96450b4d60e91ce54c0 to your computer and use it in GitHub Desktop.
Save rvvvt/e8163fdf2a13c96450b4d60e91ce54c0 to your computer and use it in GitHub Desktop.
multithreading proxy scraper
# -*- coding: utf-8 -*-
from multiprocessing import Pool
from multiprocessing.dummy import Pool as ThreadPool
import re, os
from urllib2 import urlopen
Path = os.path.dirname(os.path.realpath(__file__))
with open(Path+'\\url.txt', 'r') as file:
urls = file.readlines()
file.close()
def parseproxy(url):
try:
source = urlopen(url).read()
except:
return None
proxies = re.findall( r'[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}\.[\d]{1,3}\:[\d]{1,6}', source[5:], re.M|re.I)
with open(Path+'\\proxy.txt', "a") as file:
for proxy in proxies:
file.write(proxy+'\n')
file.close()
print '[PARSED] - ', url.strip(), '['+str(len(proxies))+']'
pool = ThreadPool(100)
results = pool.map(parseproxy, urls)
pool.close()
pool.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment