asanakoy · October 12, 2018 19:11
diff --git a/Scrape with proxy b/Scrape with proxy


 #Proxy list graper
 # https://github.com/abdallahelsokary/Proxy-Collector-/blob/master/Proxy_Collector.py
 import urllib.request
 import urllib.error
 import time

 def proxy_list():
    try:
        time.sleep(1)
        url = "https://free-proxy-list.net/" # the source
        req = urllib.request.Request(url,headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
        open_url = urllib.request.urlopen(req)
        data = open_url.read()
        soup = BeautifulSoup(data,'html.parser')
        proxy_data = soup.find_all('tr')
        Pr_list = []
        for i in proxy_data:
            Pr = "{0}:{1}".format(BeautifulSoup(str(list(i)[0]),'html.parser').text,BeautifulSoup(str(list(i)[1]),'html.parser').text)
            Pr_list.append(Pr)
        Pr_list.remove(Pr_list[0])
        Pr_list.remove(Pr_list[-1])
        
        print('Find ', len(Pr_list), 'proxies')
        
    except:
        pass
    
    return Pr_list

 def getPage(CheckPage, proxyDict, prxList):
    header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    
    
    if (prxList==[]):
        prxList = proxy_list()
        prxList = list(reversed(prxList))
        prxList = prxList[0:100]
    
    r = ''
    try:
        r = requests.get(CheckPage, proxies=proxyDict, headers=header) 
    except:
        pass
    
    if (str(r)!='<Response [200]>'):
        prx = prxList.pop()
        print('Changing proxy. New one:', prx, 'Proxies left:', len(prxList))
        proxyDict = {'http': prx, 'https': prx}
        soup, r, proxyDict, prxList = getPage(CheckPage, proxyDict, prxList)
    
    data = r.text
    soup = BeautifulSoup(data, 'html.parser') 
    return soup, r, proxyDict, prxList
    
    
 for ID in tqdm(IDsAll):

    Page = 'http:/blabla.com/page?id=' + ID
    
    MainTable = ''
    while (MainTable==''):
        try:
            soup, r, proxyDict, prxList = getPage(Page, proxyDict, prxList)
            MainTable = soup.body.findAll('table', {'class':'hdr14'})[0].tr.find('table', {'cellspacing':'0'})
        except:
            proxyDict = {'http': None, 'https': None}


	#Proxy list graper
	# https://github.com/abdallahelsokary/Proxy-Collector-/blob/master/Proxy_Collector.py
	import urllib.request
	import urllib.error
	import time

	def proxy_list():
	try:
	time.sleep(1)
	url = "https://free-proxy-list.net/" # the source
	req = urllib.request.Request(url,headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
	open_url = urllib.request.urlopen(req)
	data = open_url.read()
	soup = BeautifulSoup(data,'html.parser')
	proxy_data = soup.find_all('tr')
	Pr_list = []
	for i in proxy_data:
	Pr = "{0}:{1}".format(BeautifulSoup(str(list(i)[0]),'html.parser').text,BeautifulSoup(str(list(i)[1]),'html.parser').text)
	Pr_list.append(Pr)
	Pr_list.remove(Pr_list[0])
	Pr_list.remove(Pr_list[-1])

	print('Find ', len(Pr_list), 'proxies')

	except:
	pass

	return Pr_list

	def getPage(CheckPage, proxyDict, prxList):
	header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}


	if (prxList==[]):
	prxList = proxy_list()
	prxList = list(reversed(prxList))
	prxList = prxList[0:100]

	r = ''
	try:
	r = requests.get(CheckPage, proxies=proxyDict, headers=header)
	except:
	pass

	if (str(r)!='<Response [200]>'):
	prx = prxList.pop()
	print('Changing proxy. New one:', prx, 'Proxies left:', len(prxList))
	proxyDict = {'http': prx, 'https': prx}
	soup, r, proxyDict, prxList = getPage(CheckPage, proxyDict, prxList)

	data = r.text
	soup = BeautifulSoup(data, 'html.parser')
	return soup, r, proxyDict, prxList


	for ID in tqdm(IDsAll):

	Page = 'http:/blabla.com/page?id=' + ID

	MainTable = ''
	while (MainTable==''):
	try:
	soup, r, proxyDict, prxList = getPage(Page, proxyDict, prxList)
	MainTable = soup.body.findAll('table', {'class':'hdr14'})[0].tr.find('table', {'cellspacing':'0'})
	except:
	proxyDict = {'http': None, 'https': None}