sumitsk20 · January 20, 2020 18:39
diff --git a/clutch_co_scrapper.py b/clutch_co_scrapper.py
 import traceback
 from itertools import cycle
 import bs4 as bs  # pip intsall bs4
 import re
 import csv
 import requests  # pip intsall requests
 import json
 from random import randint
 from time import sleep
 from lxml.html import fromstring  # pip intsall lxml
 from retrying import retry  # pip intsall retrying

 headers = {
    'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
    'referrer': 'https://clutch.co/directory/mobile-application-developers',
    # 'Accept': '*/*',
    # 'Accept-Encoding': 'gzip, deflate, br',
    # 'Accept-Language': 'en-IN,en-GB;q=0.9,en-US;q=0.8,en;q=0.7,hi;q=0.6',
    # 'Pragma': 'no-cache',
 }
 count = 0
 company_data = []
 with open('clutch_data.json') as json_file:
    company_data = json.load(json_file)

 urls = [
    "https://clutch.co/directory/mobile-application-developers",
 ]


 def get_proxies():
    sleep(randint(5, 10))
    url = 'https://free-proxy-list.net/'
    response = requests.get(url)
    parser = fromstring(response.text)
    proxies = set()
    for i in parser.xpath('//tbody/tr')[:10]:
        if i.xpath('.//td[7][contains(text(),"yes")]'):
            proxy = ":".join([i.xpath('.//td[1]/text()')[0],
                              i.xpath('.//td[2]/text()')[0]])
            proxies.add(proxy)
    return proxies


 def retry_if_connection_error(exception):
    """ Specify an exception you need. or just True"""
    # return True
    print('EXCEPTION: ', exception)
    return True

 # if exception retry with 2 second wait
 @retry(retry_on_exception=retry_if_connection_error, wait_exponential_multiplier=2000, wait_exponential_max=30000)
 def safe_request(url):
    proxies = get_proxies()
    proxy_pool = cycle(proxies)
    proxy = next(proxy_pool)
    return requests.get(url, proxies={
        "http": proxy, "https": proxy})


 try:
    for url in urls:
        #----------------------------------------------------------------#
        pageNo = 0  # 37 <- enter your last page number here

        while True:
            queries = "sort_by=0&location%5Bcountry%5D=US"
            if pageNo == 0:
                pageUrl = str(url)+"?"+queries
            else:
                page = "?page="
                pageUrl = str(url)+page+str(pageNo)+"&"+queries

            print('Fetching data from :: ', pageUrl)

            try:
                response = safe_request(url=pageUrl)
            except Exception as ex:
                print('\n Current URL:: ', url, ', page:: ', pageNo)
                print("Error occurred: {}".format(str(ex)))
                break
            # webdata = urlopen(Request(url, headers={'User-Agent': 'Mozilla/5.0'}))
            # sauce = webdata.read()
            soup = bs.BeautifulSoup(response.content, 'lxml')

            # i = 0
            directory_ul = soup.find(
                "ul", {"class": "directory-list"})
            companies = directory_ul.find_all(
                "li", {"class": "provider-row"})
            # print('here is :', companies, companies == None)

            for company in companies:
                link = company.find('a', href=re.compile(
                    "^https://clutch.co/profile"))
                try:
                    comp_url = link.get('href')
                except:
                    pass
                clutch_profile = comp_url
                name = company.find("h3", {"class": "company-name"})
                tagline = company.find("p", {"class": "tagline"})
                info_list = company.find_all('div', {"class": "list-item"})
                project_size = info_list[0]
                hourly_rate = info_list[1].text.replace(
                    '\n', '').replace('/ hr', '').split('-')
                try:
                    min_hourly_rate = hourly_rate[0].replace('>', '').strip()
                    if '<' in hourly_rate[0]:
                        max_hourly_rate = hourly_rate[0].replace(
                            '<', '').strip()
                        min_hourly_rate = '0'
                except:
                    pass

                try:
                    if not max_hourly_rate:
                        max_hourly_rate = hourly_rate[1].strip()
                except:
                    max_hourly_rate = None

                employee = info_list[2].text.replace('\n', '').split('-')
                min_employee = employee[0].strip()
                max_employee = employee[1].strip()
                location = info_list[3]
                locality = location.find('span', {"class": "locality"})
                region = location.find('span', {"class": "region"})
                website = company.find('li', {"class": "website-link"})
                services_object = company.find(
                    'div', {"class": {"carousel-inner"}})
                services_html_array = services_object.find_all(
                    'div', {"class": {"item"}})
                services = []
                for service in services_html_array:
                    service = service.text.replace('\n', '').split('%')
                    services.append(
                        {'tag': service[1].strip(), 'score': service[0].strip()})

                company_json = {
                    "name": name.text.replace('\n', '').strip(),
                    "clutch_profile": comp_url.strip(),
                    "tagline": tagline.text.replace('\n', '').strip(),
                    "min_hourly_rate": min_hourly_rate,
                    "max_hourly_rate": max_hourly_rate,
                    "min_employee": min_employee,
                    "max_employee": max_employee,
                    "location": {
                        "locality": locality.text.replace('\n', '').strip(),
                        "region": region.text.replace('\n', '').strip()
                    },
                    "webiste": website.a['href'].strip(),
                    "services": services
                }
                if company_json not in company_data:
                    company_data.append(company_json)
                    count += 1
                    print('Company number:: ', count)
            sleep(randint(20, 60))
            pageNo += 1
        break
 except:
    print('\nAdded Data for ', count, ' companies.')
    with open('clutch_data.json', 'w') as outfile:
        json.dump(company_data, outfile, indent=4)

 print('\n DONE')
	import traceback
	from itertools import cycle
	import bs4 as bs # pip intsall bs4
	import re
	import csv
	import requests # pip intsall requests
	import json
	from random import randint
	from time import sleep
	from lxml.html import fromstring # pip intsall lxml
	from retrying import retry # pip intsall retrying

	headers = {
	'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
	'referrer': 'https://clutch.co/directory/mobile-application-developers',
	# 'Accept': '/',
	# 'Accept-Encoding': 'gzip, deflate, br',
	# 'Accept-Language': 'en-IN,en-GB;q=0.9,en-US;q=0.8,en;q=0.7,hi;q=0.6',
	# 'Pragma': 'no-cache',
	}
	count = 0
	company_data = []
	with open('clutch_data.json') as json_file:
	company_data = json.load(json_file)

	urls = [
	"https://clutch.co/directory/mobile-application-developers",
	]


	def get_proxies():
	sleep(randint(5, 10))
	url = 'https://free-proxy-list.net/'
	response = requests.get(url)
	parser = fromstring(response.text)
	proxies = set()
	for i in parser.xpath('//tbody/tr')[:10]:
	if i.xpath('.//td[7][contains(text(),"yes")]'):
	proxy = ":".join([i.xpath('.//td[1]/text()')[0],
	i.xpath('.//td[2]/text()')[0]])
	proxies.add(proxy)
	return proxies


	def retry_if_connection_error(exception):
	""" Specify an exception you need. or just True"""
	# return True
	print('EXCEPTION: ', exception)
	return True

	# if exception retry with 2 second wait
	@retry(retry_on_exception=retry_if_connection_error, wait_exponential_multiplier=2000, wait_exponential_max=30000)
	def safe_request(url):
	proxies = get_proxies()
	proxy_pool = cycle(proxies)
	proxy = next(proxy_pool)
	return requests.get(url, proxies={
	"http": proxy, "https": proxy})


	try:
	for url in urls:
	#----------------------------------------------------------------#
	pageNo = 0 # 37 <- enter your last page number here

	while True:
	queries = "sort_by=0&location%5Bcountry%5D=US"
	if pageNo == 0:
	pageUrl = str(url)+"?"+queries
	else:
	page = "?page="
	pageUrl = str(url)+page+str(pageNo)+"&"+queries

	print('Fetching data from :: ', pageUrl)

	try:
	response = safe_request(url=pageUrl)
	except Exception as ex:
	print('\n Current URL:: ', url, ', page:: ', pageNo)
	print("Error occurred: {}".format(str(ex)))
	break
	# webdata = urlopen(Request(url, headers={'User-Agent': 'Mozilla/5.0'}))
	# sauce = webdata.read()
	soup = bs.BeautifulSoup(response.content, 'lxml')

	# i = 0
	directory_ul = soup.find(
	"ul", {"class": "directory-list"})
	companies = directory_ul.find_all(
	"li", {"class": "provider-row"})
	# print('here is :', companies, companies == None)

	for company in companies:
	link = company.find('a', href=re.compile(
	"^https://clutch.co/profile"))
	try:
	comp_url = link.get('href')
	except:
	pass
	clutch_profile = comp_url
	name = company.find("h3", {"class": "company-name"})
	tagline = company.find("p", {"class": "tagline"})
	info_list = company.find_all('div', {"class": "list-item"})
	project_size = info_list[0]
	hourly_rate = info_list[1].text.replace(
	'\n', '').replace('/ hr', '').split('-')
	try:
	min_hourly_rate = hourly_rate[0].replace('>', '').strip()
	if '<' in hourly_rate[0]:
	max_hourly_rate = hourly_rate[0].replace(
	'<', '').strip()
	min_hourly_rate = '0'
	except:
	pass

	try:
	if not max_hourly_rate:
	max_hourly_rate = hourly_rate[1].strip()
	except:
	max_hourly_rate = None

	employee = info_list[2].text.replace('\n', '').split('-')
	min_employee = employee[0].strip()
	max_employee = employee[1].strip()
	location = info_list[3]
	locality = location.find('span', {"class": "locality"})
	region = location.find('span', {"class": "region"})
	website = company.find('li', {"class": "website-link"})
	services_object = company.find(
	'div', {"class": {"carousel-inner"}})
	services_html_array = services_object.find_all(
	'div', {"class": {"item"}})
	services = []
	for service in services_html_array:
	service = service.text.replace('\n', '').split('%')
	services.append(
	{'tag': service[1].strip(), 'score': service[0].strip()})

	company_json = {
	"name": name.text.replace('\n', '').strip(),
	"clutch_profile": comp_url.strip(),
	"tagline": tagline.text.replace('\n', '').strip(),
	"min_hourly_rate": min_hourly_rate,
	"max_hourly_rate": max_hourly_rate,
	"min_employee": min_employee,
	"max_employee": max_employee,
	"location": {
	"locality": locality.text.replace('\n', '').strip(),
	"region": region.text.replace('\n', '').strip()
	},
	"webiste": website.a['href'].strip(),
	"services": services
	}
	if company_json not in company_data:
	company_data.append(company_json)
	count += 1
	print('Company number:: ', count)
	sleep(randint(20, 60))
	pageNo += 1
	break
	except:
	print('\nAdded Data for ', count, ' companies.')
	with open('clutch_data.json', 'w') as outfile:
	json.dump(company_data, outfile, indent=4)

	print('\n DONE')