Last active
May 21, 2019 12:16
-
-
Save sairoopb/55d219aa87ca65f8ddfb41cf23786596 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import requests | |
from bs4 import BeautifulSoup | |
from concurrent.futures import ThreadPoolExecutor | |
import json | |
from requests.exceptions import InvalidURL,TooManyRedirects | |
from requests import ReadTimeout, ConnectTimeout, HTTPError, Timeout, ConnectionError | |
def get_html_array(js_url): | |
response = requests.get(js_url) | |
start = response.text.find("var html = [") + len("var html = [") - 1 | |
end = response.text.find(".join(") | |
raw_js_array = response.text[start:end] | |
python_list = json.loads(raw_js_array) | |
return python_list | |
def url_get(var): | |
soup = BeautifulSoup(var,'html.parser') | |
list_urls = [] | |
for url in soup.find_all("a") : | |
if "www.crunchbase.com" not in url.get('href',): | |
list_urls.append(url.get('href',)) | |
return list_urls | |
html_array = get_html_array("https://connect.techstars.com/widgets/portfolio-statistics.js") | |
html_final = ("\n".join(html_array)) | |
list_of_url = [] | |
list_of_url = url_get(html_final) | |
def app_get(comp_url): | |
headers = { | |
'authority': 'www.crunchbase.com', | |
'cache-control': 'max-age=0', | |
'upgrade-insecure-requests': '1', | |
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36', | |
'dnt': '1', | |
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', | |
'accept-encoding': 'gzip, deflate, br', | |
'accept-language': 'en-US,en;q=0.9', | |
'cookie': '__cfduid=d499acc665a0982639cca9a26bdf370161557839726; _pxhd=52dfd28c320c0e311f69e0cb66953de8b765de55690c734ae52e52a1e6de5c06:56caf5b1-764a-11e9-8c30-3322ba1ff06d; cid=rBsWdlzav25iOAAkC8GtAg==; _ga=GA1.2.552015480.1557839702; _fbp=fb.1.1557839702219.416251075; __qca=P0-1160423674-1557839702197; __zlcmid=sIiEfBGcEd0oVD; fs_uid=rs.fullstory.com`BA8KZ`5353594564706304:5992337998938112; _gid=GA1.2.211578024.1558344556; _hp2_ses_props.973801186=%7B%22ts%22%3A1558351433747%2C%22d%22%3A%22www.crunchbase.com%22%2C%22h%22%3A%22%2F%22%7D; _mkto_trk=id:976-JJA-800&token:_mch-crunchbase.com-1558351439685-49508; remember_user_token=W1syNTM0MTY1XSwiWkZ3QXJHVjlUTV9HNHlubXNyN3AiLCIxNTU4MzUxODY0LjA3NTk0Il0%3D--fdcd15d5665029f749f43e973e421cdffe582a78; XSRF-TOKEN=zd7iIY3d6hgMqNBLtWPTnu2szyxH9QpgEi9sDwZiKs94O6%2B7K1aq3w1VpnvyLsCHvpLGJMudJqbj1llsv24N4w%3D%3D; _site_session=98f381c251275771207a87ccefc1f803; _gat_UA-60854465-1=1; _pxff_tm=1; _px3=649f2e4854a0958d3d3ff7fbcff4afc68eee70afa8853da5f4122faa108cfa49:8uLj9mg0hxM5QN09s3XkOZBD3cb8XrbNN2d6wCkdXN75YLqa5IgqzJwLwHHhrNs24Awvc6AtAQ1uDcGEKEzs/w==:1000:m2BFWfCzyT7cTEjySs+37sJcQlGalfOBNfqwbi8NF/ZyZl1y3kN13BOwqRfO5agy8uwKgRXvU0llAhkuWJ6Yhpmn0hH/MisEA6/j3ZmSsot2onYwnGui+4k2c0B/LxelB5fYcjb4AsdkSf0Ehd16McaOuso01PkUH+oWpsCz524=; _hp2_props.973801186=%7B%22Logged%20In%22%3Atrue%2C%22Pro%22%3Afalse%2C%22apptopia-lite%22%3Atrue%2C%22similarweb%22%3Atrue%2C%22owler%22%3Atrue%7D; _pendo_visitorId.c2d5ec20-6f43-454d-5214-f0bb69852048=c107a956-df02-4e2e-8467-f58a854df062; _pendo_meta.c2d5ec20-6f43-454d-5214-f0bb69852048=539547481; _hp2_id.973801186=%7B%22userId%22%3A%227667545879524903%22%2C%22pageviewId%22%3A%225492701109201534%22%2C%22sessionId%22%3A%226247381939942526%22%2C%22identity%22%3A%22john.andrews%40mailinnator.com%22%2C%22trackerVersion%22%3A%224.0%22%2C%22identityField%22%3Anull%2C%22isIdentified%22%3A1%7D', | |
} | |
try: | |
response = requests.get(comp_url, headers=headers,timeout = 2) | |
soup = BeautifulSoup(response.text,'html.parser') | |
for link in soup.select("a"): | |
r = link.get("href","") | |
if "play.google.com" in r: | |
return r | |
return 'NULL' | |
except (ConnectTimeout, HTTPError, ReadTimeout, Timeout, InvalidURL, ConnectionError): | |
return "NULL" | |
def check(url): | |
if url != "": | |
if "https://" not in url: | |
if "http://" not in url: | |
url = "http://" + url | |
return url | |
else: | |
return url | |
else: | |
return url | |
def remove_values_from_list(the_list, val): | |
while val in the_list: | |
the_list.remove(val) | |
list_of_url_up = list(map(check,list_of_url)) | |
remove_values_from_list(list_of_url_up,None) | |
with ThreadPoolExecutor(max_workers=10) as executor: | |
list_of_playstore_links = list(executor.map(app_get,list_of_url_up)) | |
list_of_playstore_links.insert(0,"Playstore Link") | |
list_of_url_up.insert(0,"Company Website") | |
info = dict(zip(list_of_url_up,list_of_playstore_links)) | |
with open('info.csv', 'w+') as csv_file: | |
writer = csv.writer(csv_file) | |
for key, value in info.items(): | |
writer.writerow([key, value]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment