Last active
June 19, 2018 23:04
-
-
Save ixtli/7a39f3db24e868ad1b82356d6ac976d4 to your computer and use it in GitHub Desktop.
A python program to download all public Linked In info about members of ICE. All of this was made by https://twitter.com/sam_lavigne
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
headers = {} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import json | |
import csv | |
import os | |
import requests | |
from bs4 import BeautifulSoup | |
from jinja2 import Template | |
import headers | |
# these represent different job functions | |
FUNCTION_FACETS = [17, 18, 14, 2, 4, 20, 5, 13, 12, 26] #FA | |
SENIORITY_FACETS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] #SE | |
LOCATION_FACETS = [ #G | |
'us:8-2-0-1-2', | |
'us:97', | |
'us:va', | |
'us:dc', | |
'us:tx', | |
'us:ca', | |
'us:md', | |
'us:70', | |
'us:31', | |
'us:ny', | |
'us:8-8-0-8-1', | |
'us:8-8-0-3-1', | |
'us:ga', | |
'us:52', | |
'us:7', | |
'us:8-8-0-95-11', | |
'us:nj', | |
'us:3-2-0-31-1', | |
] | |
FACETS = [ | |
('FA', FUNCTION_FACETS), | |
('SE', SENIORITY_FACETS), | |
('G', LOCATION_FACETS) | |
] | |
def download_file(url, local_filename=None): | |
'''Downloads a file with requests | |
from: https://stackoverflow.com/a/16696317 | |
''' | |
if local_filename is None: | |
local_filename = url.split('/')[-1] | |
print('saving to', local_filename) | |
r = requests.get(url, stream=True) | |
with open(local_filename, 'wb') as f: | |
for chunk in r.iter_content(chunk_size=1024): | |
if chunk: | |
f.write(chunk) | |
return local_filename | |
def get_page(company_id, facet=None, facet_id=None, start=0, count=50): | |
'''Gets a single page of results from linkedin for a particular job function at a company''' | |
params = { | |
'facet': ['CC'], | |
'facet.CC': company_id, | |
'count': count, | |
'start': start, | |
} | |
if facet is not None and facet_id is not None: | |
params['facet'] = ['CC', facet] | |
params['facet.' + facet] = facet_id | |
response = requests.get('https://www.linkedin.com/sales/search/results', headers=headers.headers, params=params) | |
return response.json() | |
def get_company(company_id, outname): | |
'''Gets all employees from a company using particular job functions''' | |
people = [] | |
for facet, facet_ids in FACETS: | |
for facet_id in facet_ids: | |
print('getting facet', facet, facet_id, 'for company', company_id) | |
count = 50 | |
start = 0 | |
results = get_page(company_id, facet, facet_id) | |
total = results['pagination']['total'] | |
people += results['searchResults'] | |
start += count | |
while start < total: | |
print('getting', start, 'of', total) | |
time.sleep(1) | |
results = get_page(company_id, facet, facet_id, start) | |
people += results['searchResults'] | |
start += count | |
with open(outname, 'w') as outfile: | |
json.dump(people, outfile, indent=2) | |
return outname | |
def get_images(datafile): | |
'''Downloads profile images''' | |
with open(datafile, 'r') as infile: | |
people = json.load(infile) | |
people = [p['member'] for p in people] | |
for p in people: | |
if 'vectorImage' not in p: | |
continue | |
pid = p['memberId'] | |
outname = 'images/{}.jpg'.format(pid) | |
if os.path.exists(outname): | |
print('skipping') | |
continue | |
url = p['vectorImage']['rootUrl'] | |
url += sorted(p['vectorImage']['artifacts'], key=lambda x: x['width'])[-1]['fileIdentifyingUrlPathSegment'] | |
print(url) | |
download_file(url, outname) | |
time.sleep(1) | |
def get_profile(pid): | |
'''Downloads individual profiles''' | |
outname = 'profiles/{}.json'.format(pid) | |
if os.path.exists(outname): | |
return outname | |
out = {} | |
url = 'https://www.linkedin.com/sales/people/{},NAME_SEARCH'.format(pid) | |
print(url) | |
response = requests.get(url, headers=headers.headers) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
codes = soup.select('code') | |
for c in codes: | |
try: | |
d = json.loads(c.text) | |
if 'contactInfo' in d: | |
out = d | |
break | |
except Exception as e: | |
continue | |
with open(outname, 'w') as outfile: | |
json.dump(out, outfile) | |
time.sleep(1) | |
return outname | |
def get_profiles(datafile): | |
'''Gets all profiles''' | |
with open(datafile, 'r') as infile: | |
data = json.load(infile) | |
for d in data: | |
pid = d['member']['profileId'] | |
get_profile(pid) | |
def clean_and_parse(datafile, outname): | |
'''Outputs csv, json and html from employee listings''' | |
out = [] | |
mids = [] | |
with open(datafile, 'r') as infile: | |
data = json.load(infile) | |
for d in data: | |
mid = d['member']['memberId'] | |
pid = d['member']['profileId'] | |
imgpath = 'images/{}.jpg'.format(mid) | |
if not os.path.exists(imgpath): | |
imgpath = None | |
item = { | |
'name': d['member'].get('formattedName', ''), | |
'title': d['member'].get('title', ''), | |
'img': imgpath, | |
'company': d['company'].get('companyName', ''), | |
'location': d['member'].get('location', ''), | |
'id': d['member']['memberId'], | |
'linkedin': 'https://linkedin.com/in/' + pid, | |
} | |
if mid not in mids: | |
out.append(item) | |
mids.append(mid) | |
with open(outname + '.json', 'w') as jsonfile: | |
json.dump(out, jsonfile, indent=2) | |
with open(outname + '.csv', 'w') as csvfile: | |
fieldnames = list(out[0].keys()) | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
writer.writeheader() | |
for row in out: | |
writer.writerow(row) | |
with open('template.html', 'r') as templatefile: | |
template = Template(templatefile.read()) | |
html = template.render(people=out) | |
with open('index.html', 'w') as htmlout: | |
htmlout.write(html) | |
if __name__ == '__main__': | |
ICE = '533534' | |
datafile = 'ice_raw.json' | |
get_company(ICE, datafile) | |
get_profiles(datafile) | |
get_images(datafile) | |
clean_and_parse(datafile, 'ice') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
beautifulsoup4==4.6.0 | |
certifi==2018.4.16 | |
chardet==3.0.4 | |
idna==2.7 | |
Jinja2==2.10 | |
MarkupSafe==1.0 | |
requests==2.19.1 | |
urllib3==1.23 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<html> | |
<head> | |
<title>ICE @ LinkedIn</title> | |
<style> | |
body, table { | |
font: 14px sans-serif; | |
} | |
#container { | |
max-width: 1100px; | |
margin: auto; | |
} | |
table { | |
border-collapse: collapse; | |
width: 100%; | |
} | |
th { | |
text-align: left; | |
} | |
td { | |
padding: 3px; | |
border: 1px solid #ccc; | |
} | |
img { | |
max-width: 50px; | |
display: block; | |
} | |
a { | |
color: #000; | |
} | |
</style> | |
</head> | |
<body> | |
<div id="container"> | |
<h1>People on LinkedIn who work for ICE</h1> | |
<p>collected by <a href="http://lav.io">Sam Lavigne</a></p> | |
<p><a href="https://raw.githubusercontent.com/antiboredom/ice-linkedin/master/ice.csv">Download as a CSV</a>, or explore <a href="https://github.com/antiboredom/ice-linkedin">the more detailed dataset here</a>.</p> | |
<table> | |
<tr> | |
<th></th> | |
<th>Name</th> | |
<th>Title</th> | |
<th>Location</th> | |
</tr> | |
{% for p in people %} | |
<tr> | |
<td> | |
{% if p.img %} | |
<a href="{{p.linkedin}}"><img src="{{p.img}}"></a> {% endif %} | |
</td> | |
<td><a href="{{p.linkedin}}">{{p.name}}</a></td> | |
<td>{{p.title}}</td> | |
<td>{{p.location}}</td> | |
</tr> | |
{% endfor %} | |
</table> | |
</div> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment