ixtli · June 19, 2018 23:04
diff --git a/headers.py b/headers.py
 headers = {}
diff --git a/linked_in_scraper.py b/linked_in_scraper.py
 import time
 import json
 import csv
 import os
 import requests
 from bs4 import BeautifulSoup
 from jinja2 import Template
 import headers

 # these represent different job functions
 FUNCTION_FACETS = [17, 18, 14, 2, 4, 20, 5, 13, 12, 26] #FA
 SENIORITY_FACETS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] #SE
 LOCATION_FACETS = [ #G
    'us:8-2-0-1-2',
    'us:97',
    'us:va',
    'us:dc',
    'us:tx',
    'us:ca',
    'us:md',
    'us:70',
    'us:31',
    'us:ny',
    'us:8-8-0-8-1',
    'us:8-8-0-3-1',
    'us:ga',
    'us:52',
    'us:7',
    'us:8-8-0-95-11',
    'us:nj',
    'us:3-2-0-31-1',
 ]

 FACETS = [
    ('FA', FUNCTION_FACETS),
    ('SE', SENIORITY_FACETS),
    ('G', LOCATION_FACETS)
 ]

 def download_file(url, local_filename=None):
    '''Downloads a file with requests
    from: https://stackoverflow.com/a/16696317
    '''

    if local_filename is None:
        local_filename = url.split('/')[-1]

    print('saving to', local_filename)
    r = requests.get(url, stream=True)
    with open(local_filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=1024):
            if chunk:
                f.write(chunk)

    return local_filename


 def get_page(company_id, facet=None, facet_id=None, start=0, count=50):
    '''Gets a single page of results from linkedin for a particular job function at a company'''

    params = {
        'facet': ['CC'],
        'facet.CC': company_id,
        'count': count,
        'start': start,
    }

    if facet is not None and facet_id is not None:
        params['facet'] = ['CC', facet]
        params['facet.' + facet] = facet_id

    response = requests.get('https://www.linkedin.com/sales/search/results', headers=headers.headers, params=params)
    return response.json()


 def get_company(company_id, outname):
    '''Gets all employees from a company using particular job functions'''
    people = []

    for facet, facet_ids in FACETS:
        for facet_id in facet_ids:
            print('getting facet', facet, facet_id, 'for company', company_id)
            count = 50
            start = 0
            results = get_page(company_id, facet, facet_id)
            total = results['pagination']['total']
            people += results['searchResults']
            start += count
            while start < total:
                print('getting', start, 'of', total)
                time.sleep(1)
                results = get_page(company_id, facet, facet_id, start)
                people += results['searchResults']
                start += count

                with open(outname, 'w') as outfile:
                    json.dump(people, outfile, indent=2)

    return outname


 def get_images(datafile):
    '''Downloads profile images'''

    with open(datafile, 'r') as infile:
        people = json.load(infile)

    people = [p['member'] for p in people]

    for p in people:
        if 'vectorImage' not in p:
            continue

        pid = p['memberId']
        outname = 'images/{}.jpg'.format(pid)

        if os.path.exists(outname):
            print('skipping')
            continue

        url = p['vectorImage']['rootUrl']
        url += sorted(p['vectorImage']['artifacts'], key=lambda x: x['width'])[-1]['fileIdentifyingUrlPathSegment']

        print(url)

        download_file(url, outname)

        time.sleep(1)


 def get_profile(pid):
    '''Downloads individual profiles'''

    outname = 'profiles/{}.json'.format(pid)
    if os.path.exists(outname):
        return outname

    out = {}
    url = 'https://www.linkedin.com/sales/people/{},NAME_SEARCH'.format(pid)
    print(url)
    response = requests.get(url, headers=headers.headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    codes = soup.select('code')
    for c in codes:
        try:
            d = json.loads(c.text)
            if 'contactInfo' in d:
                out = d
                break
        except Exception as e:
            continue

    with open(outname, 'w') as outfile:
        json.dump(out, outfile)

    time.sleep(1)
    return outname


 def get_profiles(datafile):
    '''Gets all profiles'''

    with open(datafile, 'r') as infile:
        data = json.load(infile)

    for d in data:
        pid = d['member']['profileId']
        get_profile(pid)


 def clean_and_parse(datafile, outname):
    '''Outputs csv, json and html from employee listings'''

    out = []
    mids = []
    with open(datafile, 'r') as infile:
        data = json.load(infile)

    for d in data:
        mid = d['member']['memberId']
        pid = d['member']['profileId']

        imgpath = 'images/{}.jpg'.format(mid)
        if not os.path.exists(imgpath):
            imgpath = None

        item = {
            'name': d['member'].get('formattedName', ''),
            'title': d['member'].get('title', ''),
            'img': imgpath,
            'company': d['company'].get('companyName', ''),
            'location': d['member'].get('location', ''),
            'id': d['member']['memberId'],
            'linkedin': 'https://linkedin.com/in/' + pid,
        }

        if mid not in mids:
            out.append(item)
            mids.append(mid)

    with open(outname + '.json', 'w') as jsonfile:
        json.dump(out, jsonfile, indent=2)

    with open(outname + '.csv', 'w') as csvfile:
        fieldnames = list(out[0].keys())
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in out:
            writer.writerow(row)

    with open('template.html', 'r') as templatefile:
        template = Template(templatefile.read())
    html = template.render(people=out)
    with open('index.html', 'w') as htmlout:
        htmlout.write(html)


 if __name__ == '__main__':
    ICE = '533534'
    datafile = 'ice_raw.json'
    get_company(ICE, datafile)
    get_profiles(datafile)
    get_images(datafile)
    clean_and_parse(datafile, 'ice')
diff --git a/requirements.txt b/requirements.txt
 beautifulsoup4==4.6.0
 certifi==2018.4.16
 chardet==3.0.4
 idna==2.7
 Jinja2==2.10
 MarkupSafe==1.0
 requests==2.19.1
 urllib3==1.23
diff --git a/template.html b/template.html
 <html>

 <head>
  <title>ICE @ LinkedIn</title>
  <style>
    body, table {
      font: 14px sans-serif;
    }
    #container {
      max-width: 1100px;
        margin: auto;
    }
    table {
      border-collapse: collapse;
        width: 100%;
    }
    th {
      text-align: left;
    }
    td {
      padding: 3px;
      border: 1px solid #ccc;
    }
    img {
      max-width: 50px;
      display: block;
    }
    a {
      color: #000;
    }
  </style>
 </head>

 <body>
  <div id="container">
  <h1>People on LinkedIn who work for ICE</h1>
  <p>collected by <a href="http://lav.io">Sam Lavigne</a></p>
  <p><a href="https://raw.githubusercontent.com/antiboredom/ice-linkedin/master/ice.csv">Download as a CSV</a>, or explore <a href="https://github.com/antiboredom/ice-linkedin">the more detailed dataset here</a>.</p>
  <table>
    <tr>
      <th></th>
      <th>Name</th>
      <th>Title</th>
      <th>Location</th>
    </tr>
    {% for p in people %}
    <tr>
      <td>
        {% if p.img %}
        <a href="{{p.linkedin}}"><img src="{{p.img}}"></a> {% endif %}
      </td>
      <td><a href="{{p.linkedin}}">{{p.name}}</a></td>
      <td>{{p.title}}</td>
      <td>{{p.location}}</td>
      </tr>
      {% endfor %}
    </table>
    </div>
  </body>
 </html>
	import time
	import json
	import csv
	import os
	import requests
	from bs4 import BeautifulSoup
	from jinja2 import Template
	import headers

	# these represent different job functions
	FUNCTION_FACETS = [17, 18, 14, 2, 4, 20, 5, 13, 12, 26] #FA
	SENIORITY_FACETS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] #SE
	LOCATION_FACETS = [ #G
	'us:8-2-0-1-2',
	'us:97',
	'us:va',
	'us:dc',
	'us:tx',
	'us:ca',
	'us:md',
	'us:70',
	'us:31',
	'us:ny',
	'us:8-8-0-8-1',
	'us:8-8-0-3-1',
	'us:ga',
	'us:52',
	'us:7',
	'us:8-8-0-95-11',
	'us:nj',
	'us:3-2-0-31-1',
	]

	FACETS = [
	('FA', FUNCTION_FACETS),
	('SE', SENIORITY_FACETS),
	('G', LOCATION_FACETS)
	]

	def download_file(url, local_filename=None):
	'''Downloads a file with requests
	from: https://stackoverflow.com/a/16696317
	'''

	if local_filename is None:
	local_filename = url.split('/')[-1]

	print('saving to', local_filename)
	r = requests.get(url, stream=True)
	with open(local_filename, 'wb') as f:
	for chunk in r.iter_content(chunk_size=1024):
	if chunk:
	f.write(chunk)

	return local_filename


	def get_page(company_id, facet=None, facet_id=None, start=0, count=50):
	'''Gets a single page of results from linkedin for a particular job function at a company'''

	params = {
	'facet': ['CC'],
	'facet.CC': company_id,
	'count': count,
	'start': start,
	}

	if facet is not None and facet_id is not None:
	params['facet'] = ['CC', facet]
	params['facet.' + facet] = facet_id

	response = requests.get('https://www.linkedin.com/sales/search/results', headers=headers.headers, params=params)
	return response.json()


	def get_company(company_id, outname):
	'''Gets all employees from a company using particular job functions'''
	people = []

	for facet, facet_ids in FACETS:
	for facet_id in facet_ids:
	print('getting facet', facet, facet_id, 'for company', company_id)
	count = 50
	start = 0
	results = get_page(company_id, facet, facet_id)
	total = results['pagination']['total']
	people += results['searchResults']
	start += count
	while start < total:
	print('getting', start, 'of', total)
	time.sleep(1)
	results = get_page(company_id, facet, facet_id, start)
	people += results['searchResults']
	start += count

	with open(outname, 'w') as outfile:
	json.dump(people, outfile, indent=2)

	return outname


	def get_images(datafile):
	'''Downloads profile images'''

	with open(datafile, 'r') as infile:
	people = json.load(infile)

	people = [p['member'] for p in people]

	for p in people:
	if 'vectorImage' not in p:
	continue

	pid = p['memberId']
	outname = 'images/{}.jpg'.format(pid)

	if os.path.exists(outname):
	print('skipping')
	continue

	url = p['vectorImage']['rootUrl']
	url += sorted(p['vectorImage']['artifacts'], key=lambda x: x['width'])[-1]['fileIdentifyingUrlPathSegment']

	print(url)

	download_file(url, outname)

	time.sleep(1)


	def get_profile(pid):
	'''Downloads individual profiles'''

	outname = 'profiles/{}.json'.format(pid)
	if os.path.exists(outname):
	return outname

	out = {}
	url = 'https://www.linkedin.com/sales/people/{},NAME_SEARCH'.format(pid)
	print(url)
	response = requests.get(url, headers=headers.headers)
	soup = BeautifulSoup(response.text, 'html.parser')
	codes = soup.select('code')
	for c in codes:
	try:
	d = json.loads(c.text)
	if 'contactInfo' in d:
	out = d
	break
	except Exception as e:
	continue

	with open(outname, 'w') as outfile:
	json.dump(out, outfile)

	time.sleep(1)
	return outname


	def get_profiles(datafile):
	'''Gets all profiles'''

	with open(datafile, 'r') as infile:
	data = json.load(infile)

	for d in data:
	pid = d['member']['profileId']
	get_profile(pid)


	def clean_and_parse(datafile, outname):
	'''Outputs csv, json and html from employee listings'''

	out = []
	mids = []
	with open(datafile, 'r') as infile:
	data = json.load(infile)

	for d in data:
	mid = d['member']['memberId']
	pid = d['member']['profileId']

	imgpath = 'images/{}.jpg'.format(mid)
	if not os.path.exists(imgpath):
	imgpath = None

	item = {
	'name': d['member'].get('formattedName', ''),
	'title': d['member'].get('title', ''),
	'img': imgpath,
	'company': d['company'].get('companyName', ''),
	'location': d['member'].get('location', ''),
	'id': d['member']['memberId'],
	'linkedin': 'https://linkedin.com/in/' + pid,
	}

	if mid not in mids:
	out.append(item)
	mids.append(mid)

	with open(outname + '.json', 'w') as jsonfile:
	json.dump(out, jsonfile, indent=2)

	with open(outname + '.csv', 'w') as csvfile:
	fieldnames = list(out[0].keys())
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
	writer.writeheader()
	for row in out:
	writer.writerow(row)

	with open('template.html', 'r') as templatefile:
	template = Template(templatefile.read())
	html = template.render(people=out)
	with open('index.html', 'w') as htmlout:
	htmlout.write(html)


	if __name__ == '__main__':
	ICE = '533534'
	datafile = 'ice_raw.json'
	get_company(ICE, datafile)
	get_profiles(datafile)
	get_images(datafile)
	clean_and_parse(datafile, 'ice')
	beautifulsoup4==4.6.0
	certifi==2018.4.16
	chardet==3.0.4
	idna==2.7
	Jinja2==2.10
	MarkupSafe==1.0
	requests==2.19.1
	urllib3==1.23
	<html>

	<head>
	<title>ICE @ LinkedIn</title>
	<style>
	body, table {
	font: 14px sans-serif;
	}
	#container {
	max-width: 1100px;
	margin: auto;
	}
	table {
	border-collapse: collapse;
	width: 100%;
	}
	th {
	text-align: left;
	}
	td {
	padding: 3px;
	border: 1px solid #ccc;
	}
	img {
	max-width: 50px;
	display: block;
	}
	a {
	color: #000;
	}
	</style>
	</head>

	<body>
	<div id="container">
	<h1>People on LinkedIn who work for ICE</h1>
	<p>collected by <a href="http://lav.io">Sam Lavigne</a></p>
	<p><a href="https://raw.githubusercontent.com/antiboredom/ice-linkedin/master/ice.csv">Download as a CSV</a>, or explore <a href="https://github.com/antiboredom/ice-linkedin">the more detailed dataset here</a>.</p>
	<table>
	<tr>
	<th></th>
	<th>Name</th>
	<th>Title</th>
	<th>Location</th>
	</tr>
	{% for p in people %}
	<tr>
	<td>
	{% if p.img %}
	<a href="{{p.linkedin}}"><img src="{{p.img}}"></a> {% endif %}
	</td>
	<td><a href="{{p.linkedin}}">{{p.name}}</a></td>
	<td>{{p.title}}</td>
	<td>{{p.location}}</td>
	</tr>
	{% endfor %}
	</table>
	</div>
	</body>
	</html>