kylemanna · September 6, 2015 17:19
diff --git a/al-search.py b/al-search.py
 #!/usr/bin/env python3
 '''
 Angellist Web Search Scraper

 Author: Kyle Manna

 The goal of htis tool is to scrape the Angellist search results becaues the
 Search API [1] only returns 20 results with no pagination at the time of
 this writing (2015.09.06).

 It's super hacky but works.  Pass in query arguments on the command line.  The
 script will return a massive json array with all the results (de-paginated).

 It's recommended to test the search query strings ahead of time using a web
 browser @ https://angel.co/search?q=query1

 Currently it's hardcoded to only search for people matching the queries.

 Usage: ./al-search.py query1 query2

 [1] https://angel.co/api/spec/search

 '''

 import json
 import urllib.request
 from pyquery import PyQuery as pq
 import argparse

 def search(query):
    result = []
    url = 'https://angel.co/search'
    values = {
            'page':0,
            'per_page':40,
            'skip_loading':'true',
            'include_ids':'',
            'q':query,
            'type':'people',
            }

    headers = { 'Accept': '*/*' }

    while True:
        values['page'] = values['page'] + 1
        full_url = url + '?' + urllib.parse.urlencode(values)
        req = urllib.request.Request(full_url, headers=headers, method='GET')

        with urllib.request.urlopen(req) as response:
            html_doc = json.loads(response.read().decode('utf-8'))['html']

        entries = pq(html_doc)('.result')
        if len(entries) == 0: break

        for entry in entries:
            obj = {}

            entry = pq(entry)

            title = entry('.title')
            a = title('a')
            obj['href'] = a.attr['href']
            obj['name'] = a.text()
            obj['slug'] = obj['href'].rsplit('/',1)[-1]

            obj['type'] = entry('.type').text().strip()
            
            obj['pic'] = entry('img').attr['src']
            
            bio = entry('.excerpt')
            if bio:
                obj['bio'] = bio.text().strip()[1:-2]

            result.append(obj)

    return result

 if __name__ == '__main__':

    p = argparse.ArgumentParser()
    p.add_argument('query', nargs='+')
    args = p.parse_args()


    result = []

    for q in args.query:
        result.extend(search(q))

    print(json.dumps(result))
	#!/usr/bin/env python3
	'''
	Angellist Web Search Scraper

	Author: Kyle Manna

	The goal of htis tool is to scrape the Angellist search results becaues the
	Search API [1] only returns 20 results with no pagination at the time of
	this writing (2015.09.06).

	It's super hacky but works. Pass in query arguments on the command line. The
	script will return a massive json array with all the results (de-paginated).

	It's recommended to test the search query strings ahead of time using a web
	browser @ https://angel.co/search?q=query1

	Currently it's hardcoded to only search for people matching the queries.

	Usage: ./al-search.py query1 query2

	[1] https://angel.co/api/spec/search

	'''

	import json
	import urllib.request
	from pyquery import PyQuery as pq
	import argparse

	def search(query):
	result = []
	url = 'https://angel.co/search'
	values = {
	'page':0,
	'per_page':40,
	'skip_loading':'true',
	'include_ids':'',
	'q':query,
	'type':'people',
	}

	headers = { 'Accept': '/' }

	while True:
	values['page'] = values['page'] + 1
	full_url = url + '?' + urllib.parse.urlencode(values)
	req = urllib.request.Request(full_url, headers=headers, method='GET')

	with urllib.request.urlopen(req) as response:
	html_doc = json.loads(response.read().decode('utf-8'))['html']

	entries = pq(html_doc)('.result')
	if len(entries) == 0: break

	for entry in entries:
	obj = {}

	entry = pq(entry)

	title = entry('.title')
	a = title('a')
	obj['href'] = a.attr['href']
	obj['name'] = a.text()
	obj['slug'] = obj['href'].rsplit('/',1)[-1]

	obj['type'] = entry('.type').text().strip()

	obj['pic'] = entry('img').attr['src']

	bio = entry('.excerpt')
	if bio:
	obj['bio'] = bio.text().strip()[1:-2]

	result.append(obj)

	return result

	if __name__ == '__main__':

	p = argparse.ArgumentParser()
	p.add_argument('query', nargs='+')
	args = p.parse_args()


	result = []

	for q in args.query:
	result.extend(search(q))

	print(json.dumps(result))