Created
September 6, 2015 17:19
-
-
Save kylemanna/b48d3dce447927377531 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
''' | |
Angellist Web Search Scraper | |
Author: Kyle Manna | |
The goal of htis tool is to scrape the Angellist search results becaues the | |
Search API [1] only returns 20 results with no pagination at the time of | |
this writing (2015.09.06). | |
It's super hacky but works. Pass in query arguments on the command line. The | |
script will return a massive json array with all the results (de-paginated). | |
It's recommended to test the search query strings ahead of time using a web | |
browser @ https://angel.co/search?q=query1 | |
Currently it's hardcoded to only search for people matching the queries. | |
Usage: ./al-search.py query1 query2 | |
[1] https://angel.co/api/spec/search | |
''' | |
import json | |
import urllib.request | |
from pyquery import PyQuery as pq | |
import argparse | |
def search(query): | |
result = [] | |
url = 'https://angel.co/search' | |
values = { | |
'page':0, | |
'per_page':40, | |
'skip_loading':'true', | |
'include_ids':'', | |
'q':query, | |
'type':'people', | |
} | |
headers = { 'Accept': '*/*' } | |
while True: | |
values['page'] = values['page'] + 1 | |
full_url = url + '?' + urllib.parse.urlencode(values) | |
req = urllib.request.Request(full_url, headers=headers, method='GET') | |
with urllib.request.urlopen(req) as response: | |
html_doc = json.loads(response.read().decode('utf-8'))['html'] | |
entries = pq(html_doc)('.result') | |
if len(entries) == 0: break | |
for entry in entries: | |
obj = {} | |
entry = pq(entry) | |
title = entry('.title') | |
a = title('a') | |
obj['href'] = a.attr['href'] | |
obj['name'] = a.text() | |
obj['slug'] = obj['href'].rsplit('/',1)[-1] | |
obj['type'] = entry('.type').text().strip() | |
obj['pic'] = entry('img').attr['src'] | |
bio = entry('.excerpt') | |
if bio: | |
obj['bio'] = bio.text().strip()[1:-2] | |
result.append(obj) | |
return result | |
if __name__ == '__main__': | |
p = argparse.ArgumentParser() | |
p.add_argument('query', nargs='+') | |
args = p.parse_args() | |
result = [] | |
for q in args.query: | |
result.extend(search(q)) | |
print(json.dumps(result)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment