Skip to content

Instantly share code, notes, and snippets.

@mjumbewu
Created February 27, 2013 18:39
Show Gist options
  • Save mjumbewu/5050388 to your computer and use it in GitHub Desktop.
Save mjumbewu/5050388 to your computer and use it in GitHub Desktop.
A scraper to collect Legistar subdomain names from Google
#!/usr/bin/env python
#-*- coding:utf-8 -*-
import sys
from requests import get
import re
import time
pattern = re.compile(r'(\w+)\.legistar\.com')
def main():
count = 0
sites = set()
while count < 50:
search = get(
('http://www.google.com/search?hl=en&tbo=d&output=search'
'&sclient=psy-ab&q=site:legistar.com&gbv=1&sei=CxbFUMOHLoe70QGVyYGoAg'
'&sa=N&start=') + str(count * 10),
headers={'User-agent': ('Mozilla/5.0 (X11; Linux x86_64) '
'AppleWebKit/537.11 (KHTML, like Gecko) '
'Chrome/23.0.1271.95 Safari/537.11')})
if search.status_code != 200:
break
found_sites = set(pattern.findall(search.text))
new_sites = found_sites - sites
sites |= found_sites
count += 1
sys.stderr.write('finished %s pages, with %s matching entries...\n' % (count, len(new_sites)))
sys.stdout.write('\n'.join(new_sites))
if new_sites:
sys.stdout.write('\n')
# Take your time, don't captcha'd
time.sleep(2)
if __name__ == '__main__':
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment