Created
February 18, 2011 19:22
-
-
Save ulope/834249 to your computer and use it in GitHub Desktop.
This is an attemt to make the code from http://blog.jgc.org/2011/02/how-could-i-have-coded-this-better.html a bit more pythonic
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import csv | |
import sys | |
from BeautifulSoup import BeautifulSoup | |
import urllib, urllib2 | |
from urlparse import urlparse | |
from collections import defaultdict | |
__doc__ = """ | |
Script to perform Google searches and extract the domain names of | |
the returned results. The order in which the domain names are | |
returned is used to determine a ranking between different companies. | |
""" | |
# This is the list of domains to look for in Google searches. To | |
# search for more or different domains simply alter this list. The | |
# order of this list determines the order in which the results are | |
# saved. | |
DOMAINS = ['apple.com', 'microsoft.com', 'engadget.com', 'wired.com', 'cnet.com', ] | |
# All possible search strings to be used are generated from this list by using | |
# 'product'. This can be modified to create other search terms by | |
# altering the lists (add/delete elements, or add/delete lists). Each | |
# of these terms will have "whizz bang" appended below | |
SEARCH_TERMS = ( | |
('apple', 'microsoft', 'gadget', ), | |
('iphone', 'xbox', 'news', ), | |
('4', '360', ), | |
) | |
# All search terms are passed through this format string (so you can add | |
# static pre-/post-fixes) | |
SEARCH_MODIFIER = "%s" | |
GOOGLE_URL = "http://google.com/search?q=%s&num=100&hl=en&start=0" | |
def google(q): # Query string (will be URL quoted by this function) | |
""" | |
Performs a Google search and returns a BeautifulSoup object | |
containing the parsed, returned page. | |
""" | |
# This attempts to ask for 100 results from Google. This does not | |
# always seem to work correctly. Note that the fake User-Agent is | |
# required otherwise Google will reject the search | |
url = GOOGLE_URL % urllib.quote_plus(q) | |
try: | |
req = urllib2.Request(url, None, { | |
'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; ' | |
'en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.94 ' | |
'Safari/534.13'}) | |
res = urllib2.urlopen(req) | |
dat = res.read() | |
# optional add error handling (e.g. return None on broken request or some such) | |
finally: | |
try: res.close() | |
except: pass | |
return BeautifulSoup(dat) | |
def product(*args): # List to x together | |
""" | |
Cartesian product function (similar to itertools.product but joins | |
the #elements together as a space separated string rather than | |
returning a tuple) | |
""" | |
pools = map(tuple, args) | |
result = [[]] | |
for pool in pools: | |
result = [x+[y] for x in result for y in pool] | |
for prod in result: | |
yield " ".join(prod) | |
def main(): | |
writer = csv.writer(sys.stdout) | |
writer.writerow(['term'] + DOMAINS) | |
for t in product(*SEARCH_TERMS): | |
# Format search term | |
qu = SEARCH_MODIFIER % t | |
# This performs a Google query using the helper function. | |
so = google(qu) | |
# Then extracts all the <a> tags that have class "l". If Google | |
# changes the structure of their pages this is where this code | |
# will break. Currently class=l grabs all the appropriate links | |
# (displayed in green in the search results). | |
# The urlparse(u['href'])[1] works by extracting the href from the | |
# <a> tag, parsing it into component parts and extracting the 1th | |
# element of the returned tuple which contains the netloc (the | |
# domain name) | |
hrefs = [ urlparse(u['href'])[1] for u in so.findAll("a", {"class":"l"}) ] | |
# Used to hold the rank of the domains a result was found on. | |
# Uses defaultdict(int) to have a default of 0 for not found domains | |
ranks = defaultdict(int) | |
# Here we iterate through the returned domain names in hrefs and | |
# match them up with the domain names we are looking for. | |
for i, href in enumerate(hrefs): | |
for domain in DOMAINS: | |
# Note that the comparison here deals with two cases. The | |
# domain is entirely 'foo.com' (for example), or the | |
# domain ends with '.foo.com' (for example). | |
if href == domain or href.endswith(".%s" % domain): | |
ranks[domain] = i | |
writer.writerow([qu] + [ranks[domain] for domain in DOMAINS]) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment