Skip to content

Instantly share code, notes, and snippets.

@arsatiki
Created February 18, 2011 17:44
Show Gist options
  • Save arsatiki/834063 to your computer and use it in GitHub Desktop.
Save arsatiki/834063 to your computer and use it in GitHub Desktop.
jgc's google script
# Script to perform Google searches and extract the domain names of
# the returned results. The order in which the domain names are
# returned is used to determine a ranking between different companies.
# This is the list of domains to look for in Google searches. To
# search for more or different domains simply alter this list. The
# order of this list determines the order in which the results are
# saved.
domains = [ 'foo.com', 'bar.com', 'baz.com' ]
from BeautifulSoup import BeautifulSoup
import urllib, urllib2
from urlparse import urlparse
import itertools
# Using triple quotes for the user agent string:
AGENT = """Mozilla/5.0 (Macintosh; U;
Intel Mac OS X 10_5_8; en-US) AppleWebKit/534.13 (KHTML, like Gecko)
Chrome/9.0.597.94 Safari/534.13"""
# Performs a Google search and returns a BeautifulSoup object
# containing the parsed, returned page.
def google(q): # Query string (will be URL quoted by this function)
# This attempts to ask for 100 results from Google. This does not
# always seem to work correctly. Note that the fake User-Agent is
# required otherwise Google will reject the search
url = "http://google.com/search?q=%s&num=100&hl=en&start=0" %
urllib.quote_plus(q)
req = urllib2.Request(url, None, {'User-Agent': AGENT})
res = urllib2.urlopen(req)
dat = res.read()
res.close()
return BeautifulSoup(dat)
# This could be replaced with the csv module, especially if needs grow
# in the future.
# Small helper function to output a single line of a CSV file. Note
# that this does not do quoting of arguments so assumes there are not
# " or , present.
def csv(f, l): # First elements to print followed by list of elements
# to print
print "%s," % f, ", ".join(l)
# Cartesian product function (similar to itertools.product but joins
# the #elements together as a space separated string rather than
# returning a tuple)
# Giving this callable a name, although it is not strictly necessary
# spacejoiner(("a", "b") -> 'a b'
spacejoiner = " ".join
def product(*args):
return itertools.imap(spacejoiner, itertools.product(*args))
# This generates all possible search strings to be used using
# product. This can be modified to create other search terms by
# altering the lists (add/delete elements, or add/delete lists). Each
# of these terms will have "whizz bang" appended below
terms = product( [ 'a', 'b', 'c' ],
[ 'foo', 'bar', 'baz' ],
[ 'one', 'two', 'three', 'four' ] )
csv('term', domains)
for t in terms:
# All the queries have 'whizz bang' appended to the end of
# them
qu = "%s whizz bang" % t
# This performs a Google query using the helper function and then
# extracts all the <a> tags that have class "l". If Google
# changes the structure of their pages this is where this code
# will break. Currently class=l grabs all the appropriate links
# (displayed in green in the search results).
so = google(qu)
# The urlparse(u['href'])[1] works by extracting the href from the
# <a> tag, parsing it into component parts and extracting the 1th
# element of the returned tuple which contains the netloc (the
# domain name)
a = [ urlparse(u['href'])[1] for u in so.findAll("a", {"class":"l"}) ]
# The pos array stores the lowest position in which a specific
# domain has been seen in the results from Google. Each element
# corresponds to an element of domains. Initially, these are set
# to 0 to indicate 'not found' in the results.
rank = ["0"] * len(domains)
# Here we iterate through the returned domain names in a and match
# them up with the domain names we are looking for.
for i, href in enumerate(a):
for j, domain in enumerate(domains):
# Note that the comparison here deals with two cases. The
# domain is entirely 'foo.com' (for example), or the
# domain ends with '.foo.com' (for example).
if (domain == href) or href.endswith(".%s" % domain):
rank[j] = "%d" % (i+1) # Count ranks from 1 not 0
csv(qu, rank)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment