arsatiki · February 18, 2011 17:44
diff --git a/gistfile1.py b/gistfile1.py
 # Script to perform Google searches and extract the domain names of
 # the returned results.  The order in which the domain names are
 # returned is used to determine a ranking between different companies.

 # This is the list of domains to look for in Google searches.  To
 # search for more or different domains simply alter this list.  The
 # order of this list determines the order in which the results are
 # saved.

 domains = [ 'foo.com', 'bar.com', 'baz.com' ]

 from BeautifulSoup import BeautifulSoup
 import urllib, urllib2
 from urlparse import urlparse
 import itertools

 # Using triple quotes for the user agent string:
 AGENT = """Mozilla/5.0 (Macintosh; U; 
  Intel Mac OS X 10_5_8; en-US) AppleWebKit/534.13 (KHTML, like Gecko) 
  Chrome/9.0.597.94 Safari/534.13"""

 # Performs a Google search and returns a BeautifulSoup object
 # containing the parsed, returned page.
 def google(q): # Query string (will be URL quoted by this function)

    # This attempts to ask for 100 results from Google.  This does not
    # always seem to work correctly.  Note that the fake User-Agent is
    # required otherwise Google will reject the search

    url = "http://google.com/search?q=%s&num=100&hl=en&start=0" % 
          urllib.quote_plus(q)
    req = urllib2.Request(url, None, {'User-Agent': AGENT})
    res = urllib2.urlopen(req)
    dat = res.read()
    res.close()

    return BeautifulSoup(dat)

 # This could be replaced with the csv module, especially if needs grow
 # in the future.
 # Small helper function to output a single line of a CSV file.  Note
 # that this does not do quoting of arguments so assumes there are not
 # " or , present.
 def csv(f, l): # First elements to print followed by list of elements
               # to print
    print "%s," % f, ", ".join(l)

 # Cartesian product function (similar to itertools.product but joins
 # the #elements together as a space separated string rather than
 # returning a tuple)

 # Giving this callable a name, although it is not strictly necessary
 # spacejoiner(("a", "b") -> 'a b'
 spacejoiner = " ".join

 def product(*args):
    return itertools.imap(spacejoiner, itertools.product(*args))

 # This generates all possible search strings to be used using
 # product.  This can be modified to create other search terms by
 # altering the lists (add/delete elements, or add/delete lists).  Each
 # of these terms will have "whizz bang" appended below

 terms = product( [ 'a', 'b', 'c' ],
                 [ 'foo', 'bar', 'baz' ],
                 [ 'one', 'two', 'three', 'four' ] )

 csv('term', domains)

 for t in terms:

    # All the queries have 'whizz bang' appended to the end of
    # them

    qu = "%s whizz bang" % t

    # This performs a Google query using the helper function and then
    # extracts all the <a> tags that have class "l".  If Google
    # changes the structure of their pages this is where this code
    # will break.  Currently class=l grabs all the appropriate links
    # (displayed in green in the search results).

    so = google(qu)

    # The urlparse(u['href'])[1] works by extracting the href from the
    # <a> tag, parsing it into component parts and extracting the 1th
    # element of the returned tuple which contains the netloc (the
    # domain name)

    a = [ urlparse(u['href'])[1] for u in so.findAll("a", {"class":"l"}) ]

    # The pos array stores the lowest position in which a specific
    # domain has been seen in the results from Google.  Each element
    # corresponds to an element of domains.  Initially, these are set
    # to 0 to indicate 'not found' in the results.

    rank = ["0"] * len(domains)

    # Here we iterate through the returned domain names in a and match
    # them up with the domain names we are looking for.

    for i, href in enumerate(a):
        for j, domain in enumerate(domains):

            # Note that the comparison here deals with two cases.  The
            # domain is entirely 'foo.com' (for example), or the
            # domain ends with '.foo.com' (for example).

            if (domain == href) or href.endswith(".%s" % domain):
                rank[j] = "%d" % (i+1) # Count ranks from 1 not 0

    csv(qu, rank)
	# Script to perform Google searches and extract the domain names of
	# the returned results. The order in which the domain names are
	# returned is used to determine a ranking between different companies.

	# This is the list of domains to look for in Google searches. To
	# search for more or different domains simply alter this list. The
	# order of this list determines the order in which the results are
	# saved.

	domains = [ 'foo.com', 'bar.com', 'baz.com' ]

	from BeautifulSoup import BeautifulSoup
	import urllib, urllib2
	from urlparse import urlparse
	import itertools

	# Using triple quotes for the user agent string:
	AGENT = """Mozilla/5.0 (Macintosh; U;
	Intel Mac OS X 10_5_8; en-US) AppleWebKit/534.13 (KHTML, like Gecko)
	Chrome/9.0.597.94 Safari/534.13"""

	# Performs a Google search and returns a BeautifulSoup object
	# containing the parsed, returned page.
	def google(q): # Query string (will be URL quoted by this function)

	# This attempts to ask for 100 results from Google. This does not
	# always seem to work correctly. Note that the fake User-Agent is
	# required otherwise Google will reject the search

	url = "http://google.com/search?q=%s&num=100&hl=en&start=0" %
	urllib.quote_plus(q)
	req = urllib2.Request(url, None, {'User-Agent': AGENT})
	res = urllib2.urlopen(req)
	dat = res.read()
	res.close()

	return BeautifulSoup(dat)

	# This could be replaced with the csv module, especially if needs grow
	# in the future.
	# Small helper function to output a single line of a CSV file. Note
	# that this does not do quoting of arguments so assumes there are not
	# " or , present.
	def csv(f, l): # First elements to print followed by list of elements
	# to print
	print "%s," % f, ", ".join(l)

	# Cartesian product function (similar to itertools.product but joins
	# the #elements together as a space separated string rather than
	# returning a tuple)

	# Giving this callable a name, although it is not strictly necessary
	# spacejoiner(("a", "b") -> 'a b'
	spacejoiner = " ".join

	def product(*args):
	return itertools.imap(spacejoiner, itertools.product(*args))

	# This generates all possible search strings to be used using
	# product. This can be modified to create other search terms by
	# altering the lists (add/delete elements, or add/delete lists). Each
	# of these terms will have "whizz bang" appended below

	terms = product( [ 'a', 'b', 'c' ],
	[ 'foo', 'bar', 'baz' ],
	[ 'one', 'two', 'three', 'four' ] )

	csv('term', domains)

	for t in terms:

	# All the queries have 'whizz bang' appended to the end of
	# them

	qu = "%s whizz bang" % t

	# This performs a Google query using the helper function and then
	# extracts all the <a> tags that have class "l". If Google
	# changes the structure of their pages this is where this code
	# will break. Currently class=l grabs all the appropriate links
	# (displayed in green in the search results).

	so = google(qu)

	# The urlparse(u['href'])[1] works by extracting the href from the
	# <a> tag, parsing it into component parts and extracting the 1th
	# element of the returned tuple which contains the netloc (the
	# domain name)

	a = [ urlparse(u['href'])[1] for u in so.findAll("a", {"class":"l"}) ]

	# The pos array stores the lowest position in which a specific
	# domain has been seen in the results from Google. Each element
	# corresponds to an element of domains. Initially, these are set
	# to 0 to indicate 'not found' in the results.

	rank = ["0"] * len(domains)

	# Here we iterate through the returned domain names in a and match
	# them up with the domain names we are looking for.

	for i, href in enumerate(a):
	for j, domain in enumerate(domains):

	# Note that the comparison here deals with two cases. The
	# domain is entirely 'foo.com' (for example), or the
	# domain ends with '.foo.com' (for example).

	if (domain == href) or href.endswith(".%s" % domain):
	rank[j] = "%d" % (i+1) # Count ranks from 1 not 0

	csv(qu, rank)