ulope · February 18, 2011 19:22
diff --git a/Google search domain ranking b/Google search domain ranking
 #!/usr/bin/env python
 import csv
 import sys
 from BeautifulSoup import BeautifulSoup
 import urllib, urllib2
 from urlparse import urlparse
 from collections import defaultdict

 __doc__ = """
 Script to perform Google searches and extract the domain names of
 the returned results.  The order in which the domain names are
 returned is used to determine a ranking between different companies.
 """

 # This is the list of domains to look for in Google searches.  To
 # search for more or different domains simply alter this list.  The
 # order of this list determines the order in which the results are
 # saved.
 DOMAINS = ['apple.com', 'microsoft.com', 'engadget.com', 'wired.com', 'cnet.com', ]

 # All possible search strings to be used are generated from this list by using
 # 'product'.  This can be modified to create other search terms by
 # altering the lists (add/delete elements, or add/delete lists).  Each
 # of these terms will have "whizz bang" appended below
 SEARCH_TERMS = (
    ('apple', 'microsoft', 'gadget', ),
    ('iphone', 'xbox', 'news', ),
    ('4', '360', ),
 )

 # All search terms are passed through this format string (so you can add
 # static pre-/post-fixes)
 SEARCH_MODIFIER = "%s"

 GOOGLE_URL = "http://google.com/search?q=%s&num=100&hl=en&start=0"


 def google(q): # Query string (will be URL quoted by this function)
    """
    Performs a Google search and returns a BeautifulSoup object
    containing the parsed, returned page.
    """
    
    # This attempts to ask for 100 results from Google.  This does not
    # always seem to work correctly.  Note that the fake User-Agent is
    # required otherwise Google will reject the search

    url = GOOGLE_URL % urllib.quote_plus(q)
    try:
        req = urllib2.Request(url, None, {
            'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; '
            'en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.94 '
            'Safari/534.13'})
        res = urllib2.urlopen(req)
        dat = res.read()
    # optional add error handling (e.g. return None on broken request or some such)
    finally:
        try: res.close()
        except: pass

    return BeautifulSoup(dat)


 def product(*args): # List to x together
    """
    Cartesian product function (similar to itertools.product but joins
    the #elements together as a space separated string rather than
    returning a tuple)
    """
    pools = map(tuple, args)
    result = [[]]
    for pool in pools:
        result = [x+[y] for x in result for y in pool]
    for prod in result:
        yield " ".join(prod)


 def main():
    writer = csv.writer(sys.stdout)

    writer.writerow(['term'] + DOMAINS)

    for t in product(*SEARCH_TERMS):

        # Format search term
        qu = SEARCH_MODIFIER % t

        # This performs a Google query using the helper function.
        so = google(qu)

        # Then extracts all the <a> tags that have class "l".  If Google
        # changes the structure of their pages this is where this code
        # will break.  Currently class=l grabs all the appropriate links
        # (displayed in green in the search results).
        # The urlparse(u['href'])[1] works by extracting the href from the
        # <a> tag, parsing it into component parts and extracting the 1th
        # element of the returned tuple which contains the netloc (the
        # domain name)

        hrefs = [ urlparse(u['href'])[1] for u in so.findAll("a", {"class":"l"}) ]

        # Used to hold the rank of the domains a result was found on.
        # Uses defaultdict(int) to have a default of 0 for not found domains
        ranks = defaultdict(int)

        # Here we iterate through the returned domain names in hrefs and 
        # match them up with the domain names we are looking for.
        for i, href in enumerate(hrefs):
            for domain in DOMAINS:
                # Note that the comparison here deals with two cases.  The
                # domain is entirely 'foo.com' (for example), or the
                # domain ends with '.foo.com' (for example).
                if href == domain or href.endswith(".%s" % domain):
                    ranks[domain] = i
        
        writer.writerow([qu] + [ranks[domain] for domain in DOMAINS])

 if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	import csv
	import sys
	from BeautifulSoup import BeautifulSoup
	import urllib, urllib2
	from urlparse import urlparse
	from collections import defaultdict

	__doc__ = """
	Script to perform Google searches and extract the domain names of
	the returned results. The order in which the domain names are
	returned is used to determine a ranking between different companies.
	"""

	# This is the list of domains to look for in Google searches. To
	# search for more or different domains simply alter this list. The
	# order of this list determines the order in which the results are
	# saved.
	DOMAINS = ['apple.com', 'microsoft.com', 'engadget.com', 'wired.com', 'cnet.com', ]

	# All possible search strings to be used are generated from this list by using
	# 'product'. This can be modified to create other search terms by
	# altering the lists (add/delete elements, or add/delete lists). Each
	# of these terms will have "whizz bang" appended below
	SEARCH_TERMS = (
	('apple', 'microsoft', 'gadget', ),
	('iphone', 'xbox', 'news', ),
	('4', '360', ),
	)

	# All search terms are passed through this format string (so you can add
	# static pre-/post-fixes)
	SEARCH_MODIFIER = "%s"

	GOOGLE_URL = "http://google.com/search?q=%s&num=100&hl=en&start=0"


	def google(q): # Query string (will be URL quoted by this function)
	"""
	Performs a Google search and returns a BeautifulSoup object
	containing the parsed, returned page.
	"""

	# This attempts to ask for 100 results from Google. This does not
	# always seem to work correctly. Note that the fake User-Agent is
	# required otherwise Google will reject the search

	url = GOOGLE_URL % urllib.quote_plus(q)
	try:
	req = urllib2.Request(url, None, {
	'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; '
	'en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.94 '
	'Safari/534.13'})
	res = urllib2.urlopen(req)
	dat = res.read()
	# optional add error handling (e.g. return None on broken request or some such)
	finally:
	try: res.close()
	except: pass

	return BeautifulSoup(dat)


	def product(*args): # List to x together
	"""
	Cartesian product function (similar to itertools.product but joins
	the #elements together as a space separated string rather than
	returning a tuple)
	"""
	pools = map(tuple, args)
	result = [[]]
	for pool in pools:
	result = [x+[y] for x in result for y in pool]
	for prod in result:
	yield " ".join(prod)


	def main():
	writer = csv.writer(sys.stdout)

	writer.writerow(['term'] + DOMAINS)

	for t in product(*SEARCH_TERMS):

	# Format search term
	qu = SEARCH_MODIFIER % t

	# This performs a Google query using the helper function.
	so = google(qu)

	# Then extracts all the <a> tags that have class "l". If Google
	# changes the structure of their pages this is where this code
	# will break. Currently class=l grabs all the appropriate links
	# (displayed in green in the search results).
	# The urlparse(u['href'])[1] works by extracting the href from the
	# <a> tag, parsing it into component parts and extracting the 1th
	# element of the returned tuple which contains the netloc (the
	# domain name)

	hrefs = [ urlparse(u['href'])[1] for u in so.findAll("a", {"class":"l"}) ]

	# Used to hold the rank of the domains a result was found on.
	# Uses defaultdict(int) to have a default of 0 for not found domains
	ranks = defaultdict(int)

	# Here we iterate through the returned domain names in hrefs and
	# match them up with the domain names we are looking for.
	for i, href in enumerate(hrefs):
	for domain in DOMAINS:
	# Note that the comparison here deals with two cases. The
	# domain is entirely 'foo.com' (for example), or the
	# domain ends with '.foo.com' (for example).
	if href == domain or href.endswith(".%s" % domain):
	ranks[domain] = i

	writer.writerow([qu] + [ranks[domain] for domain in DOMAINS])

	if __name__ == '__main__':
	main()