Created
February 18, 2011 17:44
-
-
Save arsatiki/834063 to your computer and use it in GitHub Desktop.
jgc's google script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Script to perform Google searches and extract the domain names of | |
# the returned results. The order in which the domain names are | |
# returned is used to determine a ranking between different companies. | |
# This is the list of domains to look for in Google searches. To | |
# search for more or different domains simply alter this list. The | |
# order of this list determines the order in which the results are | |
# saved. | |
domains = [ 'foo.com', 'bar.com', 'baz.com' ] | |
from BeautifulSoup import BeautifulSoup | |
import urllib, urllib2 | |
from urlparse import urlparse | |
import itertools | |
# Using triple quotes for the user agent string: | |
AGENT = """Mozilla/5.0 (Macintosh; U; | |
Intel Mac OS X 10_5_8; en-US) AppleWebKit/534.13 (KHTML, like Gecko) | |
Chrome/9.0.597.94 Safari/534.13""" | |
# Performs a Google search and returns a BeautifulSoup object | |
# containing the parsed, returned page. | |
def google(q): # Query string (will be URL quoted by this function) | |
# This attempts to ask for 100 results from Google. This does not | |
# always seem to work correctly. Note that the fake User-Agent is | |
# required otherwise Google will reject the search | |
url = "http://google.com/search?q=%s&num=100&hl=en&start=0" % | |
urllib.quote_plus(q) | |
req = urllib2.Request(url, None, {'User-Agent': AGENT}) | |
res = urllib2.urlopen(req) | |
dat = res.read() | |
res.close() | |
return BeautifulSoup(dat) | |
# This could be replaced with the csv module, especially if needs grow | |
# in the future. | |
# Small helper function to output a single line of a CSV file. Note | |
# that this does not do quoting of arguments so assumes there are not | |
# " or , present. | |
def csv(f, l): # First elements to print followed by list of elements | |
# to print | |
print "%s," % f, ", ".join(l) | |
# Cartesian product function (similar to itertools.product but joins | |
# the #elements together as a space separated string rather than | |
# returning a tuple) | |
# Giving this callable a name, although it is not strictly necessary | |
# spacejoiner(("a", "b") -> 'a b' | |
spacejoiner = " ".join | |
def product(*args): | |
return itertools.imap(spacejoiner, itertools.product(*args)) | |
# This generates all possible search strings to be used using | |
# product. This can be modified to create other search terms by | |
# altering the lists (add/delete elements, or add/delete lists). Each | |
# of these terms will have "whizz bang" appended below | |
terms = product( [ 'a', 'b', 'c' ], | |
[ 'foo', 'bar', 'baz' ], | |
[ 'one', 'two', 'three', 'four' ] ) | |
csv('term', domains) | |
for t in terms: | |
# All the queries have 'whizz bang' appended to the end of | |
# them | |
qu = "%s whizz bang" % t | |
# This performs a Google query using the helper function and then | |
# extracts all the <a> tags that have class "l". If Google | |
# changes the structure of their pages this is where this code | |
# will break. Currently class=l grabs all the appropriate links | |
# (displayed in green in the search results). | |
so = google(qu) | |
# The urlparse(u['href'])[1] works by extracting the href from the | |
# <a> tag, parsing it into component parts and extracting the 1th | |
# element of the returned tuple which contains the netloc (the | |
# domain name) | |
a = [ urlparse(u['href'])[1] for u in so.findAll("a", {"class":"l"}) ] | |
# The pos array stores the lowest position in which a specific | |
# domain has been seen in the results from Google. Each element | |
# corresponds to an element of domains. Initially, these are set | |
# to 0 to indicate 'not found' in the results. | |
rank = ["0"] * len(domains) | |
# Here we iterate through the returned domain names in a and match | |
# them up with the domain names we are looking for. | |
for i, href in enumerate(a): | |
for j, domain in enumerate(domains): | |
# Note that the comparison here deals with two cases. The | |
# domain is entirely 'foo.com' (for example), or the | |
# domain ends with '.foo.com' (for example). | |
if (domain == href) or href.endswith(".%s" % domain): | |
rank[j] = "%d" % (i+1) # Count ranks from 1 not 0 | |
csv(qu, rank) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment