Last active
February 4, 2018 07:14
-
-
Save chidimo/17da59ebf470d741c057 to your computer and use it in GitHub Desktop.
Functions to index the web. To use, just modify seedPage and n. The seed page and the depth of index you want. You have to see the output in a text file. Its a python dictionary dumped as json
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import OrderedDict | |
from operator import itemgetter | |
import pprint | |
import requests | |
import bs4 | |
import json | |
import time | |
import re | |
# returns a BS object of a webpage | |
def makeSoup(pageUrl): | |
connTimeout = 10.0 | |
readTimeout = 10.0 | |
errorTrack = {} | |
errorTrack['CECount'] = 0 | |
errorTrack['TECount'] = 0 | |
errorTrack['HECount'] = 0 | |
errorTrack['MSECount'] = 0 | |
errorTrack['IUECount'] = 0 | |
errorTrack['RTECount'] = 0 | |
errorTrack['TMRECount'] = 0 | |
try: | |
r = requests.get(pageUrl, timeout=(connTimeout, readTimeout)) | |
pageString = r.text | |
return bs4.BeautifulSoup(pageString, 'html.parser'), errorTrack | |
try: | |
r.raise_for_status() | |
except requests.exceptions.HTTPError as e: | |
errorTrack['HECount'] += 1 | |
except requests.exceptions.ConnectionError as CE: | |
errorTrack['CECount'] += 1 | |
except requests.exceptions.ConnectTimeout as TE: | |
errorTrack['TECount'] += 1 | |
except requests.exceptions.HTTPError as HE: | |
errorTrack['HECount'] += 1 | |
except requests.exceptions.MissingSchema as MSE: | |
errorTrack['MSECount'] += 1 | |
except requests.exceptions.InvalidURL as IUE: | |
errorTrack['IUECount'] += 1 | |
except requests.exceptions.ReadTimeout as RTE: | |
errorTrack['RTECount'] += 1 | |
except requests.exceptions.TooManyRedirects as TMRE: | |
errorTrack['TMRECount'] += 1 | |
return None | |
# returns all href links on a page | |
def allLinks(pageUrl): | |
try: | |
pageSoup, errorTrack = makeSoup(pageUrl) | |
return pageSoup.findAll('a', href=True) | |
except TypeError as e: | |
return None | |
# returns all http links on a page | |
def crawlableLinks(pageUrl): | |
links = allLinks(pageUrl) | |
goodLinks = [] | |
try: | |
for link in links: | |
text = link.text | |
dest = link.get('href') | |
if dest.startswith('http'): | |
goodLinks.append(dest) | |
return goodLinks | |
except TypeError as e: | |
return None | |
# returns all http links on a page | |
# together with their associated contents | |
def crawlableLinksWithCont(pageUrl): | |
links = allLinks(pageUrl) | |
goodLinks = [] | |
goodLinksText = [] | |
for link in links: | |
text = link.text | |
cont = link.contents | |
dest = link.get('href') | |
if dest.startswith('http'): | |
goodLinks.append(dest) | |
goodLinksText.append(cont) | |
return goodLinks, goodLinksText | |
# returns the index of a word in a list | |
def findWord(someList, word): | |
try: | |
someList.index(word) | |
return someList.index(word) | |
except: ValueError | |
return -1 | |
# returns the index of a word in a list | |
def findWord(someList, word): | |
if word in someList: | |
return someList.index(word) | |
else: | |
return -1 | |
# merges two lists: common elements are not replicated | |
def Union(list1, list2): | |
try: | |
for elem in list2: | |
if elem not in list1: | |
list1.append(elem) | |
return list1 | |
except TypeError as e: | |
return list1 | |
# returns a list of all words in paragraph tags | |
def wordList(pageUrl): | |
try: | |
pageSoup, errorTrack = makeSoup(pageUrl) | |
parText = pageSoup.findAll('p') | |
pars = [] | |
for i in range(len(parText)): | |
indText = parText[i].text | |
for word in re.split('[; , \* \n \. ]', indText): | |
# for word in indText.split(): | |
pars.append(word) | |
return pars | |
except TypeError as e: | |
return None | |
# Indexing a word: dictionary implementation | |
# no replication of links | |
def addToIndex(webIndex, keyword, pageUrl): | |
if keyword in webIndex: | |
if pageUrl in webIndex[keyword]: | |
return | |
webIndex[keyword].append(pageUrl) | |
return | |
webIndex[keyword] = [pageUrl] | |
# Indexing the contents of a webpage | |
def addPageToIndex(webIndex, pageUrl): | |
allWords = wordList(pageUrl) | |
try: | |
for word in allWords: | |
addToIndex(webIndex, word, pageUrl) | |
except TypeError as e: | |
return None | |
def buildIndex(pageUrl, n): | |
toCrawl = [pageUrl] | |
nextToCrawl = [] | |
crawled = [] | |
webIndex = {} | |
linkGraph = {} | |
badLinks = [] | |
for j in range(n): | |
print(len(toCrawl), ' links being indexed') | |
for i in range(len(toCrawl)): | |
print('step: ', i + 1, ' of run ', j+1) | |
if toCrawl[i] not in crawled: | |
outLinks = crawlableLinks(toCrawl[i]) | |
else: | |
continue | |
if outLinks == None: | |
badLinks.append(toCrawl[i]) | |
continue | |
else: | |
Union(nextToCrawl, outLinks) | |
linkGraph[toCrawl[i]] = outLinks | |
addPageToIndex(webIndex, toCrawl[i]) | |
crawled.append(toCrawl[i]) | |
toCrawl = nextToCrawl | |
print(len(nextToCrawl), 'links available for next crawl') | |
return webIndex, linkGraph, badLinks, crawled | |
def computeRanks(linkGraph): | |
d = 0.8 # damping constant | |
numLoops = 10 # I will modify this | |
ranks = {} | |
nPages = len(linkGraph) | |
for page in linkGraph: | |
ranks[page] = 1.0 / nPages | |
for i in range(0, numLoops): | |
newRanks = {} | |
for page in linkGraph: | |
newRank = (1-d) / nPages | |
for node in linkGraph: | |
if page in linkGraph[node]: | |
newRank = newRank + d * (ranks[node] / len(linkGraph[node])) | |
newRanks[page] = newRank | |
ranks = newRanks | |
return ranks | |
def lookUpBest(webIndex, ranks, keyword): | |
matches = {} | |
if keyword in webIndex: | |
for url in webIndex[keyword]: | |
matches[url] = ranks[url] | |
return dictSortByValue(matches) | |
return None | |
def dictSortByValue(someDict): | |
if someDict == None: | |
return | |
order = OrderedDict(sorted(someDict.items(), key = itemgetter(1))) | |
keys = order.keys() | |
values = order.values() | |
keyList = list(keys) | |
valueList = list(values) | |
valueList.reverse() | |
keyList.reverse() | |
return keyList | |
t1 = time.clock() | |
seedPage = 'http://coursera.org' | |
n = 2 | |
Index, Graph, BadLink, crawledUrls = buildIndex(seedPage, n) | |
print('Done Indexing') | |
print('Indexing time: ', time.clock() - t1, 'sec') | |
print(len(Index), ' words indexed') | |
print(len(crawledUrls), ' good links indexed') | |
print(len(BadLink), 'Bad Links: ') | |
print('Now computing url ranks') | |
t2 = time.clock() | |
ranking = computeRanks(Graph) | |
print('Ranks computed in ', time.clock() - t2, 'sec') | |
fHand = open('webIndex.txt', 'w') | |
json.dump(Index, fHand) | |
fHand.close() | |
fHand2 = open('linkGraph.txt', 'w') | |
json.dump(Graph, fHand2) | |
fHand2.close() | |
while True: | |
searchTerm = input('\nEnter search term: ') | |
if not searchTerm: | |
break | |
t3 = time.clock() | |
results = lookUpBest(Index, ranking, searchTerm) | |
if results == None: | |
print('No results found for your search') | |
continue | |
print('\n') | |
print(len(results), 'matches found in ', time.clock() - t3, ' sec') | |
pprint.pprint(results) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment