Skip to content

Instantly share code, notes, and snippets.

@uchiiii
Last active December 20, 2018 05:03
Show Gist options
  • Save uchiiii/31cc93b002652f6f6ced23589652cf2d to your computer and use it in GitHub Desktop.
Save uchiiii/31cc93b002652f6f6ced23589652cf2d to your computer and use it in GitHub Desktop.
This is a crawler written by python3.
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
# This program crawls the specified site and generates a text to represent it.
#
# This collects internal nodes only.
#
# Usage: crawler.py url [limit] [i|d|e]
# The option ''url'' is the start page of the crawling.
# The crawling continues while it can find unvisited pages on the same host.
# The option ''limit'' specifies the depth of jumping links.
# (-1 means infinite jump-depth)
# The option i, d, or e specifies the region of the output graph.
# i: internal nodes only
# d: descendant nodes only
# e: external nodes with 'Out of domain'
#
# Example usage:
# python crawler.py http://www.ipl.t.u-tokyo.ac.jp/ > result.txt
# The resulting web graph contains all pages on www.ipl.t.u-tokyo.ac.jp
# that can be reached from the top page by hyper-links.
#
# python crawler.py http://www.ipl.t.u-tokyo.ac.jp/ 3 > result.txt
# The resulting web graph contains all pages on www.ipl.t.u-tokyo.ac.jp
# that can be reached from the top page by less than 4 hyper-link jumps.
#
# python crawler.py http://www.ipl.t.u-tokyo.ac.jp/~emoto/ > result.txt
# The resulting web graph contains all pages on www.ipl.t.u-tokyo.ac.jp
# that can be reached from the page "/~emoto/", which may include the top
# page.
#
# python crawler.py http://www.ipl.t.u-tokyo.ac.jp/~emoto/ -1 d > result.txt
# The resulting web graph contains all pages under the page "/~emoto/".
#
from urllib.parse import urlparse,urlunparse,urljoin,unquote,quote
from urllib.request import urlopen
from urllib.error import HTTPError
import re
import sys
if len(sys.argv) < 2:
print("crawler.py url [limit] [i|d|e]")
exit(0)
url = sys.argv[1]
limit = -1
if len(sys.argv) > 2:
limit = int(sys.argv[2])
fDescentOnly = len(sys.argv) > 3 and sys.argv[3] == "d"
fInternalOnly = len(sys.argv) > 3 and sys.argv[3] == "i"
re_href = re.compile('<\\s*[\\w]+\\s+href\\s*=\\s*(["\'][^>\'"]+["\']|[^ >]+)')
re_src = re.compile('<\\s*[\\w]+\\s+src\\s*=\\s*(["\'][^>\'"]+["\']|[^ >]+)')
re_refresh = re.compile('<[^>]*refresh[^>]*CONTENT=["\'][0-9]*;URL=([^\'"><\\s]*)') #matches meta refresh
re_quote = re.compile('["\'\\s]*([^>\'"]+)["\'\\s]*')
re_mailto = re.compile('^mailto') #matches 'mailto' at the beginning of the string.
re_title = re.compile('<\\s*[Tt][iI][Tt][Ll][Ee]\\s*>([^<]*)') #matches '< title > ', '<TITLE>, and so on.
re_error = re.compile('^error') #matches 'error' at the beginning of the string.
def clean(href):
#extract a value between double quotes
m = re_quote.search(href)
if m: return href[m.start(1):m.end(1)]
return href
def valid(href):
if re_mailto.match(href): return False
return True
def normalize(base, href):
parsedurl = urlparse(href)
href = urlunparse((parsedurl[0], parsedurl[1], quote(parsedurl[2]), quote(parsedurl[3]), quote(parsedurl[4]), "")) # remove fragment
if parsedurl[1] == '':
href=urljoin(base, href)
elif parsedurl[2] == '':
href=urlunparse((parsedurl[0],parsedurl[1],"/",parsedurl[3],parsedurl[4],""))
return unquote(href)
def quoting(url):
parsedurl = urlparse(url)
href = urlunparse((parsedurl[0], parsedurl[1], quote(parsedurl[2]), quote(parsedurl[3]), quote(parsedurl[4]), ""))
return href
def ishtml(mtype):
return mtype == 'text/html' or mtype == 'application/xhtml+xml'
def stripNewline(str):
return ''.join([ c for c in str if not (c in '\n\r')])
def extractURLs(url):
url = quoting(url)
try:
with urlopen(url) as f:
if f.geturl() != url:
return (url, [], 'error {}'.format('redirected'),'')
info = f.info()
mtype = info.get_content_type()
if ishtml(mtype):
cont = f.read().decode('utf-8','ignore')
hrefs = [clean(href) for href in re_href.findall(cont) + re_src.findall(cont) + re_refresh.findall(cont)]
urls = [normalize(url, href) for href in hrefs if valid(href)]
m = re_title.search(cont)
if m:
title = cont[m.start(1):m.end(1)]
else:
title = ''
else:
urls = []
title= ''
return (url, urls, mtype, stripNewline(title))
except HTTPError as e:
if hasattr(e, 'code'):
return (url, [], 'error {:d}'.format(e.code), '')
return (url, [], 'error', '')
def visit(baseurl, allowedhosts, url):
parsedu = urlparse(url)
if fDescentOnly:
if isDescent(baseurl, url):
sys.stderr.write('visiting {}\n'.format(url))
ret = extractURLs(url)
else:
sys.stderr.write('skipping {}\n'.format(url))
ret = (url, [], "Out of range", "Out of range")
elif fInternalOnly:
if isInternal(allowedhosts, url):
sys.stderr.write("visiting {}\n".format(url))
ret = extractURLs(url)
else:
sys.stderr.write("skipping {}\n".format(url))
ret = (url, [], "Out of range", "Out of range")
else:
if parsedu[1] in allowedhosts:
sys.stderr.write("visiting {}\n".format(url))
ret = extractURLs(url)
else:
sys.stderr.write("skipping {}\n".format(url))
ret = (url, [], "Out of domain", "Out of domain")
return ret
def isInternal(allowedhosts, url):
parsedu = urlparse(url)
return parsedu[1] in allowedhosts
def isDescent(beginurl, url):
return url.startswith(beginurl)
# allurls :: index -> urls
# indices :: url -> index
def crawl(url, maxdepth):
parsedurl = urlparse(url)
allowedhosts = [parsedurl[1]]
indices = {}
allurls = []
queue = [normalize(url,url)]
depth = 0
visited = {}
indices[queue[0]] = 0
allurls += [()]
while len(queue) > 0 and not (depth == maxdepth):
nextqueue = []
for i, u in enumerate(queue):
if not (u in visited):
visited[u] = len(visited)
ret = visit(url, allowedhosts, u)
if not re_error.match(ret[2]):
for v in ret[1]:
if not (v in indices):
indices[v] = len(indices)
allurls+=[(v, [], "Out of Range", "")]
nextqueue += [v]
allurls[indices[u]] = (ret[0], [indices[v] for v in ret[1]], ret[2], ret[3])
queue = nextqueue
depth += 1
return (allurls, indices)
def myquote(url):
pd = urlparse(url)
return urlunparse((pd[0], quote(pd[1]), quote(pd[2]), quote(pd[3]), quote(pd[4]), quote(pd[5])))
def printout(url, allurls, indices, indexmap):
print(url)
s = 0
for i, ret in enumerate(allurls):
if indexmap[i] >= 0:
s = s + 1
print(s)
for i, ret in enumerate(allurls):
if indexmap[i] >= 0:
if ishtml(ret[2]):
print('{:d} {} "{}"'.format(indexmap[i], myquote(ret[0]), ret[3]))
else:
print('{:d} {} "{}"'.format(indexmap[i], myquote(ret[0]), ret[2]))
s = 0
for i, ret in enumerate(allurls):
if indexmap[i] >= 0:
for u in ret[1]:
if indexmap[u] >= 0:
s = s + 1
print(s)
for i, ret in enumerate(allurls):
if indexmap[i] >= 0:
for u in ret[1]:
if indexmap[u] >= 0:
print('{:d} {:d}'.format(indexmap[i], indexmap[u]))
def allUrls(url, allurls, indices):
indexmap=[-1] * len(allurls)
for i, ret in enumerate(allurls):
indexmap[i] = i
return indexmap
def internalOnly(url, allurls, indices):
parsedu = urlparse(url)
allowedhosts = [parsedu[1]]
indexmap=[-1] * len(allurls)
k=0
for i, ret in enumerate(allurls):
if isInternal(allowedhosts, ret[0]):
indexmap[i] = k
k = k + 1
else:
indexmap[i] = -1
return indexmap
def descentOnly(url, allurls, indices):
parsedu = urlparse(url)
allowedhosts = [parsedu[1]]
indexmap=[-1] * len(allurls)
nurl = normalize(url,url)
k=0
for i, ret in enumerate(allurls):
if isDescent(nurl, ret[0]):
indexmap[i] = k
k = k + 1
else:
indexmap[i] = -1
return indexmap
(allurls, indices) = crawl(normalize(url,url), limit)
#print allurls
#print indices
if fDescentOnly:
indexmap = descentOnly(url, allurls, indices)
elif fInternalOnly:
indexmap = internalOnly(url, allurls, indices)
else:
indexmap = allUrls(url, allurls, indices)
printout(url, allurls, indices, indexmap)
# cat me6.txt | sed -e '1{s/.*/digraph "&" {/p;d};2d;/"/{s/^\([0-9]*\) [^ ]* \(".*"\)$/\1 [label=\2]/p;d};s/ / -> /p;${s/.*/}/p};d ' > me6.dot
# cat me6.txt | sed -e '1{s/.*/digraph "&" {/p;d};2d;/"/{s/^\([0-9]*\) [^ ]* \(".*"\)$/\1 [label=\2]/;d};s/ / -> /p;${s/.*/}/p};d ' > me6.dot
# dot -Tps < me6.dot > me6.ps
# cat me7.txt | sed -e '1{s/.*/digraph "&" {/p;d};2d;/"/{s/^\([0-9]*\) [^ ]* \(".*"\)$/\1 [label=\2]/;d};s/ / -> /p;${s/.*/}/p};d ' > me7.dot
# dot -Tps < me7.dot > me7.pst
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment