lewis-carson · November 6, 2017 17:16
diff --git a/crawler.py b/crawler.py
 from bs4 import BeautifulSoup
 import random
 import requests
 import re

 class crawler():
    def findurls(self, limit, seed):
        cache = seed
        history = []
        for idx, currenturl in enumerate(cache):
            try:
                r  = requests.get(currenturl)
                data = r.text
                soup = BeautifulSoup(data, "lxml")
                for a in soup.find_all('a', href=True):
                    link = a['href']
                    if link not in history:
                        cache.append(link)
                history.append(currenturl)
                if len(history) == limit:
                    return history
            except:
                t = ""
    def searchwithseed(self, limit, seed, word):
        cache = seed
        history = []
        found = []
        for idx, currenturl in enumerate(cache):
            try:
                r  = requests.get(currenturl)
                data = r.text
                soup = BeautifulSoup(data, "lxml")
                for a in soup.find_all('a', href=True):
                    link = a['href']
                    if link not in history:
                        cache.append(link)
                if word in data:
                    found.append(currenturl)
                    
                history.append(currenturl)
                if len(history) == limit:
                    return found
            except:
                t = ""
	from bs4 import BeautifulSoup
	import random
	import requests
	import re

	class crawler():
	def findurls(self, limit, seed):
	cache = seed
	history = []
	for idx, currenturl in enumerate(cache):
	try:
	r = requests.get(currenturl)
	data = r.text
	soup = BeautifulSoup(data, "lxml")
	for a in soup.find_all('a', href=True):
	link = a['href']
	if link not in history:
	cache.append(link)
	history.append(currenturl)
	if len(history) == limit:
	return history
	except:
	t = ""
	def searchwithseed(self, limit, seed, word):
	cache = seed
	history = []
	found = []
	for idx, currenturl in enumerate(cache):
	try:
	r = requests.get(currenturl)
	data = r.text
	soup = BeautifulSoup(data, "lxml")
	for a in soup.find_all('a', href=True):
	link = a['href']
	if link not in history:
	cache.append(link)
	if word in data:
	found.append(currenturl)

	history.append(currenturl)
	if len(history) == limit:
	return found
	except:
	t = ""