Created
November 6, 2017 17:16
-
-
Save lewis-carson/90c81f7fa5dbd11985a680dc182dbfef to your computer and use it in GitHub Desktop.
lightweight web crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import random | |
import requests | |
import re | |
class crawler(): | |
def findurls(self, limit, seed): | |
cache = seed | |
history = [] | |
for idx, currenturl in enumerate(cache): | |
try: | |
r = requests.get(currenturl) | |
data = r.text | |
soup = BeautifulSoup(data, "lxml") | |
for a in soup.find_all('a', href=True): | |
link = a['href'] | |
if link not in history: | |
cache.append(link) | |
history.append(currenturl) | |
if len(history) == limit: | |
return history | |
except: | |
t = "" | |
def searchwithseed(self, limit, seed, word): | |
cache = seed | |
history = [] | |
found = [] | |
for idx, currenturl in enumerate(cache): | |
try: | |
r = requests.get(currenturl) | |
data = r.text | |
soup = BeautifulSoup(data, "lxml") | |
for a in soup.find_all('a', href=True): | |
link = a['href'] | |
if link not in history: | |
cache.append(link) | |
if word in data: | |
found.append(currenturl) | |
history.append(currenturl) | |
if len(history) == limit: | |
return found | |
except: | |
t = "" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment