Created
August 8, 2019 02:37
-
-
Save davidlares/65e1ec53bd6dd404182e43090ed64970 to your computer and use it in GitHub Desktop.
Python script for checking robots.txt's 'Disallow' property presence with urllib2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import urllib2 | |
class DisallowPresent(Exception): | |
def __init__(self, path) : | |
self.disallowed = path | |
def __str__(self) : | |
# repr is the same shit as str -> string representation on objects | |
return repr(self.disallowed) | |
# urllib file | |
link = urllib2.urlopen('http://davidlares.now.sh/robots.txt') | |
for line in link.readlines(): | |
try: | |
#checking for disallow | |
if line.lower().find('disallow') != -1: | |
print(line.strip()) | |
# raising custom error | |
raise DisallowPresent(line.split(':')[1].strip()) | |
except DisallowPresent as e: | |
print("Exeption occurred for path: %s" % e.disallowed) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Since Gist does not handle indentation for raw python string very well. You'll need to check it previously for correct possible indentation errors on implementation.
Happy coding :)