Last active
September 21, 2015 17:46
-
-
Save peterk/4e1a0974f891a5f8724c to your computer and use it in GitHub Desktop.
Checks Swedish Wikipedia for pages containing links from domain specified and then checks link availability
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import argparse | |
from lxml import html | |
class bcolors: | |
OKBLUE = '\033[94m' | |
OKGREEN = '\033[92m' | |
WARNING = '\033[93m' | |
FAIL = '\033[91m' | |
ENDC = '\033[0m' | |
BOLD = '\033[1m' | |
def main(): | |
"""Checks Swedish Wikipedia for pages containing links from | |
domain specified and then checks link availability.""" | |
parser = argparse.ArgumentParser() | |
parser.add_argument("domain", help="The domain to check, e.g. http://www.example.com") | |
args = parser.parse_args() | |
print "Working on %s" % args.domain | |
page = requests.get("https://sv.wikipedia.org/w/index.php?title=Special:L%C3%A4nks%C3%B6kning&limit=20000&offset=0&target=" + args.domain) | |
tree = html.fromstring(page.text) | |
listitems = tree.xpath('//ol[@class="special"]/li') | |
print "Found %s links" % len(listitems) | |
for item in listitems: | |
targetel, sourceel = item.xpath("a") | |
target = targetel.attrib["href"] | |
source = "https://sv.wikipedia.org%s" % sourceel.attrib["href"] | |
status = requests.get(target).status_code | |
if status != 200: | |
print bcolors.FAIL + "[" + str(status) + "]" + bcolors.ENDC + " %s from %s" % (target, source) | |
else: | |
print bcolors.OKBLUE + "[" + str(status) + "]" + bcolors.ENDC + " %s from %s" % (target, source) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment