Skip to content

Instantly share code, notes, and snippets.

@peterk
Last active September 21, 2015 17:46
Show Gist options
  • Save peterk/4e1a0974f891a5f8724c to your computer and use it in GitHub Desktop.
Save peterk/4e1a0974f891a5f8724c to your computer and use it in GitHub Desktop.
Checks Swedish Wikipedia for pages containing links from domain specified and then checks link availability
import requests
import argparse
from lxml import html
class bcolors:
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
def main():
"""Checks Swedish Wikipedia for pages containing links from
domain specified and then checks link availability."""
parser = argparse.ArgumentParser()
parser.add_argument("domain", help="The domain to check, e.g. http://www.example.com")
args = parser.parse_args()
print "Working on %s" % args.domain
page = requests.get("https://sv.wikipedia.org/w/index.php?title=Special:L%C3%A4nks%C3%B6kning&limit=20000&offset=0&target=" + args.domain)
tree = html.fromstring(page.text)
listitems = tree.xpath('//ol[@class="special"]/li')
print "Found %s links" % len(listitems)
for item in listitems:
targetel, sourceel = item.xpath("a")
target = targetel.attrib["href"]
source = "https://sv.wikipedia.org%s" % sourceel.attrib["href"]
status = requests.get(target).status_code
if status != 200:
print bcolors.FAIL + "[" + str(status) + "]" + bcolors.ENDC + " %s from %s" % (target, source)
else:
print bcolors.OKBLUE + "[" + str(status) + "]" + bcolors.ENDC + " %s from %s" % (target, source)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment