Last active
June 2, 2018 18:17
-
-
Save mineta/459826554adce463e88a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import urllib2 | |
# Abrimos el sitemap - formato xml | |
f = open('sitemap.xml', 'r') | |
# Abrimos el archivo donde vamos a guardar los resultados | |
s = open('urls_sitemap.txt', 'w') | |
""" | |
Para cada url del sitemap guardamos: | |
- código de respuesta - sólamente si no se ha encontrado | |
la página escribimos 404 | |
- url inicial | |
- url final (después de la redirección) | |
""" | |
for l in f: | |
ll = l.split("</url>") | |
for l in ll: | |
data = re.findall('<loc>(http:\/\/.+)<\/loc>', l) | |
for i in data: | |
req = urllib2.Request(i) | |
try: | |
resp = urllib2.urlopen(req) | |
s.write(" \t" + i + "\t" + resp.geturl() + "\n") | |
except urllib2.URLError, e: | |
s.write(str(e.code) + "\t" + i + "\t" + resp.geturl() + "\n") | |
f.close() | |
s.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment