Skip to content

Instantly share code, notes, and snippets.

@cageyjames
Created December 12, 2012 17:18
Show Gist options
  • Select an option

  • Save cageyjames/4269713 to your computer and use it in GitHub Desktop.

Select an option

Save cageyjames/4269713 to your computer and use it in GitHub Desktop.
""" This is the code I used to find the URLs that were 404 in Disqus """
import os,sys
#import feedparser
import urllib
import Levenshtein
def read_atom():
d = feedparser.parse("http://cageyjames.webfactional.com/atom.xml") # the url of the Atom feed can also be used here
of = open("./cageyjames_out.csv","w")
for entry in d.entries:
of.write("%s\n" % entry.link)
of.close()
if __name__ == "__main__":
of = open("./match.csv", "w")
nm = open("./no_match.csv", "w")
cj = open("./cageyjames_out.csv").readlines()
at = open("spatiallyadjusted_mapping.csv").readlines()
for c in cj:
trip = True
for a in at:
if Levenshtein.ratio(c.lower(),a.lower()) > .90:
trip = False
httpcodea = urllib.urlopen(a).getcode()
httpcodec = urllib.urlopen(c).getcode()
of.write("%s,%s,%s,%s,%s\n" % (c.strip(),httpcodec,a.strip(),httpcodea,Levenshtein.ratio(c.lower(),a.lower())))
break # Could be far more efficient
if trip:
nm.write( "%s\n" % (c.strip()) )
of.close()
nm.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment