Created
December 19, 2012 13:02
-
-
Save DmitrySandalov/4336528 to your computer and use it in GitHub Desktop.
unshorten links in file with Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# This is for Py2k. For Py3k, use http.client and urllib.parse instead, and | |
# use // instead of / for the division | |
import sys | |
import httplib | |
import urlparse | |
import re | |
def unshorten_url(url): | |
if hasattr(url, 'group'): | |
url = url.group(0) | |
parsed = urlparse.urlparse(url) | |
h = httplib.HTTPConnection(parsed.netloc) | |
resource = parsed.path | |
if parsed.query != "": | |
resource += "?" + parsed.query | |
h.request('HEAD', resource ) | |
response = h.getresponse() | |
if response.status/100 == 3 and response.getheader('Location'): | |
return unshorten_url(response.getheader('Location')) # changed to process chains of short urls | |
else: | |
return url | |
def unshorten_file(file_in, file_out): | |
with open(file_in) as f_in, open(file_out, 'w') as f_out: | |
for line in f_in: | |
f_out.write(re.sub(r'http://([^\n ,]+)', unshorten_url, line)) | |
return 0 | |
if __name__ == '__main__': | |
if len(sys.argv) != 3: | |
sys.exit('Usage: %s <input> <output>' % sys.argv[0]) | |
unshorten_file(sys.argv[1], sys.argv[2]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment