Created
June 24, 2013 02:00
-
-
Save ericdfields/5847312 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Extract all image link from a Typepad HTML Export file | |
For use with wget | |
===================================================== | |
Author: Martin Sauter (http://www.wirelessmoves.com) | |
May 2013 | |
Usage: | |
------ | |
./get_links.py filename blogURL | |
""" | |
import sys | |
def process(filename, blogUrl): | |
num = 0 | |
numImageLinks = 0 | |
searchfile = open(filename, "r") | |
for line in searchfile: | |
UrlPosition = 0 | |
CurLocInString = 0 | |
#Get all URLs in the current line | |
while True: | |
#Get the next URL in the current line | |
UrlPosition = line.find(blogUrl, CurLocInString) | |
if UrlPosition > -1: | |
EndOfUrl = line.find("\"", UrlPosition) | |
#If the URL contains "/.a/" then it's a picture or a link | |
if line[UrlPosition:EndOfUrl].find("/.a/") > -1: | |
#If the URL has a "-popup" at the end it's a link and we have to derive the image name from it | |
if line[UrlPosition:EndOfUrl].find("-popup") > -1: | |
TempString = line[UrlPosition:EndOfUrl] | |
print TempString.replace("-popup","-800wi") | |
numImageLinks = numImageLinks + 1 | |
#Else it really is an image! | |
else: | |
print line[UrlPosition:EndOfUrl] | |
numImageLinks = numImageLinks + 1 | |
#If the URL contains "images" then it's an image straight away | |
elif line[UrlPosition:EndOfUrl].find("images") > -1: | |
print line[UrlPosition:EndOfUrl] | |
numImageLinks = numImageLinks + 1 | |
#If the URL contains a .shared/image.html? then it's indirect and we need to fiddle with the URL a bit | |
elif line[UrlPosition:EndOfUrl].find(".shared/image.html?") > -1: | |
TempString = line[UrlPosition:EndOfUrl] | |
#Remove a part of the URL to get to a direct URL to the picture | |
print TempString.replace(".shared/image.html?/","") | |
numImageLinks = numImageLinks + 1 | |
#If the URL contains "photos/uncategorized" then it's an image straight away | |
elif line[UrlPosition:EndOfUrl].find("photos/uncategorized") > -1: | |
print line[UrlPosition:EndOfUrl] | |
numImageLinks = numImageLinks + 1 | |
#print UrlPosition | |
#print EndOfUrl | |
num = num + 1 | |
CurLocInString = UrlPosition + 1 | |
#No more URLs in the current line | |
else: | |
break | |
#end of the for each line in the file loop | |
searchfile.close() | |
print "Number of links: ", num | |
print "Number of image links: ", numImageLinks | |
# endo of "process" function | |
def main(): | |
if len(sys.argv) < 3: | |
print "Martin's Typepad Image Link Extractor v0.1" | |
print "Usage: %s filename blogURL (e.g. http://mobilesociety.typepad.com)" % sys.argv[0] | |
sys.exit(1) | |
process(sys.argv[1], sys.argv[2]) | |
# main() | |
############################################################################# | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment