ericdfields · June 24, 2013 02:00
diff --git a/get_links.py b/get_links.py
 #!/usr/bin/env python

 """
 Extract all image link from a Typepad HTML Export file
 For use with wget
 =====================================================
 Author: Martin Sauter (http://www.wirelessmoves.com)
 May 2013

 Usage:
 ------
 ./get_links.py filename blogURL
 """

 import sys

 def process(filename, blogUrl):

    num = 0
    numImageLinks = 0

    searchfile = open(filename, "r")
    for line in searchfile:

        UrlPosition = 0
        CurLocInString = 0

        #Get all URLs in the current line
        while True:

           #Get the next URL in the current line
           UrlPosition = line.find(blogUrl, CurLocInString)

           if UrlPosition > -1: 
              
              EndOfUrl = line.find("\"", UrlPosition)
              
              #If the URL contains "/.a/" then it's a picture or a link
              if line[UrlPosition:EndOfUrl].find("/.a/") > -1:

                 #If the URL has a "-popup" at the end it's a link and we have to derive the image name from it
                 if line[UrlPosition:EndOfUrl].find("-popup") > -1:
                    TempString = line[UrlPosition:EndOfUrl]
                    print TempString.replace("-popup","-800wi")
                    numImageLinks = numImageLinks + 1
                 
                 #Else it really is an image!
                 else:
                    print line[UrlPosition:EndOfUrl]
                    numImageLinks = numImageLinks + 1

              #If the URL contains "images" then it's an image straight away
              elif line[UrlPosition:EndOfUrl].find("images") > -1:
                 print line[UrlPosition:EndOfUrl]
                 numImageLinks = numImageLinks + 1

              #If the URL contains a .shared/image.html? then it's indirect and we need to fiddle with the URL a bit
              elif line[UrlPosition:EndOfUrl].find(".shared/image.html?") > -1:
                 TempString = line[UrlPosition:EndOfUrl]
                 #Remove a part of the URL to get to a direct URL to the picture
                 print TempString.replace(".shared/image.html?/","")
                 numImageLinks = numImageLinks + 1

              #If the URL contains "photos/uncategorized" then it's an image straight away
              elif line[UrlPosition:EndOfUrl].find("photos/uncategorized") > -1:
                 print line[UrlPosition:EndOfUrl]
                 numImageLinks = numImageLinks + 1

              #print UrlPosition
              #print EndOfUrl

              num = num + 1 
              CurLocInString = UrlPosition + 1

           #No more URLs in the current line 
           else:
              break

    #end of the for each line in the file loop

    searchfile.close()

    print "Number of links: ", num
    print "Number of image links: ", numImageLinks

 # endo of "process" function


 def main():
    if len(sys.argv) < 3:
        print "Martin's Typepad Image Link Extractor v0.1"
        print "Usage: %s filename blogURL (e.g. http://mobilesociety.typepad.com)" % sys.argv[0]
        sys.exit(1)

    process(sys.argv[1], sys.argv[2])
 # main()

 #############################################################################

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python

	"""
	Extract all image link from a Typepad HTML Export file
	For use with wget
	=====================================================
	Author: Martin Sauter (http://www.wirelessmoves.com)
	May 2013

	Usage:
	------
	./get_links.py filename blogURL
	"""

	import sys

	def process(filename, blogUrl):

	num = 0
	numImageLinks = 0

	searchfile = open(filename, "r")
	for line in searchfile:

	UrlPosition = 0
	CurLocInString = 0

	#Get all URLs in the current line
	while True:

	#Get the next URL in the current line
	UrlPosition = line.find(blogUrl, CurLocInString)

	if UrlPosition > -1:

	EndOfUrl = line.find("\"", UrlPosition)

	#If the URL contains "/.a/" then it's a picture or a link
	if line[UrlPosition:EndOfUrl].find("/.a/") > -1:

	#If the URL has a "-popup" at the end it's a link and we have to derive the image name from it
	if line[UrlPosition:EndOfUrl].find("-popup") > -1:
	TempString = line[UrlPosition:EndOfUrl]
	print TempString.replace("-popup","-800wi")
	numImageLinks = numImageLinks + 1

	#Else it really is an image!
	else:
	print line[UrlPosition:EndOfUrl]
	numImageLinks = numImageLinks + 1

	#If the URL contains "images" then it's an image straight away
	elif line[UrlPosition:EndOfUrl].find("images") > -1:
	print line[UrlPosition:EndOfUrl]
	numImageLinks = numImageLinks + 1

	#If the URL contains a .shared/image.html? then it's indirect and we need to fiddle with the URL a bit
	elif line[UrlPosition:EndOfUrl].find(".shared/image.html?") > -1:
	TempString = line[UrlPosition:EndOfUrl]
	#Remove a part of the URL to get to a direct URL to the picture
	print TempString.replace(".shared/image.html?/","")
	numImageLinks = numImageLinks + 1

	#If the URL contains "photos/uncategorized" then it's an image straight away
	elif line[UrlPosition:EndOfUrl].find("photos/uncategorized") > -1:
	print line[UrlPosition:EndOfUrl]
	numImageLinks = numImageLinks + 1

	#print UrlPosition
	#print EndOfUrl

	num = num + 1
	CurLocInString = UrlPosition + 1

	#No more URLs in the current line
	else:
	break

	#end of the for each line in the file loop

	searchfile.close()

	print "Number of links: ", num
	print "Number of image links: ", numImageLinks

	# endo of "process" function


	def main():
	if len(sys.argv) < 3:
	print "Martin's Typepad Image Link Extractor v0.1"
	print "Usage: %s filename blogURL (e.g. http://mobilesociety.typepad.com)" % sys.argv[0]
	sys.exit(1)

	process(sys.argv[1], sys.argv[2])
	# main()

	#############################################################################

	if __name__ == "__main__":
	main()