Skip to content

Instantly share code, notes, and snippets.

@navinpai
Last active December 23, 2015 07:29
Show Gist options
  • Select an option

  • Save navinpai/6601312 to your computer and use it in GitHub Desktop.

Select an option

Save navinpai/6601312 to your computer and use it in GitHub Desktop.
Downloading pintOS from the vt.edu sourcetree through WayBackMachine
from bs4 import BeautifulSoup
import requests
import os
import re
def downloader(filepage):
getdownload=[]
print "\n\n\n\nCURRENT"+os.getcwd()
for link in BeautifulSoup(requests.get(filepage).text).findAll('a',{'id':"download"}):
print "got link"
getdownload.append(link)
print "Link"+getdownload[0].get('href')
with open(link.get('href')[link.get('href').rfind('/')+1:], 'wb') as f:
for chunk in requests.get('http://web.archive.org'+getdownload[0].get('href')).iter_content():
f.write(chunk)
#STOLEN FROM SO
def find_occ(a_str, sub):
start = 0
while True:
start = a_str.find(sub, start)
if start == -1: return
yield start
start += len(sub)
def getfile(parentlink,count,currentdir):
alllinks=[]
print "COUNT"+ str(count)
for link in BeautifulSoup(requests.get(parentlink).text).find_all('a'):
alllinks.append(link)
validlinks=alllinks[count:-1]
for link in validlinks:
if(link=='http://web.archive.org/web/20090426004848/http://gback.cs.vt.edu:8080/source/xref/'):
print 'I Made a Boo Boo'
else:
print link
if(link.get('href')[-1]=='/'):
folder=currentdir+link.get('href')[link.get('href')[:-1].rfind('/'):-1]
print "FOLDER: "+folder
if not os.path.exists(folder):
os.makedirs(folder)
os.chdir(folder)
length=[m.start() for m in re.finditer('/', link.get('href'))]
print "LENGTH" +str(len(length))
getfile('http://web.archive.org'+link.get('href'),len(length)+3,folder)
else:
downloader('http://web.archive.org'+link.get('href'))
getfile('http://web.archive.org/web/20090426004848/http://gback.cs.vt.edu:8080/source/xref/',11,os.getcwd())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment