Skip to content

Instantly share code, notes, and snippets.

@okiwan
Created August 7, 2018 18:45
Show Gist options
  • Select an option

  • Save okiwan/66984d2ffddc02676c967e58ba937182 to your computer and use it in GitHub Desktop.

Select an option

Save okiwan/66984d2ffddc02676c967e58ba937182 to your computer and use it in GitHub Desktop.
import io # Operating system I/O
import re # Regular expression module
import pycurl # Curl module
def log(text, end='\n'):
print(text, end=end, flush=True)
c = pycurl.Curl()
buffer = io.BytesIO()
log('* Retrieving URL data... ', end='')
# Retrieving original URL
c.setopt(c.URL, 'https://blogs.msdn.microsoft.com/mssmallbiz/2017/07/11/largest-free-microsoft-ebook-giveaway-im-giving-away-millions-of-free-microsoft-ebooks-again-including-windows-10-office-365-office-2016-power-bi-azure-windows-8-1-office-2013-sharepo/')
c.setopt(c.WRITEDATA, buffer)
c.perform()
body = buffer.getvalue()
log('done.')
log('* Parsing data... ', end='')
# We use a regular expression to easily extract titles and URLs
rexp = re.compile('<td valign="top" width="673"><font size="2" face="Arial">(.{1,120})</font></td>.<td width="77"><a href="(http://ligman.me/.{7})" target="_blank"><font size="2" face="Arial">PDF</font></a><font size="2" face="Arial">', re.DOTALL)
result = rexp.findall(body.decode('utf-8', errors='ignore'))
log('done.')
log('* Extracting documents... ', end='')
try:
counter = 1
for doc in result:
log('.', end='')
counter = counter + 1
path = doc[0] + '.pdf'
output = open(path, 'wb')
c.setopt(c.URL, doc[1])
c.setopt(c.WRITEDATA, buffer)
c.setopt(c.FOLLOWLOCATION, True)
c.perform()
result = buffer.getvalue()
output.write(result)
output.close()
except KeyboardInterrupt:
log(' ABORTED!', end='')
except Exception as e:
log('\n')
log(e)
c.close()
log('')
log('Process finished (extracted ' + str(counter - 1) + ' document/s).')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment