Created
August 7, 2018 18:45
-
-
Save okiwan/66984d2ffddc02676c967e58ba937182 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import io # Operating system I/O | |
| import re # Regular expression module | |
| import pycurl # Curl module | |
| def log(text, end='\n'): | |
| print(text, end=end, flush=True) | |
| c = pycurl.Curl() | |
| buffer = io.BytesIO() | |
| log('* Retrieving URL data... ', end='') | |
| # Retrieving original URL | |
| c.setopt(c.URL, 'https://blogs.msdn.microsoft.com/mssmallbiz/2017/07/11/largest-free-microsoft-ebook-giveaway-im-giving-away-millions-of-free-microsoft-ebooks-again-including-windows-10-office-365-office-2016-power-bi-azure-windows-8-1-office-2013-sharepo/') | |
| c.setopt(c.WRITEDATA, buffer) | |
| c.perform() | |
| body = buffer.getvalue() | |
| log('done.') | |
| log('* Parsing data... ', end='') | |
| # We use a regular expression to easily extract titles and URLs | |
| rexp = re.compile('<td valign="top" width="673"><font size="2" face="Arial">(.{1,120})</font></td>.<td width="77"><a href="(http://ligman.me/.{7})" target="_blank"><font size="2" face="Arial">PDF</font></a><font size="2" face="Arial">', re.DOTALL) | |
| result = rexp.findall(body.decode('utf-8', errors='ignore')) | |
| log('done.') | |
| log('* Extracting documents... ', end='') | |
| try: | |
| counter = 1 | |
| for doc in result: | |
| log('.', end='') | |
| counter = counter + 1 | |
| path = doc[0] + '.pdf' | |
| output = open(path, 'wb') | |
| c.setopt(c.URL, doc[1]) | |
| c.setopt(c.WRITEDATA, buffer) | |
| c.setopt(c.FOLLOWLOCATION, True) | |
| c.perform() | |
| result = buffer.getvalue() | |
| output.write(result) | |
| output.close() | |
| except KeyboardInterrupt: | |
| log(' ABORTED!', end='') | |
| except Exception as e: | |
| log('\n') | |
| log(e) | |
| c.close() | |
| log('') | |
| log('Process finished (extracted ' + str(counter - 1) + ' document/s).') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment