okiwan · August 7, 2018 18:45
diff --git a/bookextractor.py b/bookextractor.py
 import io      # Operating system I/O
 import re      # Regular expression module
 import pycurl  # Curl module

 def log(text, end='\n'):
    print(text, end=end, flush=True)

 c = pycurl.Curl()
 buffer = io.BytesIO()

 log('* Retrieving URL data... ', end='')

 # Retrieving original URL
 c.setopt(c.URL, 'https://blogs.msdn.microsoft.com/mssmallbiz/2017/07/11/largest-free-microsoft-ebook-giveaway-im-giving-away-millions-of-free-microsoft-ebooks-again-including-windows-10-office-365-office-2016-power-bi-azure-windows-8-1-office-2013-sharepo/')
 c.setopt(c.WRITEDATA, buffer)
 c.perform()

 body = buffer.getvalue()

 log('done.')
 log('* Parsing data... ', end='')

 # We use a regular expression to easily extract titles and URLs
 rexp = re.compile('<td valign="top" width="673"><font size="2" face="Arial">(.{1,120})</font></td>.<td width="77"><a href="(http://ligman.me/.{7})" target="_blank"><font size="2" face="Arial">PDF</font></a><font size="2" face="Arial">', re.DOTALL)
 result = rexp.findall(body.decode('utf-8', errors='ignore'))

 log('done.')
 log('* Extracting documents... ', end='')

 try:
    counter = 1
    for doc in result:
        log('.', end='')
        counter = counter + 1

        path = doc[0] + '.pdf'
        output = open(path, 'wb')

        c.setopt(c.URL, doc[1])
        c.setopt(c.WRITEDATA, buffer)
        c.setopt(c.FOLLOWLOCATION, True)
        c.perform()

        result = buffer.getvalue()
        output.write(result)
        output.close()
 except KeyboardInterrupt:
    log(' ABORTED!', end='')
 except Exception as e:
    log('\n')
    log(e)

 c.close()
 log('')
 log('Process finished (extracted ' + str(counter - 1) + ' document/s).')
	import io # Operating system I/O
	import re # Regular expression module
	import pycurl # Curl module

	def log(text, end='\n'):
	print(text, end=end, flush=True)

	c = pycurl.Curl()
	buffer = io.BytesIO()

	log('* Retrieving URL data... ', end='')

	# Retrieving original URL
	c.setopt(c.URL, 'https://blogs.msdn.microsoft.com/mssmallbiz/2017/07/11/largest-free-microsoft-ebook-giveaway-im-giving-away-millions-of-free-microsoft-ebooks-again-including-windows-10-office-365-office-2016-power-bi-azure-windows-8-1-office-2013-sharepo/')
	c.setopt(c.WRITEDATA, buffer)
	c.perform()

	body = buffer.getvalue()

	log('done.')
	log('* Parsing data... ', end='')

	# We use a regular expression to easily extract titles and URLs
	rexp = re.compile('<td valign="top" width="673"><font size="2" face="Arial">(.{1,120})</font></td>.<td width="77"><a href="(http://ligman.me/.{7})" target="_blank"><font size="2" face="Arial">PDF</font></a><font size="2" face="Arial">', re.DOTALL)
	result = rexp.findall(body.decode('utf-8', errors='ignore'))

	log('done.')
	log('* Extracting documents... ', end='')

	try:
	counter = 1
	for doc in result:
	log('.', end='')
	counter = counter + 1

	path = doc[0] + '.pdf'
	output = open(path, 'wb')

	c.setopt(c.URL, doc[1])
	c.setopt(c.WRITEDATA, buffer)
	c.setopt(c.FOLLOWLOCATION, True)
	c.perform()

	result = buffer.getvalue()
	output.write(result)
	output.close()
	except KeyboardInterrupt:
	log(' ABORTED!', end='')
	except Exception as e:
	log('\n')
	log(e)

	c.close()
	log('')
	log('Process finished (extracted ' + str(counter - 1) + ' document/s).')
No results found