Last active
August 29, 2015 14:19
-
-
Save asfaltboy/abaf2eb65d6da0727bc9 to your computer and use it in GitHub Desktop.
Mine URLs from POSA-15 slides
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
https://d396qusza40orc.cloudfront.net/posaconcurrency/slides/S0-P1-MOOC-organization-and-topics.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/slides/S0-P3-MOOC-prereqs-and-learning-strategies.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/2014-PDFs/S1-M1-P1-concurrency-motivations.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/2014-PDFs/S1-M1-P2-concurrency-challenges.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/2014-PDFs/S0-P4-overview-of-patterns-and-frameworks.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/2014-PDFs/S0-P5-overview-of-patterns-and-frameworks-pt2.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/android-layers-s1.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/android-layers-s2.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/android-layers-s3.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/android-layers-s4.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/android-layers-s5.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/android-layers-s6.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/android-layers-s7.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/java-threads-s1.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/java-threads-s2.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/java-threads-s3.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/java-threads-s4.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/thread-lifecycle-s1.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/thread-lifecycle-s2.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/thread-lifecycle-s3.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/thread-lifecycle-s4.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/2014-PDFs/S1-M2-P1-overview-of-Java-threads-pt1.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/2014-PDFs/S1-M2-P2-overview-of-Java-threads-pt2.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/2014-PDFs/S1-M2-P12-Java-built-in-monitor-objects.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/and-con-fwks-s1.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/and-con-fwks-s2.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/and-con-fwks-s3.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/thr-downloads-app.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/looper-s1.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/looper-s2.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/looper-s3.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/handler-seg1.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/handler-seg2.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/posting-seg1.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/posting-seg2.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/posting-seg3.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/sending-seg1.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/sending-seg2.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/2014-PDFs/S1-M3-P1-Android-concurrency-frameworks-and-idioms.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/2014-PDFs/S1-M3-P2-Android-Looper.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/2014-PDFs/S1-M3-P3-Overview-of-Android-Handler.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/2014-PDFs/S1-M3-P4-Posting-and-Processing-Runnables-with-Android-Handler.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/2014-PDFs/S1-M3-P5-Sending-and-Handling-Messages-with-Android-Handler.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/asynctask-s1.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/asynctask-s2.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/asynctask-s3.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/asynctask-s4.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/bbwb-frameworks-s1.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/bbwb-frameworks-s2.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/lecture_slides/evaluating-Android-concy-fwks.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/2014-PDFs/S1-M3-P6-the-AsyncTask-framework.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/2014-PDFs/S1-M3-P7-the-AsyncTask-framework.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/2014-PDFs/S3-M1-P1-Monitor-Object-pattern-pt1.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/2014-PDFs/S3-M1-P2-Monitor-Object-pattern-pt2.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/2014-PDFs/TSS-pattern.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/2014-PDFs/Command-Processor-pattern-pt1.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/2014-PDFs/Command-Processor-pattern-pt2.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/2014-PDFs/HS-HA-pattern.pdf | |
https://d396qusza40orc.cloudfront.net/posaconcurrency/2014-PDFs/Active-Object-pattern.pdf |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This script downloads (if required) and parses a list of PDF files and | |
extracts any URLs which follow the pattern "See X". | |
It optionally validates if urls are broken and stores all results | |
(good and bad) in an markdown formatted result file. | |
FILE_LIST was initially populated from the course page with jQuery: | |
$('.icon-file.resource').map(function(){return $(this).parent().attr('href')}); | |
WARNINGS: | |
Currently we rely on `pdfminer`'s page detection mechanism to detect | |
where a URL ends. This is due to some long urls occasionally being split into | |
several lines. | |
This, unfortunately, causes many errors in cases where additional notes, not | |
part of the URL, are added after it; and also in cases where `pdfminer` wrongly | |
breaks a page before the URL. This means we might have many invalid links | |
Additionally, our validation mechanism knows nothing of the content, so if a | |
target link responds with a 200 OK http response to a HEAD/GET, we assume it | |
is fine. | |
""" | |
from collections import defaultdict | |
from urlparse import urlparse | |
from StringIO import StringIO | |
import logging | |
import os | |
import re | |
import sys | |
# main | |
from pdfminer.pdfpage import PDFPage | |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
from pdfminer.converter import HTMLConverter, TextConverter | |
import requests | |
logger = logging.getLogger(__name__) | |
# Some Settings: | |
FILE_LIST = 'lecture_slides_list.txt' | |
VERIFY_URL = True | |
RESULT_FILE = 'result.md' | |
SLIDES_DIR = 'slides/' | |
DEFAULT_LOG_LEVEL = logging.WARN | |
pattern = re.compile('.*\\sSee\\s(.*)\x0c') | |
def extract_links(filename, rsrcmgr, device, **kwargs): | |
""" | |
Takes <filename> which is a path to PDF file and returns a list | |
of all urls which appear ion the file. | |
Each link is a dictionary with the following keys: | |
{'filename': <filename>, 'slide_title': <title>, | |
'slide_no': <slide>, 'page_no': <page>, 'url': <url>} | |
Takes optional kwargs which are passed to PDFPage.get_pages | |
""" | |
extract_text(filename, rsrcmgr, device, **kwargs) # populates our StringIO | |
links = [] | |
text = device.outfp.getvalue() | |
for page, content in enumerate(text.split('~new~page~sep~')[1:]): | |
# content = content.split('\xe2\x80\xa2') | |
match = re.match(pattern, content) | |
if not match: | |
continue | |
url = ''.join(match.groups()[0].split()) | |
broken = False | |
if VERIFY_URL: | |
parts = urlparse(url) | |
if not parts.scheme: | |
url = 'http://%s' % url | |
# import ipdb; ipdb.set_trace() | |
# url = urlunparse(parts) | |
logger.debug('Validating extracted url %s', url) | |
try: | |
res = requests.head(url) | |
if res.status_code == 405: # does not support HEAD | |
res = requests.get(url) | |
logger.debug('Raising for status %s', res.status_code) | |
res.raise_for_status() | |
except requests.exceptions.RequestException as e: | |
if e.response: | |
code = e.response.status_code | |
else: | |
code = '???' | |
logger.error('Page %s: %s has invalid or broken URL (%s): %s -' | |
' %s', filename, page, code, url, e) | |
broken = True | |
link = { | |
'filename': os.path.basename(filename), | |
# 'slide_title': title.strip(), | |
# 'slide_no': slide.strip(), | |
'page_no': page, | |
'url': url, | |
'broken': broken, | |
} | |
logger.debug('Adding link %s', link) | |
links.append(link) | |
return links | |
def extract_text(filename, rsrcmgr, device, **kwargs): | |
""" | |
Wrap PDF miner calls to actually interpret and iterate over | |
PDF pages and write their content into the device. | |
""" | |
logger.info('Processing file %s', filename) | |
fp = file(filename, 'rb') | |
interpreter = PDFPageInterpreter(rsrcmgr, device) | |
for page in PDFPage.get_pages(fp, **kwargs): | |
interpreter.process_page(page) | |
device.outfp.write('~new~page~sep~') | |
fp.close() | |
def get_files_from_list(filenames): | |
""" | |
Examine the given text file <filenames> which must contain filename paths | |
or URLs, one per line. If a line is a URL, the file is downloaded into | |
a subdirectory ("slides/" by default). Finally a list of file paths are | |
returned. | |
""" | |
if isinstance(filenames, basestring): | |
with open(filenames) as f: | |
filenames = [l for l in f] | |
slides = [] | |
for item in filenames: | |
uri = item.strip() | |
url = urlparse(uri) | |
if not url.scheme: | |
slides.append(url.path) | |
continue | |
local_path = os.path.join(SLIDES_DIR, url.path.split('/')[-1]) | |
if not os.path.exists(local_path): | |
logger.debug('Downloading missing slide from %s', uri) | |
res = requests.get(uri) | |
if not res.ok: | |
logger.warn("Could not download file from url %s", uri) | |
continue | |
with open(local_path, 'w') as f: | |
f.write(res.content) | |
else: | |
logger.debug('Using existing local file %s', local_path) | |
slides.append(local_path) | |
return slides | |
def get_all_links(filenames): | |
rsrcmgr = PDFResourceManager(caching=True) | |
links = [] | |
for fname in filenames: | |
outfp = StringIO() | |
device = TextConverter(rsrcmgr, outfp) | |
links.extend(extract_links(fname, rsrcmgr, device, caching=True)) | |
outfp.close() | |
device.close() | |
return links | |
def write_results(links): | |
good_links_per_file = defaultdict(list) | |
broken_links_per_file = defaultdict(list) | |
for l in links: | |
if l['broken']: | |
broken_links_per_file[l['filename']].append(l) | |
else: | |
good_links_per_file[l['filename']].append(l) | |
results = [] | |
results.append('# Working Links\n') | |
for sfile, links in good_links_per_file.items(): | |
results.append('\n## %s\n\n' % sfile) | |
for l in links: | |
results.append('* Page %s - %s\n' % (l['page_no'], l['url'])) | |
results.append('\n\n# Broken Links\n') | |
for sfile, links in broken_links_per_file.items(): | |
results.append('\n## %s\n\n' % sfile) | |
for l in links: | |
results.append('* Page %s - %s\n' % (l['page_no'], l['url'])) | |
with open(RESULT_FILE, 'w') as f: | |
f.writelines(results) | |
if __name__ == '__main__': | |
logging.basicConfig( | |
level=DEFAULT_LOG_LEVEL, | |
format='%(asctime)s %(levelname)-8s %(message)s', | |
datefmt='%a, %d %b %Y %H:%M:%S') | |
filenames = get_files_from_list(sys.argv[1:] or FILE_LIST) | |
assert filenames, ("At least one filename should be specified (either" | |
" as argument or in file %s)") % FILE_LIST | |
links = get_all_links(filenames) | |
write_results(links) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment