Created
December 21, 2010 19:17
-
-
Save starenka/750404 to your computer and use it in GitHub Desktop.
Respekt - PDF links extraction
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# | |
# Pulls out links to PDFs from Respekt user section. | |
# *As the site allows only one logged in user at once, | |
# you need to logout in your browser in order to use this script. | |
# usage: | |
# $ ./norespekt.py > respekt_links | |
# $ cat respekt_links | xargs -P 10 -r -n 1 wget -nv | |
# | |
# there's also js version over here https://gist.github.com/870509 | |
# | |
# @author: starenka | |
# @email: 'moc]tod[liamg].T.E[0aknerats'[::-1] | |
# @version: 1.0.3 | |
# @since Dec 22, 2010 | |
import re,mechanize | |
file_format, login, passwd = '', None, None | |
while not login: | |
login = raw_input('Login? ') | |
while not passwd: | |
passwd = raw_input('Heslo? ') | |
while file_format.lower() not in ('pdf','txt'): | |
file_format = raw_input('Odkazy na PDF nebo TXT? ') | |
br = mechanize.Browser() | |
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) | |
br.addheaders = [('User-agent', 'Opera/9.80 (X11; Linux i686; U; en) Presto/2.7.62 Version/11.00')] | |
br.open('http://service.ihned.cz/respekt/respekt-v-pdf.php?year=2004&number=37') | |
br.select_form(nr=1) | |
br.form['username'] = login | |
br.form['password'] = passwd | |
br.submit() | |
page = br.response().read() | |
br.open('http://respekt.ihned.cz/index.php?p=RRL000&login[logout]=1') #logout | |
parsed = {} | |
for tar,regexp in {'years': r'var ay = new Array\((.*?)\)', | |
'ynrs': r'var ayn = new Array\((.*?)\)', | |
'nrs': r'var an = new Array\((.*?)\)', | |
'hashes': r'var ah = new Array\((.*?)\)' | |
}.items(): | |
parsed[tar] = re.compile(regexp).search(page).group(1).replace('"','').split(',') | |
start = 0 | |
for y,max in enumerate(parsed['ynrs'][:-1]): | |
for i,hash in enumerate(parsed['hashes'][start:start+int(max)]): | |
print 'http://81.95.101.5/download/pdf_respekt/%s/Respekt_%s_%s_%s%s.%s'%(parsed['years'][y], | |
file_format.upper(), | |
parsed['years'][y], | |
parsed['nrs'][start+i], | |
hash, | |
file_format | |
) | |
start += int(max) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment