Skip to content

Instantly share code, notes, and snippets.

@starenka
Created December 21, 2010 19:17
Show Gist options
  • Save starenka/750404 to your computer and use it in GitHub Desktop.
Save starenka/750404 to your computer and use it in GitHub Desktop.
Respekt - PDF links extraction
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Pulls out links to PDFs from Respekt user section.
# *As the site allows only one logged in user at once,
# you need to logout in your browser in order to use this script.
# usage:
# $ ./norespekt.py > respekt_links
# $ cat respekt_links | xargs -P 10 -r -n 1 wget -nv
#
# there's also js version over here https://gist.github.com/870509
#
# @author: starenka
# @email: 'moc]tod[liamg].T.E[0aknerats'[::-1]
# @version: 1.0.3
# @since Dec 22, 2010
import re,mechanize
file_format, login, passwd = '', None, None
while not login:
login = raw_input('Login? ')
while not passwd:
passwd = raw_input('Heslo? ')
while file_format.lower() not in ('pdf','txt'):
file_format = raw_input('Odkazy na PDF nebo TXT? ')
br = mechanize.Browser()
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = [('User-agent', 'Opera/9.80 (X11; Linux i686; U; en) Presto/2.7.62 Version/11.00')]
br.open('http://service.ihned.cz/respekt/respekt-v-pdf.php?year=2004&number=37')
br.select_form(nr=1)
br.form['username'] = login
br.form['password'] = passwd
br.submit()
page = br.response().read()
br.open('http://respekt.ihned.cz/index.php?p=RRL000&login[logout]=1') #logout
parsed = {}
for tar,regexp in {'years': r'var ay = new Array\((.*?)\)',
'ynrs': r'var ayn = new Array\((.*?)\)',
'nrs': r'var an = new Array\((.*?)\)',
'hashes': r'var ah = new Array\((.*?)\)'
}.items():
parsed[tar] = re.compile(regexp).search(page).group(1).replace('"','').split(',')
start = 0
for y,max in enumerate(parsed['ynrs'][:-1]):
for i,hash in enumerate(parsed['hashes'][start:start+int(max)]):
print 'http://81.95.101.5/download/pdf_respekt/%s/Respekt_%s_%s_%s%s.%s'%(parsed['years'][y],
file_format.upper(),
parsed['years'][y],
parsed['nrs'][start+i],
hash,
file_format
)
start += int(max)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment