Created
September 15, 2011 14:18
-
-
Save starenka/1219347 to your computer and use it in GitHub Desktop.
Aktuální Respekt v HTML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Scrapuje Respekt.cz, aby to shráblo aktuální číslo v HTML. | |
# Nestahuje to obrázky, ale vidět jsou, protože maj absolutní URL, | |
# tohle se dá spravit pomocí pář řádků. | |
# | |
# Bohužel hapruje diakritika, bude tam někde potřeba pořešit přes iconv nebo tak něco. | |
import mechanize | |
from pyquery import PyQuery as pq | |
usr = "username" | |
pwd = "password" | |
br = mechanize.Browser() | |
br.open("http://www.respekt.cz") | |
br.select_form(name="login") | |
br["login[nick]"] = usr | |
br["login[pass]"] = pwd | |
br.submit() | |
br.open("http://respekt.ihned.cz/aktualni-cislo/") | |
html = br.response().read() | |
doc = pq(html) | |
linky = doc("div.ow-enclose div.ow h2 a:first-child") | |
for l in linky: | |
link = pq(l).attr("href") | |
br.open(link) | |
htmlArt = br.response().read() | |
docArt = pq(htmlArt) | |
nadpis = docArt("div#detail h1").text() | |
autor = docArt("div.d-date-author a:last-child").text() | |
perex = docArt("div.d-perex").text() | |
obsah = docArt("div.d-text").html() | |
print "<h1>" + nadpis + "</h1>" | |
print "<em>" + autor + "</em>" | |
print "<p><strong>" + perex + "</strong></p>" | |
print obsah |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment