Last active
February 1, 2021 20:08
-
-
Save Wikinaut/2231c23f36d4d9ad52f1bc5c02f26e4c to your computer and use it in GitHub Desktop.
Erstellt lokal eine kompakte Webseite aller "Langen Nächte" von DLF/DLFKultur
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Erstellt lokal eine kompakte Webseite aller "Langen Nächte" von DLF/DLFKultur | |
# Lizenz: WTFPL | |
rm "lange-naechte.html" | |
# get number of pages | |
maxpages=$(wget -q -O - "https://www.deutschlandfunkkultur.de/lange-nacht.1023.de.html" | \ | |
grep -oE "paginationanzahl.*Seite ./[0-9]+" | \ | |
sed -nr "s/.*\/([0-9]+).*/\1/p") | |
echo "$maxpages Webseiten \"Lange Nacht\"" | |
baseurl="https:\/\/www.deutschlandfunkkultur.de\/" | |
for i in `eval echo {1..$maxpages}`; do | |
# for i in {1..3}; do | |
fn="https://www.deutschlandfunkkultur.de/lange-nacht.1023.de.html?drbm:page=$i" | |
echo "fetching... $fn" | |
wget -q -O - "$fn" | \ | |
sed -n "/pageContent/p" | sed -e "s/<h3><a href=\"/<h3><a href=\"$baseurl/g; \ | |
s/a class=\"image\" href=\"/a class=\"image\" href=\"$baseurl/g; \ | |
s/\/span>/\/span><\/br>/g; \ | |
s/<span class=\"drk-paginationanzahl\".*//g" >> lange-naechte.html | |
done | |
# extract all urls | |
grep -oE "https:\/\/www.deutschlandfunkkultur.de\/[^\"]+?" lange-naechte.html > allurls.txt | |
# download all referred pages and images | |
wget -N -i allurls.txt | |
# patch urls to point to local filenames | |
sed -e "s/https:\/\/www.deutschlandfunkkultur.de\///g;s/\?dram:article_id/\%3Fdram:article_id/g;s/\?key=/%3Fkey=/g;s/media\/thumbs\/.\///g" "lange-naechte.html" > "lange-naechte.local.html" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment