Skip to content

Instantly share code, notes, and snippets.

@hclivess
Created April 21, 2021 12:51
Show Gist options
  • Save hclivess/0744bc3ec53dc0fb0b2a87ab777995fc to your computer and use it in GitHub Desktop.
Save hclivess/0744bc3ec53dc0fb0b2a87ab777995fc to your computer and use it in GitHub Desktop.
librivox old
from threading import Thread
import threading
lock = threading.Lock()
import glob
import os.path
#import cookielib
import http.cookiejar
import re
import time
import datetime
from datetime import date
import socket
import requests
global run_length
global i_controller
global matches_id_controller
def download_file(url, i, page):
local_filename = str(i) + " - " + str(page) + " - " + url.split('/')[-1]
# NOTE the stream=True parameter
r = requests.get(url, stream=True, headers=hdr)
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
# f.flush() commented by recommendation from J.F.Sebastian
return local_filename
# timeout in seconds
timeout = 6
socket.setdefaulttimeout(timeout)
# timeout in seconds
# set header
hdr = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'cs,sk;q=0.8,en;q=0.6,en-GB;q=0.4',
'Connection': 'keep-alive'}
# set header
limit = 100000000
page_init = 9
index = 72
page = page_init
while page < limit:
print ("page: " + str(page))
request = requests.get("https://archive.org/details/librivoxaudio?&sort=-downloads&page=" + str(page) + "",
headers=hdr)
print (request)
geturl_readable = request.text
album_pages = re.findall("item\-ttl C C2\">[\n\r\t\w]+ \<a href\=\"([\w\_\/\-\.]+)", geturl_readable)
album_pages_urls = []
for x in album_pages:
y = "https://archive.org" + x
album_pages_urls.append(y)
print (album_pages_urls)
i = 0
for x in album_pages_urls:
print (x)
request2 = requests.get(album_pages_urls[i], headers=hdr)
i = i + 1
geturl_readable2 = request2.text
album_zip = re.findall("\"(\/compress.*zip)", geturl_readable2)
try:
print (album_zip[0])
except:
album_zip = re.findall("\"(\/.*mp3)", geturl_readable2)
print (album_zip[0])
dl_this = ("https://archive.org" + album_zip[0])
print (dl_this)
if page == page_init:
print (index)
print (i)
if i < index:
pass
else:
download_file(dl_this, i, page)
else:
download_file(dl_this, i, page)
page = page + 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment