jwoglom · October 13, 2020 02:35 · dxmar03 · Sep 11, 2024 · aarontbk · Apr 24, 2025
diff --git a/data.py b/data.py
 title = "The title of the article"
 urls="""
 <image URLs scraped from the page>
 """
diff --git a/download_perusall.py b/download_perusall.py
 # dependencies: imagemagick, img2pdf
 from data import title, urls

 folder = title.replace(' ','-')
 import requests
 import os
 if not os.path.exists(folder):
    os.mkdir(folder)
 i = 0
 for u in urls.splitlines():
    if u:
        print('Downloading chunk', i, 'of', title)
        open('{}/{:0>2}.png'.format(folder, i), 'wb').write(requests.get(u.strip()).content)
        i += 1

 pgno = 1
 for j in range(0, i, 6):
    f = ' '.join(['{}/{:0>2}.png'.format(folder, k) for k in range(j, min(i, j+6))])
    print('Converting page', pgno)
    os.system('convert -append %s %s/page_%s.png' % (f, folder, pgno))
    pgno += 1

 print('Converting to pdf')
 pages = ' '.join(['{}/page_{}.png'.format(folder, k) for k in range(1, pgno)])
 os.system('img2pdf %s -o %s.pdf' % (pages, title))
 print('Done')
diff --git a/get_urls.py b/get_urls.py
 /*
 * Click on a reading in the Perusall web interface,
 * and run this script in the developer console.
 * Copy-and-paste the console.info output to data.py.
 */
 var len = 0; 
 var times = 0;
 var i = setInterval(() => { 
  var img = document.querySelectorAll("img.chunk"); img[img.length-1].scrollIntoView(); 
  if (len < img.length) {
    len = img.length;
  } else if (times > 3) {
    var urls = [];
    img.forEach((e) => urls.push(e.src));
    var spl = location.pathname.split('/');
    console.info('urls = """\n'+urls.join('\n')+'\n"""\n\ntitle="'+spl[spl.length-1]+'"\n');
    clearInterval(i);
  } else {
      times++;
  }
 }, 2000);
	title = "The title of the article"
	urls="""
	<image URLs scraped from the page>
	"""
	# dependencies: imagemagick, img2pdf
	from data import title, urls

	folder = title.replace(' ','-')
	import requests
	import os
	if not os.path.exists(folder):
	os.mkdir(folder)
	i = 0
	for u in urls.splitlines():
	if u:
	print('Downloading chunk', i, 'of', title)
	open('{}/{:0>2}.png'.format(folder, i), 'wb').write(requests.get(u.strip()).content)
	i += 1

	pgno = 1
	for j in range(0, i, 6):
	f = ' '.join(['{}/{:0>2}.png'.format(folder, k) for k in range(j, min(i, j+6))])
	print('Converting page', pgno)
	os.system('convert -append %s %s/page_%s.png' % (f, folder, pgno))
	pgno += 1

	print('Converting to pdf')
	pages = ' '.join(['{}/page_{}.png'.format(folder, k) for k in range(1, pgno)])
	os.system('img2pdf %s -o %s.pdf' % (pages, title))
	print('Done')
	/*
	* Click on a reading in the Perusall web interface,
	* and run this script in the developer console.
	* Copy-and-paste the console.info output to data.py.
	*/
	var len = 0;
	var times = 0;
	var i = setInterval(() => {
	var img = document.querySelectorAll("img.chunk"); img[img.length-1].scrollIntoView();
	if (len < img.length) {
	len = img.length;
	} else if (times > 3) {
	var urls = [];
	img.forEach((e) => urls.push(e.src));
	var spl = location.pathname.split('/');
	console.info('urls = """\n'+urls.join('\n')+'\n"""\n\ntitle="'+spl[spl.length-1]+'"\n');
	clearInterval(i);
	} else {
	times++;
	}
	}, 2000);