Skip to content

Instantly share code, notes, and snippets.

@sguzman
Last active June 5, 2022 15:52
Show Gist options
  • Save sguzman/d7865e3139d86ff9639419ddb4df9826 to your computer and use it in GitHub Desktop.
Save sguzman/d7865e3139d86ff9639419ddb4df9826 to your computer and use it in GitHub Desktop.
Download pdfs
const fs = require('node:fs');
const https = require('node:https');
const u = require('node:url');
const p = require('node:path');
const cheerio = require('cheerio');
const winston = require('winston');
// --------------------------------------------------
const { createLogger, format, transports } = winston;
const { combine, timestamp, printf } = format;
const myFormat = printf(({ level, message, timestamp }) => {
return `${timestamp} ${level}: ${message}`;
});
const log = createLogger({
level: 'warn',
format: combine(
timestamp(),
myFormat
),
transports: [new transports.Console()]
});
function info(message) {
setTimeout(() => {
log.info(message);
});
}
function debug(message) {
setTimeout(() => {
log.debug(message);
});
}
function error(message) {
setTimeout(() => {
log.error(message);
});
}
// --------------------------------------------------
info('hi');
function mkdir(path, func) {
info(`mkdir ${path}`);
const fullPath = p.join('./data', path);
fs.mkdir(fullPath, func);
}
function filename(path) {
return path.split('/').slice(-1)[0];
}
function basePath(path) {
return path.split('/').slice(0, -1).join('/');
}
function path(url) {
return u.parse(url).path;
}
function get(url, func) {
debug(`get ${url.path}`);
https.get(url, (resp) => {
let data = '';
// A chunk of data has been received.
resp.on('data', (chunk) => {
data += chunk;
});
// The whole response has been received. Print out the result.
resp.on('end', () => {
func(data);
});
}).on("error", (err) => {
console.log("Error: " + err.message);
});
}
function cache(url, func) {
debug(`cache ${url.path}`);
const pathFile = path(url);
fs.readFile(pathFile, (err, data1) => {
if (err) {
info(`cache miss ${url.path}`);
get(url, (data2) => {
debug(`cache write ${url.path}`);
const fullPath = p.join('./data', pathFile);
fs.writeFile(fullPath, data2, (err) => {
if (err) {
error(err);
return;
}
});
func(data2);
});
} else {
debug(`cache hit ${url.path}`);
func(data1);
}
});
}
function main() {
const urls = [
"https://archive.org/details/bruinlife9091univ",
"https://archive.org/details/bruinlife1984univ",
"https://archive.org/details/southerncampus1963univ",
"https://archive.org/details/bruinlife1990univ",
"https://archive.org/details/bruinlife1988univ",
"https://archive.org/details/bruinlife1986univ",
"https://archive.org/details/bruinlife1985univ",
"https://archive.org/details/southerncampus1982univ",
"https://archive.org/details/bruinlife9495univ",
"https://archive.org/details/southerncampus1972pt1univ",
"https://archive.org/details/southerncampus1970univ",
"https://archive.org/details/southerncampus1968univ",
"https://archive.org/details/southerncampus1947univ",
"https://archive.org/details/bruinlife9192univ",
"https://archive.org/details/southerncampus1964univ",
"https://archive.org/details/southerncampus1948univ",
"https://archive.org/details/bruinlife1996univ",
"https://archive.org/details/southerncampus1969univ",
"https://archive.org/details/bruinlife1983univ",
"https://archive.org/details/southerncampus1965univ",
"https://archive.org/details/southerncampus1981univ",
"https://archive.org/details/southerncampus1955univ",
"https://archive.org/details/southerncampus1976univ",
"https://archive.org/details/southerncampus1953univ",
"https://archive.org/details/southerncampus1949univ",
"https://archive.org/details/southerncampus1958univ",
"https://archive.org/details/southerncampus1957univ",
"https://archive.org/details/southerncampus1980univ",
"https://archive.org/details/bruinlife1997univ",
"https://archive.org/details/southerncampus1966univ",
"https://archive.org/details/southerncampus1928univ",
"https://archive.org/details/bruinlife1998univ",
"https://archive.org/details/southerncampus1935univ",
"https://archive.org/details/southerncampus1960univ",
"https://archive.org/details/southerncampus1946univ",
"https://archive.org/details/bruinlife1999univ",
"https://archive.org/details/southerncampus1930univ",
"https://archive.org/details/southerncampus1959univ",
"https://archive.org/details/southerncampus1934univ",
"https://archive.org/details/southerncampus1950univ",
"https://archive.org/details/southerncampus1977univ",
"https://archive.org/details/southerncampus1939univ",
"https://archive.org/details/southerncampus1967univ",
"https://archive.org/details/southerncampus1937univ",
"https://archive.org/details/southerncampus1942univ",
"https://archive.org/details/southerncampus1941univ",
"https://archive.org/details/bruinlife9293univ",
"https://archive.org/details/southerncampus1932univ",
"https://archive.org/details/southerncampushon1954univ",
"https://archive.org/details/southerncampus1943univ",
"https://archive.org/details/southerncampus1924univ",
"https://archive.org/details/southerncampus1952univ",
"https://archive.org/details/southerncampus1944univ",
"https://archive.org/details/southerncampus1978univ",
"https://archive.org/details/southerncampus1929univ",
"https://archive.org/details/southerncampushon1931univ",
"https://archive.org/details/southerncampus1940univ",
"https://archive.org/details/southerncampus1956univ",
"https://archive.org/details/southerncampus1933univ",
"https://archive.org/details/southerncampus1962univ",
"https://archive.org/details/southerncampus1938univ",
"https://archive.org/details/southerncampus1951univ",
"https://archive.org/details/southerncampus1979univ",
"https://archive.org/details/southerncampus1975univ",
"https://archive.org/details/southerncampus1921univ",
"https://archive.org/details/southerncampus1954univ",
"https://archive.org/details/southerncampus1972pt2univ",
"https://archive.org/details/southerncampus1925univ",
"https://archive.org/details/exponentlosangel1906stat",
"https://archive.org/details/southerncampushon1959univ",
"https://archive.org/details/exponentlosangel1907stat",
"https://archive.org/details/southerncampus1926univ",
"https://archive.org/details/southerncampus1961univ",
"https://archive.org/details/southerncampus1931univ",
"https://archive.org/details/exponentlosangel1903stat",
"https://archive.org/details/southerncampu1920univ",
"https://archive.org/details/southerncampus1974univ",
"https://archive.org/details/southerncampus1927univ",
"https://archive.org/details/southerncampus1922univ",
"https://archive.org/details/southerncampus1945univ",
"https://archive.org/details/exponentlosangel1902sumrstat",
"https://archive.org/details/exponentlosangel1910stat",
"https://archive.org/details/southerncampus1971pt2univ",
"https://archive.org/details/southerncampus1923univ",
"https://archive.org/details/exponentlosangel1900stat",
"https://archive.org/details/exponentlosangel1902wintstat",
"https://archive.org/details/exponentlosangel1908stat",
"https://archive.org/details/exponentlosangel1899stat",
"https://archive.org/details/exponentlosangel1901sumrstat",
"https://archive.org/details/exponentlosangel1901wintstat",
"https://archive.org/details/exponentlosangel1911stat",
"https://archive.org/details/southerncampus1966suppuniv",
"https://archive.org/details/southerncampus1936univ",
"https://archive.org/details/exponentlosangel1913stat",
"https://archive.org/details/exponentlosangel1904stat",
"https://archive.org/details/southerncampus1973univ",
"https://archive.org/details/exponentlosangel1917stat",
"https://archive.org/details/exponentlosangel1905stat",
"https://archive.org/details/exponentlosangel1909stat",
"https://archive.org/details/exponentlosangel1914stat",
"https://archive.org/details/southerncampus1971pt1univ",
"https://archive.org/details/exponentlosangel1916stat",
"https://archive.org/details/exponentlosangel1915stat",
"https://archive.org/details/exponentlosangel1912stat"
];
urls.forEach(function(url) {
debug(url);
const parsedUrl = u.parse(url)
cache(parsedUrl, (data) => {
const doc = cheerio.load(data);
const href = doc('a.format-summary.download-pill[href$="pdf"]').attr('href');
const pdf = `https://archive.org${href}`;
console.log(pdf);
});
});
}
main();
info('bye');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment