Last active
May 28, 2021 13:54
-
-
Save cboulanger/6be3e5aedb198d4a50e9320e373f02f5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import {default as fetch} from 'node-fetch'; | |
const { pdf } = require("pdf-to-img"); | |
import {tmpdir} from "os"; | |
import {createWriteStream, createReadStream} from 'fs'; | |
import * as fsp from 'fs/promises' | |
import * as archiver from 'archiver'; | |
import {ArchiverError} from "archiver"; | |
import * as path from "path"; | |
import {Parser, Builder} from "xml2js"; | |
interface LoginResponse { | |
"auth": string, | |
"expires": number, | |
"user": { | |
"id": number, | |
"name": string, | |
"email": string, | |
"institute": string, | |
"admin": true|false | |
} | |
} | |
export type BookMetadata = { | |
author? : string, | |
title? : string, | |
description?: string, | |
language?: string, | |
profilerUrl?: string, | |
histPatterns?: string[], | |
year?: number, | |
pooled?: boolean | |
} | |
export class PocowebApi { | |
private readonly endpoint: string; | |
private token?: string; | |
/** | |
* @param {string} endpoint The API endpoint, without the /rest postfix | |
*/ | |
constructor(endpoint:string) { | |
this.endpoint = endpoint; | |
} | |
/** | |
* Given a service path of the REST API (without the "rest/" prefix), return | |
* the final URL containing the authentication token. | |
* @param {string} path | |
* @param {{}} params? | |
* @protected | |
*/ | |
protected serviceUrl(path: string, params: {[key: string]:any}={}) { | |
if (!this.token) { | |
throw new Error(`Client is not authenticated yet`); | |
} | |
params.auth = this.token; | |
const url = new URL(`${this.endpoint}/rest/${path}`); | |
url.search = (new URLSearchParams(params)).toString(); | |
return url.toString(); | |
} | |
/** | |
* Authenticates with the PocoWeb server | |
* @param {string} email | |
* @param {string} password | |
*/ | |
public async authenticate(email: string, password: string) : Promise<void>{ | |
let response; | |
try { | |
response = await fetch(`${this.endpoint}/rest/login`, { | |
body: JSON.stringify({email, password}), | |
method: "POST" | |
}); | |
} catch (e) { | |
// to do | |
throw e; | |
} | |
const loginResponse = await response.json() as LoginResponse; | |
if (!loginResponse.auth) { | |
throw new Error("Could not authenticate"); | |
} | |
this.token = loginResponse.auth; | |
} | |
/** | |
* Creates a PocoWeb-compliant project archive that can be uploaded | |
* from a source PDF and its corresponding ABBYY FineReader XML document. | |
* @param {String} pdfPath | |
* @param {String} xmlPath | |
* @return {String} Path to the created zip document | |
*/ | |
public createArchive(pdfPath: string, xmlPath: string) : Promise<string>{ | |
return new Promise<string>( async (resolve, reject) => { | |
const outputDir = tmpdir(); | |
const archive = archiver('zip', { | |
zlib: { level: 9 } | |
}); | |
const zipPath = path.join(outputDir, path.basename(pdfPath).replace(".pdf",".zip")); | |
const writeStream = createWriteStream(zipPath); | |
archive.pipe(writeStream); | |
writeStream.on('close', async () => { | |
console.log( `Finished writing archive ${zipPath} (${Math.round(archive.pointer()/1024)} kb).`); | |
resolve(zipPath); | |
}); | |
archive.on('warning', (err: ArchiverError) => { | |
if (err.code === 'ENOENT') { | |
console.warn(err.message); | |
} else { | |
reject(err); | |
} | |
}); | |
archive.on('error', (err: ArchiverError) => { | |
reject(err); | |
}); | |
// parse xml | |
const xmlParser = new Parser(); | |
const xmlSerializer = new Builder(); | |
const xmlDoc = await xmlParser.parseStringPromise(await fsp.readFile(xmlPath, "utf8")); | |
const pages = xmlDoc.document.page; | |
// extract images from PDF | |
let counter = 1; | |
const pdfDoc = await pdf(pdfPath, {scale:3}); | |
for await (const page of pdfDoc) { | |
const fileId = `page-${String(counter).padStart(3,"0")}`; | |
const imgFileName = `${fileId}.png`; | |
const imgFilePath = path.join(outputDir, imgFileName); | |
await fsp.writeFile(imgFilePath, page); | |
console.log(`Adding ${imgFileName}`); | |
archive.file(imgFilePath, { name: imgFileName }); | |
// select XML description of page | |
xmlDoc.document.page = pages[counter-1]; | |
const xml = xmlSerializer.buildObject(xmlDoc); | |
const xmlFileName = `${fileId}.xml`; | |
console.log(`Adding ${xmlFileName}`); | |
archive.append(xml, { name: xmlFileName }); | |
counter++; | |
} | |
archive.finalize(); | |
}); | |
} | |
/** | |
* Uploads an archive | |
* @param {string} archivePath | |
* @param {BookMetadata?} meta | |
*/ | |
public async uploadArchive(archivePath: string, meta: BookMetadata = {}) { | |
const url = this.serviceUrl("books", meta); | |
console.log(`Uploading ${path.basename(archivePath)} to ${this.endpoint}`); | |
const response = await fetch(url, { | |
method: "POST", | |
headers: { | |
"Content-Type": "application/zip" | |
}, | |
body: createReadStream(archivePath).on('error', e => {throw e}) | |
}); | |
const result = await response.json(); | |
// error? | |
if (result.message || result.status || result.code) { | |
throw new Error(`${result.code} ${result.status}: ${result.message}`); | |
} | |
console.log(result); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
async function uploadPdf(pdfPath, xmlPath) => { | |
const pw = new PocowebApi(process.env.POCOWEB_URL as string); | |
process.env.NODE_TLS_REJECT_UNAUTHORIZED = "0"; // to get around self-signed certificate error | |
await pw.authenticate(process.env.POCOWEB_EMAIL as string, process.env.POCOWEB_PASSWD as string); | |
const archivePath = await pw.createArchive(pdfPath, xmlPath); | |
const title = path.basename(pdfPath).replace(".pdf",""); | |
await pw.uploadArchive(archivePath, {title}); | |
console.log(`${title} uploaded.`); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment