Skip to content

Instantly share code, notes, and snippets.

@limitedeternity
Last active June 11, 2019 15:12
Show Gist options
  • Select an option

  • Save limitedeternity/fd08d6a988aa4acc58b50f58c7bcab58 to your computer and use it in GitHub Desktop.

Select an option

Save limitedeternity/fd08d6a988aa4acc58b50f58c7bcab58 to your computer and use it in GitHub Desktop.
Utility to fetch data from https://opop.herzen.spb.ru/upload/scanned_docs/ by faculty code
// FileSaver.min.js
(function(a,b){if("function"==typeof define&&define.amd)define([],b);else if("undefined"!=typeof exports)b();else{b(),a.FileSaver={exports:{}}.exports}})(this,function(){"use strict";function b(a,b){return"undefined"==typeof b?b={autoBom:!1}:"object"!=typeof b&&(console.warn("Depricated: Expected third argument to be a object"),b={autoBom:!b}),b.autoBom&&/^\s*(?:text\/\S*|application\/xml|\S*\/\S*\+xml)\s*;.*charset\s*=\s*utf-8/i.test(a.type)?new Blob(["\uFEFF",a],{type:a.type}):a}function c(b,c,d){var e=new XMLHttpRequest;e.open("GET",b),e.responseType="blob",e.onload=function(){a(e.response,c,d)},e.onerror=function(){console.error("could not download file")},e.send()}function d(a){var b=new XMLHttpRequest;return b.open("HEAD",a,!1),b.send(),200<=b.status&&299>=b.status}function e(a){try{a.dispatchEvent(new MouseEvent("click"))}catch(c){var b=document.createEvent("MouseEvents");b.initMouseEvent("click",!0,!0,window,0,0,0,80,20,!1,!1,!1,!1,0,null),a.dispatchEvent(b)}}var f="object"==typeof window&&window.window===window?window:"object"==typeof self&&self.self===self?self:"object"==typeof global&&global.global===global?global:void 0,a=f.saveAs||"object"!=typeof window||window!==f?function(){}:"download"in HTMLAnchorElement.prototype?function(b,g,h){var i=f.URL||f.webkitURL,j=document.createElement("a");g=g||b.name||"download",j.download=g,j.rel="noopener","string"==typeof b?(j.href=b,j.origin===location.origin?e(j):d(j.href)?c(b,g,h):e(j,j.target="_blank")):(j.href=i.createObjectURL(b),setTimeout(function(){i.revokeObjectURL(j.href)},4E4),setTimeout(function(){e(j)},0))}:"msSaveOrOpenBlob"in navigator?function(f,g,h){if(g=g||f.name||"download","string"!=typeof f)navigator.msSaveOrOpenBlob(b(f,h),g);else if(d(f))c(f,g,h);else{var i=document.createElement("a");i.href=f,i.target="_blank",setTimeout(function(){e(i)})}}:function(a,b,d,e){if(e=e||open("","_blank"),e&&(e.document.title=e.document.body.innerText="downloading..."),"string"==typeof a)return c(a,b,d);var g="application/octet-stream"===a.type,h=/constructor/i.test(f.HTMLElement)||f.safari,i=/CriOS\/[\d]+/.test(navigator.userAgent);if((i||g&&h)&&"object"==typeof FileReader){var j=new FileReader;j.onloadend=function(){var a=j.result;a=i?a:a.replace(/^data:[^;]*;/,"data:attachment/file;"),e?e.location.href=a:location=a,e=null},j.readAsDataURL(a)}else{var k=f.URL||f.webkitURL,l=k.createObjectURL(a);e?e.location=l:location.href=l,e=null,setTimeout(function(){k.revokeObjectURL(l)},4E4)}};f.saveAs=a.saveAs=a,"undefined"!=typeof module&&(module.exports=a)});
// when-dom-ready.min.js
!function(e,n){"object"==typeof exports&&"undefined"!=typeof module?module.exports=n():"function"==typeof define&&define.amd?define(n):e.whenDomReady=n()}(this,function(){"use strict";var i=["interactive","complete"],t=function(t,o){return new Promise(function(e){t&&"function"!=typeof t&&(o=t,t=null),o=o||window.document;var n=function(){return e(void(t&&setTimeout(t)))};-1!==i.indexOf(o.readyState)?n():o.addEventListener("DOMContentLoaded",n)})};return t.resume=function(n){return function(e){return t(n).then(function(){return e})}},t});
function createLinkList() {
let txt = '';
Array.from(
document.querySelectorAll('a[href]')
)
.filter(el =>
el.innerText.match(
new RegExp(`^${new Date().getFullYear()}(.+)\.pdf$`)
)
)
.forEach(el =>
txt += `${el.href}\n`
);
return txt.trim();
};
function downloadResult() {
let blob = new Blob([createLinkList()], {
type: 'text/plain;charset=utf-8;'
});
saveAs(blob, 'result.txt');
};
whenDomReady().then(() => {
window.scrollTo(0, document.body.scrollHeight);
downloadResult();
});
from uuid import uuid4
from os import unlink
import asyncio
from tqdm import tqdm
from tika import parser
from aiohttp import ClientSession
from aiofiles import open as aiopen
def analyzePdf(fname):
pdfReader = parser.from_file(fname)
text = pdfReader['content']
if not text or not '09.03.02' in text:
unlink(fname)
async def retrieveAndProcess(url, semaphore):
async with semaphore:
fname = uuid4().hex[:15] + '.pdf'
async with ClientSession() as session:
async with session.get(url, ssl=False) as response:
async with aiopen(fname, mode='wb') as pdfWriteObj:
while True:
chunk = await response.content.read(1 << 15)
if chunk:
await pdfWriteObj.write(chunk)
else:
break
analyzePdf(fname)
async def main():
semaphore = asyncio.Semaphore(4)
stripNewline = lambda line: line.rstrip('\n')
async with aiopen('result.txt', mode='r') as lines:
processTasks = [retrieveAndProcess(stripNewline(line), semaphore) async for line in lines]
for f in tqdm(asyncio.as_completed(processTasks), total=len(processTasks)):
await f
if __name__ == '__main__':
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
loop.run_until_complete(asyncio.sleep(5.250))
loop.close()
aiofiles
aiohttp
tika
tqdm
@limitedeternity
Copy link
Copy Markdown
Author

JRE is required for Tika to work properly.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment