Last active
June 11, 2019 15:12
-
-
Save limitedeternity/fd08d6a988aa4acc58b50f58c7bcab58 to your computer and use it in GitHub Desktop.
Utility to fetch data from https://opop.herzen.spb.ru/upload/scanned_docs/ by faculty code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// FileSaver.min.js | |
(function(a,b){if("function"==typeof define&&define.amd)define([],b);else if("undefined"!=typeof exports)b();else{b(),a.FileSaver={exports:{}}.exports}})(this,function(){"use strict";function b(a,b){return"undefined"==typeof b?b={autoBom:!1}:"object"!=typeof b&&(console.warn("Depricated: Expected third argument to be a object"),b={autoBom:!b}),b.autoBom&&/^\s*(?:text\/\S*|application\/xml|\S*\/\S*\+xml)\s*;.*charset\s*=\s*utf-8/i.test(a.type)?new Blob(["\uFEFF",a],{type:a.type}):a}function c(b,c,d){var e=new XMLHttpRequest;e.open("GET",b),e.responseType="blob",e.onload=function(){a(e.response,c,d)},e.onerror=function(){console.error("could not download file")},e.send()}function d(a){var b=new XMLHttpRequest;return b.open("HEAD",a,!1),b.send(),200<=b.status&&299>=b.status}function e(a){try{a.dispatchEvent(new MouseEvent("click"))}catch(c){var b=document.createEvent("MouseEvents");b.initMouseEvent("click",!0,!0,window,0,0,0,80,20,!1,!1,!1,!1,0,null),a.dispatchEvent(b)}}var f="object"==typeof window&&window.window===window?window:"object"==typeof self&&self.self===self?self:"object"==typeof global&&global.global===global?global:void 0,a=f.saveAs||"object"!=typeof window||window!==f?function(){}:"download"in HTMLAnchorElement.prototype?function(b,g,h){var i=f.URL||f.webkitURL,j=document.createElement("a");g=g||b.name||"download",j.download=g,j.rel="noopener","string"==typeof b?(j.href=b,j.origin===location.origin?e(j):d(j.href)?c(b,g,h):e(j,j.target="_blank")):(j.href=i.createObjectURL(b),setTimeout(function(){i.revokeObjectURL(j.href)},4E4),setTimeout(function(){e(j)},0))}:"msSaveOrOpenBlob"in navigator?function(f,g,h){if(g=g||f.name||"download","string"!=typeof f)navigator.msSaveOrOpenBlob(b(f,h),g);else if(d(f))c(f,g,h);else{var i=document.createElement("a");i.href=f,i.target="_blank",setTimeout(function(){e(i)})}}:function(a,b,d,e){if(e=e||open("","_blank"),e&&(e.document.title=e.document.body.innerText="downloading..."),"string"==typeof a)return c(a,b,d);var g="application/octet-stream"===a.type,h=/constructor/i.test(f.HTMLElement)||f.safari,i=/CriOS\/[\d]+/.test(navigator.userAgent);if((i||g&&h)&&"object"==typeof FileReader){var j=new FileReader;j.onloadend=function(){var a=j.result;a=i?a:a.replace(/^data:[^;]*;/,"data:attachment/file;"),e?e.location.href=a:location=a,e=null},j.readAsDataURL(a)}else{var k=f.URL||f.webkitURL,l=k.createObjectURL(a);e?e.location=l:location.href=l,e=null,setTimeout(function(){k.revokeObjectURL(l)},4E4)}};f.saveAs=a.saveAs=a,"undefined"!=typeof module&&(module.exports=a)}); | |
// when-dom-ready.min.js | |
!function(e,n){"object"==typeof exports&&"undefined"!=typeof module?module.exports=n():"function"==typeof define&&define.amd?define(n):e.whenDomReady=n()}(this,function(){"use strict";var i=["interactive","complete"],t=function(t,o){return new Promise(function(e){t&&"function"!=typeof t&&(o=t,t=null),o=o||window.document;var n=function(){return e(void(t&&setTimeout(t)))};-1!==i.indexOf(o.readyState)?n():o.addEventListener("DOMContentLoaded",n)})};return t.resume=function(n){return function(e){return t(n).then(function(){return e})}},t}); | |
function createLinkList() { | |
let txt = ''; | |
Array.from( | |
document.querySelectorAll('a[href]') | |
) | |
.filter(el => | |
el.innerText.match( | |
new RegExp(`^${new Date().getFullYear()}(.+)\.pdf$`) | |
) | |
) | |
.forEach(el => | |
txt += `${el.href}\n` | |
); | |
return txt.trim(); | |
}; | |
function downloadResult() { | |
let blob = new Blob([createLinkList()], { | |
type: 'text/plain;charset=utf-8;' | |
}); | |
saveAs(blob, 'result.txt'); | |
}; | |
whenDomReady().then(() => { | |
window.scrollTo(0, document.body.scrollHeight); | |
downloadResult(); | |
}); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from uuid import uuid4 | |
from os import unlink | |
import asyncio | |
from tqdm import tqdm | |
from tika import parser | |
from aiohttp import ClientSession | |
from aiofiles import open as aiopen | |
def analyzePdf(fname): | |
pdfReader = parser.from_file(fname) | |
text = pdfReader['content'] | |
if not text or not '09.03.02' in text: | |
unlink(fname) | |
async def retrieveAndProcess(url, semaphore): | |
async with semaphore: | |
fname = uuid4().hex[:15] + '.pdf' | |
async with ClientSession() as session: | |
async with session.get(url, ssl=False) as response: | |
async with aiopen(fname, mode='wb') as pdfWriteObj: | |
while True: | |
chunk = await response.content.read(1 << 15) | |
if chunk: | |
await pdfWriteObj.write(chunk) | |
else: | |
break | |
analyzePdf(fname) | |
async def main(): | |
semaphore = asyncio.Semaphore(4) | |
stripNewline = lambda line: line.rstrip('\n') | |
async with aiopen('result.txt', mode='r') as lines: | |
processTasks = [retrieveAndProcess(stripNewline(line), semaphore) async for line in lines] | |
for f in tqdm(asyncio.as_completed(processTasks), total=len(processTasks)): | |
await f | |
if __name__ == '__main__': | |
loop = asyncio.get_event_loop() | |
loop.run_until_complete(main()) | |
loop.run_until_complete(asyncio.sleep(5.250)) | |
loop.close() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
aiofiles | |
aiohttp | |
tika | |
tqdm |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
JRE is required for Tika to work properly.