Last active
May 22, 2023 13:10
-
-
Save Hashbrown777/47e45954950c0b07dd00c01de46f50c5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| (async () => { | |
| const ignore = /^http:\/\/waltercosand.com\/CosandScores\/Composers%20[^/]+\//; | |
| const urls = [ | |
| 'http://waltercosand.com/CosandScores/Composers%20A-D/', | |
| 'http://waltercosand.com/CosandScores/Composers%20E-K/', | |
| 'http://waltercosand.com/CosandScores/Composers%20L-P/', | |
| 'http://waltercosand.com/CosandScores/Composers%20Q-Z/' | |
| ].reverse(); | |
| const skip = [ | |
| 'http://waltercosand.com/CosandScores/Composers%20L-P/Mozart,%20W.%20A/Mozart%20-%20Complete%20Works%20for%20Piano/' | |
| ].reduce((map, key) => { map[key] = true; return map; }, {}); | |
| const special = [ | |
| 'http://waltercosand.com/CosandScores/Composers%20L-P/Mozart,%20W.%20A/Orchestral_Works/' | |
| ].reduce((map, key) => { map[key] = true; return map; }, {}); | |
| //I have no idea what charset these are | |
| //but through googling the rest of the text that contains them | |
| //I have the extant codepoints mapped below | |
| const decoded = [ | |
| ['+%a1', 'ed'], | |
| ['+%a3', 'dc'], | |
| ['+%a6', 'f1'], | |
| ['+%ac', 'ea'], | |
| ['+%ae', 'e9'], | |
| ['+%bd', 'eb'], | |
| ['+%bf', 'e8'], | |
| ['+%c2', 'f6'], | |
| ['+%e4', 'c4'], | |
| ['+%eb', 'c9'], | |
| ['+%f1', 'e4'], | |
| ['-%a6', 'b0'] | |
| ].reduce( | |
| (decoded, [code, char]) => { | |
| decoded[code] = String.fromCharCode(parseInt(char, 16)); | |
| return decoded; | |
| }, | |
| {} | |
| ); | |
| function decodeChar(code) { | |
| if (code[0] == '+' || code[0] == '-') | |
| return decoded[code.toLowerCase()] || (() => { throw code; })(); | |
| return (code[1] < '8' || code.length > 3) ? | |
| decodeURIComponent(code) : | |
| String.fromCharCode(parseInt(code.substr(1), 16)) | |
| ; | |
| } | |
| function utf8Tail(count, not) { | |
| if (count < 1) | |
| return ''; | |
| let output = '%[89ab][0-9a-f]'; | |
| if (count > 1) | |
| output = `(?:${output}){${count}}`; | |
| if (not) | |
| output = `(?!${output})`; | |
| return output; | |
| } | |
| const doubleWhitespace = /\s* \s+|\s+ \s*|[^\S ]+/g; | |
| const unsafe = /[\x00\\/:*?"<>|]+|^[ .]+|[. ]+$/g; | |
| const empty = /^$/; | |
| const customDecode = new RegExp([ | |
| //utf8 | |
| '%(' + [ | |
| '[0-7][0-9a-f]' + utf8Tail(0), | |
| '[cd][0-9a-f]' + utf8Tail(1), | |
| 'e[0-9a-f]' + utf8Tail(2), | |
| 'f[0-7]' + utf8Tail(3) | |
| ].join('|') + ')', | |
| //extended ascii & weird encodings | |
| '[+-]?%(' + [ | |
| //invalid utf8 starting byte | |
| '[8-9ab][0-9a-f]', | |
| 'f[89a-f]', | |
| //invalid utf trailing bytes | |
| '[cd][0-9a-f]' + utf8Tail(1, true), | |
| 'e[0-9a-f]' + utf8Tail(2, true), | |
| 'f[0-7]' + utf8Tail(3, true) | |
| ].join('|') + ')' | |
| ].join('|'), 'gi'); | |
| //Goldenweiser | |
| const doubleEncoding = eval(customDecode.toString().replace(/%/g, '%25')); | |
| function interpret(part) { | |
| return part | |
| .replace(doubleEncoding, decodeURIComponent) | |
| .replace(customDecode, decodeChar) | |
| .replace(doubleWhitespace, ' ') | |
| .replace(unsafe, '') | |
| .replace(empty, '_') | |
| ; | |
| } | |
| const output = open().document; | |
| output.write('<pre>['); | |
| function write(obj) { | |
| output.write('\n'); | |
| output.write(JSON | |
| .stringify(obj, undefined, 4) | |
| .replace(/(?<=^|\n)/g, ' ') | |
| ); | |
| if (urls.length) | |
| output.write(','); | |
| } | |
| for (let url; url = urls.pop();) { | |
| if (url.lastIndexOf('/') + 1 < url.length) { | |
| //console.log(url); | |
| let path; | |
| try { | |
| path = url | |
| .replace(ignore, '') | |
| .split('/') | |
| .map(interpret) | |
| .join('/') | |
| ; | |
| } | |
| catch (e) { | |
| throw url; | |
| } | |
| write({path, url}); | |
| continue; | |
| } | |
| if (url in skip) | |
| continue; | |
| const dir = open(url); | |
| let timeout; | |
| await new Promise((resolve) => { | |
| function check() { | |
| if (dir.document && dir.document.readyState == 'complete') | |
| resolve(); | |
| else | |
| dir.addEventListener('load', resolve); | |
| } | |
| function manual(wait) { | |
| timeout = setTimeout(manual, wait || 100); | |
| if (dir.document && dir.document.readyState == 'complete') | |
| resolve(); | |
| } | |
| manual(5000); | |
| check(); | |
| }); | |
| clearTimeout(timeout); | |
| //console.log(url); | |
| if (url in special) { | |
| let heading = null; | |
| for ( | |
| const {textContent, href, tagName} | |
| of Array.from(dir.document.querySelectorAll('h1, a, hr')) | |
| ) { | |
| switch (tagName) { | |
| case 'H1': | |
| heading = textContent; | |
| continue; | |
| case 'HR': | |
| heading = null; | |
| continue; | |
| } | |
| if (!heading) | |
| continue; | |
| write({ | |
| path : url | |
| .replace(ignore, '') | |
| .replace(/\/$/, '') | |
| .split('/') | |
| .concat([ | |
| heading, | |
| textContent + '.' + href.replace(/.*[.]/, '') | |
| ].map(encodeURIComponent)) | |
| .map(interpret) | |
| .join('/') | |
| , | |
| url : href | |
| }); | |
| } | |
| } | |
| else { | |
| let failed = true; | |
| for ( | |
| const {href} | |
| of Array.from(dir.document.querySelectorAll('td a')).reverse() | |
| ) { | |
| if (href.indexOf(url) == 0) { | |
| urls.push(href); | |
| failed = false; | |
| } | |
| } | |
| // if (failed) | |
| // throw url; | |
| } | |
| dir.close(); | |
| } | |
| output.write('\n]</pre>'); | |
| output.close(); | |
| })() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| . "$PSScriptRoot/_async.ps1" | |
| $links = Get-Content _links.json | ConvertFrom-Json | |
| $links ` | |
| | Group-Object -Property path ` | |
| | ?{ $_.Count -gt 1 } ` | |
| | %{ | |
| $index = 0 | |
| $path = '_dupes/' + $_.Group[0].path | |
| $_.Group ` | |
| | %{ | |
| ++$index | |
| $_.path = ([regex]'(?=(\.[^.]+)?$)').replace($path, "_$index", 1) | |
| } | |
| } | |
| $links ` | |
| | Async ` | |
| -Expected $links.Count ` | |
| -BatchSize 10 ` | |
| -Func { Process { | |
| if (Test-Path -LiteralPath $_.path) { | |
| return | |
| } | |
| $alias = ( | |
| [System.Security.Cryptography.MD5CryptoServiceProvider]::new().ComputeHash( | |
| [System.Text.UTF8Encoding]::new().GetBytes($_.path) | |
| ) ` | |
| | %{ [Convert]::ToString($_, 16) } | |
| ) -join '' | |
| $errors = "$alias.error" | |
| try { | |
| .{ | |
| if (Split-Path -Path $_.path) { | |
| New-Item ` | |
| -Type Directory ` | |
| -Force ` | |
| -Path $(Split-Path -Path $_.path) ` | |
| | Out-Null | |
| } | |
| $ProgressPreference = 'SilentlyContinue' | |
| Invoke-WebRequest $_.url -OutFile $alias | |
| Move-Item $alias $_.path | |
| } 2>$errors | |
| } | |
| catch { | |
| "`r`n---`r`n" >>$errors | |
| $Error >>$errors | |
| } | |
| finally { | |
| if ((Get-ChildItem $errors).Length) { | |
| "`r`n---`r`n" >>$errors | |
| $alias >>$errors | |
| Move-Item $errors "$($_.path).error.log" | |
| } | |
| else { | |
| Remove-Item $errors | |
| } | |
| } | |
| } } | |
| Get-ChildItem ` | |
| -Recurse ` | |
| -Filter '*.error.log' ` | |
| | Out-File '_errors.log' | |
| Get-ChildItem ` | |
| -Recurse ` | |
| -File ` | |
| -Exclude '_dupes.log' ` | |
| | Group-Object -Property Length ` | |
| | ?{ $_.Count -gt 1 } ` | |
| | %{ $_.Group } ` | |
| | Get-FileHash ` | |
| | Group-Object -Property Hash ` | |
| | ?{ $_.Count -gt 1 } ` | |
| | %{ $_.Group } ` | |
| | %{ $_.Path -replace "$([regex]::escape($(pwd)))",'' } ` | |
| | Out-File '_dupes.log' |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This generates on-disk filenames from the urls, making them filesystem safe, and decoding them to unicode to be readable (This was the hardest part, given the frankly broken encoding scheme/s on this server's filesystem. Eg
GOLDENWEISER-Gavotte%2520op7%2520n%25C2%25BA2.pdfis double-encoded but the script successfully translates it toGOLDENWEISER-Gavotte op7 nº2.pdf& some unknown encoding schemes likeAlb+%aeniz%20Rapsodia-Espa+%a6ola.pdfare successfully interpreted asAlbéniz Rapsodia-Española.pdf).It also ditches the A-D, E-K, L-P, Q-Z segregation and has a special interpretation rule for Mozart's "Orchestral_Works" which has an actual html page instead of a server-generated ftp-esque list.
The resultant text file is a whopping 10mB!
You'll also notice a powershell script to download the list on windows. It has niceties like simultaneous downloads, filename clash detection, & a filecontent duplication scan at the end. The full lot is almost 100gB!