Hashbrown777 · May 22, 2023 13:10 · Hashbrown777 · May 22, 2023
diff --git a/opendirectory.js b/opendirectory.js
 (async () => {
 	const ignore = /^http:\/\/waltercosand.com\/CosandScores\/Composers%20[^/]+\//;
 	const urls = [
 		'http://waltercosand.com/CosandScores/Composers%20A-D/',
 		'http://waltercosand.com/CosandScores/Composers%20E-K/',
 		'http://waltercosand.com/CosandScores/Composers%20L-P/',
 		'http://waltercosand.com/CosandScores/Composers%20Q-Z/'
 	].reverse();
 	const skip = [
 		'http://waltercosand.com/CosandScores/Composers%20L-P/Mozart,%20W.%20A/Mozart%20-%20Complete%20Works%20for%20Piano/'
 	].reduce((map, key) => { map[key] = true; return map; }, {});
 	const special = [
 		'http://waltercosand.com/CosandScores/Composers%20L-P/Mozart,%20W.%20A/Orchestral_Works/'
 	].reduce((map, key) => { map[key] = true; return map; }, {});
 	
 	//I have no idea what charset these are
 	//but through googling the rest of the text that contains them
 	//I have the extant codepoints mapped below
 	const decoded = [
 		['+%a1', 'ed'],
 		['+%a3', 'dc'],
 		['+%a6', 'f1'],
 		['+%ac', 'ea'],
 		['+%ae', 'e9'],
 		['+%bd', 'eb'],
 		['+%bf', 'e8'],
 		['+%c2', 'f6'],
 		['+%e4', 'c4'],
 		['+%eb', 'c9'],
 		['+%f1', 'e4'],
 		['-%a6', 'b0']
 	].reduce(
 		(decoded, [code, char]) => {
 			decoded[code] = String.fromCharCode(parseInt(char, 16));
 			return decoded;
 		},
 		{}
 	);
 	function decodeChar(code) {
 		if (code[0] == '+' || code[0] == '-')
 			return decoded[code.toLowerCase()] || (() => { throw code; })();
 		return (code[1] < '8' || code.length > 3) ?
 			decodeURIComponent(code) :
 			String.fromCharCode(parseInt(code.substr(1), 16))
 		;
 	}
 	
 	function utf8Tail(count, not) {
 		if (count < 1)
 			return '';
 		let output = '%[89ab][0-9a-f]';
 		if (count > 1)
 			output = `(?:${output}){${count}}`;
 		if (not)
 			output = `(?!${output})`;
 		return output;
 	}
 	
 	const doubleWhitespace = /\s* \s+|\s+ \s*|[^\S ]+/g;
 	const unsafe           = /[\x00\\/:*?"<>|]+|^[ .]+|[. ]+$/g;
 	const empty            = /^$/;
 	const customDecode     = new RegExp([
 		//utf8
 		'%(' + [
 			'[0-7][0-9a-f]' + utf8Tail(0),
 			 '[cd][0-9a-f]' + utf8Tail(1),
 			    'e[0-9a-f]' + utf8Tail(2),
 			    'f[0-7]'    + utf8Tail(3)
 		].join('|') + ')',
 		//extended ascii & weird encodings
 		'[+-]?%(' + [
 			//invalid utf8 starting byte
 			'[8-9ab][0-9a-f]',
 			'f[89a-f]',
 			//invalid utf trailing bytes
 			'[cd][0-9a-f]' + utf8Tail(1, true),
 			   'e[0-9a-f]' + utf8Tail(2, true),
 			   'f[0-7]'    + utf8Tail(3, true)
 		].join('|') + ')'
 	].join('|'), 'gi');
 //Goldenweiser
 	const doubleEncoding = eval(customDecode.toString().replace(/%/g, '%25'));
 	function interpret(part) {
 		return part
 			.replace(doubleEncoding, decodeURIComponent)
 			.replace(customDecode, decodeChar)
 			.replace(doubleWhitespace, ' ')
 			.replace(unsafe, '')
 			.replace(empty, '_')
 		;
 	}
 	
 	const output = open().document;
 	output.write('<pre>[');
 	function write(obj) {
 		output.write('\n');
 		output.write(JSON
 			.stringify(obj, undefined, 4)
 			.replace(/(?<=^|\n)/g, '    ')
 		);
 		if (urls.length)
 			output.write(',');
 	}
 	for (let url; url = urls.pop();) {
 		if (url.lastIndexOf('/') + 1 < url.length) {
 //console.log(url);
 			let path;
 			try {
 				path = url
 					.replace(ignore, '')
 					.split('/')
 					.map(interpret)
 					.join('/')
 				;
 			}
 			catch (e) {
 				throw url;
 			}
 			write({path, url});
 			continue;
 		}
 		
 		if (url in skip)
 			continue;
 		const dir = open(url);
 		let timeout;
 		await new Promise((resolve) => {
 			function check() {
 				if (dir.document && dir.document.readyState == 'complete')
 					resolve();
 				else
 					dir.addEventListener('load', resolve);
 			}
 			function manual(wait) {
 				timeout = setTimeout(manual, wait || 100);
 				if (dir.document && dir.document.readyState == 'complete')
 					resolve();
 			}
 			manual(5000);
 			check();
 		});
 		clearTimeout(timeout);
 //console.log(url);
 		
 		if (url in special) {
 			let heading = null;
 			for (
 				const {textContent, href, tagName}
 				of Array.from(dir.document.querySelectorAll('h1, a, hr'))
 			) {
 				switch (tagName) {
 				case 'H1':
 					heading = textContent;
 					continue;
 				case 'HR':
 					heading = null;
 					continue;
 				}
 				if (!heading)
 					continue;

 				write({
 					path : url
 						.replace(ignore, '')
 						.replace(/\/$/, '')
 						.split('/')
 						.concat([
 							heading,
 							textContent + '.' + href.replace(/.*[.]/, '')
 						].map(encodeURIComponent))
 						.map(interpret)
 						.join('/')
 					,
 					url : href
 				});
 			}
 		}
 		else {
 			let failed = true;
 			for (
 				const {href}
 				of Array.from(dir.document.querySelectorAll('td a')).reverse()
 			) {
 				if (href.indexOf(url) == 0) {
 					urls.push(href);
 					failed = false;
 				}
 			}
 //			if (failed)
 //				throw url;
 		}
 		dir.close();
 	}
 	output.write('\n]</pre>');
 	output.close();
 })()
diff --git a/opendirectory.ps1 b/opendirectory.ps1
 . "$PSScriptRoot/_async.ps1"

 $links = Get-Content _links.json | ConvertFrom-Json
 $links `
 | Group-Object -Property path `
 | ?{ $_.Count -gt 1 } `
 | %{
 	$index = 0
 	$path = '_dupes/' + $_.Group[0].path
 	$_.Group `
 	| %{
 		++$index
 		$_.path = ([regex]'(?=(\.[^.]+)?$)').replace($path, "_$index", 1)
 	}
 }

 $links `
 | Async `
 	-Expected $links.Count `
 	-BatchSize 10 `
 	-Func { Process {
 		if (Test-Path -LiteralPath $_.path) {
 			return
 		}
 		
 		$alias = (
 			[System.Security.Cryptography.MD5CryptoServiceProvider]::new().ComputeHash(
 				[System.Text.UTF8Encoding]::new().GetBytes($_.path)
 			) `
 			| %{ [Convert]::ToString($_, 16) }
 		) -join ''
 		$errors = "$alias.error"
 		
 		try {
 			.{
 				if (Split-Path -Path $_.path) {
 					New-Item `
 						-Type Directory `
 						-Force `
 						-Path $(Split-Path -Path $_.path) `
 					| Out-Null
 				}
 				$ProgressPreference = 'SilentlyContinue'
 				Invoke-WebRequest $_.url -OutFile $alias
 				Move-Item $alias $_.path
 			} 2>$errors
 		}
 		catch {
 			"`r`n---`r`n" >>$errors
 			$Error >>$errors
 		}
 		finally {
 			if ((Get-ChildItem $errors).Length) {
 				"`r`n---`r`n" >>$errors
 				$alias >>$errors
 				Move-Item $errors "$($_.path).error.log"
 			}
 			else {
 				Remove-Item $errors
 			}
 		}
 	} }
 	
 Get-ChildItem `
 	-Recurse `
 	-Filter '*.error.log' `
 | Out-File '_errors.log'

 Get-ChildItem `
 	-Recurse `
 	-File `
 	-Exclude '_dupes.log' `
 | Group-Object -Property Length `
 | ?{ $_.Count -gt 1 } `
 | %{ $_.Group } `
 | Get-FileHash `
 | Group-Object -Property Hash `
 | ?{ $_.Count -gt 1 } `
 | %{ $_.Group } `
 | %{ $_.Path -replace "$([regex]::escape($(pwd)))",'' } `
 | Out-File '_dupes.log'
	(async () => {
	const ignore = /^http:\/\/waltercosand.com\/CosandScores\/Composers%20[^/]+\//;
	const urls = [
	'http://waltercosand.com/CosandScores/Composers%20A-D/',
	'http://waltercosand.com/CosandScores/Composers%20E-K/',
	'http://waltercosand.com/CosandScores/Composers%20L-P/',
	'http://waltercosand.com/CosandScores/Composers%20Q-Z/'
	].reverse();
	const skip = [
	'http://waltercosand.com/CosandScores/Composers%20L-P/Mozart,%20W.%20A/Mozart%20-%20Complete%20Works%20for%20Piano/'
	].reduce((map, key) => { map[key] = true; return map; }, {});
	const special = [
	'http://waltercosand.com/CosandScores/Composers%20L-P/Mozart,%20W.%20A/Orchestral_Works/'
	].reduce((map, key) => { map[key] = true; return map; }, {});

	//I have no idea what charset these are
	//but through googling the rest of the text that contains them
	//I have the extant codepoints mapped below
	const decoded = [
	['+%a1', 'ed'],
	['+%a3', 'dc'],
	['+%a6', 'f1'],
	['+%ac', 'ea'],
	['+%ae', 'e9'],
	['+%bd', 'eb'],
	['+%bf', 'e8'],
	['+%c2', 'f6'],
	['+%e4', 'c4'],
	['+%eb', 'c9'],
	['+%f1', 'e4'],
	['-%a6', 'b0']
	].reduce(
	(decoded, [code, char]) => {
	decoded[code] = String.fromCharCode(parseInt(char, 16));
	return decoded;
	},
	{}
	);
	function decodeChar(code) {
	if (code[0] == '+' \|\| code[0] == '-')
	return decoded[code.toLowerCase()] \|\| (() => { throw code; })();
	return (code[1] < '8' \|\| code.length > 3) ?
	decodeURIComponent(code) :
	String.fromCharCode(parseInt(code.substr(1), 16))
	;
	}

	function utf8Tail(count, not) {
	if (count < 1)
	return '';
	let output = '%[89ab][0-9a-f]';
	if (count > 1)
	output = `(?:${output}){${count}}`;
	if (not)
	output = `(?!${output})`;
	return output;
	}

	const doubleWhitespace = /\s* \s+\|\s+ \s*\|[^\S ]+/g;
	const unsafe = /[\x00\\/:*?"<>\|]+\|^[ .]+\|[. ]+$/g;
	const empty = /^$/;
	const customDecode = new RegExp([
	//utf8
	'%(' + [
	'[0-7][0-9a-f]' + utf8Tail(0),
	'[cd][0-9a-f]' + utf8Tail(1),
	'e[0-9a-f]' + utf8Tail(2),
	'f[0-7]' + utf8Tail(3)
	].join('\|') + ')',
	//extended ascii & weird encodings
	'[+-]?%(' + [
	//invalid utf8 starting byte
	'[8-9ab][0-9a-f]',
	'f[89a-f]',
	//invalid utf trailing bytes
	'[cd][0-9a-f]' + utf8Tail(1, true),
	'e[0-9a-f]' + utf8Tail(2, true),
	'f[0-7]' + utf8Tail(3, true)
	].join('\|') + ')'
	].join('\|'), 'gi');
	//Goldenweiser
	const doubleEncoding = eval(customDecode.toString().replace(/%/g, '%25'));
	function interpret(part) {
	return part
	.replace(doubleEncoding, decodeURIComponent)
	.replace(customDecode, decodeChar)
	.replace(doubleWhitespace, ' ')
	.replace(unsafe, '')
	.replace(empty, '_')
	;
	}

	const output = open().document;
	output.write('<pre>[');
	function write(obj) {
	output.write('\n');
	output.write(JSON
	.stringify(obj, undefined, 4)
	.replace(/(?<=^\|\n)/g, ' ')
	);
	if (urls.length)
	output.write(',');
	}
	for (let url; url = urls.pop();) {
	if (url.lastIndexOf('/') + 1 < url.length) {
	//console.log(url);
	let path;
	try {
	path = url
	.replace(ignore, '')
	.split('/')
	.map(interpret)
	.join('/')
	;
	}
	catch (e) {
	throw url;
	}
	write({path, url});
	continue;
	}

	if (url in skip)
	continue;
	const dir = open(url);
	let timeout;
	await new Promise((resolve) => {
	function check() {
	if (dir.document && dir.document.readyState == 'complete')
	resolve();
	else
	dir.addEventListener('load', resolve);
	}
	function manual(wait) {
	timeout = setTimeout(manual, wait \|\| 100);
	if (dir.document && dir.document.readyState == 'complete')
	resolve();
	}
	manual(5000);
	check();
	});
	clearTimeout(timeout);
	//console.log(url);

	if (url in special) {
	let heading = null;
	for (
	const {textContent, href, tagName}
	of Array.from(dir.document.querySelectorAll('h1, a, hr'))
	) {
	switch (tagName) {
	case 'H1':
	heading = textContent;
	continue;
	case 'HR':
	heading = null;
	continue;
	}
	if (!heading)
	continue;

	write({
	path : url
	.replace(ignore, '')
	.replace(/\/$/, '')
	.split('/')
	.concat([
	heading,
	textContent + '.' + href.replace(/.*[.]/, '')
	].map(encodeURIComponent))
	.map(interpret)
	.join('/')
	,
	url : href
	});
	}
	}
	else {
	let failed = true;
	for (
	const {href}
	of Array.from(dir.document.querySelectorAll('td a')).reverse()
	) {
	if (href.indexOf(url) == 0) {
	urls.push(href);
	failed = false;
	}
	}
	// if (failed)
	// throw url;
	}
	dir.close();
	}
	output.write('\n]</pre>');
	output.close();
	})()
	. "$PSScriptRoot/_async.ps1"

	$links = Get-Content _links.json \| ConvertFrom-Json
	$links `
	\| Group-Object -Property path `
	\| ?{ $_.Count -gt 1 } `
	\| %{
	$index = 0
	$path = '_dupes/' + $_.Group[0].path
	$_.Group `
	\| %{
	++$index
	$_.path = ([regex]'(?=(\.[^.]+)?$)').replace($path, "_$index", 1)
	}
	}

	$links `
	\| Async `
	-Expected $links.Count `
	-BatchSize 10 `
	-Func { Process {
	if (Test-Path -LiteralPath $_.path) {
	return
	}

	$alias = (
	[System.Security.Cryptography.MD5CryptoServiceProvider]::new().ComputeHash(
	[System.Text.UTF8Encoding]::new().GetBytes($_.path)
	) `
	\| %{ [Convert]::ToString($_, 16) }
	) -join ''
	$errors = "$alias.error"

	try {
	.{
	if (Split-Path -Path $_.path) {
	New-Item `
	-Type Directory `
	-Force `
	-Path $(Split-Path -Path $_.path) `
	\| Out-Null
	}
	$ProgressPreference = 'SilentlyContinue'
	Invoke-WebRequest $_.url -OutFile $alias
	Move-Item $alias $_.path
	} 2>$errors
	}
	catch {
	"`r`n---`r`n" >>$errors
	$Error >>$errors
	}
	finally {
	if ((Get-ChildItem $errors).Length) {
	"`r`n---`r`n" >>$errors
	$alias >>$errors
	Move-Item $errors "$($_.path).error.log"
	}
	else {
	Remove-Item $errors
	}
	}
	} }

	Get-ChildItem `
	-Recurse `
	-Filter '*.error.log' `
	\| Out-File '_errors.log'

	Get-ChildItem `
	-Recurse `
	-File `
	-Exclude '_dupes.log' `
	\| Group-Object -Property Length `
	\| ?{ $_.Count -gt 1 } `
	\| %{ $_.Group } `
	\| Get-FileHash `
	\| Group-Object -Property Hash `
	\| ?{ $_.Count -gt 1 } `
	\| %{ $_.Group } `
	\| %{ $_.Path -replace "$([regex]::escape($(pwd)))",'' } `
	\| Out-File '_dupes.log'