Last active
May 22, 2022 15:18
-
-
Save mavaddat/7584836f14e8960f264f2163c631b6e3 to your computer and use it in GitHub Desktop.
This script checks for invalid books in the calibre library by looking for empty covers and then searches for the original book file in the downloads folder
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Set-Location C:\CalibrePortable\Calibre\ | |
$calibreNS = @{ 'dc' = 'http://purl.org/dc/elements/1.1/' | |
'calibre' = 'http://calibre.kovidgoyal.net/2009/metadata' | |
} | |
$formatLessBookIds = [int[]](&.\calibredb.exe search 'formats:false and (identifiers:"=isbn:")' --library-path="..\Calibre Library\" | ForEach-Object { $_ -split ',' }) | |
$formatLessBooks = Import-Clixml -Path .\formatLessBooks.xml | |
if($null -eq $formatLessBooks -or $null -eq $formatLessBooks.Hash -or $formatLessBooks.Hash -ne $formatLessBookIds.GetHashCode()) | |
{ | |
$formatLessBooks = @{ Hash = ($formatLessBookIds.GetHashCode());Books = ($formatLessBookIds | ForEach-Object { [xml](&.\calibredb.exe show_metadata --as-opf $PSItem --library-path="..\Calibre Library\") } | Where-Object { [string]::IsNullOrWhiteSpace((&.\calibredb.exe search formats:true title:`"$(Select-Xml -Xml $PSItem -Namespace $calibreNS -XPath '//dc:title' | ForEach-Object { $_.Node.InnerXml })`" author:`"$((Select-Xml -Xml $PSItem -Namespace $calibreNS -XPath '//dc:creator' | ForEach-Object { $_.Node.InnerXml }) -join ' & ')`" --library-path="..\Calibre Library\" 2>$null | ForEach-Object { $_ -split ',' } ))})} | |
Export-Clixml -InputObject $formatLessBooks -Path .\formatLessBooks.xml | |
} | |
$mutex = New-Object System.Threading.Mutex -ArgumentList @($false, 'calibre-portable-update-metadata') | |
$formatLessBooks.Books | ForEach-Object -ThrottleLimit 16 -Parallel { | |
$mutex = $using:mutex | |
$calibreNS = $using:calibreNS | |
Set-Location C:\CalibrePortable\Calibre\ | |
$title = Select-Xml -Xml $PSItem -Namespace $calibreNS -XPath '//dc:title' | ForEach-Object { $_.Node.InnerXml } | |
$isbn = Select-Xml -Xml $PSItem -Namespace $calibreNS -XPath '//dc:identifier' | Where-Object -FilterScript { $_.Node.scheme -eq 'ISBN' } | ForEach-Object { $_.Node.'#text' } | |
$id = Select-Xml -Xml $PSItem -Namespace $calibreNS -XPath '//dc:identifier' | Where-Object -FilterScript { $_.Node.scheme -eq 'Calibre' } | ForEach-Object { $_.Node.'#text' } | |
if (-not [String]::IsNullOrWhiteSpace( $isbn ) -and -not [String]::IsNullOrWhiteSpace( $id )) | |
{ | |
$sleepTime = 3 | |
$uri = "http://libgen.is/search.php?req=$isbn&open=0&res=25&view=simple&phrase=1&column=identifier&sort=year&sortmode=DESC" | |
$request = Invoke-WebRequest -Uri $uri 2>$null | |
while ($request.StatusCode -ne 200) | |
{ | |
Start-Sleep -Seconds ($sleepTime *= 2) | |
$request = Invoke-WebRequest -Uri $uri 2>$null | |
} | |
Get-Package -Name 'AngleSharp' | ForEach-Object { Split-Path $_.Source } | Get-ChildItem -Filter '*.dll' -Recurse | Where-Object { $_ -Like '*standard*' } | Select-Object -Last 1 | ForEach-Object { Add-Type -Path $_ -ErrorAction SilentlyContinue -Verbose } | |
$parser = [AngleSharp.Html.Parser.HtmlParser]::new() | |
$parsedContent = $parser.ParseDocument($request.Content) | |
$formats = New-Object -TypeName 'System.Collections.Generic.HashSet[string]' | |
$parsedContent.QuerySelectorAll('body > table.c > tbody > tr') | Select-Object -Skip 1 | ForEach-Object { | |
$format = $_.QuerySelector('td:nth-child(9)').InnerHtml | |
if (-not([String]::IsNullOrWhiteSpace($format)) -and $formats.Add($format)) | |
{ | |
$requestDL = Invoke-WebRequest -Uri ($_.QuerySelector('td:nth-child(10) > a').GetAttribute('href')) | |
$parsedDLContent = $parser.ParseDocument($requestDL.Content) | |
$uri = [uri]::new(($parsedDLContent.QuerySelector('#download > h2 > a').href)) | |
$md5 = $uri.segments[3] -replace '/', '' | |
$filename = Join-Path $env:TEMP "$id.$($uri.segments[-1] -split '\.' | Select-Object -Last 1 )" | |
if (-not(Test-Path $filename -PathType Leaf) -or ($md5.ToUpperInvariant() -ne (Get-FileHash -Path $filename -Algorithm MD5).Hash)) | |
{ | |
$request = Invoke-WebRequest -Uri $uri -OutFile $filename -Resume 2>$null | |
while ($request.StatusCode -ne 200) | |
{ | |
Start-Sleep -Seconds ($sleepTime *= 2) | |
$request = Invoke-WebRequest -Uri $uri -OutFile $filename -Resume 2>$null | |
} | |
} | |
try { | |
$mutex.WaitOne() | Out-Null | |
$ebookFile = Get-Item $filename | |
&.\calibredb.exe add_format $id "$ebookFile" --library-path="..\Calibre Library\" | |
@{Id=$id; Ebook=$ebookFile; Title=$title; Filename=$filename} | Format-Table -AutoSize -Wrap | Write-Output | |
} finally { | |
$mutex.ReleaseMutex() | Out-Null | |
} | |
} | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$calibreLibraryPath = "$env:USERPROFILE\Calibre Library" | |
#Install pre-requisite module Communary.PASM for approximate string matching if necessary | |
#See https://github.com/gravejester/Communary.PASM/tree/6fe3b5ea01e8f49aeccbde0db0dc777079b9e9cd | |
if (-not (Get-Module -ListAvailable -Name Communary.PASM)) { | |
$m="Installing missing approximate string matching module Communary.PASM" | |
Write-Host $m | |
Write-Log -Message $m -Level "WARN" -logfile $log | |
# Check to see if we are currently running "as Administrator" | |
if ($myWindowsPrincipal.IsInRole($adminRole)) | |
{ | |
# We are running "as Administrator" - so change the title and background color to indicate this | |
$Host.UI.RawUI.WindowTitle = $myInvocation.MyCommand.Definition + "(Elevated)" | |
$Host.UI.RawUI.BackgroundColor = "DarkBlue" | |
clear-host | |
Install-Module Communary.PASM | |
} | |
else | |
{ | |
# We are not running "as Administrator" - so relaunch as administrator | |
# Create a new process object that starts PowerShell | |
$newProcess = new-object System.Diagnostics.ProcessStartInfo "PowerShell"; | |
# Specify the current script path and name as a parameter | |
$newProcess.Arguments = $myInvocation.MyCommand.Definition; | |
# Indicate that the process should be elevated | |
$newProcess.Verb = "runas"; | |
# Start the new process | |
[System.Diagnostics.Process]::Start($newProcess); | |
# Exit from the current, unelevated, process | |
# exit | |
} | |
} | |
#Install pre-requisite pdfinfo by XpdfReader if necessary | |
#See https://www.xpdfreader.com/pdfinfo-man.html | |
if (-not (where.exe pdfinfo.exe)){ | |
$m="Installing missing pdfinfo program by XpdfReader" | |
Write-Host $m | |
Write-Log -Message $m -Level "WARN" -logfile $log | |
$pdfinfoZipDl = $($(iwr -Uri 'https://www.xpdfreader.com/download.html' -Method Get).ParsedHtml.links | foreach {$_.href | Select-String -Pattern 'xpdf-tools-win.*\.zip$' }).ToString() | |
(New-Object Net.WebClient).DownloadFile($pdfinfoZipDl,"$env:TEMP\xpdf-tools-win.zip");(new-object -com shell.application).namespace('"$env:TEMP').CopyHere((new-object -com shell.application).namespace("$env:TEMP\xpdf-tools-win.zip").Items(),16) | |
$pdfInfoLoc = "$env:TEMP\" + $(Get-ChildItem -Path $env:TEMP -Name "xpdf-tools-win*" -Directory) + "\bin64\pdfinfo.exe" | |
} | |
else { | |
$pdfInfoLoc = where.exe pdfinfo.exe; | |
} | |
$log = New-TemporaryFile | |
Function Write-Log { | |
[CmdletBinding()] | |
Param( | |
[Parameter(Mandatory=$True)] | |
[string] | |
$Message, | |
[Parameter(Mandatory=$False)] | |
[ValidateSet("INFO","WARN","ERROR","FATAL","DEBUG")] | |
[String] | |
$Level = "INFO", | |
[Parameter(Mandatory=$False)] | |
[string] | |
$logfile | |
) | |
$Stamp = (Get-Date).toString("yyyy/MM/dd HH:mm:ss") | |
$Line = "$Stamp`t$Level`t$Message" | |
If($logfile) { | |
Add-Content $logfile -Value $Line | |
} | |
Else { | |
Write-Output $Line | |
} | |
} | |
$confidence = 0.90 | |
$calibreNS = @{ "dc" = "http://purl.org/dc/elements/1.1/"; "calibre" = "http://calibre.kovidgoyal.net/2009/metadata" }; | |
$webNS = @{'xmlns'="http://www.w3.org/1999/xhtml"} | |
$files = Get-ChildItem -Path $env:USERPROFILE\Downloads -Recurse -File -Include *.AZW, *.AZW3, *.AZW4, *.CBZ, *.CBR, *.CBC, *.CHM, *.DJVU, *.DOCX, *.EPUB, *.FB2, *.FBZ, *.HTML, *.HTMLZ, *.LIT, *.LRF, *.MOBI, *.ODT, *.PDF, *.PRC, *.PDB, *.PML, *.RB, *.RTF, *.SNB, *.TCR, *.TXT, *.TXTZ ; #has $files.Count many elements | |
$ids = ([string](calibredb search cover:False)).Split(","); | |
$candidates = New-Object 'System.Collections.Generic.Dictionary[Float,System.IO.FileSystemInfo]' | |
foreach ($id in $ids) { | |
Write-Progress -Activity "Trying to find good copies of broken books" -Status "$([int16]$(100*$ids.IndexOf($id)/$ids.Count))% Complete:" -PercentComplete (100*$ids.IndexOf($id)/$ids.Count); | |
$err = New-TemporaryFile | |
$out = New-TemporaryFile | |
$calibreBookFilename = Get-ChildItem -Path $calibreLibraryPath -Include "*($id)" -Recurse | |
$book = [xml](calibredb.exe show_metadata --as-opf $id) | |
$author = Select-Xml -Xml $book -Namespace $calibreNS -XPath "//dc:creator" | ForEach-Object { $_.Node.Innerxml } | |
$title = Select-Xml -Xml $book -Namespace $calibreNS -XPath "//dc:title" | ForEach-Object { $_.Node.Innerxml } | |
$compositeName = $($(if($title.Equals("untitled")){$calibreBookFilename.Name}else{"- $author"}) + $(if($author.Equals("Unknown")){""}else{"- $author"})) | |
#if($title -ieq 'untitled' -and $author -ieq 'Unknown' -and $calibreBookFilename.Name -ieq 'untitled - Unknown') | |
#{ | |
foreach ($file in $files) { | |
Write-Progress -Activity "Looking for $title by $author" -Status "$([int16]$(100*$files.IndexOf($file)/$files.Count))% Complete:" -PercentComplete (100*$files.IndexOf($file)/$files.Count); | |
$compar = 1 - $(Get-JaccardDistance -a $compositeName -b $file.Name.Split(".")[0] -CaseSensitive) | |
if ($file.Name.Split(".")[0] -gt 0) { | |
try{ | |
$candidates.Add($compar,$file) | Out-Null | |
} | |
catch [ArgumentException]{ | |
$m="'" + $file.Name.Split('.')[0] + "' had the same confidence as '" + $candidates[$compar].Name.Split('.')[0] + "'" | |
#Write-Host $m | |
Write-Log -Message $m -Level "DEBUG" -logfile $log | |
} | |
if (-not $candidates.GetEnumerator().Value -ceq $null -and $compar -ge (1-$confidence)) { | |
Write-Progress -Activity "Looking for $title by $author" -Status "Found $($replacement.Name) at $compar% confidence" -PercentComplete 100 | |
Clear-Variable compar | |
break | |
} | |
} | |
} | |
$m="Looking for $title by $author" | |
Write-Host $m | |
Write-Log -Message $m -Level "INFO" -logfile $log | |
$bestGuess = $(($candidates.GetEnumerator() | sort -Property Key))[0] | |
$compar = $bestGuess.Key | |
$replacement = $bestGuess.Value; | |
$m="Found $($replacement.Name) at $compar% confidence" | |
Write-Host $m -NoNewLine -ForegroundColor DarkGreen | |
Write-Log -Message $m -Level "INFO" -logfile $log | |
if($compar -lt (1-$confidence)){ | |
$m=" having low confidence `($compar`)" | |
Write-Host $m -ForegroundColor DarkYellow | |
Write-Log -Message $m -Level "INFO" -logfile $log | |
} | |
Start-Process -FilePath $pdfInfoLoc -ArgumentList "-meta `"$($replacement.FullName)`"" -NoNewWindow -RedirectStandardError $err -RedirectStandardOutput $out | |
if ( $(Select-String -InputObject $err -Pattern "(Error|Warning)") -ceq $null -and -not ($(Get-Content $out) -ceq $null)) { | |
$m="`nReplacing old file with new one" | |
Write-Host $m -ForegroundColor Green | |
Write-Log -Message $m -Level "INFO" -logfile $log | |
Write-Host $out -ForegroundColor Green | |
Write-Log("INFO",$out,$log) | |
calibredb add_format $id "$($replacement.FullName)" | |
} else { | |
$m="`nThe file $($replacement.FullName) was corrupt.`nNo valid file found for $title by $author.`nRemoving book with ID $id from Calibre" | |
Write-Host $m -ForegroundColor DarkRed -BackgroundColor Black | |
Write-Log -Message $m -Level "WARN" -logfile $log | |
calibredb.exe remove $id | |
} | |
#} | |
#else { | |
#$m="`nThe book with ID $id was corrupt, had no title or author. Its filename is $($calibreBookFilename.Name). It is thus impossible to replace.`nRemoving book with ID $id from Calibre" | |
#Write-Host $m -ForegroundColor DarkRed -BackgroundColor Black | |
#Write-Log -Message $m -Level "INFO" -logfile $log | |
#calibredb.exe remove $id | |
#} | |
Remove-Item $out, $err -Force | |
Clear-Variable candidates, err, out | |
$candidates = New-Object 'System.Collections.Generic.Dictionary[Float,System.IO.FileSystemInfo]' | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment