Last active
November 21, 2024 11:55
-
-
Save 108EAA0A/fbcabaf87e4b09e6e282e42ac64a3689 to your computer and use it in GitHub Desktop.
Download article images for https://ci-en.dlsite.com
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Param( | |
# (*)クリエイターID: URLから推測して指定 | |
[Parameter(Mandatory, ValueFromPipeline)] | |
[ValidateScript({ $_ -gt 0 })] | |
[int]$CreatorId, | |
# ページ指定: 投稿記事の一覧表示における何ページから何ページまでをDLするかどうか。未指定時は全ページ | |
[ValidateScript({ $_ -gt 0 })] | |
[int]$PageStart = 1, | |
[int]$PageEnd, | |
# 記事ID: 指定時ページ単体の画像をDL | |
[ValidateScript({ $_ -gt 0 })] | |
[int]$ArticleId, | |
# 保存先のパス: 未指定時はカレントディレクトリに保存 | |
[string]$OutPath = ((Get-Location).Path), | |
# スリープ時間(ms): 0はマナー違反 | |
[ValidateScript({ $_ -gt 0 })] | |
[int]$SleepMs = 100, | |
# 指定時、同名ファイルを上書きする | |
[switch]$Overwrite, | |
# 指定時、クリエイター名のディレクトリを作成しない | |
[switch]$NoCreatorDir | |
) | |
Set-StrictMode -Version Latest | |
Add-Type -AssemblyName System.Web | |
# This script required AngleParse | |
# @see https://github.com/kamome283/AngleParse | |
if (-not (Get-Module -ListAvailable -Name AngleParse)) { | |
Install-Module AngleParse -Scope CurrentUser | |
} | |
Import-Module AngleParse | |
$CIEN_HOST = "https://ci-en.dlsite.com" | |
# 何ページまであるか調べる | |
Function GetCreatorsArticlePageEnd([Parameter(Mandatory, ValueFromPipeline)][int]$cid) { | |
for ($page = 1; ; ++$page) { | |
Write-Host "fetching: $CIEN_HOST/creator/$cid/article?mode=list&page=$page" | |
$textContent = Invoke-WebRequest "$CIEN_HOST/creator/$cid/article?mode=list&page=$page" | Select-HtmlContent "div.c-notification.is-large.is-light > div.c-notificationItem > p.content" | |
if ([string]::Equals($TextContent, "まだ記事はありません。")) { | |
return $page - 1 | |
} | |
Start-Sleep -Milliseconds $sleepMs | |
} | |
} | |
# Create A Directory If It Does Not Exist | |
Function CreateDirectory([Parameter(Mandatory, ValueFromPipeline)][ValidateNotNullOrEmpty()][string]$dirPath) { | |
if (-not (Test-Path $dirPath -PathType Container)) { | |
try { | |
New-Item -Path $dirPath -ItemType Directory -ErrorAction Stop | Out-Null #-Force | |
} | |
catch { | |
Write-Error -Message "Unable to create directory '$dirPath'. Error was: $_" -ErrorAction Stop | |
} | |
} | |
} | |
Function ReplaceWinPathInvaildChar([Parameter(Mandatory, ValueFromPipeline)][string]$preValidatePath) { | |
return $preValidatePath.Trim().Replace('<', '<').Replace('>', '>').Replace('?', '?').Replace(':', ':').Replace('*', '*').Replace('"', '”').Replace('|', '|').Replace('/', '/').Replace('\\', '¥') | |
} | |
Function DownloadFile([ValidateNotNullOrEmpty()][string]$url, [ValidateNotNullOrEmpty()][string]$path) { | |
# Invoke-WebRequest -OutFile のバグ回避 | |
# WebClientは並列実行したらなぜか上手くDLされないことがあった | |
# @see http://265.blog3.fc2.com/blog-entry-37.html | |
[System.IO.File]::WriteAllBytes($path, (Invoke-WebRequest $url).content) | |
Start-Sleep -Milliseconds $sleepMs | |
} | |
Function DownloadImage([ValidateNotNullOrEmpty()][string]$url, [ValidateNotNullOrEmpty()][string]$distDir, [ValidateNotNullOrEmpty()][string]$fileName) { | |
$urlDecodedFileName = [System.Web.HttpUtility]::UrlDecode($fileName) | ReplaceWinPathInvaildChar | |
$fullPath = Join-Path $distDir $urlDecodedFileName | |
if (!$Overwrite -and [System.IO.File]::Exists($fullPath)) { | |
Write-Host "'$fullPath' already exists. The download of this file was skipped." | |
return | |
} | |
Write-Host "downloading: $urlDecodedFileName" | |
DownloadFile $url $fullPath | |
} | |
Function DownloadCienImage([ValidateNotNullOrEmpty()][string]$url, [ValidateNotNullOrEmpty()][string]$distDir) { | |
$cienRawImgFileName = $url | Select-String -Pattern "/upload/(.+)\?" | ForEach-Object { $_.matches.groups[1].Value } | |
DownloadImage $url $distDir $cienRawImgFileName | |
} | |
Function ConvertWeybackUrl([Parameter(Mandatory, ValueFromPipeline)][ValidateNotNullOrEmpty()][string]$url) { | |
(Invoke-WebRequest "http://archive.org/wayback/available?url=$url").Content | ConvertFrom-Json | ForEach-Object { $_.archived_snapshots } | Where-Object { $_.closest.available } | ForEach-Object { $_.closest.url }[0] | |
} | |
Function ExportWaybackImgHref([Parameter(Mandatory, ValueFromPipeline)][ValidateNotNullOrEmpty()][string]$url) { | |
Invoke-WebRequest $url | Select-HtmlContent "iframe#playback", ([AngleParse.Attr]::Src) | |
} | |
Function DownloadDlblogImages($outerLinkImgs, [ValidateNotNullOrEmpty()][string]$articleDir) { | |
# 画像のURLが dlsite.blogimg.jp だったら WebArchive を見に行く | |
[array]$blogArchiveImgs = $outerLinkImgs | Where-Object { $_ -match "/dlsite.blogimg.jp/" } | ForEach-Object { ConvertWeybackUrl $_ | ExportWaybackImgHref } | ForEach-Object {$idx = 1}{ | |
$groups = [regex]::Matches($_, "/dlsite.blogimg.jp/.+/imgs/[\da-f]/[\da-f]/(.+)\.(jpg|jpeg|png|gif)$").Groups | |
@{ | |
Src = $_ | |
FileName = "$("{0:D3}" -f $idx)_$($groups[1]).$($groups[2])" | |
} | |
++$idx | |
} | |
if ($null -eq $blogArchiveImgs) { | |
return $false | |
} | |
CreateDirectory $articleDir | |
foreach ($blogArchiveImg in $blogArchiveImgs) { | |
DownloadImage $blogArchiveImg.Src $articleDir $blogArchiveImg.FileName | |
} | |
return $blogArchiveImgs.Length -gt 0 | |
} | |
# FIXME: 生きてるリンクと死んでてWebArchiveから落とした画像でindexがあったりなかったりする | |
Function DownloadArticleImages([ValidateNotNull()]$article, [ValidateNotNullOrEmpty()][string]$rootDir) { | |
Write-Host "fetching: $($article.Href)" | |
$html = Invoke-WebRequest $article.Href | |
$outerLinkImgs = $html | Select-HtmlContent "div.file-player-image-wrapper > a", ([AngleParse.Attr]::Href) | |
$creatorName = if ($NoCreatorDir) { "" } else { | |
$html | Select-HtmlContent "div.c-grid-account > div.c-grid-account-info > div.inner-accountInfo-item > h2" | |
} | |
$articleDir = Join-Path $rootDir $creatorName | Join-Path -ChildPath "$($article.Id)_$(ReplaceWinPathInvaildChar $article.Title)" | |
$existsDlblogImg = if ($null -ne $outerLinkImgs) { DownloadDlblogImages $outerLinkImgs $articleDir } | |
# Article内の画像の内小さくないもののみ抽出 | |
$imageRawSrcs = $html | Select-HtmlContent "div.file-player-image-wrapper > figure > img.file-player-image", ([AngleParse.Attr]::Element) | ForEach-Object { @{ | |
RawSrc = $_.Dataset['raw'] # Hrefは縮小画像なのでオリジナル画像を落とす | |
Width = $_.Attributes['width'].Value | |
Height = $_.Attributes['height'].Value | |
}} | Where-Object { ([int]$_.Width -gt 100) -and ([int]$_.Height -gt 100) } | ForEach-Object { $_.RawSrc } | |
if ((-not $existsDlblogImg) -and ($null -eq $imageRawSrcs)) { | |
Write-Host "empty image: $($article.Href)" | |
continue | |
} | |
CreateDirectory $articleDir | |
foreach ($imageRawSrc in $imageRawSrcs) { | |
DownloadCienImage $imageRawSrc $articleDir | |
} | |
} | |
Function GetArticleData([ValidateNotNullOrEmpty()][string]$url, [ValidateNotNullOrEmpty()][string]$titleAnchorSelector) { | |
return Invoke-WebRequest $url | Select-HtmlContent $titleAnchorSelector, @{ | |
Id = [AngleParse.Attr]::Href, { [regex]::Matches($_, "\d+$")[0] } | |
Href = [AngleParse.Attr]::Href | |
Title = [AngleParse.Attr]::TextContent | |
} | |
} | |
Function DownloadCreatorsAllArticleImages([int]$creator, [ValidateNotNullOrEmpty()][string]$dlPath, [int]$pStart, [int]$pEnd) { | |
$pEnd = if ($pEnd -gt 0) { $pEnd } else { GetCreatorsArticlePageEnd $creator } | |
if ($pStart -gt $pEnd) { | |
Write-Error "PageStart($pStart) > PageEnd($pEnd)" | |
return | |
} | |
foreach ($page in $pStart..$pEnd) { | |
$articles = GetArticleData "$CIEN_HOST/creator/$creator/article?mode=list&page=$page" "h3.articleTitle.js_wovn_ignore > a" | |
foreach ($article in $articles) { | |
DownloadArticleImages $article $dlPath | |
} | |
} | |
} | |
if ($ArticleId -gt 0) { | |
$articleData = GetArticleData "$CIEN_HOST/creator/$CreatorId/article/$ArticleId" "h1.article-title > a" | |
DownloadArticleImages $articleData $OutPath | |
} else { | |
DownloadCreatorsAllArticleImages $CreatorId $OutPath $PageStart $PageEnd | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment