Skip to content

Instantly share code, notes, and snippets.

@108EAA0A
Last active November 21, 2024 11:55
Show Gist options
  • Save 108EAA0A/fbcabaf87e4b09e6e282e42ac64a3689 to your computer and use it in GitHub Desktop.
Save 108EAA0A/fbcabaf87e4b09e6e282e42ac64a3689 to your computer and use it in GitHub Desktop.
Download article images for https://ci-en.dlsite.com
Param(
# (*)クリエイターID: URLから推測して指定
[Parameter(Mandatory, ValueFromPipeline)]
[ValidateScript({ $_ -gt 0 })]
[int]$CreatorId,
# ページ指定: 投稿記事の一覧表示における何ページから何ページまでをDLするかどうか。未指定時は全ページ
[ValidateScript({ $_ -gt 0 })]
[int]$PageStart = 1,
[int]$PageEnd,
# 記事ID: 指定時ページ単体の画像をDL
[ValidateScript({ $_ -gt 0 })]
[int]$ArticleId,
# 保存先のパス: 未指定時はカレントディレクトリに保存
[string]$OutPath = ((Get-Location).Path),
# スリープ時間(ms): 0はマナー違反
[ValidateScript({ $_ -gt 0 })]
[int]$SleepMs = 100,
# 指定時、同名ファイルを上書きする
[switch]$Overwrite,
# 指定時、クリエイター名のディレクトリを作成しない
[switch]$NoCreatorDir
)
Set-StrictMode -Version Latest
Add-Type -AssemblyName System.Web
# This script required AngleParse
# @see https://github.com/kamome283/AngleParse
if (-not (Get-Module -ListAvailable -Name AngleParse)) {
Install-Module AngleParse -Scope CurrentUser
}
Import-Module AngleParse
$CIEN_HOST = "https://ci-en.dlsite.com"
# 何ページまであるか調べる
Function GetCreatorsArticlePageEnd([Parameter(Mandatory, ValueFromPipeline)][int]$cid) {
for ($page = 1; ; ++$page) {
Write-Host "fetching: $CIEN_HOST/creator/$cid/article?mode=list&page=$page"
$textContent = Invoke-WebRequest "$CIEN_HOST/creator/$cid/article?mode=list&page=$page" | Select-HtmlContent "div.c-notification.is-large.is-light > div.c-notificationItem > p.content"
if ([string]::Equals($TextContent, "まだ記事はありません。")) {
return $page - 1
}
Start-Sleep -Milliseconds $sleepMs
}
}
# Create A Directory If It Does Not Exist
Function CreateDirectory([Parameter(Mandatory, ValueFromPipeline)][ValidateNotNullOrEmpty()][string]$dirPath) {
if (-not (Test-Path $dirPath -PathType Container)) {
try {
New-Item -Path $dirPath -ItemType Directory -ErrorAction Stop | Out-Null #-Force
}
catch {
Write-Error -Message "Unable to create directory '$dirPath'. Error was: $_" -ErrorAction Stop
}
}
}
Function ReplaceWinPathInvaildChar([Parameter(Mandatory, ValueFromPipeline)][string]$preValidatePath) {
return $preValidatePath.Trim().Replace('<', '<').Replace('>', '>').Replace('?', '?').Replace(':', ':').Replace('*', '*').Replace('"', '”').Replace('|', '|').Replace('/', '/').Replace('\\', '¥')
}
Function DownloadFile([ValidateNotNullOrEmpty()][string]$url, [ValidateNotNullOrEmpty()][string]$path) {
# Invoke-WebRequest -OutFile のバグ回避
# WebClientは並列実行したらなぜか上手くDLされないことがあった
# @see http://265.blog3.fc2.com/blog-entry-37.html
[System.IO.File]::WriteAllBytes($path, (Invoke-WebRequest $url).content)
Start-Sleep -Milliseconds $sleepMs
}
Function DownloadImage([ValidateNotNullOrEmpty()][string]$url, [ValidateNotNullOrEmpty()][string]$distDir, [ValidateNotNullOrEmpty()][string]$fileName) {
$urlDecodedFileName = [System.Web.HttpUtility]::UrlDecode($fileName) | ReplaceWinPathInvaildChar
$fullPath = Join-Path $distDir $urlDecodedFileName
if (!$Overwrite -and [System.IO.File]::Exists($fullPath)) {
Write-Host "'$fullPath' already exists. The download of this file was skipped."
return
}
Write-Host "downloading: $urlDecodedFileName"
DownloadFile $url $fullPath
}
Function DownloadCienImage([ValidateNotNullOrEmpty()][string]$url, [ValidateNotNullOrEmpty()][string]$distDir) {
$cienRawImgFileName = $url | Select-String -Pattern "/upload/(.+)\?" | ForEach-Object { $_.matches.groups[1].Value }
DownloadImage $url $distDir $cienRawImgFileName
}
Function ConvertWeybackUrl([Parameter(Mandatory, ValueFromPipeline)][ValidateNotNullOrEmpty()][string]$url) {
(Invoke-WebRequest "http://archive.org/wayback/available?url=$url").Content | ConvertFrom-Json | ForEach-Object { $_.archived_snapshots } | Where-Object { $_.closest.available } | ForEach-Object { $_.closest.url }[0]
}
Function ExportWaybackImgHref([Parameter(Mandatory, ValueFromPipeline)][ValidateNotNullOrEmpty()][string]$url) {
Invoke-WebRequest $url | Select-HtmlContent "iframe#playback", ([AngleParse.Attr]::Src)
}
Function DownloadDlblogImages($outerLinkImgs, [ValidateNotNullOrEmpty()][string]$articleDir) {
# 画像のURLが dlsite.blogimg.jp だったら WebArchive を見に行く
[array]$blogArchiveImgs = $outerLinkImgs | Where-Object { $_ -match "/dlsite.blogimg.jp/" } | ForEach-Object { ConvertWeybackUrl $_ | ExportWaybackImgHref } | ForEach-Object {$idx = 1}{
$groups = [regex]::Matches($_, "/dlsite.blogimg.jp/.+/imgs/[\da-f]/[\da-f]/(.+)\.(jpg|jpeg|png|gif)$").Groups
@{
Src = $_
FileName = "$("{0:D3}" -f $idx)_$($groups[1]).$($groups[2])"
}
++$idx
}
if ($null -eq $blogArchiveImgs) {
return $false
}
CreateDirectory $articleDir
foreach ($blogArchiveImg in $blogArchiveImgs) {
DownloadImage $blogArchiveImg.Src $articleDir $blogArchiveImg.FileName
}
return $blogArchiveImgs.Length -gt 0
}
# FIXME: 生きてるリンクと死んでてWebArchiveから落とした画像でindexがあったりなかったりする
Function DownloadArticleImages([ValidateNotNull()]$article, [ValidateNotNullOrEmpty()][string]$rootDir) {
Write-Host "fetching: $($article.Href)"
$html = Invoke-WebRequest $article.Href
$outerLinkImgs = $html | Select-HtmlContent "div.file-player-image-wrapper > a", ([AngleParse.Attr]::Href)
$creatorName = if ($NoCreatorDir) { "" } else {
$html | Select-HtmlContent "div.c-grid-account > div.c-grid-account-info > div.inner-accountInfo-item > h2"
}
$articleDir = Join-Path $rootDir $creatorName | Join-Path -ChildPath "$($article.Id)_$(ReplaceWinPathInvaildChar $article.Title)"
$existsDlblogImg = if ($null -ne $outerLinkImgs) { DownloadDlblogImages $outerLinkImgs $articleDir }
# Article内の画像の内小さくないもののみ抽出
$imageRawSrcs = $html | Select-HtmlContent "div.file-player-image-wrapper > figure > img.file-player-image", ([AngleParse.Attr]::Element) | ForEach-Object { @{
RawSrc = $_.Dataset['raw'] # Hrefは縮小画像なのでオリジナル画像を落とす
Width = $_.Attributes['width'].Value
Height = $_.Attributes['height'].Value
}} | Where-Object { ([int]$_.Width -gt 100) -and ([int]$_.Height -gt 100) } | ForEach-Object { $_.RawSrc }
if ((-not $existsDlblogImg) -and ($null -eq $imageRawSrcs)) {
Write-Host "empty image: $($article.Href)"
continue
}
CreateDirectory $articleDir
foreach ($imageRawSrc in $imageRawSrcs) {
DownloadCienImage $imageRawSrc $articleDir
}
}
Function GetArticleData([ValidateNotNullOrEmpty()][string]$url, [ValidateNotNullOrEmpty()][string]$titleAnchorSelector) {
return Invoke-WebRequest $url | Select-HtmlContent $titleAnchorSelector, @{
Id = [AngleParse.Attr]::Href, { [regex]::Matches($_, "\d+$")[0] }
Href = [AngleParse.Attr]::Href
Title = [AngleParse.Attr]::TextContent
}
}
Function DownloadCreatorsAllArticleImages([int]$creator, [ValidateNotNullOrEmpty()][string]$dlPath, [int]$pStart, [int]$pEnd) {
$pEnd = if ($pEnd -gt 0) { $pEnd } else { GetCreatorsArticlePageEnd $creator }
if ($pStart -gt $pEnd) {
Write-Error "PageStart($pStart) > PageEnd($pEnd)"
return
}
foreach ($page in $pStart..$pEnd) {
$articles = GetArticleData "$CIEN_HOST/creator/$creator/article?mode=list&page=$page" "h3.articleTitle.js_wovn_ignore > a"
foreach ($article in $articles) {
DownloadArticleImages $article $dlPath
}
}
}
if ($ArticleId -gt 0) {
$articleData = GetArticleData "$CIEN_HOST/creator/$CreatorId/article/$ArticleId" "h1.article-title > a"
DownloadArticleImages $articleData $OutPath
} else {
DownloadCreatorsAllArticleImages $CreatorId $OutPath $PageStart $PageEnd
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment