Last active
June 4, 2024 12:02
-
-
Save bshishov/db5436eb2dea0e38069525eb44ef1583 to your computer and use it in GitHub Desktop.
WGET recursive download of a mediawiki website with authorization
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
WIKI_URL=___ # with trailing slash | |
WIKI_USERNAME=___ | |
WIKI_PASSWORD=___ | |
WIKI_DUMP_DIR=./dump | |
WIKI_DUMP_DIR_LOGIN=${WIKI_DUMP_DIR}/login | |
WIKI_LOGIN_PAGE="index.php?title=Служебная:Вход" | |
#WIKI_START_PAGE="index.php?title=Содержание" | |
WIKI_START_PAGE="index.php/Содержание" | |
#WIKI_LOGIN_PAGE="index.php?title=Special:Userlogin" | |
#http://${WIKI_URL}index.php?title=Служебная:Вход | |
#POST_DATA="wpName=${WIKI_USERNAME}&wpPassword=${WIKI_PASSWORD}&wpRemember=1&wpLoginattempt=Log%20in" | |
#echo "POST DATA = ${POST_DATA}" | |
# get login page | |
wget --keep-session-cookies \ | |
--save-cookies cookies.txt \ | |
--directory-prefix=${WIKI_DUMP_DIR} \ | |
"${WIKI_URL}${WIKI_LOGIN_PAGE}" \ | |
-O ${WIKI_DUMP_DIR_LOGIN}/login_page.html | |
# extract login token form login page | |
WIKI_LOGIN_TOKEN=$(grep -oP '(?<=type=\"hidden\" value=\")[a-z0-9]+' ${WIKI_DUMP_DIR_LOGIN}/login_page.html) | |
echo | |
echo "WIKI_LOGIN_TOKEN = \"${WIKI_LOGIN_TOKEN}\"" | |
echo | |
sleep 2 | |
rawurlencode() { | |
local string="${1}" | |
local strlen=${#string} | |
local encoded="" | |
local pos c o | |
for (( pos=0 ; pos<strlen ; pos++ )); do | |
c=${string:$pos:1} | |
case "$c" in | |
[-_.~a-zA-Z0-9] ) o="${c}" ;; | |
* ) printf -v o '%%%02x' "'$c" | |
esac | |
encoded+="${o}" | |
done | |
echo "${encoded}" # You can either set a return variable (FASTER) | |
REPLY="${encoded}" #+or echo the result (EASIER)... or both... :p | |
} | |
# post login | |
wget --keep-session-cookies \ | |
--load-cookies cookies.txt \ | |
--save-cookies cookies.txt \ | |
--post-data "wpName=${WIKI_USERNAME}&wpPassword=${WIKI_PASSWORD}&wpRemember=1&wpLoginattempt=Войти&wpLoginToken=${WIKI_LOGIN_TOKEN}" \ | |
--directory-prefix=${WIKI_DUMP_DIR_LOGIN} \ | |
"${WIKI_URL}${WIKI_LOGIN_PAGE}&action=submitlogin&type=login" | |
SPECIAL_ENCODED=$( rawurlencode "Служебная" ) | |
# DUMP ALL | |
wget --mirror \ | |
--recursive \ | |
-e robots=off \ | |
--page-requisites \ | |
--convert-links \ | |
--no-clobber \ | |
--adjust-extension \ | |
--reject "*Служебная:*","*Обсуждение:*","*Участник:*","*action=*","*oldid=*","*diff=*","*index.php*" \ | |
--exclude-directories "/index.php/Служебная*" \ | |
--keep-session-cookies \ | |
--restrict-file-names=nocontrol \ | |
--remote-encoding=utf-8 \ | |
--load-cookies cookies.txt \ | |
--user-agent=firefox \ | |
--directory-prefix=${WIKI_DUMP_DIR} \ | |
${WIKI_URL}${WIKI_START_PAGE} | |
#--no-parent | |
#-q --show-progress \ | |
#--no-host-directories \ | |
#--adjust-extension | |
#--html-extension (OBSOLETE) | |
#--reject "*oldid=*","*action=edit*","*action=history*","*diff=*","*limit=*","*[/=]User:*","*[/=]User_talk:*","*[^p]/Special:*","*=Special:[^R]*","*.php/Special:[^LUA][^onl][^nul]*","*MediaWiki:*","*Search:*","*Help:*’","*[/=]Участник:*","*[/=]Обсуждение:*","*[^p]/${SPECIAL_ENCODED}:*","*=${SPECIAL_ENCODED}:[^R]*","*.php/${SPECIAL_ENCODED}:[^LUA][^onl][^nul]*","*Поиск:*","*Помощь:*" \ | |
#--reject-regex "Служебная|Поиск|Помощь|Участник|index\.php|action=|diff=|limit=|" \ | |
#--reject-regex | |
#--no-clobber \ prevents download twice | |
#--cut-dirs=2 \ | |
rawurlencode "${WIKI_TEST_ARTICLE}"; | |
# wget --no-parent --page-requisites --convert-links --no-host-directories --cut-dirs=2 --load-cookies cookies.txt --directory-prefix=. ${WIKI_URL}index.php/${REPLY} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Do you know if this still work on MediaWiki v1.29?