Last active
December 28, 2015 06:29
-
-
Save timjstewart/7457111 to your computer and use it in GitHub Desktop.
Mirror files locally. Now handles gzipped files.
Now alphabetizes Master Index
Is a bit better at finding index.html files in the mirrored content.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/bash | |
# where to put the mirrored documents | |
base_dir=~/docs | |
# the name of the master index file | |
master_index=${base_dir}/index.html | |
# download a single compressed archive and extract it to the current | |
# directory | |
function download_and_extract() { | |
url=$1 | |
file_name=$2 | |
file_ext=$3 | |
wget -nv -np -nH ${url} | |
case ${file_ext} in | |
txz ) | |
tar Jxvf ${file_name} | |
rm ${file_name} | |
;; | |
gz ) | |
tar xvzf ${file_name} | |
rm ${file_name} | |
;; | |
zip ) | |
unzip ${file_name} | |
rm ${file_name} | |
;; | |
*) | |
echo "don't know how to extract ${file_ext} files." 2>&1 | |
;; | |
esac | |
} | |
# use wget's recursive download feature to download all content | |
# referred to by $url (but not going to $url's parent directory). | |
function download_recursively() { | |
url=$1 | |
wget -nv -np -nH -r ${url} | |
} | |
# write the first argument to the master index, after truncating the | |
# file | |
function write_to_index() { | |
echo $1 > "${master_index}.tmp" | |
} | |
# append the first argument to the master index | |
function append_to_index() { | |
echo $1 >> "${master_index}.tmp" | |
} | |
# write the header of the master index file | |
function write_index_header() { | |
write_to_index "<html><head><title>Master Index</title></head><body>" | |
append_to_index "<h3>Master Index</h3>" | |
} | |
function write_index_file() { | |
href=$1 | |
text=$2 | |
dir=$3 | |
append_to_index "<li>" | |
append_to_index "<a href=\"${href}\">${text}</a>" | |
append_to_index "<i>($(du ${dir} -chsx | tail -n 1 | cut -f 1))</i>" | |
append_to_index "</li>" | |
} | |
# write links to all index files that were found in the mirrored files | |
# | |
# use an iterative deepening depth-first search to find the most | |
# likely index.html file. | |
function write_index_files() { | |
dir=$1 | |
name=$2 | |
index_file=$3 | |
link_text=$(echo ${name} | tr '/' '-') | |
if [ "${index_file}" != "" ] | |
then | |
write_index_file "${dir}/${index_file}" "${link_text}" "${dir}" | |
else | |
IFS=$'\n' | |
maxdepth=3 | |
found=0 | |
while (( ${found} == 0 && ${maxdepth} < 10 )) | |
do | |
for index in $(find ${dir} -maxdepth ${maxdepth} -name 'index.htm?' -print) | |
do | |
write_index_file "${index#${base_dir}/}" "${link_text}" "${dir}" | |
found=1 | |
done | |
(( maxdepth = ${maxdepth} + 1 )) | |
done | |
IFS=';' | |
fi | |
} | |
# close out all the open HTML tags | |
function write_index_footer() { | |
append_to_index "<br/><i>generated at: $(date) using <a href=\"https://gist.github.com/timjstewart/7457111\">mirror</a></i>" | |
append_to_index "<i>(Index Size: $(du ${base_dir} -chsx | tail -n 1 | cut -f 1))</i>" | |
append_to_index "</ul></body></html>" | |
} | |
# Always rebuilds the index, even if nothing got downloaded. Is this | |
# a feature? Not sure. | |
write_index_header | |
IFS=';' | |
sort ~/.mirror | grep -ve "^#" | while read dir url index_file | |
do | |
if [ "${dir}" == "" ] | |
then | |
continue | |
fi | |
dest_dir=${base_dir}/${dir} | |
if [ ! -d "${dest_dir}" ] | |
then | |
mkdir -p ${dest_dir} | |
pushd ${dest_dir} | |
file_name=$(basename ${url}) | |
file_ext="${file_name##*.}" | |
case ${file_ext} in | |
zip | gz | tgz | txz ) | |
download_and_extract ${url} ${file_name} ${file_ext} | |
;; | |
html | htm | * ) | |
download_recursively ${url} | |
;; | |
esac | |
popd | |
fi | |
write_index_files ${dest_dir} ${dir} ${index_file} | |
done | |
write_index_footer | |
mv ${master_index}.tmp ${master_index} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Configuration file for mirrored documentation | |
# | |
# NOTE: Make sure to leave a blank line at the end of the file | |
# | |
# Format: <doc-dir>;<doc-url>[;<index-file>] | |
# | |
# doc-dir - the sub-directory (under ~/docs) where this documentation | |
# should be placed | |
# | |
# doc-url - the html or archive URL to download | |
# | |
# index-file - (optional) if mirror is having a hard time figuring out which | |
# file to include in the Master Index, specify the relative path | |
# to the index file. | |
# | |
# Akka | |
Akka/2.2.3;http://doc.akka.io/api/akka/2.2.3/ | |
# Scala (multiple versions) | |
Scala/2.11-M5;http://scala-lang.org/files/archive/scala-docs-2.11.0-M5.txz | |
Scala/2.10.3;http://scala-lang.org/files/archive/scala-docs-2.10.3.txz | |
# Scala Testing | |
ScalaCheck/1.11.0;http://scalacheck.org/files/scalacheck_2.10-1.11.0-javadoc.tar.gz | |
ScalaTest/2.0;http://doc.scalatest.org/2.0/index.html | |
# Reactive Java | |
RxJava/0.14.10;http://netflix.github.io/RxJava/javadoc/index.html | |
# Play Framework | |
Play/2.0;http://www.playframework.com/documentation/2.0/api/scala/index.html | |
# Emacs | |
Elisp;https://www.gnu.org/software/emacs/manual/elisp.html_node.tar.gz | |
Emacs;https://www.gnu.org/software/emacs/manual/emacs.html_node.tar.gz | |
# Shell Utilities | |
gawk;https://www.gnu.org/software/gawk/manual/gawk.html_node.tar.gz | |
git;https://www.kernel.org/pub/software/scm/git/docs/ | |
# Java API | |
Java/1.7;http://docs.oracle.com/javase/7/docs/;javase/7/docs/api/index.html | |
# Google Libraries | |
guice/4.0beta;https://google-guice.googlecode.com/git/latest-javadoc/packages.html | |
# JodaTime | |
JodaTime/2.4;http://www.joda.org/joda-time/apidocs/index.html | |
# EasyMock | |
EasyMock/3.1;http://www.easymock.org/api/easymock/3.1/index.html | |
# JUnit | |
JUnit;http://junit.sourceforge.net/javadoc/ | |
# Cassandra | |
Cassandra Java Driver;http://www.datastax.com/drivers/java/2.0/apidocs/ | |
# Bash Manual | |
Bash;https://www.gnu.org/software/bash/manual/bash.html_node.tar.gz | |
# Pandas | |
Pandas;http://pandas.pydata.org/pandas-docs/stable/ | |
# IPython | |
IPython;http://ipython.org/ipython-doc/stable/index.html |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment