Skip to content

Instantly share code, notes, and snippets.

@timjstewart
Last active December 28, 2015 06:29
Show Gist options
  • Save timjstewart/7457111 to your computer and use it in GitHub Desktop.
Save timjstewart/7457111 to your computer and use it in GitHub Desktop.
Mirror files locally. Now handles gzipped files. Now alphabetizes Master Index Is a bit better at finding index.html files in the mirrored content.
#! /bin/bash
# where to put the mirrored documents
base_dir=~/docs
# the name of the master index file
master_index=${base_dir}/index.html
# download a single compressed archive and extract it to the current
# directory
function download_and_extract() {
url=$1
file_name=$2
file_ext=$3
wget -nv -np -nH ${url}
case ${file_ext} in
txz )
tar Jxvf ${file_name}
rm ${file_name}
;;
gz )
tar xvzf ${file_name}
rm ${file_name}
;;
zip )
unzip ${file_name}
rm ${file_name}
;;
*)
echo "don't know how to extract ${file_ext} files." 2>&1
;;
esac
}
# use wget's recursive download feature to download all content
# referred to by $url (but not going to $url's parent directory).
function download_recursively() {
url=$1
wget -nv -np -nH -r ${url}
}
# write the first argument to the master index, after truncating the
# file
function write_to_index() {
echo $1 > "${master_index}.tmp"
}
# append the first argument to the master index
function append_to_index() {
echo $1 >> "${master_index}.tmp"
}
# write the header of the master index file
function write_index_header() {
write_to_index "<html><head><title>Master Index</title></head><body>"
append_to_index "<h3>Master Index</h3>"
}
function write_index_file() {
href=$1
text=$2
dir=$3
append_to_index "<li>"
append_to_index "<a href=\"${href}\">${text}</a>"
append_to_index "<i>($(du ${dir} -chsx | tail -n 1 | cut -f 1))</i>"
append_to_index "</li>"
}
# write links to all index files that were found in the mirrored files
#
# use an iterative deepening depth-first search to find the most
# likely index.html file.
function write_index_files() {
dir=$1
name=$2
index_file=$3
link_text=$(echo ${name} | tr '/' '-')
if [ "${index_file}" != "" ]
then
write_index_file "${dir}/${index_file}" "${link_text}" "${dir}"
else
IFS=$'\n'
maxdepth=3
found=0
while (( ${found} == 0 && ${maxdepth} < 10 ))
do
for index in $(find ${dir} -maxdepth ${maxdepth} -name 'index.htm?' -print)
do
write_index_file "${index#${base_dir}/}" "${link_text}" "${dir}"
found=1
done
(( maxdepth = ${maxdepth} + 1 ))
done
IFS=';'
fi
}
# close out all the open HTML tags
function write_index_footer() {
append_to_index "<br/><i>generated at: $(date) using <a href=\"https://gist.github.com/timjstewart/7457111\">mirror</a></i>"
append_to_index "<i>(Index Size: $(du ${base_dir} -chsx | tail -n 1 | cut -f 1))</i>"
append_to_index "</ul></body></html>"
}
# Always rebuilds the index, even if nothing got downloaded. Is this
# a feature? Not sure.
write_index_header
IFS=';'
sort ~/.mirror | grep -ve "^#" | while read dir url index_file
do
if [ "${dir}" == "" ]
then
continue
fi
dest_dir=${base_dir}/${dir}
if [ ! -d "${dest_dir}" ]
then
mkdir -p ${dest_dir}
pushd ${dest_dir}
file_name=$(basename ${url})
file_ext="${file_name##*.}"
case ${file_ext} in
zip | gz | tgz | txz )
download_and_extract ${url} ${file_name} ${file_ext}
;;
html | htm | * )
download_recursively ${url}
;;
esac
popd
fi
write_index_files ${dest_dir} ${dir} ${index_file}
done
write_index_footer
mv ${master_index}.tmp ${master_index}
#
# Configuration file for mirrored documentation
#
# NOTE: Make sure to leave a blank line at the end of the file
#
# Format: <doc-dir>;<doc-url>[;<index-file>]
#
# doc-dir - the sub-directory (under ~/docs) where this documentation
# should be placed
#
# doc-url - the html or archive URL to download
#
# index-file - (optional) if mirror is having a hard time figuring out which
# file to include in the Master Index, specify the relative path
# to the index file.
#
# Akka
Akka/2.2.3;http://doc.akka.io/api/akka/2.2.3/
# Scala (multiple versions)
Scala/2.11-M5;http://scala-lang.org/files/archive/scala-docs-2.11.0-M5.txz
Scala/2.10.3;http://scala-lang.org/files/archive/scala-docs-2.10.3.txz
# Scala Testing
ScalaCheck/1.11.0;http://scalacheck.org/files/scalacheck_2.10-1.11.0-javadoc.tar.gz
ScalaTest/2.0;http://doc.scalatest.org/2.0/index.html
# Reactive Java
RxJava/0.14.10;http://netflix.github.io/RxJava/javadoc/index.html
# Play Framework
Play/2.0;http://www.playframework.com/documentation/2.0/api/scala/index.html
# Emacs
Elisp;https://www.gnu.org/software/emacs/manual/elisp.html_node.tar.gz
Emacs;https://www.gnu.org/software/emacs/manual/emacs.html_node.tar.gz
# Shell Utilities
gawk;https://www.gnu.org/software/gawk/manual/gawk.html_node.tar.gz
git;https://www.kernel.org/pub/software/scm/git/docs/
# Java API
Java/1.7;http://docs.oracle.com/javase/7/docs/;javase/7/docs/api/index.html
# Google Libraries
guice/4.0beta;https://google-guice.googlecode.com/git/latest-javadoc/packages.html
# JodaTime
JodaTime/2.4;http://www.joda.org/joda-time/apidocs/index.html
# EasyMock
EasyMock/3.1;http://www.easymock.org/api/easymock/3.1/index.html
# JUnit
JUnit;http://junit.sourceforge.net/javadoc/
# Cassandra
Cassandra Java Driver;http://www.datastax.com/drivers/java/2.0/apidocs/
# Bash Manual
Bash;https://www.gnu.org/software/bash/manual/bash.html_node.tar.gz
# Pandas
Pandas;http://pandas.pydata.org/pandas-docs/stable/
# IPython
IPython;http://ipython.org/ipython-doc/stable/index.html
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment