Skip to content

Instantly share code, notes, and snippets.

@vromero
Created July 28, 2014 05:48
Show Gist options
  • Save vromero/dd6cfdc2a226a8589ef0 to your computer and use it in GitHub Desktop.
Save vromero/dd6cfdc2a226a8589ef0 to your computer and use it in GitHub Desktop.
Convert MuleSoft documentation to asciidoc
#!/bin/bash
#===================================================================================
#
# FILE: mule-docs-to-asciidoc.sh
#
# USAGE: mule-docs-to-asciidoc.sh
#
# DESCRIPTION: List and/or delete all stale links in directory trees.
# The default starting directory is the current directory.
# Don’t descend directories on other filesystems.
#
# OPTIONS: see function ’usage’ below
# REQUIREMENTS: asciidoctor xidel pandoc
# BUGS: ---
# NOTES: ---
# AUTHOR: Victor Romero (vrc), [email protected]
# COMPANY: MuleSoft Inc.
# VERSION: 1.0-SNAPSHOT
# CREATED: 28.08.2014
# REVISION: 28.08.2014
#===================================================================================
LINKS_FILE=output/links
LINKS_FILTER_URL=http://www.mulesoft.org/documentation/display/current/
LINKS_URL=http://www.mulesoft.org/documentation/display/current/Home
TARGET_DIR=output
#=== FUNCTION ================================================================
# NAME: getLinks
# DESCRIPTION: Extract documentation links
# PARAMETER 1: URL to download and extract links
# PARAMETER 2: URL portion that should be present in accepted links
#=============================================================================
function getLinks() {
xidel -q $1 --extract "//a/@href" | grep $2
}
#=== FUNCTION ================================================================
# NAME: downloadImages
# DESCRIPTION: Download images from url
# PARAMETER 1: URL to download and extract image urls
# PARAMETER 2: Target directory
#=============================================================================
function downloadImages() {
mkdir -p $2/$(basename $1)
for IMAGE in $(xidel -q $1 --extract "//img/@src"); do
IMAGE_NAME=$(basename $(echo $IMAGE | awk 'BEGIN { FS = "?" } ; {print $1}'))
echo -n "#"
curl -s -o $2/$(basename $LINK)/$IMAGE_NAME http://www.mulesoft.org$IMAGE 2>&1 > /dev/null
done
echo " "
}
#=== FUNCTION ================================================================
# NAME: downloadAndExtractXpath
# DESCRIPTION: Download HTML and extract from url
# PARAMETER 1: URL to download and extract portion
# PARAMETER 2: XPath to be used for extraction
# PARAMETER 3: Target directory
#=============================================================================
function downloadAndExtractXpath() {
xidel -q --html $1 --xpath $2 > $3/$(basename $LINK).html
}
#=== FUNCTION ================================================================
# NAME: htmlToAsciidoc
# DESCRIPTION: Converts file from HTML to asciidoc
# PARAMETER 1: source file
# PARAMETER 2: destination file
#=============================================================================
function htmlToAsciidoc() {
pandoc --from=html --to=asciidoc $1 -o $2
}
#=== FUNCTION ================================================================
# NAME: rebaseImageTagUrls
# DESCRIPTION: Rebase URLs of image tags in asciidoc
# PARAMETER 1: input file
# PARAMETER 2: new base
#=============================================================================
function rebaseImageTagUrls() {
sed -i '' "s,image:/.*/\(.*\)\?.*\[image\],image:$2/\1[image],g" $1
}
#=== FUNCTION ================================================================
# NAME: rebaseImageTagUrls
# DESCRIPTION: Rebase URLs of link tags in asciidoc
# PARAMETER 1: input file
# PARAMETER 2: new base
#=============================================================================
function rebaseLinkTagUrls() {
sed -i '' "s,link:/.*/\(.*\)\\[,link:$2/\1.asciidoc[,g" $1
}
#=== FUNCTION ================================================================
# NAME: convertPageToAsciidoc
# DESCRIPTION: Converts file from HTML to asciidoc
# PARAMETER 1: URL of the page to convert
#=============================================================================
function convertPageToAsciidoc() {
# Download and extract path of an HTML file
downloadAndExtractXpath "http://www.mulesoft.org/$1" '//div[@id="content-editable-container"]' $TARGET_DIR
# Convert HTML to ASCIIDOC
echo -n "Processing $(basename $1) "
htmlToAsciidoc $TARGET_DIR/$(basename $1).html $TARGET_DIR/$(basename $1).asciidoc
# Download images from HTML
downloadImages $TARGET_DIR/$(basename $1).html $TARGET_DIR/images
# Correct URLs
rebaseImageTagUrls $TARGET_DIR/$(basename $1).asciidoc images/$(basename $1)
rebaseLinkTagUrls $TARGET_DIR/$(basename $1).asciidoc images/$(basename $1)
## Convert back asciidoc to HTML overwriting the original
asciidoctor $TARGET_DIR/$(basename $1).asciidoc 1>&2 2>/dev/null
}
#-------------------------------------------------------------------------------
# Main
#-------------------------------------------------------------------------------
getLinks $LINKS_URL $LINKS_FILTER_URL | while read LINK; do
convertPageToAsciidoc $LINK
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment