Created
July 28, 2014 05:48
-
-
Save vromero/dd6cfdc2a226a8589ef0 to your computer and use it in GitHub Desktop.
Convert MuleSoft documentation to asciidoc
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#=================================================================================== | |
# | |
# FILE: mule-docs-to-asciidoc.sh | |
# | |
# USAGE: mule-docs-to-asciidoc.sh | |
# | |
# DESCRIPTION: List and/or delete all stale links in directory trees. | |
# The default starting directory is the current directory. | |
# Don’t descend directories on other filesystems. | |
# | |
# OPTIONS: see function ’usage’ below | |
# REQUIREMENTS: asciidoctor xidel pandoc | |
# BUGS: --- | |
# NOTES: --- | |
# AUTHOR: Victor Romero (vrc), [email protected] | |
# COMPANY: MuleSoft Inc. | |
# VERSION: 1.0-SNAPSHOT | |
# CREATED: 28.08.2014 | |
# REVISION: 28.08.2014 | |
#=================================================================================== | |
LINKS_FILE=output/links | |
LINKS_FILTER_URL=http://www.mulesoft.org/documentation/display/current/ | |
LINKS_URL=http://www.mulesoft.org/documentation/display/current/Home | |
TARGET_DIR=output | |
#=== FUNCTION ================================================================ | |
# NAME: getLinks | |
# DESCRIPTION: Extract documentation links | |
# PARAMETER 1: URL to download and extract links | |
# PARAMETER 2: URL portion that should be present in accepted links | |
#============================================================================= | |
function getLinks() { | |
xidel -q $1 --extract "//a/@href" | grep $2 | |
} | |
#=== FUNCTION ================================================================ | |
# NAME: downloadImages | |
# DESCRIPTION: Download images from url | |
# PARAMETER 1: URL to download and extract image urls | |
# PARAMETER 2: Target directory | |
#============================================================================= | |
function downloadImages() { | |
mkdir -p $2/$(basename $1) | |
for IMAGE in $(xidel -q $1 --extract "//img/@src"); do | |
IMAGE_NAME=$(basename $(echo $IMAGE | awk 'BEGIN { FS = "?" } ; {print $1}')) | |
echo -n "#" | |
curl -s -o $2/$(basename $LINK)/$IMAGE_NAME http://www.mulesoft.org$IMAGE 2>&1 > /dev/null | |
done | |
echo " " | |
} | |
#=== FUNCTION ================================================================ | |
# NAME: downloadAndExtractXpath | |
# DESCRIPTION: Download HTML and extract from url | |
# PARAMETER 1: URL to download and extract portion | |
# PARAMETER 2: XPath to be used for extraction | |
# PARAMETER 3: Target directory | |
#============================================================================= | |
function downloadAndExtractXpath() { | |
xidel -q --html $1 --xpath $2 > $3/$(basename $LINK).html | |
} | |
#=== FUNCTION ================================================================ | |
# NAME: htmlToAsciidoc | |
# DESCRIPTION: Converts file from HTML to asciidoc | |
# PARAMETER 1: source file | |
# PARAMETER 2: destination file | |
#============================================================================= | |
function htmlToAsciidoc() { | |
pandoc --from=html --to=asciidoc $1 -o $2 | |
} | |
#=== FUNCTION ================================================================ | |
# NAME: rebaseImageTagUrls | |
# DESCRIPTION: Rebase URLs of image tags in asciidoc | |
# PARAMETER 1: input file | |
# PARAMETER 2: new base | |
#============================================================================= | |
function rebaseImageTagUrls() { | |
sed -i '' "s,image:/.*/\(.*\)\?.*\[image\],image:$2/\1[image],g" $1 | |
} | |
#=== FUNCTION ================================================================ | |
# NAME: rebaseImageTagUrls | |
# DESCRIPTION: Rebase URLs of link tags in asciidoc | |
# PARAMETER 1: input file | |
# PARAMETER 2: new base | |
#============================================================================= | |
function rebaseLinkTagUrls() { | |
sed -i '' "s,link:/.*/\(.*\)\\[,link:$2/\1.asciidoc[,g" $1 | |
} | |
#=== FUNCTION ================================================================ | |
# NAME: convertPageToAsciidoc | |
# DESCRIPTION: Converts file from HTML to asciidoc | |
# PARAMETER 1: URL of the page to convert | |
#============================================================================= | |
function convertPageToAsciidoc() { | |
# Download and extract path of an HTML file | |
downloadAndExtractXpath "http://www.mulesoft.org/$1" '//div[@id="content-editable-container"]' $TARGET_DIR | |
# Convert HTML to ASCIIDOC | |
echo -n "Processing $(basename $1) " | |
htmlToAsciidoc $TARGET_DIR/$(basename $1).html $TARGET_DIR/$(basename $1).asciidoc | |
# Download images from HTML | |
downloadImages $TARGET_DIR/$(basename $1).html $TARGET_DIR/images | |
# Correct URLs | |
rebaseImageTagUrls $TARGET_DIR/$(basename $1).asciidoc images/$(basename $1) | |
rebaseLinkTagUrls $TARGET_DIR/$(basename $1).asciidoc images/$(basename $1) | |
## Convert back asciidoc to HTML overwriting the original | |
asciidoctor $TARGET_DIR/$(basename $1).asciidoc 1>&2 2>/dev/null | |
} | |
#------------------------------------------------------------------------------- | |
# Main | |
#------------------------------------------------------------------------------- | |
getLinks $LINKS_URL $LINKS_FILTER_URL | while read LINK; do | |
convertPageToAsciidoc $LINK | |
done | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment