vromero · July 28, 2014 05:48
diff --git a/gistfile1.txt b/gistfile1.txt
 #!/bin/bash 
 #===================================================================================
 #
 # FILE: mule-docs-to-asciidoc.sh
 #
 # USAGE: mule-docs-to-asciidoc.sh
 #
 # DESCRIPTION: List and/or delete all stale links in directory trees.
 # The default starting directory is the current directory.
 # Don’t descend directories on other filesystems.
 #
 # OPTIONS: see function ’usage’ below
 # REQUIREMENTS: asciidoctor xidel pandoc
 # BUGS: ---
 # NOTES: ---
 # AUTHOR: Victor Romero (vrc), [email protected]
 # COMPANY: MuleSoft Inc.
 # VERSION: 1.0-SNAPSHOT
 # CREATED: 28.08.2014
 # REVISION: 28.08.2014
 #===================================================================================


 LINKS_FILE=output/links
 LINKS_FILTER_URL=http://www.mulesoft.org/documentation/display/current/
 LINKS_URL=http://www.mulesoft.org/documentation/display/current/Home
 TARGET_DIR=output


 #=== FUNCTION ================================================================
 # NAME: getLinks
 # DESCRIPTION: Extract documentation links
 # PARAMETER 1: URL to download and extract links
 # PARAMETER 2: URL portion that should be present in accepted links
 #=============================================================================
 function getLinks() {
 	xidel -q $1  --extract "//a/@href" | grep $2 
 }

 #=== FUNCTION ================================================================
 # NAME: downloadImages
 # DESCRIPTION: Download images from url
 # PARAMETER 1: URL to download and extract image urls
 # PARAMETER 2: Target directory
 #=============================================================================
 function downloadImages() {
 	mkdir -p $2/$(basename $1)
    for IMAGE in $(xidel -q  $1 --extract "//img/@src"); do
 	    IMAGE_NAME=$(basename $(echo $IMAGE | awk 'BEGIN { FS = "?" } ; {print $1}'))
 	    echo -n "#"
 	    curl -s -o $2/$(basename $LINK)/$IMAGE_NAME http://www.mulesoft.org$IMAGE 2>&1 > /dev/null
 	done
 	echo " "
 }

 #=== FUNCTION ================================================================
 # NAME: downloadAndExtractXpath
 # DESCRIPTION: Download HTML and extract  from url
 # PARAMETER 1: URL to download and extract portion
 # PARAMETER 2: XPath to be used for extraction
 # PARAMETER 3: Target directory
 #=============================================================================
 function downloadAndExtractXpath() {
    xidel -q --html $1 --xpath $2 > $3/$(basename $LINK).html
 }

 #=== FUNCTION ================================================================
 # NAME: htmlToAsciidoc
 # DESCRIPTION: Converts file from HTML to asciidoc
 # PARAMETER 1: source file
 # PARAMETER 2: destination file
 #=============================================================================
 function htmlToAsciidoc() {
 	pandoc --from=html --to=asciidoc $1 -o $2
 }

 #=== FUNCTION ================================================================
 # NAME: rebaseImageTagUrls
 # DESCRIPTION: Rebase URLs of image tags in asciidoc
 # PARAMETER 1: input file
 # PARAMETER 2: new base
 #=============================================================================
 function rebaseImageTagUrls() {
    sed -i '' "s,image:/.*/\(.*\)\?.*\[image\],image:$2/\1[image],g" $1 
 }

 #=== FUNCTION ================================================================
 # NAME: rebaseImageTagUrls
 # DESCRIPTION: Rebase URLs of link tags in asciidoc
 # PARAMETER 1: input file
 # PARAMETER 2: new base
 #=============================================================================
 function rebaseLinkTagUrls() {
    sed -i '' "s,link:/.*/\(.*\)\\[,link:$2/\1.asciidoc[,g" $1 
 }


 #=== FUNCTION ================================================================
 # NAME: convertPageToAsciidoc
 # DESCRIPTION: Converts file from HTML to asciidoc
 # PARAMETER 1: URL of the page to convert
 #=============================================================================
 function convertPageToAsciidoc() {
 	# Download and extract path of an HTML file
    downloadAndExtractXpath "http://www.mulesoft.org/$1" '//div[@id="content-editable-container"]' $TARGET_DIR

    # Convert HTML to ASCIIDOC
    echo -n "Processing $(basename $1) "
    htmlToAsciidoc $TARGET_DIR/$(basename $1).html $TARGET_DIR/$(basename $1).asciidoc

    # Download images from HTML
    downloadImages $TARGET_DIR/$(basename $1).html $TARGET_DIR/images

    # Correct URLs
    rebaseImageTagUrls $TARGET_DIR/$(basename $1).asciidoc images/$(basename $1)
 	rebaseLinkTagUrls $TARGET_DIR/$(basename $1).asciidoc images/$(basename $1)

    ## Convert back asciidoc to HTML overwriting the original
    asciidoctor $TARGET_DIR/$(basename $1).asciidoc 1>&2 2>/dev/null
 }

 #-------------------------------------------------------------------------------
 # Main
 #-------------------------------------------------------------------------------
 getLinks $LINKS_URL $LINKS_FILTER_URL | while read LINK; do
    convertPageToAsciidoc $LINK
 done
	#!/bin/bash
	#===================================================================================
	#
	# FILE: mule-docs-to-asciidoc.sh
	#
	# USAGE: mule-docs-to-asciidoc.sh
	#
	# DESCRIPTION: List and/or delete all stale links in directory trees.
	# The default starting directory is the current directory.
	# Don’t descend directories on other filesystems.
	#
	# OPTIONS: see function ’usage’ below
	# REQUIREMENTS: asciidoctor xidel pandoc
	# BUGS: ---
	# NOTES: ---
	# AUTHOR: Victor Romero (vrc), [email protected]
	# COMPANY: MuleSoft Inc.
	# VERSION: 1.0-SNAPSHOT
	# CREATED: 28.08.2014
	# REVISION: 28.08.2014
	#===================================================================================


	LINKS_FILE=output/links
	LINKS_FILTER_URL=http://www.mulesoft.org/documentation/display/current/
	LINKS_URL=http://www.mulesoft.org/documentation/display/current/Home
	TARGET_DIR=output


	#=== FUNCTION ================================================================
	# NAME: getLinks
	# DESCRIPTION: Extract documentation links
	# PARAMETER 1: URL to download and extract links
	# PARAMETER 2: URL portion that should be present in accepted links
	#=============================================================================
	function getLinks() {
	xidel -q $1 --extract "//a/@href" \| grep $2
	}

	#=== FUNCTION ================================================================
	# NAME: downloadImages
	# DESCRIPTION: Download images from url
	# PARAMETER 1: URL to download and extract image urls
	# PARAMETER 2: Target directory
	#=============================================================================
	function downloadImages() {
	mkdir -p $2/$(basename $1)
	for IMAGE in $(xidel -q $1 --extract "//img/@src"); do
	IMAGE_NAME=$(basename $(echo $IMAGE \| awk 'BEGIN { FS = "?" } ; {print $1}'))
	echo -n "#"
	curl -s -o $2/$(basename $LINK)/$IMAGE_NAME http://www.mulesoft.org$IMAGE 2>&1 > /dev/null
	done
	echo " "
	}

	#=== FUNCTION ================================================================
	# NAME: downloadAndExtractXpath
	# DESCRIPTION: Download HTML and extract from url
	# PARAMETER 1: URL to download and extract portion
	# PARAMETER 2: XPath to be used for extraction
	# PARAMETER 3: Target directory
	#=============================================================================
	function downloadAndExtractXpath() {
	xidel -q --html $1 --xpath $2 > $3/$(basename $LINK).html
	}

	#=== FUNCTION ================================================================
	# NAME: htmlToAsciidoc
	# DESCRIPTION: Converts file from HTML to asciidoc
	# PARAMETER 1: source file
	# PARAMETER 2: destination file
	#=============================================================================
	function htmlToAsciidoc() {
	pandoc --from=html --to=asciidoc $1 -o $2
	}

	#=== FUNCTION ================================================================
	# NAME: rebaseImageTagUrls
	# DESCRIPTION: Rebase URLs of image tags in asciidoc
	# PARAMETER 1: input file
	# PARAMETER 2: new base
	#=============================================================================
	function rebaseImageTagUrls() {
	sed -i '' "s,image:/./\(.\)\?.*\[image\],image:$2/\1[image],g" $1
	}

	#=== FUNCTION ================================================================
	# NAME: rebaseImageTagUrls
	# DESCRIPTION: Rebase URLs of link tags in asciidoc
	# PARAMETER 1: input file
	# PARAMETER 2: new base
	#=============================================================================
	function rebaseLinkTagUrls() {
	sed -i '' "s,link:/./\(.\)\\[,link:$2/\1.asciidoc[,g" $1
	}


	#=== FUNCTION ================================================================
	# NAME: convertPageToAsciidoc
	# DESCRIPTION: Converts file from HTML to asciidoc
	# PARAMETER 1: URL of the page to convert
	#=============================================================================
	function convertPageToAsciidoc() {
	# Download and extract path of an HTML file
	downloadAndExtractXpath "http://www.mulesoft.org/$1" '//div[@id="content-editable-container"]' $TARGET_DIR

	# Convert HTML to ASCIIDOC
	echo -n "Processing $(basename $1) "
	htmlToAsciidoc $TARGET_DIR/$(basename $1).html $TARGET_DIR/$(basename $1).asciidoc

	# Download images from HTML
	downloadImages $TARGET_DIR/$(basename $1).html $TARGET_DIR/images

	# Correct URLs
	rebaseImageTagUrls $TARGET_DIR/$(basename $1).asciidoc images/$(basename $1)
	rebaseLinkTagUrls $TARGET_DIR/$(basename $1).asciidoc images/$(basename $1)

	## Convert back asciidoc to HTML overwriting the original
	asciidoctor $TARGET_DIR/$(basename $1).asciidoc 1>&2 2>/dev/null
	}

	#-------------------------------------------------------------------------------
	# Main
	#-------------------------------------------------------------------------------
	getLinks $LINKS_URL $LINKS_FILTER_URL \| while read LINK; do
	convertPageToAsciidoc $LINK
	done