m040601 · October 15, 2010 06:26
diff --git a/yet%20another%20third%20file b/yet%20another%20third%20file
 ##############3 
 #open and scrap 
 ######################################

 #scan_page.rb = Retrieves the html page of interest from the server,
 #        navigates to links within the main page and construct a
 #        context document
                            
 #!/usr/bin/ruby                   
                
 require 'rubygems'
 require 'open-uri'        # the open-uri library                                                                                   
 require 'hpricot'         # the hpricot library                                                                                        
 require 'scrape_page'     # user-defined function to filter html  into ConTeXt                                                
                                                                   
 # scans the home page and lists     
 # all the directories and subdirectories

 doc=Hpricot(open("http://ipa.dd.re.ss/AnnRep07"))

 #####################################
 set up the doc
 #####################################
 mainfil="annrep.tex"   # open a file to output ConTeXt document                                                                                         
 `rm #{mainfil}`              
 fil=File.new(mainfil,"a")

 # Add some opening directives and include style files
                        
 fil.write "\\input context_styles \n"  # this file contains the styling options for my Context document                       
 fil.write "\\starttext \n"
 fil.write "\\leftaligned{\\BigFontOne Contents} \n"
 fil.write "\\vfill \n"
 fil.write "{ \\switchtobodyfont[10pt] "
 fil.write "\\startcolumns[n=2,balance=no,rule=off,option=background,frame=off,background=color,backgroundcolor=blue:1] \n"    
 fil.write "\\placecontent \n"
 fil.write "\\stopcolumns \n"
 fil.write "}"



 #####################
 # navigate
 #################
 I have used the following 'hpricot' code to click on chapter and section links to retrieve their contents.
 chapters= (doc/"p/a.existingWikiWord")

 # we need to navigate one more level into the web page
 # let us discover the links for that
 chapters.each do |ch|
  chap_link = ch.attributes['href']
  # using inner_html we can create subdirectories


  chap_name = ch.inner_html.gsub(/\s*/,"")
  chap_name_org = ch.inner_html

  # We create chapter directories
  system("mkdir -p #{chap_name}")
  fil.write "\\input #{chap_name}  \n"
  chapFil="#{chap_name}.tex"
  `rm #{chapFil}`
  cFil=File.new(chapFil,"a")
  cFil.write "\\chapter{ #{chap_name_org} } \n"
  # We navigate to sections now
  doc2=Hpricot(open(chap_link))
  sections= (doc2/"p/a.existingWikiWord")
  sections.each do |sc|
    sec_link = sc.attributes['href']
    sec_name = sc.inner_html.gsub(/\s*/,"")

    secFil="#{chap_name}/#{sec_name}.tex"
    `rm #{secFil}`
    sFil=File.new(secFil,"a")
    sechFil="#{chap_name}/#{sec_name}.html"
    `rm #{sechFil}`
    shFil=File.new(sechFil,"a")

 After navigating to sections (h1 elements in HTML) retrieve their contents and send it to the ruby function "scrape_page.rb" for filtering.
    #  scrape_the_page(sec_link,"#{chap_name}/#{sec_name}")
    scrape_the_page(sec_link,sFil,shFil)
    cFil.write "\\input #{chap_name}/#{sec_name} \n"
  end
 end
 fil.write "\\stoptext \n"


 ### convert
 ##  http://wiki.contextgarden.net/HTML_to_ConTeXt
 ########################3










 ##########33
 # the rest
 ######################################



 # Now we transfer the syntactically altered html to a string Object
 # and manipulate that object further


 [email protected]_html

 # remove empty space in the beginning
 newdoc.gsub!(/^\s+/,"")

 # remove all elements we don't need.
 newdoc.gsub!(/^<div.*/,"")
 newdoc.gsub!(/^<\/div.*/,"")
 newdoc.gsub!(/^<form.*/,"")
 newdoc.gsub!(/^<\/form.*/,"")
 newdoc.gsub!(/<p>/,"\n")
 newdoc.gsub!(/<\/p>/,"\n")
 newdoc.gsub!(/<\u>/,"")
 newdoc.gsub!(/<\/u>/,"")
 newdoc.gsub!(/<ul>/,"\\startitemize[1]")
 newdoc.gsub!(/<\/ul>/,"\\stopitemize")
 newdoc.gsub!(/<ol>/,"\\startitemize[n]")
 newdoc.gsub!(/<\/ol>/,"\\stopitemize")
 newdoc.gsub!(/<li>/,"\\item ")
 newdoc.gsub!(/<\/li>/,"\n")
 newdoc.gsub!("_","\\_")
 newdoc.gsub!(/<table>/,"\\bTABLE \n")
 newdoc.gsub!(/<\/table>/,"\\eTABLE \n")
 newdoc.gsub!(/<tr>/,"\\bTR ")
 newdoc.gsub!(/<\/tr>/,"\\eTR ")
 newdoc.gsub!(/<td>/,"\\bTD ")
 newdoc.gsub!(/<\/td>/,"\\eTD ")
 newdoc.gsub!(/<th>/,"\\bTH ")
 newdoc.gsub!(/<\/th>/,"\\eTH ")
 newdoc.gsub!(/<center>/,"")
 newdoc.gsub!(/<\/center>/,"")
 newdoc.gsub!(/<em>/,"{\\em ")
 newdoc.gsub!(/<\/em>/,"}")
 newdoc.gsub!("^","")
 newdoc.gsub!("\%","\\%")
 newdoc.gsub!("&","&")
 newdoc.gsub!("&",'\\\&')
 newdoc.gsub!("$",'\\$')
 newdoc.gsub!(/<tbody>/,"\\bTABLEbody \n")
 newdoc.gsub!(/<\/tbody>/,"\\eTABLEbody \n")

 # Context does not mind "_" in figures and does not recognize \_,
 # so i have to catch these and replace \_ with _

 # First catch
 filter=/\/AnnRep07\/Figures\/(\w+\/)*(\w+\\_)*/

 if newdoc[filter]
 newdoc.gsub!(filter) { |fString|
 fString.gsub("\\_","_")
 }
 end

 # Second catch
 filter2=/\/AnnRep07\/Figures\/(\w+\/)*\w+[-.]\w+\\_\w+/

 if newdoc[filter2]
 newdoc.gsub!(filter2) { |fString|
 fString.gsub("\\_","_") }
 end

 # Third catch; remove \_ inside []
 filter3=/\[\w+\\_\w+\]/

 if newdoc[filter3]
 newdoc.gsub!(filter3) { |fString|
 puts fString
 fString.gsub("\\_","_") }
 end


 # remove the comment tag, which we used to embed context commands
 newdoc.gsub!("<!--","")
 newdoc.gsub!("-->","")
                                # add full path to the images                                                                 
 newdoc.gsub!("\/AnnRep07\/Figures\/","~\/AnnRep07\/Figures\/")

 newdoc.gsub!(/<\w+\s*\/>/,"")

 #puts newdoc
 # open file for output
 #outfil="#{oFile}.tex"
 #`rm #{outfil}`

 #fil=File.new(outfil,"a")
 #puts "Writing #{oFile}"
 oFile.write newdoc

 end
 aoeuao
 aoeuaoeanoeut anoetuhonehu

 aoenutahoe
 so this should end up some where in the third file
	##############3
	#open and scrap
	######################################

	#scan_page.rb = Retrieves the html page of interest from the server,
	# navigates to links within the main page and construct a
	# context document

	#!/usr/bin/ruby

	require 'rubygems'
	require 'open-uri' # the open-uri library
	require 'hpricot' # the hpricot library
	require 'scrape_page' # user-defined function to filter html into ConTeXt

	# scans the home page and lists
	# all the directories and subdirectories

	doc=Hpricot(open("http://ipa.dd.re.ss/AnnRep07"))

	#####################################
	set up the doc
	#####################################
	mainfil="annrep.tex" # open a file to output ConTeXt document
	`rm #{mainfil}`
	fil=File.new(mainfil,"a")

	# Add some opening directives and include style files

	fil.write "\\input context_styles \n" # this file contains the styling options for my Context document
	fil.write "\\starttext \n"
	fil.write "\\leftaligned{\\BigFontOne Contents} \n"
	fil.write "\\vfill \n"
	fil.write "{ \\switchtobodyfont[10pt] "
	fil.write "\\startcolumns[n=2,balance=no,rule=off,option=background,frame=off,background=color,backgroundcolor=blue:1] \n"
	fil.write "\\placecontent \n"
	fil.write "\\stopcolumns \n"
	fil.write "}"



	#####################
	# navigate
	#################
	I have used the following 'hpricot' code to click on chapter and section links to retrieve their contents.
	chapters= (doc/"p/a.existingWikiWord")

	# we need to navigate one more level into the web page
	# let us discover the links for that
	chapters.each do \|ch\|
	chap_link = ch.attributes['href']
	# using inner_html we can create subdirectories


	chap_name = ch.inner_html.gsub(/\s*/,"")
	chap_name_org = ch.inner_html

	# We create chapter directories
	system("mkdir -p #{chap_name}")
	fil.write "\\input #{chap_name} \n"
	chapFil="#{chap_name}.tex"
	`rm #{chapFil}`
	cFil=File.new(chapFil,"a")
	cFil.write "\\chapter{ #{chap_name_org} } \n"
	# We navigate to sections now
	doc2=Hpricot(open(chap_link))
	sections= (doc2/"p/a.existingWikiWord")
	sections.each do \|sc\|
	sec_link = sc.attributes['href']
	sec_name = sc.inner_html.gsub(/\s*/,"")

	secFil="#{chap_name}/#{sec_name}.tex"
	`rm #{secFil}`
	sFil=File.new(secFil,"a")
	sechFil="#{chap_name}/#{sec_name}.html"
	`rm #{sechFil}`
	shFil=File.new(sechFil,"a")

	After navigating to sections (h1 elements in HTML) retrieve their contents and send it to the ruby function "scrape_page.rb" for filtering.
	# scrape_the_page(sec_link,"#{chap_name}/#{sec_name}")
	scrape_the_page(sec_link,sFil,shFil)
	cFil.write "\\input #{chap_name}/#{sec_name} \n"
	end
	end
	fil.write "\\stoptext \n"


	### convert
	## http://wiki.contextgarden.net/HTML_to_ConTeXt
	########################3










	##########33
	# the rest
	######################################



	# Now we transfer the syntactically altered html to a string Object
	# and manipulate that object further


	[email protected]_html

	# remove empty space in the beginning
	newdoc.gsub!(/^\s+/,"")

	# remove all elements we don't need.
	newdoc.gsub!(/^<div.*/,"")
	newdoc.gsub!(/^<\/div.*/,"")
	newdoc.gsub!(/^<form.*/,"")
	newdoc.gsub!(/^<\/form.*/,"")
	newdoc.gsub!(/<p>/,"\n")
	newdoc.gsub!(/<\/p>/,"\n")
	newdoc.gsub!(/<\u>/,"")
	newdoc.gsub!(/<\/u>/,"")
	newdoc.gsub!(/<ul>/,"\\startitemize[1]")
	newdoc.gsub!(/<\/ul>/,"\\stopitemize")
	newdoc.gsub!(/<ol>/,"\\startitemize[n]")
	newdoc.gsub!(/<\/ol>/,"\\stopitemize")
	newdoc.gsub!(/<li>/,"\\item ")
	newdoc.gsub!(/<\/li>/,"\n")
	newdoc.gsub!("_","\\_")
	newdoc.gsub!(/<table>/,"\\bTABLE \n")
	newdoc.gsub!(/<\/table>/,"\\eTABLE \n")
	newdoc.gsub!(/<tr>/,"\\bTR ")
	newdoc.gsub!(/<\/tr>/,"\\eTR ")
	newdoc.gsub!(/<td>/,"\\bTD ")
	newdoc.gsub!(/<\/td>/,"\\eTD ")
	newdoc.gsub!(/<th>/,"\\bTH ")
	newdoc.gsub!(/<\/th>/,"\\eTH ")
	newdoc.gsub!(/<center>/,"")
	newdoc.gsub!(/<\/center>/,"")
	newdoc.gsub!(/<em>/,"{\\em ")
	newdoc.gsub!(/<\/em>/,"}")
	newdoc.gsub!("^","")
	newdoc.gsub!("\%","\\%")
	newdoc.gsub!("&","&")
	newdoc.gsub!("&",'\\\&')
	newdoc.gsub!("$",'\\$')
	newdoc.gsub!(/<tbody>/,"\\bTABLEbody \n")
	newdoc.gsub!(/<\/tbody>/,"\\eTABLEbody \n")

	# Context does not mind "_" in figures and does not recognize \_,
	# so i have to catch these and replace \_ with _

	# First catch
	filter=/\/AnnRep07\/Figures\/(\w+\/)(\w+\\_)/

	if newdoc[filter]
	newdoc.gsub!(filter) { \|fString\|
	fString.gsub("\\_","_")
	}
	end

	# Second catch
	filter2=/\/AnnRep07\/Figures\/(\w+\/)*\w+[-.]\w+\\_\w+/

	if newdoc[filter2]
	newdoc.gsub!(filter2) { \|fString\|
	fString.gsub("\\_","_") }
	end

	# Third catch; remove \_ inside []
	filter3=/\[\w+\\_\w+\]/

	if newdoc[filter3]
	newdoc.gsub!(filter3) { \|fString\|
	puts fString
	fString.gsub("\\_","_") }
	end


	# remove the comment tag, which we used to embed context commands
	newdoc.gsub!("<!--","")
	newdoc.gsub!("-->","")
	# add full path to the images
	newdoc.gsub!("\/AnnRep07\/Figures\/","~\/AnnRep07\/Figures\/")

	newdoc.gsub!(/<\w+\s*\/>/,"")

	#puts newdoc
	# open file for output
	#outfil="#{oFile}.tex"
	#`rm #{outfil}`

	#fil=File.new(outfil,"a")
	#puts "Writing #{oFile}"
	oFile.write newdoc

	end
	aoeuao
	aoeuaoeanoeut anoetuhonehu

	aoenutahoe
	so this should end up some where in the third file