Created
October 15, 2010 06:26
-
-
Save m040601/627723 to your computer and use it in GitHub Desktop.
found on the context wiki - scrape (hpricot,openurl etc) , convert (own rb funcs)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##############3 | |
#open and scrap | |
###################################### | |
#scan_page.rb = Retrieves the html page of interest from the server, | |
# navigates to links within the main page and construct a | |
# context document | |
#!/usr/bin/ruby | |
require 'rubygems' | |
require 'open-uri' # the open-uri library | |
require 'hpricot' # the hpricot library | |
require 'scrape_page' # user-defined function to filter html into ConTeXt | |
# scans the home page and lists | |
# all the directories and subdirectories | |
doc=Hpricot(open("http://ipa.dd.re.ss/AnnRep07")) | |
##################################### | |
set up the doc | |
##################################### | |
mainfil="annrep.tex" # open a file to output ConTeXt document | |
`rm #{mainfil}` | |
fil=File.new(mainfil,"a") | |
# Add some opening directives and include style files | |
fil.write "\\input context_styles \n" # this file contains the styling options for my Context document | |
fil.write "\\starttext \n" | |
fil.write "\\leftaligned{\\BigFontOne Contents} \n" | |
fil.write "\\vfill \n" | |
fil.write "{ \\switchtobodyfont[10pt] " | |
fil.write "\\startcolumns[n=2,balance=no,rule=off,option=background,frame=off,background=color,backgroundcolor=blue:1] \n" | |
fil.write "\\placecontent \n" | |
fil.write "\\stopcolumns \n" | |
fil.write "}" | |
##################### | |
# navigate | |
################# | |
I have used the following 'hpricot' code to click on chapter and section links to retrieve their contents. | |
chapters= (doc/"p/a.existingWikiWord") | |
# we need to navigate one more level into the web page | |
# let us discover the links for that | |
chapters.each do |ch| | |
chap_link = ch.attributes['href'] | |
# using inner_html we can create subdirectories | |
chap_name = ch.inner_html.gsub(/\s*/,"") | |
chap_name_org = ch.inner_html | |
# We create chapter directories | |
system("mkdir -p #{chap_name}") | |
fil.write "\\input #{chap_name} \n" | |
chapFil="#{chap_name}.tex" | |
`rm #{chapFil}` | |
cFil=File.new(chapFil,"a") | |
cFil.write "\\chapter{ #{chap_name_org} } \n" | |
# We navigate to sections now | |
doc2=Hpricot(open(chap_link)) | |
sections= (doc2/"p/a.existingWikiWord") | |
sections.each do |sc| | |
sec_link = sc.attributes['href'] | |
sec_name = sc.inner_html.gsub(/\s*/,"") | |
secFil="#{chap_name}/#{sec_name}.tex" | |
`rm #{secFil}` | |
sFil=File.new(secFil,"a") | |
sechFil="#{chap_name}/#{sec_name}.html" | |
`rm #{sechFil}` | |
shFil=File.new(sechFil,"a") | |
After navigating to sections (h1 elements in HTML) retrieve their contents and send it to the ruby function "scrape_page.rb" for filtering. | |
# scrape_the_page(sec_link,"#{chap_name}/#{sec_name}") | |
scrape_the_page(sec_link,sFil,shFil) | |
cFil.write "\\input #{chap_name}/#{sec_name} \n" | |
end | |
end | |
fil.write "\\stoptext \n" | |
### convert | |
## http://wiki.contextgarden.net/HTML_to_ConTeXt | |
########################3 | |
##########33 | |
# the rest | |
###################################### | |
# Now we transfer the syntactically altered html to a string Object | |
# and manipulate that object further | |
[email protected]_html | |
# remove empty space in the beginning | |
newdoc.gsub!(/^\s+/,"") | |
# remove all elements we don't need. | |
newdoc.gsub!(/^<div.*/,"") | |
newdoc.gsub!(/^<\/div.*/,"") | |
newdoc.gsub!(/^<form.*/,"") | |
newdoc.gsub!(/^<\/form.*/,"") | |
newdoc.gsub!(/<p>/,"\n") | |
newdoc.gsub!(/<\/p>/,"\n") | |
newdoc.gsub!(/<\u>/,"") | |
newdoc.gsub!(/<\/u>/,"") | |
newdoc.gsub!(/<ul>/,"\\startitemize[1]") | |
newdoc.gsub!(/<\/ul>/,"\\stopitemize") | |
newdoc.gsub!(/<ol>/,"\\startitemize[n]") | |
newdoc.gsub!(/<\/ol>/,"\\stopitemize") | |
newdoc.gsub!(/<li>/,"\\item ") | |
newdoc.gsub!(/<\/li>/,"\n") | |
newdoc.gsub!("_","\\_") | |
newdoc.gsub!(/<table>/,"\\bTABLE \n") | |
newdoc.gsub!(/<\/table>/,"\\eTABLE \n") | |
newdoc.gsub!(/<tr>/,"\\bTR ") | |
newdoc.gsub!(/<\/tr>/,"\\eTR ") | |
newdoc.gsub!(/<td>/,"\\bTD ") | |
newdoc.gsub!(/<\/td>/,"\\eTD ") | |
newdoc.gsub!(/<th>/,"\\bTH ") | |
newdoc.gsub!(/<\/th>/,"\\eTH ") | |
newdoc.gsub!(/<center>/,"") | |
newdoc.gsub!(/<\/center>/,"") | |
newdoc.gsub!(/<em>/,"{\\em ") | |
newdoc.gsub!(/<\/em>/,"}") | |
newdoc.gsub!("^","") | |
newdoc.gsub!("\%","\\%") | |
newdoc.gsub!("&","&") | |
newdoc.gsub!("&",'\\\&') | |
newdoc.gsub!("$",'\\$') | |
newdoc.gsub!(/<tbody>/,"\\bTABLEbody \n") | |
newdoc.gsub!(/<\/tbody>/,"\\eTABLEbody \n") | |
# Context does not mind "_" in figures and does not recognize \_, | |
# so i have to catch these and replace \_ with _ | |
# First catch | |
filter=/\/AnnRep07\/Figures\/(\w+\/)*(\w+\\_)*/ | |
if newdoc[filter] | |
newdoc.gsub!(filter) { |fString| | |
fString.gsub("\\_","_") | |
} | |
end | |
# Second catch | |
filter2=/\/AnnRep07\/Figures\/(\w+\/)*\w+[-.]\w+\\_\w+/ | |
if newdoc[filter2] | |
newdoc.gsub!(filter2) { |fString| | |
fString.gsub("\\_","_") } | |
end | |
# Third catch; remove \_ inside [] | |
filter3=/\[\w+\\_\w+\]/ | |
if newdoc[filter3] | |
newdoc.gsub!(filter3) { |fString| | |
puts fString | |
fString.gsub("\\_","_") } | |
end | |
# remove the comment tag, which we used to embed context commands | |
newdoc.gsub!("<!--","") | |
newdoc.gsub!("-->","") | |
# add full path to the images | |
newdoc.gsub!("\/AnnRep07\/Figures\/","~\/AnnRep07\/Figures\/") | |
newdoc.gsub!(/<\w+\s*\/>/,"") | |
#puts newdoc | |
# open file for output | |
#outfil="#{oFile}.tex" | |
#`rm #{outfil}` | |
#fil=File.new(outfil,"a") | |
#puts "Writing #{oFile}" | |
oFile.write newdoc | |
end | |
aoeuao | |
aoeuaoeanoeut anoetuhonehu | |
aoenutahoe | |
so this should end up some where in the third file |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment