Last active
December 7, 2022 17:51
-
-
Save rgchris/8172326 to your computer and use it in GitHub Desktop.
An exercise in scraping Blogger HTML content with Rebol.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
REBOL [ | |
Title: "Scraper" | |
Date: 1-Dec-2013 | |
] | |
textize: func [html [string!] /local rule last-list value new-table in-pre][ | |
html: load-html/dom html | |
last-node: html | |
last-list: "" | |
in-pre: false | |
html: join "" collect [ | |
rule: [ | |
switch node/name [ | |
<li> [keep rejoin ["^/" last-list " "]] | |
<ul> [append last-list "*"] | |
<ol> [append last-list "#"] | |
<h1> [keep "^/=== "] | |
<h2> [keep "^/=== "] | |
<h3> [keep "^/--- "] | |
<h4> [keep "^/... "] | |
<pre> <code> [keep "^/^/^-" in-pre: true] | |
; <div> <span> | |
<b> <strong> [keep "=b "] | |
<i> <em> [keep "=i "] | |
<u> [keep "=u "] | |
<blockquote> [] ; [keep "^/\blockquote^/^/"] | |
<table> [keep "^/\table^/^/" new-table: true] | |
<tr> [either new-table [new-table: false][keep "^/^/=row^/^/"]] | |
] | |
case [ | |
any [ | |
node/name = %.txt | |
string? node/value | |
][ | |
keep either in-pre [ | |
replace/all node/value "^/" "^/^-" | |
][ | |
replace/all node/value "=" "\=" | |
] | |
] | |
node/name = <head> [] | |
node/name = <br> [keep either in-pre ["^/^-"]["^/"]] | |
node/name = <a> [ | |
keep "[" | |
case [ | |
block? node/value [foreach node node/children :rule] | |
string? node/value [keep node/value] | |
] | |
keep rejoin ["](" node/get #href ")"] | |
] | |
node/name = <img> [ | |
keep join "=" remold new-line/all [ | |
'image | |
any [as url! node/get #src as file! node/get #src] | |
any [node/get #alt "Image"] | |
] false | |
] | |
node/name = <form> [keep "^/^/=donate^/^/"] | |
node/name = <iframe> [ | |
keep join "^/^/" node/get #src | |
either node/get #width [ | |
keep join " " node/get #width | |
if node/get #height [ | |
keep join "x" node/get #height | |
] | |
][ | |
if node/get #height [ | |
keep join " -1x" node/get #height | |
] | |
] | |
] | |
all [ | |
node/name = <div> | |
find ["pbar-o"] node/get #class | |
][ | |
value: node/get <div> | |
value: value/get #style | |
parse value [thru "width:" copy value to "%" to end] | |
keep rejoin ["=[progress " trim value "]"] | |
] | |
true [ | |
; don't need the /clone function, so reuse to temporarily store style | |
if node/clone: switch node/get #style [ | |
"color: red;" | |
"color: rgb(204, 0, 0); " | |
"color: #990000;" | |
"color: #cc0000;" | |
{color: #cc0000; font-family: 'Trebuchet MS', Trebuchet, sans-serif; line-height: 18px;} [<red>] | |
; "color: #a32f2f;" | |
; "color: #a32f2f; text-decoration: none;" | |
; "color: #a32f2f;" [<darkred>] | |
"font-family: inherit; font-size: large;" | |
"font-size: large; line-height: 18px;" | |
"font-size: large; " | |
"font-size: large;" | |
{font-family: 'Trebuchet MS', Trebuchet, sans-serif; font-size: large;} | |
{font-family: 'Trebuchet MS', sans-serif; font-size: large;} [keep "^/--- " none] | |
"font-size: x-large;" [keep "^/=== " none] | |
"color: #6aa84f;" | |
{color: #38761d; font-family: Trebuchet MS, sans-serif;} | |
"color: #38761d;" ; [<darkgreen>] | |
{color: #6aa84f; font-family: Trebuchet MS, sans-serif; font-weight: normal;} | |
{color: #6aa84f; font-family: 'Trebuchet MS', sans-serif;} | |
{color: #6aa84f; font-family: Trebuchet MS, sans-serif;} ; [<green>] | |
{color: #6aa84f; font-family: Courier New, Courier, monospace;} | |
{color: #38761d; font-family: 'Courier New', Courier, monospace;} | |
{color: #38761d; font-family: Courier New, Courier, monospace;} ; [<monogreen>] | |
{font-family: 'Courier New', Courier, monospace; white-space: pre;} | |
"font-family: 'Courier New', Courier, monospace;" | |
"font-family: monospace;" | |
"font-family: Courier New, Courier, monospace;" [<code>] | |
; "width: 145px;" [<145px>] | |
; "font-family: inherit;" [<inherit>] | |
; "text-align: center;" | |
; "margin-left: auto; margin-right: auto;" | |
; {margin-left: auto; margin-right: auto; text-align: center;} | |
; "clear: both; text-align: center;" [<center>] | |
; "margin-left:40px" | |
; "margin-left: 1em; margin-right: 1em;" [<indent>] | |
; "text-align: left;" [<left>] | |
; "white-space: pre;" | |
; "white-space:pre" | |
; "white-space: pre; " [<ipre>] | |
; {line-height: 1.4; list-style-image: initial; list-style-position: initial; list-style-type: disc; margin-bottom: 0.5em; margin-left: 0px; margin-right: 0px; margin-top: 0.5em; padding-bottom: 0px; padding-left: 2.5em; padding-right: 2.5em; padding-top: 0px;} [<disc>] | |
; {border-bottom-style: none; border-color: initial; border-left-style: none; border-right-style: none; border-top-color: initial; border-top-style: none; border-width: initial; margin-bottom: 0.25em; margin-left: 0px; margin-right: 0px; margin-top: 0px; padding-bottom: 0.25em; padding-left: 0px; padding-right: 0px; padding-top: 0.25em; text-indent: 0px;} [<no-border>] | |
; "color: #eeeeee;" | |
; {color: #eeeeee; font-family: 'Trebuchet MS', Trebuchet, sans-serif;} [<grey>] | |
; "line-height: 18px;" [<18px>] | |
; "border:1px solid #333333;border-bottom-style:none" | |
; {border-bottom-style: none; border: 1px solid #333333;} [<bordered>] | |
; "font-family: 'Trebuchet MS', sans-serif;" | |
; "font-family: Trebuchet MS, sans-serif;" | |
; {font-family: 'Trebuchet MS', Trebuchet, sans-serif; line-height: 18px;} | |
; {font-family: 'Trebuchet MS', Trebuchet, sans-serif;} [<chet>] | |
; "font-size: small;" [<small>] | |
][ | |
keep node/clone | |
] | |
; probe node/name | |
if block? node/value [ ; recursive walk through this node's kids. | |
foreach node node/children :rule | |
] | |
if node/clone [keep back insert copy node/clone "/"] | |
] | |
] | |
switch/default node/name [ | |
<div> <li> <h1> <h2> <h3> <h4> <td> <th> [keep "^/^/"] | |
<ul> <ol> [remove back tail last-list] | |
<pre> <code> [keep "^/^/" in-pre: false] | |
<blockquote> [] ; [keep "^/^//blockquote^/^/"] | |
<table> [keep "^//table^/^/"] | |
; <div> <span> | |
<b> <strong> [keep "=b."] | |
<i> <em> [keep "=i."] | |
<u> [keep "=u."] | |
<head> ; [probe node/flatten] | |
<form> <input> | |
<title> <body> <iframe> <div> <img> <tbody> <tr> <br> <a> <span> %.txt [] | |
][ | |
; Unsupported Tags? | |
probe node/name | |
] | |
] | |
foreach node html/children :rule | |
] | |
trim/head/tail foreach [old new][ | |
"<red>^/</red>" "^/" | |
"^/--- ^/" "" | |
"^/* ^/" "" | |
"![]" "![Image]" | |
"=b ^/=b." "^/^/" | |
"=i ^/=i." "^/^/" | |
" =b." "=b. " | |
"<code></code>" "" | |
"<code> </code>" " " | |
"<code>" "=r " | |
"</code>" "=r." | |
" =r." "=r. " | |
"=r ^/=r." "^/^/" | |
#{C2A0} " " | |
"^/: " "^/" | |
"=b <red>R</red>=b." "<red>R</red>" | |
"=b <red>R</red>e=b." "<red>R</red>e" | |
"<red>=b R=b.</red>" "<red>R</red>" | |
"<red>R</red>" "=[R]" | |
"^/###" "^/#>>" | |
"^/##" "^/#>" | |
"^/^-^/" "^/^/" | |
"^/^- ^/" "^/^/" | |
"^/^/^/^/" "^/^/" | |
"^/^/^/" "^/^/" | |
"^/^/^/" "^/^/" | |
"^/^/^/" "^/^/" | |
][ | |
replace/all html old new | |
] | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment