-
-
Save ivanionut/56af4165d90965c6c6db to your computer and use it in GitHub Desktop.
Sample ColdFusion script to parse a webpage and extract table data using jsoup.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<cfhttp url="http://target_website_with-table.com/" username="#CGI.Http_User_Agent#"></cfhttp> | |
<cfscript> | |
jsoup = CreateObject("java", "org.jsoup.Jsoup"); | |
HTMLDocument = jsoup.parse(CFHTTP.fileContent); | |
/* Identify a specific table containing the data to scrape */ | |
TheTable = HTMLDocument.select("##tableByID"); | |
/* Alternate DOM select methods if table doesn't have a unique ID | |
TheTable = HTMLDocument.select("table.tableClass").first(); | |
TheTable = HTMLDocument.select("table").first(); */ | |
/* Auto sanitize HTML. Removes most attributes http://jsoup.org/apidocs/org/jsoup/safety/Whitelist.html | |
Whitelist = CreateObject("java", "org.jsoup.safety.Whitelist"); | |
TheTable = jsoup.clean(TheTable.toString(), Whitelist.relaxed()); | |
*/ | |
/* Manually remove all class and other misc attributes from table (optional) */ | |
TheTable.removeAttr("class").addClass("cellpadding3").removeAttr("width").removeAttr("cellspacing"); | |
/* remove misc attributes from TH/TD (optional) */ | |
TheTable.select("td,th").removeAttr("colspan").removeAttr("rowspan").removeAttr("align").removeAttr("class"); | |
/* Add CSS class to table */ | |
TheTable.addClass("hTable"); | |
/* for tables with only TD, convert first column to TH using tagName() (optional) | |
http://jsoup.org/apidocs/org/jsoup/nodes/Element.html */ | |
rows = TheTable.select("tr"); | |
for ( row in rows ){ | |
TheTable.select("tr td:eq(0),tr th:eq(0)").tagName("th"); | |
} | |
/* Add thead, header row and cells (optional) */ | |
TheTable.prepend("<thead><tr><th>Col1</th><th>Col2</th></tr></thead>") | |
/* remove last row (optional) */ | |
TheTable.select("tr").last().remove(); | |
writeOutput(TheTable.toString()); | |
</cfscript> | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment