Created
April 25, 2014 19:29
-
-
Save lstroud/11300388 to your computer and use it in GitHub Desktop.
This script will convert an opml feed list between file formats (XML, JSON, and CSV), scrub the list for feeds that are still active, filter the list by expression, and de-duplicate the feeds in the list. I built it because I needed to reorganize my feeds. Moving the feeds to a csv file, where the tag column is the folder(s), made it very quick …
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Copyright 2014 Les Stroud | |
Licensed under the Apache License, Version 2.0 (the "License"); | |
you may not use this file except in compliance with the License. | |
You may obtain a copy of the License at | |
http://www.apache.org/licenses/LICENSE-2.0 | |
Unless required by applicable law or agreed to in writing, software | |
distributed under the License is distributed on an "AS IS" BASIS, | |
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
See the License for the specific language governing permissions and | |
limitations under the License. | |
*/ | |
@Grab(group='org.codehaus.gpars', module='gpars', version='1.1.0') | |
@Grab(group='org.codehaus.groovy.modules.http-builder', module='http-builder', version='0.7' ) | |
@Grab(group='net.sourceforge.nekohtml', module='nekohtml', version='1.9.20') | |
@Grab(group='net.sf.opencsv', module='opencsv', version='2.3') | |
import groovyx.net.http.HTTPBuilder | |
import groovyx.net.http.ContentType | |
import groovyx.net.http.Method | |
import groovyx.net.http.Status | |
import groovyx.gpars.GParsPool | |
import org.cyberneko.html.parsers.SAXParser | |
import groovy.xml.XmlUtil | |
import org.xml.sax.SAXException | |
import groovy.xml.MarkupBuilder | |
import groovy.json.JsonBuilder | |
import groovy.json.JsonSlurper | |
import au.com.bytecode.opencsv.CSVReader | |
import au.com.bytecode.opencsv.CSVWriter | |
import groovy.json.JsonOutput | |
rootFolder = null; | |
currentFolder = null; | |
count = [ | |
active_count:0, | |
inactive_count:0, | |
total_count:0 | |
] | |
feed_count = 0; | |
active_summary = []; | |
def cli = new CliBuilder(usage: 'groovy scrub_opml.groovy [-h] [-ads] [-f <expression closure>] [-o <outputformat>] [-O <outputfile_path>] -i <inputformat> <inputfile_path>') | |
cli.with { | |
h longOpt: 'help', 'Show usage information' | |
a longOpt: 'active', 'Filter for active feeds' | |
i longOpt: 'informat', args:1, argName:'in-format', 'Input file format. Valid values (XML, JSON, CSV)' | |
t longOpt: 'outformat', args:1, argName:'out-format','Output file format. Valid values (XML, JSON, CSV)' | |
o longOpt: 'outfile', args:1, argName:'out-path', 'Output file path.' | |
d longOpt: 'dedup', 'Deduplicate the feeds.' | |
s longOpt: 'stats', 'Print stats.' | |
f longOpt: 'filter', args:1, argName:'filter-exp', 'Filter Feeds by closure (closure must be quoted and evaluatable) [ -f "{feed -> return feed.name.startsWith(\'CSS\')}"' | |
} | |
filter_active = false | |
dedup = false | |
informat = null | |
outformat = null | |
input_file = null | |
output_file = null | |
print_stats = false | |
filter = null | |
if(!args){ | |
cli.usage() | |
System.exit(1) | |
} | |
def options = cli.parse(args) | |
if (options.h) { | |
cli.usage() | |
System.exit(0) | |
} | |
if(options.a) | |
filter_active = true | |
if(options.d) | |
dedup = true | |
if(options.s) | |
print_stats = true | |
if(options.f){ | |
filter = Eval.me(options.f) | |
} | |
if(options.i){ | |
informat = OPMLModel.FORMAT.valueOf(options.i) | |
} else { | |
informat = OPMLModel.FORMAT.XML | |
} | |
if(options.t){ | |
outformat = OPMLModel.FORMAT.valueOf(options.t) | |
} else { | |
outformat = informat | |
} | |
if(options.o){ | |
try{ | |
output_file = new File(options.o) | |
} catch (t){ | |
t.printStackTrace() | |
cli.usage() | |
System.exit(1) | |
} | |
} | |
def extraArguments = options.arguments() | |
if(extraArguments){ | |
try{ | |
input_file = new File(extraArguments[0]) | |
} catch (t){ | |
t.printStackTrace() | |
cli.usage() | |
System.exit(1) | |
} | |
} | |
def main(){ | |
def opmlModel = OPMLModel.deserialize(informat, input_file) | |
if(print_stats) | |
println JsonOutput.prettyPrint(JsonOutput.toJson(opmlModel.stats)) | |
if(filter_active){ | |
opmlModel = opmlModel.filterBy(isActive) | |
println "After Active Filter" | |
println JsonOutput.prettyPrint(JsonOutput.toJson(opmlModel.stats)) | |
} | |
if(filter){ | |
opmlModel = opmlModel.filterBy(filter) | |
println "After Filter ${filter}" | |
println JsonOutput.prettyPrint(JsonOutput.toJson(opmlModel.stats)) | |
} | |
if(dedup){ | |
opmlModel = opmlModel.deDup() | |
println "After DeDuplication" | |
println JsonOutput.prettyPrint(JsonOutput.toJson(opmlModel.stats)) | |
} | |
def output = opmlModel.serialize(outformat) | |
if(output_file){ | |
output_file.text = output | |
} else { | |
println output | |
} | |
} | |
year_ago = new Date() - 365; | |
isActive = {feed -> | |
//def content = feed.url.toURL().text | |
def content = "" | |
def active = false | |
def tryagain = false | |
def error_msg = null | |
def _pubdates = [] | |
try { | |
//def http = new HTTPBuilder(feed.url) | |
//def rss = http.get([contentType: groovyx.net.http.ContentType.XML]) | |
//def content = feed.url.toURL().getText([connectTimeout:5000, readTimeout:20000]) | |
def resp = new HTTPBuilder().request(feed.url, Method.GET, ContentType.TEXT) { req -> | |
headers.Accept = 'application/rss+xml, application/rdf+xml, application/xml, text/xml' | |
response.success = { r, reader -> | |
content = reader.text | |
return r | |
} | |
def unrecoverable = { r -> | |
throw new RuntimeException(r.statusLine as String) | |
} | |
response.'404' = unrecoverable | |
response.'500' = unrecoverable | |
response.'403' = unrecoverable | |
response.failure = { r -> | |
tryagain = true; | |
return r | |
} | |
} | |
if(tryagain){ | |
tryagain = false | |
resp = new HTTPBuilder().request(feed.url, Method.GET, ContentType.TEXT) { req -> | |
headers.Accept = 'application/rss+xml, application/rdf+xml, application/xml, text/xml' | |
response.success = { r, reader -> | |
content = reader.text | |
return r | |
} | |
response.failure = { r -> throw new RuntimeException(r.statusLine as String)} | |
} | |
} | |
if(Status.SUCCESS.matches(resp.status)){ | |
//feeds were too inconsistent and I didn't really need it to be parsed | |
//def rss = new XmlSlurper().parseText(content) | |
//println XmlUtil.serialize(rss) | |
//active = (rss.channel.item.pubDate.find{(new Date(it.text())) > year_ago}) | |
def m = content =~ /pubDate>(.*)<\/pubDate/ | |
m.each{match -> | |
def _pubdate | |
try{ | |
_pubdate = new Date(match[1]) | |
_pubdates << _pubdate | |
} catch(t){;;} | |
if(_pubdate && _pubdate > year_ago) | |
active = true | |
} | |
} | |
} catch (IOException ioe){ | |
error_msg = "ERROR checking: ${feed.name} ${feed.url}" | |
def sw = new StringWriter() | |
def pw = new PrintWriter(sw) | |
ioe.printStackTrace(pw) | |
error_msg += "\n" + sw.toString() | |
active = false | |
} catch (SAXException se){ | |
error_msg = "ERROR checking: ${feed.url} - [Parse Error - assuming active]" | |
active = true | |
} catch (t){ | |
error_msg = "ERROR checking: ${feed.name} ${feed.url} - [${t.message}]" | |
active = false | |
} | |
count.total_count++ | |
if(active) | |
count.active_count++ | |
else | |
count.inactive_count++ | |
printProgBar((count.total_count/feed_count)*100 as int) | |
active_summary << "${feed.name} [${(active)?'Active':'Inactive'}] - Latest pub date: ${_pubdates.max() as String}" | |
return active | |
} | |
class OPMLModel { | |
static enum FORMAT { | |
XML, JSON, CSV | |
} | |
def root; | |
OPMLModel(){ | |
root = new Folder(name:'Root') | |
} | |
static OPMLModel deserialize(FORMAT format, File file){ | |
switch(format){ | |
case FORMAT.XML: | |
return XMLConverter.deserialize(file); | |
case FORMAT.JSON: | |
return JSONConverter.deserialize(file); | |
case FORMAT.CSV: | |
return CSVConverter.deserialize(file); | |
} | |
} | |
def deDup(){ | |
def feed_map = [:] | |
deDupFeeds(this.root.children, null, feed_map) | |
//println JsonOutput.prettyPrint(JsonOutput.toJson(feed_map)) | |
def folder_hash = [:] | |
OPMLModel model = new OPMLModel() | |
feed_map.each{ k,v -> | |
if(v.folders){ | |
v.folders.each { t -> | |
def _t = t.trim() | |
if(!folder_hash[_t]) | |
folder_hash[_t] = new Folder(name:_t) | |
folder_hash[_t].children << v.feed | |
} | |
} else { | |
model.root.children << v.feed | |
} | |
} | |
folder_hash.each{ k,v -> | |
model.root.children << v | |
} | |
return model | |
} | |
private deDupFeeds(nodes, parent, feed_map){ | |
nodes.each{ node -> | |
if(node instanceof Feed){ | |
if(!feed_map[node.url]){ | |
def _folders = [] | |
if(parent) | |
_folders = [parent.name] | |
feed_map[node.url] = [feed: node, folders: _folders] | |
} | |
else{ | |
if(feed_map[node.url].folders){ | |
//println "Feed: ${node.name} already exists in folder(s) ${feed_map[node.url].folders}. Dropping this copy ${node.toString()} in folder ${parent.toString()}" | |
;; | |
} else { | |
if(parent){ | |
feed_map[node.url].folders << parent.name | |
} | |
} | |
} | |
} else if(node instanceof Folder) { | |
deDupFeeds(node.children, node, feed_map) | |
} | |
} | |
} | |
def getStats(){ | |
def stats = [feed_count: 0, folder_count: 0, feeds_in_folders: 0] | |
getOPMLStats(this.root.children, null, stats) | |
return stats | |
} | |
private getOPMLStats(nodes, parent, stats){ | |
nodes.each{ node -> | |
if(node instanceof Feed){ | |
stats.feed_count++; | |
if(parent) | |
stats.feeds_in_folders++; | |
} else if(node instanceof Folder) { | |
stats.folder_count++; | |
getOPMLStats(node.children, node, stats) | |
} | |
} | |
} | |
def OPMLModel filterBy(predicate){ | |
def model = new OPMLModel(); | |
model.root = this.root.filterBy(predicate) | |
return model | |
} | |
def String serialize(FORMAT format){ | |
switch(format){ | |
case FORMAT.XML: | |
return XMLConverter.serialize(this); | |
case FORMAT.JSON: | |
return JSONConverter.serialize(this); | |
case FORMAT.CSV: | |
return CSVConverter.serialize(this); | |
} | |
} | |
} | |
class XMLConverter { | |
static OPMLModel deserialize(File file){ | |
def model = new OPMLModel(); | |
def records = new XmlSlurper().parseText(file.text) | |
model.root.children = buildFolder(records.body.outline) | |
return model | |
} | |
static String serialize(OPMLModel model){ | |
def writer = new StringWriter() | |
def xml = new MarkupBuilder(writer) | |
xml.opml(version:'1.0'){ | |
head{ | |
title 'RSS Feeds' | |
} | |
body{ | |
for(f in model.root.children){ | |
if(f instanceof Feed){ | |
outline(text: f.name, title:f.title, url:f.url, xmlUrl:f.url) | |
} else if(f instanceof Folder) { | |
serializeFolder(xml, f) | |
} | |
} | |
} | |
} | |
return writer.toString() | |
} | |
private static serializeFolder(builder, folder){ | |
builder.outline(text: folder.name){ | |
for(f in folder.children){ | |
if(f instanceof Feed){ | |
outline(text: f.name, title:f.title, url:f.url, xmlUrl:f.url) | |
} else if(f instanceof Folder) { | |
serializeFolder(builder, f) | |
} | |
} | |
} | |
} | |
private static buildFolder(elements){ | |
def feed_list = []; | |
//println "building elements ${elements.size()}" | |
elements.each{ element -> | |
if(!element.'@xmlUrl'.text()){ //is a folder | |
//println "Found folder ${element.'@text'} with ${element.outline.size()} children." | |
def folder = new Folder(name:element.'@text'.text()) | |
folder.children = buildFolder(element.outline); | |
feed_list << folder | |
} else { | |
feed_list << new Feed(name:element.'@text'.text(), title:element.'@title'.text(), url:element.'@xmlUrl'.text() ) | |
} | |
} | |
return feed_list | |
} | |
} | |
class JSONConverter { | |
static OPMLModel deserialize(File file){ | |
def model = new OPMLModel(); | |
def json = new JsonSlurper().parseText(file.text) | |
model.root.children = buildFolder(json) | |
return model | |
} | |
static String serialize(OPMLModel model){ | |
return JsonOutput.prettyPrint(JsonOutput.toJson(model.root.children)) | |
} | |
private static buildFolder(elements){ | |
def feed_list = []; | |
//println "building elements ${elements.size()}" | |
elements.each{ element -> | |
if(!element.url){ //is a folder | |
def folder = new Folder(name:element.name) | |
folder.children = buildFolder(element.children); | |
feed_list << folder | |
} else { | |
feed_list << new Feed(name:element.text, title:element.title, url:element.url ) | |
} | |
} | |
return feed_list | |
} | |
} | |
class CSVConverter { | |
static OPMLModel deserialize(File file){ | |
StringReader sr = new StringReader(file.text) | |
CSVReader reader = new CSVReader(sr, ',' as char, '"' as char, 1) | |
def folder_hash = [:] | |
OPMLModel model = new OPMLModel() | |
reader.readAll().each{ row -> | |
def _name, _title, _url, _tags | |
_name = row[0] | |
_title = row[1] | |
_url = row[2] | |
_tags = row[3] | |
if(_tags){ | |
_tags.split(',').each { t -> | |
def _t = t.trim() | |
if(!folder_hash[_t]) | |
folder_hash[_t] = new Folder(name:_t) | |
folder_hash[_t].children << new Feed(name:_name, title:_title, url:_url) | |
} | |
} else { | |
model.root.children << new Feed(name:_name, title:_title, url:_url) | |
} | |
} | |
folder_hash.each{ k,v -> | |
model.root.children << v | |
} | |
return model | |
} | |
static String serialize(OPMLModel model){ | |
def model_list = [] | |
String[] header = ["Name", "Title", "Url", "Tags"] | |
model_list << header | |
flatten(model.root.children, null, model_list) | |
StringWriter sw = new StringWriter() | |
CSVWriter writer = new CSVWriter(sw, ',' as char, '"' as char) | |
model_list.each{ String[] row -> | |
writer.writeNext(row) | |
} | |
return sw.toString() | |
} | |
private static List flatten(nodes, parent, node_list){ | |
nodes.each{ node -> | |
if(node instanceof Feed){ | |
def node_tags = "" | |
if(parent) | |
node_tags = parent.name | |
String[] flatnode = [node.name, node.title, node.url, node_tags] | |
node_list << flatnode | |
} else if(node instanceof Folder) { | |
flatten(node.children, node, node_list) | |
} | |
} | |
} | |
} | |
class Folder { | |
String name | |
def children = [] | |
def filterBy(predicate){ | |
def filtered = [] | |
//GParsPool.withPool(8){ | |
children.each{c -> | |
def result = c.filterBy(predicate) | |
if(result) | |
filtered << result | |
} | |
//} | |
if(filtered){ | |
def _filteredFolder = new Folder(name:name); | |
_filteredFolder.children = filtered; | |
return _filteredFolder; | |
} else { | |
return null; | |
} | |
} | |
def String toString(){ | |
return "Folder: ${name} Child Count: ${children.size()}" | |
} | |
} | |
class Feed { | |
String name | |
String title | |
String url | |
//int article_count = 0 | |
def filterBy(predicate){ | |
if(predicate(this)) | |
return this | |
else | |
return null | |
} | |
def String toString(){ | |
return "Feed Name: ${name} Title: ${title} Url:${url}" | |
} | |
} | |
def void printProgBar(int percent){ | |
StringBuilder bar = new StringBuilder("["); | |
for(int i = 0; i < 50; i++){ | |
if( i < (percent/2)){ | |
bar.append("="); | |
}else if( i == (percent/2)){ | |
bar.append(">"); | |
}else{ | |
bar.append(" "); | |
} | |
} | |
bar.append("] " + percent + "% "); | |
bar.append(" ${count.active_count}|${count.inactive_count} ") | |
System.out.print("\r" + bar.toString()); | |
} | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
👏