Skip to content

Instantly share code, notes, and snippets.

@jmini
Created December 11, 2014 16:58
Show Gist options
  • Select an option

  • Save jmini/42d54404bdd1f6b286fe to your computer and use it in GitHub Desktop.

Select an option

Save jmini/42d54404bdd1f6b286fe to your computer and use it in GitHub Desktop.
Normalize HTML files (remove comments, change the date meta tag) in order to diff them.
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
import com.google.common.base.Charsets;
import com.google.common.io.Files;
public class NormalizeHTML {
private static final String ROOT = ""; //TODO set this.
public static void main(String... args) throws IOException {
File inFolder = new File(ROOT + "/out/html");
File outFolder = new File(ROOT + "/out/html_clean");
List<File> inFiles = new ArrayList<File>();
inFiles.addAll(Arrays.asList(inFolder.listFiles(new FileFilter() {
@Override
public boolean accept(File f) {
return f.isFile() && f.getName().endsWith("html");
}
})));
for (File inFile : inFiles) {
File outFile = new File(outFolder, inFile.getName());
String html = Files.toString(inFile, Charsets.ISO_8859_1);
Document doc = Jsoup.parse(html);
doc.outputSettings().charset("ASCII");
removeMetaDate(doc);
removeComments(doc);
Files.createParentDirs(outFile);
Files.write(doc.toString(), outFile, Charsets.ISO_8859_1);
}
}
private static void removeMetaDate(Document doc) {
Elements elements = doc.getElementsByTag("meta");
for (Element element : elements) {
String attr = element.attr("name");
if ("date".equals(attr)) {
element.attr("content", "XXXXXXXXXXXXXXXXXXX");
}
}
}
// see: http://stackoverflow.com/questions/7541843/how-to-search-for-comments-using-jsoup
private static void removeComments(Node node) {
for (int i = 0; i < node.childNodes().size();) {
Node child = node.childNode(i);
if (child.nodeName().equals("#comment")) child.remove();
else {
removeComments(child);
i++;
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment