Created
December 11, 2014 16:58
-
-
Save jmini/42d54404bdd1f6b286fe to your computer and use it in GitHub Desktop.
Normalize HTML files (remove comments, change the date meta tag) in order to diff them.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import java.io.File; | |
| import java.io.FileFilter; | |
| import java.io.IOException; | |
| import java.util.ArrayList; | |
| import java.util.Arrays; | |
| import java.util.List; | |
| import org.jsoup.Jsoup; | |
| import org.jsoup.nodes.Document; | |
| import org.jsoup.nodes.Element; | |
| import org.jsoup.nodes.Node; | |
| import org.jsoup.select.Elements; | |
| import com.google.common.base.Charsets; | |
| import com.google.common.io.Files; | |
| public class NormalizeHTML { | |
| private static final String ROOT = ""; //TODO set this. | |
| public static void main(String... args) throws IOException { | |
| File inFolder = new File(ROOT + "/out/html"); | |
| File outFolder = new File(ROOT + "/out/html_clean"); | |
| List<File> inFiles = new ArrayList<File>(); | |
| inFiles.addAll(Arrays.asList(inFolder.listFiles(new FileFilter() { | |
| @Override | |
| public boolean accept(File f) { | |
| return f.isFile() && f.getName().endsWith("html"); | |
| } | |
| }))); | |
| for (File inFile : inFiles) { | |
| File outFile = new File(outFolder, inFile.getName()); | |
| String html = Files.toString(inFile, Charsets.ISO_8859_1); | |
| Document doc = Jsoup.parse(html); | |
| doc.outputSettings().charset("ASCII"); | |
| removeMetaDate(doc); | |
| removeComments(doc); | |
| Files.createParentDirs(outFile); | |
| Files.write(doc.toString(), outFile, Charsets.ISO_8859_1); | |
| } | |
| } | |
| private static void removeMetaDate(Document doc) { | |
| Elements elements = doc.getElementsByTag("meta"); | |
| for (Element element : elements) { | |
| String attr = element.attr("name"); | |
| if ("date".equals(attr)) { | |
| element.attr("content", "XXXXXXXXXXXXXXXXXXX"); | |
| } | |
| } | |
| } | |
| // see: http://stackoverflow.com/questions/7541843/how-to-search-for-comments-using-jsoup | |
| private static void removeComments(Node node) { | |
| for (int i = 0; i < node.childNodes().size();) { | |
| Node child = node.childNode(i); | |
| if (child.nodeName().equals("#comment")) child.remove(); | |
| else { | |
| removeComments(child); | |
| i++; | |
| } | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment