Created
June 14, 2016 17:00
-
-
Save jjfiv/707b45a6c92da44637332b18c1a36e2c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import org.w3c.dom.Document; | |
| import org.w3c.dom.NamedNodeMap; | |
| import org.w3c.dom.Node; | |
| import org.w3c.dom.NodeList; | |
| import org.xml.sax.SAXException; | |
| import javax.xml.parsers.DocumentBuilderFactory; | |
| import javax.xml.parsers.ParserConfigurationException; | |
| import java.io.File; | |
| import java.io.IOException; | |
| import java.util.ArrayList; | |
| import java.util.HashMap; | |
| import java.util.List; | |
| import java.util.Map; | |
| public class ExtractFeaturesBook { | |
| public static class PageInfo { | |
| String book; | |
| int imageNumber; | |
| int width; | |
| int height; | |
| } | |
| public static class PageNumberCandidate { | |
| public final String text; | |
| public final PageInfo page; | |
| public final HashMap<String, Double> features; | |
| public PageNumberCandidate(String text, PageInfo page) { | |
| this.text = text; | |
| this.page = page; | |
| this.features = new HashMap<>(); | |
| } | |
| public void setFeature(String name, double value) { | |
| features.put(name, value); | |
| } | |
| public String book() { | |
| return page.book; | |
| } | |
| public int imageNumber() { | |
| return page.imageNumber; | |
| } | |
| } | |
| /** Code that deals with XML library... */ | |
| public static class XMLUtil { | |
| public static DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); | |
| public static Map<String,String> getAttributes(Node xmlNode) { | |
| Map<String,String> output = new HashMap<>(); | |
| NamedNodeMap attributes = xmlNode.getAttributes(); | |
| for (int i = 0; i < attributes.getLength(); i++) { | |
| Node item = attributes.item(i); | |
| assert item.getNodeType() == Node.ATTRIBUTE_NODE : "Attribute map should have attribute nodes!"; | |
| output.put(item.getNodeName(), item.getTextContent()); | |
| } | |
| return output; | |
| } | |
| /** Walk through the tree collecting all tags that match */ | |
| public static List<Node> findTagsByName(Node start, String tag) { | |
| ArrayList<Node> matching = new ArrayList<>(); | |
| findTagsByNameRecursive(start, tag, matching); | |
| return matching; | |
| } | |
| /** Recursive step through the tree collecting all tags that match */ | |
| private static void findTagsByNameRecursive(Node start, String tag, List<Node> found) { | |
| NodeList children = start.getChildNodes(); | |
| for (int i = 0; i < children.getLength(); i++) { | |
| Node item = children.item(i); | |
| if(tag.equals(item.getNodeName())) { | |
| found.add(item); | |
| } else { | |
| findTagsByNameRecursive(item, tag, found); | |
| } | |
| } | |
| } | |
| } | |
| /** Open a file and extract all candidates, {@link #extractFromPage} */ | |
| public static List<List<PageNumberCandidate>> extract(String book) { | |
| ArrayList<List<PageNumberCandidate>> candidatesPerPage = new ArrayList<>(); | |
| try { | |
| Document bookXML = XMLUtil.factory.newDocumentBuilder().parse(new File(book)); | |
| List<Node> pages = XMLUtil.findTagsByName(bookXML, "OBJECT"); | |
| for (Node page : pages) { | |
| List<Node> lines = XMLUtil.findTagsByName(page, "LINE"); | |
| Map<String, String> attributes = XMLUtil.getAttributes(page); | |
| String usemap = attributes.get("usemap"); | |
| assert(usemap.endsWith(".djvu")); | |
| usemap = usemap.substring(0, usemap.length()-5); | |
| int underscore = usemap.lastIndexOf('_'); | |
| if(underscore < 0) { | |
| throw new RuntimeException("Bad page information?: "+usemap+" "+attributes); | |
| } | |
| String bookId = usemap.substring(0, underscore); | |
| String imageNumberText = usemap.substring(underscore+1); | |
| int imageNumber = Integer.parseInt(imageNumberText); | |
| //System.out.println("Page: "+lines.size()+" "+attributes); | |
| PageInfo info = new PageInfo(); | |
| info.book = bookId; | |
| info.imageNumber = imageNumber; | |
| info.width = Integer.parseInt(attributes.get("width")); | |
| info.height = Integer.parseInt(attributes.get("height")); | |
| ArrayList<PageNumberCandidate> candidates = new ArrayList<>(); | |
| extractFromPage(info, lines, candidates); | |
| candidatesPerPage.add(candidates); | |
| } | |
| } catch (SAXException e) { | |
| e.printStackTrace(); | |
| } catch (IOException e) { | |
| e.printStackTrace(); | |
| } catch (NumberFormatException e) { | |
| e.printStackTrace(); | |
| } catch (ParserConfigurationException e) { | |
| e.printStackTrace(); | |
| } | |
| return candidatesPerPage; | |
| } | |
| /** Returns true if any letter is a number */ | |
| public static boolean isMaybeNumber(String word) { | |
| for (int i = 0; i < word.length(); i++) { | |
| if (Character.isDigit(word.charAt(i))) { | |
| return true; | |
| } | |
| } | |
| return false; | |
| } | |
| private static void extractFromPage(PageInfo page, List<Node> lines, ArrayList<PageNumberCandidate> candidates) { | |
| for (Node line : lines) { | |
| List<Node> words = XMLUtil.findTagsByName(line, "WORD"); | |
| //System.out.println("\tLine: " + words.size()); | |
| for (Node word : words) { | |
| String text = word.getTextContent(); | |
| Map<String, String> attr = XMLUtil.getAttributes(word); | |
| String[] coords = attr.get("coords").split(","); | |
| int x = Integer.parseInt(coords[0]); | |
| int y = Integer.parseInt(coords[1]); | |
| if(isMaybeNumber(text)) { | |
| //System.out.println("Candidate: "+text); | |
| PageNumberCandidate instance = new PageNumberCandidate(text, page); | |
| instance.setFeature("x-fraction", x / (double) page.width); | |
| instance.setFeature("y-fraction", y / (double) page.height); | |
| candidates.add(instance); | |
| } | |
| } | |
| } | |
| } | |
| public static void main(String[] args) { | |
| Map<String, Integer> featureNumbers = new HashMap<>(); | |
| featureNumbers.put("x-fraction", 1); | |
| featureNumbers.put("sequence", 2); | |
| featureNumbers.put("y-fraction", 3); | |
| List<List<PageNumberCandidate>> extracted = extract("data/carribean_small/cu31924020438929_djvu.xml"); | |
| // TODO: calculate sequence here and set it on all candidates. | |
| int total = 0; | |
| for (List<PageNumberCandidate> pageNumberCandidates : extracted) { | |
| for (PageNumberCandidate candidate : pageNumberCandidates) { | |
| total++; | |
| } | |
| } | |
| System.out.println("Extracted "+total+" candidates on "+extracted.size()+" pages."); | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment