mjbommar · April 10, 2011 21:17
diff --git a/buildCodeIndex.java b/buildCodeIndex.java
 /**
 * @author Michael J Bommarito II
 * @date Apr 9, 2011
 * @license MIT, (C) Michael J Bommarito II 2011
 */
 package org.mjb;

 // Java standard library imports
 import java.io.*;
 import java.util.*;
 import java.util.regex.*;
 import java.util.zip.*;

 // Lucene imports
 import org.apache.lucene.*;
 import org.apache.lucene.analysis.*;
 import org.apache.lucene.analysis.standard.*;
 import org.apache.lucene.document.*;
 import org.apache.lucene.index.*;
 import org.apache.lucene.store.*;
 import org.apache.lucene.util.Version;
 import org.htmlparser.*;
 import org.htmlparser.visitors.*;

 class CodeIndex {
    // Lucene Index objects
    private IndexWriter indexWriter;
    // private IndexWriterConfig indexWriterConfig;

    // Pattern matching regular expression objects
    private Pattern patternDocumentID, patternUSCKey, patternCurrentThrough,
 	    patternItemPath;

    /**
     * Constructor that initializes the Lucene index and regular expression
     * objects.
     * 
     * @param indexPath
     * @throws IOException
     */
    public CodeIndex(String indexPath) throws IOException {
 	// Create the index directory
 	Directory indexDir = FSDirectory.open(new File(indexPath));

 	// This syntax works for 3.1; however, Mahout won't play nice, so we
 	// need to use an older version.
 	// indexWriterConfig = new IndexWriterConfig(Version.LUCENE_31, new
 	// StandardAnalyzer(Version.LUCENE_31));
 	// indexWriter = new IndexWriter(indexDir, indexWriterConfig);

 	// Construct the Lucene index.
 	indexWriter = new IndexWriter(indexDir, new StandardAnalyzer(
 		Version.LUCENE_29), true, IndexWriter.MaxFieldLength.UNLIMITED);

 	// Compile the regular expressions
 	patternDocumentID = Pattern.compile("documentid:([^\\s]+)");
 	patternUSCKey = Pattern.compile("usckey:([^\\s]+)");
 	patternCurrentThrough = Pattern.compile("currentthrough:([0-9]+)");
 	patternItemPath = Pattern.compile("itempath:(.+) -->");
    }

    /**
     * This method shuts down the Lucene index.
     * 
     * @throws IOException
     */
    public void close() throws IOException {
 	// Shut down the indexer, but wait for merges
 	indexWriter.close();
    }

    /**
     * This method parses an HTML document and adds it to the index.
     * 
     * @param htmlBuffer
     */
    public void parseHTML(final String htmlBuffer) {
 	// Keep track of our position in the buffer.
 	int fragmentStart = 0, fragmentEnd = 0;

 	String documentString = "";
 	fragmentStart = htmlBuffer.indexOf("<!-- documentid");

 	while (fragmentStart > 0) {
 	    // Find the next document or end of file.
 	    fragmentEnd = htmlBuffer.indexOf("<!-- documentid",
 		    fragmentStart + 1);

 	    // Store the substring.
 	    if (fragmentEnd > 0) {
 		documentString = htmlBuffer.substring(fragmentStart,
 			fragmentEnd);
 	    } else {
 		documentString = htmlBuffer.substring(fragmentStart);
 	    }

 	    // Now parse the document.
 	    parseDocument(documentString);

 	    // Set up the next search.
 	    fragmentStart = fragmentEnd;
 	}
    }

    private String extractDocumentID(final String documentBuffer) {
 	/**
 	 * Extract the documentid metadata from the <document>.
 	 */
 	// Get the string positions
 	int fragmentStart = documentBuffer.indexOf("documentid:")
 		+ "documentid:".length();
 	int fragmentEnd = documentBuffer.indexOf(" ", fragmentStart);

 	// Return the substring
 	if (fragmentEnd > 0) {
 	    return documentBuffer.substring(fragmentStart, fragmentEnd);
 	} else {
 	    return documentBuffer.substring(fragmentStart);
 	}
    }

    private String extractUSCKey(final String documentBuffer) {
 	/**
 	 * Extract the usckey metadata from the <document>.
 	 */
 	// Get the string positions
 	int fragmentStart = documentBuffer.indexOf("usckey:")
 		+ "usckey:".length();
 	int fragmentEnd = documentBuffer.indexOf(" ", fragmentStart);

 	// Return the substring
 	if (fragmentEnd > 0) {
 	    return documentBuffer.substring(fragmentStart, fragmentEnd);
 	} else {
 	    return documentBuffer.substring(fragmentStart);
 	}
    }

    private String extractCurrentThrough(final String documentBuffer) {
 	/**
 	 * Extract the currentthrough metadata from the <document>.
 	 */
 	// Get the string positions
 	int fragmentStart = documentBuffer.indexOf("currentthrough:")
 		+ "currentthrough:".length();
 	int fragmentEnd = documentBuffer.indexOf(" ", fragmentStart);

 	// Return the substring
 	if (fragmentEnd > 0) {
 	    return documentBuffer.substring(fragmentStart, fragmentEnd);
 	} else {
 	    return documentBuffer.substring(fragmentStart);
 	}
    }

    private String extractItemPath(final String documentBuffer) {
 	/**
 	 * Extract the itempath metadata from the <document>.
 	 */

 	// Get the string positions.
 	int fragmentStart = documentBuffer.indexOf("itempath:")
 		+ "itempath:".length();
 	int fragmentEnd = documentBuffer.indexOf(" -->", fragmentStart);

 	// Return the substring
 	if (fragmentEnd > 0) {
 	    return documentBuffer.substring(fragmentStart, fragmentEnd);
 	} else {
 	    return documentBuffer.substring(fragmentStart);
 	}
    }

    private String extractExpCite(final String documentBuffer) {
 	/**
 	 * Extract the expcite metadata from the <document>.
 	 */

 	// Get the string positions.
 	int fragmentStart = documentBuffer.indexOf("expcite:")
 		+ "expcite:".length();
 	int fragmentEnd = documentBuffer.indexOf(" -->", fragmentStart);

 	// Return the substring
 	if (fragmentEnd > 0) {
 	    return documentBuffer.substring(fragmentStart, fragmentEnd);
 	} else {
 	    return documentBuffer.substring(fragmentStart);
 	}
    }

    public void parseDocument(final String documentBuffer) {
 	/**
 	 * This method parses an individual <document> from larger XHTML LRC
 	 * document.
 	 */

 	// Get the document-level metadata.
 	String documentID = extractDocumentID(documentBuffer);
 	String uscKey = extractUSCKey(documentBuffer);
 	String currentThrough = extractCurrentThrough(documentBuffer);
 	String itemPath = extractItemPath(documentBuffer);
 	String expCite = extractExpCite(documentBuffer);

 	int fieldStart = 0, fieldEnd = 0;

 	// Now split the document into fields.
 	String fieldString = "", textHead = "", textStatute = "";
 	fieldStart = documentBuffer.indexOf("<!-- field-start");

 	while (fieldStart > 0) {
 	    fieldEnd = documentBuffer.indexOf("<!-- field-end", fieldStart + 1);

 	    // Store the substring.
 	    if (fieldEnd > 0) {
 		fieldString = documentBuffer.substring(fieldStart, fieldEnd);
 	    } else {
 		fieldString = documentBuffer.substring(fieldStart);
 	    }

 	    // Parse the fields depending on type.
 	    if (fieldString.contains("field-start:head")) {
 		textHead = extractFieldText(fieldString);
 	    } else if (fieldString.contains("field-start:statute")) {
 		textStatute = extractFieldText(fieldString);
 	    }

 	    // Find the next field.
 	    if (fieldEnd > 0) {
 		fieldStart = documentBuffer.indexOf("<!-- field-start",
 			fieldEnd);
 	    } else {
 		fieldStart = -1;
 	    }
 	}

 	try {
 	    indexDocument(documentID, uscKey, currentThrough, itemPath,
 		    textHead, textStatute);
 	} catch (Exception E) {
 	    E.printStackTrace();
 	}
    }

    private String extractFieldText(final String fieldBuffer) {
 	/**
 	 * Parse the field buffer and return the text.
 	 */

 	// Create the parser and visitor.
 	Parser htmlParser = Parser.createParser(fieldBuffer, "UTF-8");
 	TextExtractingVisitor textVisitor = new TextExtractingVisitor();

 	try {
 	    // Now try to parse the string and store the text.
 	    htmlParser.visitAllNodesWith(textVisitor);
 	    return textVisitor.getExtractedText().trim();
 	} catch (Exception E) {
 	    return "";
 	}
    }

    private void indexDocument(String documentID, String uscKey,
 	    String currentThrough, String itemPath, String head, String text)
 	    throws IOException {
 	/**
 	 * This method actually writes the document into the index.
 	 */

 	if ((text.length() == 0) || (documentID.length() == 0)) {
 	    return;
 	}

 	// Create document.
 	Document doc = new Document();
 	doc.add(new Field("documentid", documentID, Field.Store.YES,
 		Field.Index.NOT_ANALYZED));
 	doc.add(new Field("usckey", uscKey, Field.Store.YES,
 		Field.Index.NOT_ANALYZED));
 	doc.add(new Field("currentthrough", currentThrough, Field.Store.YES,
 		Field.Index.NOT_ANALYZED));
 	doc.add(new Field("itempath", itemPath, Field.Store.YES,
 		Field.Index.ANALYZED));
 	doc.add(new Field("head", head, Field.Store.YES, Field.Index.ANALYZED,
 		Field.TermVector.YES));
 	doc.add(new Field("text", text, Field.Store.NO, Field.Index.ANALYZED,
 		Field.TermVector.YES));

 	// Write into index.
 	indexWriter.addDocument(doc);
    }
 }

 /**
 * This class is the driver that processes ZIP files to construct a CodeIndex.
 */
 public class buildCodeIndex {
    // CodeIndex object
    private static CodeIndex codeIndex;

    public static void main(final String[] args) {
 	// Check that a proper command line is passed and fail if not.
 	if (args.length == 0) {
 	    System.err.println("Usage: buildCodeIndex <path to ZIP files>");
 	    System.exit(-1);
 	}

 	// Now check that the argument is a directory that exists.
 	File directory = new File(args[0]);
 	if (!directory.exists() && directory.isDirectory()) {
 	    System.err.println("Usage: buildCodeIndex <path to ZIP files>");
 	    System.err
 		    .println("The specified path to ZIP files does not exists.");
 	    System.exit(-1);
 	}

 	try {
 	    // Create the CodeIndex object.
 	    codeIndex = new CodeIndex("index/");

 	    // Start the indexing by processing the directory.
 	    processDirectory(directory);

 	    // Shut down the index.
 	    codeIndex.close();
 	} catch (Exception E) {
 	    E.printStackTrace();
 	    System.exit(-1);
 	}
    }

    /**
     * This method processes the ZIP files in a given directory. These ZIP files
     * should be obtained from this URL: http://uscode.house.gov/xhtml/
     * 
     * @param directoryPath
     *            : path to the directory that contains the U.S. Code ZIP files.
     */
    private static void processDirectory(final File directory) {
 	// Build the sorted list of files
 	File[] fileList = directory.listFiles();
 	java.util.Arrays.sort(fileList);

 	// Iterate over all files
 	for (File f : fileList) {
 	    // Check if this is a ZIP file and process it if so.
 	    if (f.getName().toLowerCase().endsWith(".zip")) {
 		System.out.println("Processing " + f.getAbsolutePath());
 		processZIP(f.getAbsolutePath());
 	    }
 	}
    }

    /**
     * This method process a specific ZIP file that should contain the XHTML
     * files from the Law Revision Counsel at the U.S. House of Representatives.
     * 
     * @param fileName
     *            : File name of the ZIP to be processed.
     */
    private static void processZIP(final String fileName) {
 	// Try to open the ZIP file and return if we can't.
 	final ZipFile zipFile;

 	try {
 	    zipFile = new ZipFile(fileName);
 	} catch (Exception E) {
 	    E.printStackTrace();
 	    return;
 	}

 	// Now iterate over all entries in the ZIP file and parse the HTML ones.
 	for (final Enumeration<? extends ZipEntry> entryList = zipFile
 		.entries(); entryList.hasMoreElements();) {
 	    // Get the current entry
 	    ZipEntry entry = entryList.nextElement();

 	    // Now read the entry and process it if non-null.
 	    String entryBuffer = readZIPEntry(zipFile, entry);
 	    if (entryBuffer != null) {
 		codeIndex.parseHTML(entryBuffer);
 	    }
 	}
    }

    /**
     * This method reads a ZIP entry and returns a String representation of the
     * buffer.
     * 
     * @param zipFile
     *            : ZipFile containing the ZipEntry to be read.
     * @param zipEntry
     *            : ZipEntry to be read.
     */
    private static String readZIPEntry(final ZipFile zipFile,
 	    final ZipEntry zipEntry) {
 	InputStream inputStream;
 	InputStreamReader inputStreamReader;
 	StringWriter stringBuffer = new StringWriter();

 	// Read the entry data from the ZIP
 	try {
 	    // Create the InputStream objects.
 	    inputStream = zipFile.getInputStream(zipEntry);
 	    inputStreamReader = new InputStreamReader(inputStream);

 	    // Read buffer-sized chunks into the StringWriter.
 	    char[] buffer = new char[16384];
 	    while (inputStreamReader.read(buffer, 0, buffer.length) != -1) {
 		stringBuffer.append(String.valueOf(buffer));
 	    }

 	    // Close the InputStream objects.
 	    inputStreamReader.close();
 	    inputStream.close();

 	    // Return the string buffer.
 	    return stringBuffer.toString();
 	} catch (Exception E) {
 	    // Handle the exception.
 	    E.printStackTrace();
 	    return null;
 	}
    }
 }
	/**
	* @author Michael J Bommarito II
	* @date Apr 9, 2011
	* @license MIT, (C) Michael J Bommarito II 2011
	*/
	package org.mjb;

	// Java standard library imports
	import java.io.*;
	import java.util.*;
	import java.util.regex.*;
	import java.util.zip.*;

	// Lucene imports
	import org.apache.lucene.*;
	import org.apache.lucene.analysis.*;
	import org.apache.lucene.analysis.standard.*;
	import org.apache.lucene.document.*;
	import org.apache.lucene.index.*;
	import org.apache.lucene.store.*;
	import org.apache.lucene.util.Version;
	import org.htmlparser.*;
	import org.htmlparser.visitors.*;

	class CodeIndex {
	// Lucene Index objects
	private IndexWriter indexWriter;
	// private IndexWriterConfig indexWriterConfig;

	// Pattern matching regular expression objects
	private Pattern patternDocumentID, patternUSCKey, patternCurrentThrough,
	patternItemPath;

	/**
	* Constructor that initializes the Lucene index and regular expression
	* objects.
	*
	* @param indexPath
	* @throws IOException
	*/
	public CodeIndex(String indexPath) throws IOException {
	// Create the index directory
	Directory indexDir = FSDirectory.open(new File(indexPath));

	// This syntax works for 3.1; however, Mahout won't play nice, so we
	// need to use an older version.
	// indexWriterConfig = new IndexWriterConfig(Version.LUCENE_31, new
	// StandardAnalyzer(Version.LUCENE_31));
	// indexWriter = new IndexWriter(indexDir, indexWriterConfig);

	// Construct the Lucene index.
	indexWriter = new IndexWriter(indexDir, new StandardAnalyzer(
	Version.LUCENE_29), true, IndexWriter.MaxFieldLength.UNLIMITED);

	// Compile the regular expressions
	patternDocumentID = Pattern.compile("documentid:([^\\s]+)");
	patternUSCKey = Pattern.compile("usckey:([^\\s]+)");
	patternCurrentThrough = Pattern.compile("currentthrough:([0-9]+)");
	patternItemPath = Pattern.compile("itempath:(.+) -->");
	}

	/**
	* This method shuts down the Lucene index.
	*
	* @throws IOException
	*/
	public void close() throws IOException {
	// Shut down the indexer, but wait for merges
	indexWriter.close();
	}

	/**
	* This method parses an HTML document and adds it to the index.
	*
	* @param htmlBuffer
	*/
	public void parseHTML(final String htmlBuffer) {
	// Keep track of our position in the buffer.
	int fragmentStart = 0, fragmentEnd = 0;

	String documentString = "";
	fragmentStart = htmlBuffer.indexOf("<!-- documentid");

	while (fragmentStart > 0) {
	// Find the next document or end of file.
	fragmentEnd = htmlBuffer.indexOf("<!-- documentid",
	fragmentStart + 1);

	// Store the substring.
	if (fragmentEnd > 0) {
	documentString = htmlBuffer.substring(fragmentStart,
	fragmentEnd);
	} else {
	documentString = htmlBuffer.substring(fragmentStart);
	}

	// Now parse the document.
	parseDocument(documentString);

	// Set up the next search.
	fragmentStart = fragmentEnd;
	}
	}

	private String extractDocumentID(final String documentBuffer) {
	/**
	* Extract the documentid metadata from the <document>.
	*/
	// Get the string positions
	int fragmentStart = documentBuffer.indexOf("documentid:")
	+ "documentid:".length();
	int fragmentEnd = documentBuffer.indexOf(" ", fragmentStart);

	// Return the substring
	if (fragmentEnd > 0) {
	return documentBuffer.substring(fragmentStart, fragmentEnd);
	} else {
	return documentBuffer.substring(fragmentStart);
	}
	}

	private String extractUSCKey(final String documentBuffer) {
	/**
	* Extract the usckey metadata from the <document>.
	*/
	// Get the string positions
	int fragmentStart = documentBuffer.indexOf("usckey:")
	+ "usckey:".length();
	int fragmentEnd = documentBuffer.indexOf(" ", fragmentStart);

	// Return the substring
	if (fragmentEnd > 0) {
	return documentBuffer.substring(fragmentStart, fragmentEnd);
	} else {
	return documentBuffer.substring(fragmentStart);
	}
	}

	private String extractCurrentThrough(final String documentBuffer) {
	/**
	* Extract the currentthrough metadata from the <document>.
	*/
	// Get the string positions
	int fragmentStart = documentBuffer.indexOf("currentthrough:")
	+ "currentthrough:".length();
	int fragmentEnd = documentBuffer.indexOf(" ", fragmentStart);

	// Return the substring
	if (fragmentEnd > 0) {
	return documentBuffer.substring(fragmentStart, fragmentEnd);
	} else {
	return documentBuffer.substring(fragmentStart);
	}
	}

	private String extractItemPath(final String documentBuffer) {
	/**
	* Extract the itempath metadata from the <document>.
	*/

	// Get the string positions.
	int fragmentStart = documentBuffer.indexOf("itempath:")
	+ "itempath:".length();
	int fragmentEnd = documentBuffer.indexOf(" -->", fragmentStart);

	// Return the substring
	if (fragmentEnd > 0) {
	return documentBuffer.substring(fragmentStart, fragmentEnd);
	} else {
	return documentBuffer.substring(fragmentStart);
	}
	}

	private String extractExpCite(final String documentBuffer) {
	/**
	* Extract the expcite metadata from the <document>.
	*/

	// Get the string positions.
	int fragmentStart = documentBuffer.indexOf("expcite:")
	+ "expcite:".length();
	int fragmentEnd = documentBuffer.indexOf(" -->", fragmentStart);

	// Return the substring
	if (fragmentEnd > 0) {
	return documentBuffer.substring(fragmentStart, fragmentEnd);
	} else {
	return documentBuffer.substring(fragmentStart);
	}
	}

	public void parseDocument(final String documentBuffer) {
	/**
	* This method parses an individual <document> from larger XHTML LRC
	* document.
	*/

	// Get the document-level metadata.
	String documentID = extractDocumentID(documentBuffer);
	String uscKey = extractUSCKey(documentBuffer);
	String currentThrough = extractCurrentThrough(documentBuffer);
	String itemPath = extractItemPath(documentBuffer);
	String expCite = extractExpCite(documentBuffer);

	int fieldStart = 0, fieldEnd = 0;

	// Now split the document into fields.
	String fieldString = "", textHead = "", textStatute = "";
	fieldStart = documentBuffer.indexOf("<!-- field-start");

	while (fieldStart > 0) {
	fieldEnd = documentBuffer.indexOf("<!-- field-end", fieldStart + 1);

	// Store the substring.
	if (fieldEnd > 0) {
	fieldString = documentBuffer.substring(fieldStart, fieldEnd);
	} else {
	fieldString = documentBuffer.substring(fieldStart);
	}

	// Parse the fields depending on type.
	if (fieldString.contains("field-start:head")) {
	textHead = extractFieldText(fieldString);
	} else if (fieldString.contains("field-start:statute")) {
	textStatute = extractFieldText(fieldString);
	}

	// Find the next field.
	if (fieldEnd > 0) {
	fieldStart = documentBuffer.indexOf("<!-- field-start",
	fieldEnd);
	} else {
	fieldStart = -1;
	}
	}

	try {
	indexDocument(documentID, uscKey, currentThrough, itemPath,
	textHead, textStatute);
	} catch (Exception E) {
	E.printStackTrace();
	}
	}

	private String extractFieldText(final String fieldBuffer) {
	/**
	* Parse the field buffer and return the text.
	*/

	// Create the parser and visitor.
	Parser htmlParser = Parser.createParser(fieldBuffer, "UTF-8");
	TextExtractingVisitor textVisitor = new TextExtractingVisitor();

	try {
	// Now try to parse the string and store the text.
	htmlParser.visitAllNodesWith(textVisitor);
	return textVisitor.getExtractedText().trim();
	} catch (Exception E) {
	return "";
	}
	}

	private void indexDocument(String documentID, String uscKey,
	String currentThrough, String itemPath, String head, String text)
	throws IOException {
	/**
	* This method actually writes the document into the index.
	*/

	if ((text.length() == 0) \|\| (documentID.length() == 0)) {
	return;
	}

	// Create document.
	Document doc = new Document();
	doc.add(new Field("documentid", documentID, Field.Store.YES,
	Field.Index.NOT_ANALYZED));
	doc.add(new Field("usckey", uscKey, Field.Store.YES,
	Field.Index.NOT_ANALYZED));
	doc.add(new Field("currentthrough", currentThrough, Field.Store.YES,
	Field.Index.NOT_ANALYZED));
	doc.add(new Field("itempath", itemPath, Field.Store.YES,
	Field.Index.ANALYZED));
	doc.add(new Field("head", head, Field.Store.YES, Field.Index.ANALYZED,
	Field.TermVector.YES));
	doc.add(new Field("text", text, Field.Store.NO, Field.Index.ANALYZED,
	Field.TermVector.YES));

	// Write into index.
	indexWriter.addDocument(doc);
	}
	}

	/**
	* This class is the driver that processes ZIP files to construct a CodeIndex.
	*/
	public class buildCodeIndex {
	// CodeIndex object
	private static CodeIndex codeIndex;

	public static void main(final String[] args) {
	// Check that a proper command line is passed and fail if not.
	if (args.length == 0) {
	System.err.println("Usage: buildCodeIndex <path to ZIP files>");
	System.exit(-1);
	}

	// Now check that the argument is a directory that exists.
	File directory = new File(args[0]);
	if (!directory.exists() && directory.isDirectory()) {
	System.err.println("Usage: buildCodeIndex <path to ZIP files>");
	System.err
	.println("The specified path to ZIP files does not exists.");
	System.exit(-1);
	}

	try {
	// Create the CodeIndex object.
	codeIndex = new CodeIndex("index/");

	// Start the indexing by processing the directory.
	processDirectory(directory);

	// Shut down the index.
	codeIndex.close();
	} catch (Exception E) {
	E.printStackTrace();
	System.exit(-1);
	}
	}

	/**
	* This method processes the ZIP files in a given directory. These ZIP files
	* should be obtained from this URL: http://uscode.house.gov/xhtml/
	*
	* @param directoryPath
	* : path to the directory that contains the U.S. Code ZIP files.
	*/
	private static void processDirectory(final File directory) {
	// Build the sorted list of files
	File[] fileList = directory.listFiles();
	java.util.Arrays.sort(fileList);

	// Iterate over all files
	for (File f : fileList) {
	// Check if this is a ZIP file and process it if so.
	if (f.getName().toLowerCase().endsWith(".zip")) {
	System.out.println("Processing " + f.getAbsolutePath());
	processZIP(f.getAbsolutePath());
	}
	}
	}

	/**
	* This method process a specific ZIP file that should contain the XHTML
	* files from the Law Revision Counsel at the U.S. House of Representatives.
	*
	* @param fileName
	* : File name of the ZIP to be processed.
	*/
	private static void processZIP(final String fileName) {
	// Try to open the ZIP file and return if we can't.
	final ZipFile zipFile;

	try {
	zipFile = new ZipFile(fileName);
	} catch (Exception E) {
	E.printStackTrace();
	return;
	}

	// Now iterate over all entries in the ZIP file and parse the HTML ones.
	for (final Enumeration<? extends ZipEntry> entryList = zipFile
	.entries(); entryList.hasMoreElements();) {
	// Get the current entry
	ZipEntry entry = entryList.nextElement();

	// Now read the entry and process it if non-null.
	String entryBuffer = readZIPEntry(zipFile, entry);
	if (entryBuffer != null) {
	codeIndex.parseHTML(entryBuffer);
	}
	}
	}

	/**
	* This method reads a ZIP entry and returns a String representation of the
	* buffer.
	*
	* @param zipFile
	* : ZipFile containing the ZipEntry to be read.
	* @param zipEntry
	* : ZipEntry to be read.
	*/
	private static String readZIPEntry(final ZipFile zipFile,
	final ZipEntry zipEntry) {
	InputStream inputStream;
	InputStreamReader inputStreamReader;
	StringWriter stringBuffer = new StringWriter();

	// Read the entry data from the ZIP
	try {
	// Create the InputStream objects.
	inputStream = zipFile.getInputStream(zipEntry);
	inputStreamReader = new InputStreamReader(inputStream);

	// Read buffer-sized chunks into the StringWriter.
	char[] buffer = new char[16384];
	while (inputStreamReader.read(buffer, 0, buffer.length) != -1) {
	stringBuffer.append(String.valueOf(buffer));
	}

	// Close the InputStream objects.
	inputStreamReader.close();
	inputStream.close();

	// Return the string buffer.
	return stringBuffer.toString();
	} catch (Exception E) {
	// Handle the exception.
	E.printStackTrace();
	return null;
	}
	}
	}
No results found