Skip to content

Instantly share code, notes, and snippets.

@mjbommar
Created April 10, 2011 21:17
Show Gist options
  • Save mjbommar/912731 to your computer and use it in GitHub Desktop.
Save mjbommar/912731 to your computer and use it in GitHub Desktop.
Build a Lucene Index from a U.S. Code XHTML ZIP file.
/**
* @author Michael J Bommarito II
* @date Apr 9, 2011
* @license MIT, (C) Michael J Bommarito II 2011
*/
package org.mjb;
// Java standard library imports
import java.io.*;
import java.util.*;
import java.util.regex.*;
import java.util.zip.*;
// Lucene imports
import org.apache.lucene.*;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.standard.*;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.store.*;
import org.apache.lucene.util.Version;
import org.htmlparser.*;
import org.htmlparser.visitors.*;
class CodeIndex {
// Lucene Index objects
private IndexWriter indexWriter;
// private IndexWriterConfig indexWriterConfig;
// Pattern matching regular expression objects
private Pattern patternDocumentID, patternUSCKey, patternCurrentThrough,
patternItemPath;
/**
* Constructor that initializes the Lucene index and regular expression
* objects.
*
* @param indexPath
* @throws IOException
*/
public CodeIndex(String indexPath) throws IOException {
// Create the index directory
Directory indexDir = FSDirectory.open(new File(indexPath));
// This syntax works for 3.1; however, Mahout won't play nice, so we
// need to use an older version.
// indexWriterConfig = new IndexWriterConfig(Version.LUCENE_31, new
// StandardAnalyzer(Version.LUCENE_31));
// indexWriter = new IndexWriter(indexDir, indexWriterConfig);
// Construct the Lucene index.
indexWriter = new IndexWriter(indexDir, new StandardAnalyzer(
Version.LUCENE_29), true, IndexWriter.MaxFieldLength.UNLIMITED);
// Compile the regular expressions
patternDocumentID = Pattern.compile("documentid:([^\\s]+)");
patternUSCKey = Pattern.compile("usckey:([^\\s]+)");
patternCurrentThrough = Pattern.compile("currentthrough:([0-9]+)");
patternItemPath = Pattern.compile("itempath:(.+) -->");
}
/**
* This method shuts down the Lucene index.
*
* @throws IOException
*/
public void close() throws IOException {
// Shut down the indexer, but wait for merges
indexWriter.close();
}
/**
* This method parses an HTML document and adds it to the index.
*
* @param htmlBuffer
*/
public void parseHTML(final String htmlBuffer) {
// Keep track of our position in the buffer.
int fragmentStart = 0, fragmentEnd = 0;
String documentString = "";
fragmentStart = htmlBuffer.indexOf("<!-- documentid");
while (fragmentStart > 0) {
// Find the next document or end of file.
fragmentEnd = htmlBuffer.indexOf("<!-- documentid",
fragmentStart + 1);
// Store the substring.
if (fragmentEnd > 0) {
documentString = htmlBuffer.substring(fragmentStart,
fragmentEnd);
} else {
documentString = htmlBuffer.substring(fragmentStart);
}
// Now parse the document.
parseDocument(documentString);
// Set up the next search.
fragmentStart = fragmentEnd;
}
}
private String extractDocumentID(final String documentBuffer) {
/**
* Extract the documentid metadata from the <document>.
*/
// Get the string positions
int fragmentStart = documentBuffer.indexOf("documentid:")
+ "documentid:".length();
int fragmentEnd = documentBuffer.indexOf(" ", fragmentStart);
// Return the substring
if (fragmentEnd > 0) {
return documentBuffer.substring(fragmentStart, fragmentEnd);
} else {
return documentBuffer.substring(fragmentStart);
}
}
private String extractUSCKey(final String documentBuffer) {
/**
* Extract the usckey metadata from the <document>.
*/
// Get the string positions
int fragmentStart = documentBuffer.indexOf("usckey:")
+ "usckey:".length();
int fragmentEnd = documentBuffer.indexOf(" ", fragmentStart);
// Return the substring
if (fragmentEnd > 0) {
return documentBuffer.substring(fragmentStart, fragmentEnd);
} else {
return documentBuffer.substring(fragmentStart);
}
}
private String extractCurrentThrough(final String documentBuffer) {
/**
* Extract the currentthrough metadata from the <document>.
*/
// Get the string positions
int fragmentStart = documentBuffer.indexOf("currentthrough:")
+ "currentthrough:".length();
int fragmentEnd = documentBuffer.indexOf(" ", fragmentStart);
// Return the substring
if (fragmentEnd > 0) {
return documentBuffer.substring(fragmentStart, fragmentEnd);
} else {
return documentBuffer.substring(fragmentStart);
}
}
private String extractItemPath(final String documentBuffer) {
/**
* Extract the itempath metadata from the <document>.
*/
// Get the string positions.
int fragmentStart = documentBuffer.indexOf("itempath:")
+ "itempath:".length();
int fragmentEnd = documentBuffer.indexOf(" -->", fragmentStart);
// Return the substring
if (fragmentEnd > 0) {
return documentBuffer.substring(fragmentStart, fragmentEnd);
} else {
return documentBuffer.substring(fragmentStart);
}
}
private String extractExpCite(final String documentBuffer) {
/**
* Extract the expcite metadata from the <document>.
*/
// Get the string positions.
int fragmentStart = documentBuffer.indexOf("expcite:")
+ "expcite:".length();
int fragmentEnd = documentBuffer.indexOf(" -->", fragmentStart);
// Return the substring
if (fragmentEnd > 0) {
return documentBuffer.substring(fragmentStart, fragmentEnd);
} else {
return documentBuffer.substring(fragmentStart);
}
}
public void parseDocument(final String documentBuffer) {
/**
* This method parses an individual <document> from larger XHTML LRC
* document.
*/
// Get the document-level metadata.
String documentID = extractDocumentID(documentBuffer);
String uscKey = extractUSCKey(documentBuffer);
String currentThrough = extractCurrentThrough(documentBuffer);
String itemPath = extractItemPath(documentBuffer);
String expCite = extractExpCite(documentBuffer);
int fieldStart = 0, fieldEnd = 0;
// Now split the document into fields.
String fieldString = "", textHead = "", textStatute = "";
fieldStart = documentBuffer.indexOf("<!-- field-start");
while (fieldStart > 0) {
fieldEnd = documentBuffer.indexOf("<!-- field-end", fieldStart + 1);
// Store the substring.
if (fieldEnd > 0) {
fieldString = documentBuffer.substring(fieldStart, fieldEnd);
} else {
fieldString = documentBuffer.substring(fieldStart);
}
// Parse the fields depending on type.
if (fieldString.contains("field-start:head")) {
textHead = extractFieldText(fieldString);
} else if (fieldString.contains("field-start:statute")) {
textStatute = extractFieldText(fieldString);
}
// Find the next field.
if (fieldEnd > 0) {
fieldStart = documentBuffer.indexOf("<!-- field-start",
fieldEnd);
} else {
fieldStart = -1;
}
}
try {
indexDocument(documentID, uscKey, currentThrough, itemPath,
textHead, textStatute);
} catch (Exception E) {
E.printStackTrace();
}
}
private String extractFieldText(final String fieldBuffer) {
/**
* Parse the field buffer and return the text.
*/
// Create the parser and visitor.
Parser htmlParser = Parser.createParser(fieldBuffer, "UTF-8");
TextExtractingVisitor textVisitor = new TextExtractingVisitor();
try {
// Now try to parse the string and store the text.
htmlParser.visitAllNodesWith(textVisitor);
return textVisitor.getExtractedText().trim();
} catch (Exception E) {
return "";
}
}
private void indexDocument(String documentID, String uscKey,
String currentThrough, String itemPath, String head, String text)
throws IOException {
/**
* This method actually writes the document into the index.
*/
if ((text.length() == 0) || (documentID.length() == 0)) {
return;
}
// Create document.
Document doc = new Document();
doc.add(new Field("documentid", documentID, Field.Store.YES,
Field.Index.NOT_ANALYZED));
doc.add(new Field("usckey", uscKey, Field.Store.YES,
Field.Index.NOT_ANALYZED));
doc.add(new Field("currentthrough", currentThrough, Field.Store.YES,
Field.Index.NOT_ANALYZED));
doc.add(new Field("itempath", itemPath, Field.Store.YES,
Field.Index.ANALYZED));
doc.add(new Field("head", head, Field.Store.YES, Field.Index.ANALYZED,
Field.TermVector.YES));
doc.add(new Field("text", text, Field.Store.NO, Field.Index.ANALYZED,
Field.TermVector.YES));
// Write into index.
indexWriter.addDocument(doc);
}
}
/**
* This class is the driver that processes ZIP files to construct a CodeIndex.
*/
public class buildCodeIndex {
// CodeIndex object
private static CodeIndex codeIndex;
public static void main(final String[] args) {
// Check that a proper command line is passed and fail if not.
if (args.length == 0) {
System.err.println("Usage: buildCodeIndex <path to ZIP files>");
System.exit(-1);
}
// Now check that the argument is a directory that exists.
File directory = new File(args[0]);
if (!directory.exists() && directory.isDirectory()) {
System.err.println("Usage: buildCodeIndex <path to ZIP files>");
System.err
.println("The specified path to ZIP files does not exists.");
System.exit(-1);
}
try {
// Create the CodeIndex object.
codeIndex = new CodeIndex("index/");
// Start the indexing by processing the directory.
processDirectory(directory);
// Shut down the index.
codeIndex.close();
} catch (Exception E) {
E.printStackTrace();
System.exit(-1);
}
}
/**
* This method processes the ZIP files in a given directory. These ZIP files
* should be obtained from this URL: http://uscode.house.gov/xhtml/
*
* @param directoryPath
* : path to the directory that contains the U.S. Code ZIP files.
*/
private static void processDirectory(final File directory) {
// Build the sorted list of files
File[] fileList = directory.listFiles();
java.util.Arrays.sort(fileList);
// Iterate over all files
for (File f : fileList) {
// Check if this is a ZIP file and process it if so.
if (f.getName().toLowerCase().endsWith(".zip")) {
System.out.println("Processing " + f.getAbsolutePath());
processZIP(f.getAbsolutePath());
}
}
}
/**
* This method process a specific ZIP file that should contain the XHTML
* files from the Law Revision Counsel at the U.S. House of Representatives.
*
* @param fileName
* : File name of the ZIP to be processed.
*/
private static void processZIP(final String fileName) {
// Try to open the ZIP file and return if we can't.
final ZipFile zipFile;
try {
zipFile = new ZipFile(fileName);
} catch (Exception E) {
E.printStackTrace();
return;
}
// Now iterate over all entries in the ZIP file and parse the HTML ones.
for (final Enumeration<? extends ZipEntry> entryList = zipFile
.entries(); entryList.hasMoreElements();) {
// Get the current entry
ZipEntry entry = entryList.nextElement();
// Now read the entry and process it if non-null.
String entryBuffer = readZIPEntry(zipFile, entry);
if (entryBuffer != null) {
codeIndex.parseHTML(entryBuffer);
}
}
}
/**
* This method reads a ZIP entry and returns a String representation of the
* buffer.
*
* @param zipFile
* : ZipFile containing the ZipEntry to be read.
* @param zipEntry
* : ZipEntry to be read.
*/
private static String readZIPEntry(final ZipFile zipFile,
final ZipEntry zipEntry) {
InputStream inputStream;
InputStreamReader inputStreamReader;
StringWriter stringBuffer = new StringWriter();
// Read the entry data from the ZIP
try {
// Create the InputStream objects.
inputStream = zipFile.getInputStream(zipEntry);
inputStreamReader = new InputStreamReader(inputStream);
// Read buffer-sized chunks into the StringWriter.
char[] buffer = new char[16384];
while (inputStreamReader.read(buffer, 0, buffer.length) != -1) {
stringBuffer.append(String.valueOf(buffer));
}
// Close the InputStream objects.
inputStreamReader.close();
inputStream.close();
// Return the string buffer.
return stringBuffer.toString();
} catch (Exception E) {
// Handle the exception.
E.printStackTrace();
return null;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment