Created
April 10, 2011 21:17
-
-
Save mjbommar/912731 to your computer and use it in GitHub Desktop.
Build a Lucene Index from a U.S. Code XHTML ZIP file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* @author Michael J Bommarito II | |
* @date Apr 9, 2011 | |
* @license MIT, (C) Michael J Bommarito II 2011 | |
*/ | |
package org.mjb; | |
// Java standard library imports | |
import java.io.*; | |
import java.util.*; | |
import java.util.regex.*; | |
import java.util.zip.*; | |
// Lucene imports | |
import org.apache.lucene.*; | |
import org.apache.lucene.analysis.*; | |
import org.apache.lucene.analysis.standard.*; | |
import org.apache.lucene.document.*; | |
import org.apache.lucene.index.*; | |
import org.apache.lucene.store.*; | |
import org.apache.lucene.util.Version; | |
import org.htmlparser.*; | |
import org.htmlparser.visitors.*; | |
class CodeIndex { | |
// Lucene Index objects | |
private IndexWriter indexWriter; | |
// private IndexWriterConfig indexWriterConfig; | |
// Pattern matching regular expression objects | |
private Pattern patternDocumentID, patternUSCKey, patternCurrentThrough, | |
patternItemPath; | |
/** | |
* Constructor that initializes the Lucene index and regular expression | |
* objects. | |
* | |
* @param indexPath | |
* @throws IOException | |
*/ | |
public CodeIndex(String indexPath) throws IOException { | |
// Create the index directory | |
Directory indexDir = FSDirectory.open(new File(indexPath)); | |
// This syntax works for 3.1; however, Mahout won't play nice, so we | |
// need to use an older version. | |
// indexWriterConfig = new IndexWriterConfig(Version.LUCENE_31, new | |
// StandardAnalyzer(Version.LUCENE_31)); | |
// indexWriter = new IndexWriter(indexDir, indexWriterConfig); | |
// Construct the Lucene index. | |
indexWriter = new IndexWriter(indexDir, new StandardAnalyzer( | |
Version.LUCENE_29), true, IndexWriter.MaxFieldLength.UNLIMITED); | |
// Compile the regular expressions | |
patternDocumentID = Pattern.compile("documentid:([^\\s]+)"); | |
patternUSCKey = Pattern.compile("usckey:([^\\s]+)"); | |
patternCurrentThrough = Pattern.compile("currentthrough:([0-9]+)"); | |
patternItemPath = Pattern.compile("itempath:(.+) -->"); | |
} | |
/** | |
* This method shuts down the Lucene index. | |
* | |
* @throws IOException | |
*/ | |
public void close() throws IOException { | |
// Shut down the indexer, but wait for merges | |
indexWriter.close(); | |
} | |
/** | |
* This method parses an HTML document and adds it to the index. | |
* | |
* @param htmlBuffer | |
*/ | |
public void parseHTML(final String htmlBuffer) { | |
// Keep track of our position in the buffer. | |
int fragmentStart = 0, fragmentEnd = 0; | |
String documentString = ""; | |
fragmentStart = htmlBuffer.indexOf("<!-- documentid"); | |
while (fragmentStart > 0) { | |
// Find the next document or end of file. | |
fragmentEnd = htmlBuffer.indexOf("<!-- documentid", | |
fragmentStart + 1); | |
// Store the substring. | |
if (fragmentEnd > 0) { | |
documentString = htmlBuffer.substring(fragmentStart, | |
fragmentEnd); | |
} else { | |
documentString = htmlBuffer.substring(fragmentStart); | |
} | |
// Now parse the document. | |
parseDocument(documentString); | |
// Set up the next search. | |
fragmentStart = fragmentEnd; | |
} | |
} | |
private String extractDocumentID(final String documentBuffer) { | |
/** | |
* Extract the documentid metadata from the <document>. | |
*/ | |
// Get the string positions | |
int fragmentStart = documentBuffer.indexOf("documentid:") | |
+ "documentid:".length(); | |
int fragmentEnd = documentBuffer.indexOf(" ", fragmentStart); | |
// Return the substring | |
if (fragmentEnd > 0) { | |
return documentBuffer.substring(fragmentStart, fragmentEnd); | |
} else { | |
return documentBuffer.substring(fragmentStart); | |
} | |
} | |
private String extractUSCKey(final String documentBuffer) { | |
/** | |
* Extract the usckey metadata from the <document>. | |
*/ | |
// Get the string positions | |
int fragmentStart = documentBuffer.indexOf("usckey:") | |
+ "usckey:".length(); | |
int fragmentEnd = documentBuffer.indexOf(" ", fragmentStart); | |
// Return the substring | |
if (fragmentEnd > 0) { | |
return documentBuffer.substring(fragmentStart, fragmentEnd); | |
} else { | |
return documentBuffer.substring(fragmentStart); | |
} | |
} | |
private String extractCurrentThrough(final String documentBuffer) { | |
/** | |
* Extract the currentthrough metadata from the <document>. | |
*/ | |
// Get the string positions | |
int fragmentStart = documentBuffer.indexOf("currentthrough:") | |
+ "currentthrough:".length(); | |
int fragmentEnd = documentBuffer.indexOf(" ", fragmentStart); | |
// Return the substring | |
if (fragmentEnd > 0) { | |
return documentBuffer.substring(fragmentStart, fragmentEnd); | |
} else { | |
return documentBuffer.substring(fragmentStart); | |
} | |
} | |
private String extractItemPath(final String documentBuffer) { | |
/** | |
* Extract the itempath metadata from the <document>. | |
*/ | |
// Get the string positions. | |
int fragmentStart = documentBuffer.indexOf("itempath:") | |
+ "itempath:".length(); | |
int fragmentEnd = documentBuffer.indexOf(" -->", fragmentStart); | |
// Return the substring | |
if (fragmentEnd > 0) { | |
return documentBuffer.substring(fragmentStart, fragmentEnd); | |
} else { | |
return documentBuffer.substring(fragmentStart); | |
} | |
} | |
private String extractExpCite(final String documentBuffer) { | |
/** | |
* Extract the expcite metadata from the <document>. | |
*/ | |
// Get the string positions. | |
int fragmentStart = documentBuffer.indexOf("expcite:") | |
+ "expcite:".length(); | |
int fragmentEnd = documentBuffer.indexOf(" -->", fragmentStart); | |
// Return the substring | |
if (fragmentEnd > 0) { | |
return documentBuffer.substring(fragmentStart, fragmentEnd); | |
} else { | |
return documentBuffer.substring(fragmentStart); | |
} | |
} | |
public void parseDocument(final String documentBuffer) { | |
/** | |
* This method parses an individual <document> from larger XHTML LRC | |
* document. | |
*/ | |
// Get the document-level metadata. | |
String documentID = extractDocumentID(documentBuffer); | |
String uscKey = extractUSCKey(documentBuffer); | |
String currentThrough = extractCurrentThrough(documentBuffer); | |
String itemPath = extractItemPath(documentBuffer); | |
String expCite = extractExpCite(documentBuffer); | |
int fieldStart = 0, fieldEnd = 0; | |
// Now split the document into fields. | |
String fieldString = "", textHead = "", textStatute = ""; | |
fieldStart = documentBuffer.indexOf("<!-- field-start"); | |
while (fieldStart > 0) { | |
fieldEnd = documentBuffer.indexOf("<!-- field-end", fieldStart + 1); | |
// Store the substring. | |
if (fieldEnd > 0) { | |
fieldString = documentBuffer.substring(fieldStart, fieldEnd); | |
} else { | |
fieldString = documentBuffer.substring(fieldStart); | |
} | |
// Parse the fields depending on type. | |
if (fieldString.contains("field-start:head")) { | |
textHead = extractFieldText(fieldString); | |
} else if (fieldString.contains("field-start:statute")) { | |
textStatute = extractFieldText(fieldString); | |
} | |
// Find the next field. | |
if (fieldEnd > 0) { | |
fieldStart = documentBuffer.indexOf("<!-- field-start", | |
fieldEnd); | |
} else { | |
fieldStart = -1; | |
} | |
} | |
try { | |
indexDocument(documentID, uscKey, currentThrough, itemPath, | |
textHead, textStatute); | |
} catch (Exception E) { | |
E.printStackTrace(); | |
} | |
} | |
private String extractFieldText(final String fieldBuffer) { | |
/** | |
* Parse the field buffer and return the text. | |
*/ | |
// Create the parser and visitor. | |
Parser htmlParser = Parser.createParser(fieldBuffer, "UTF-8"); | |
TextExtractingVisitor textVisitor = new TextExtractingVisitor(); | |
try { | |
// Now try to parse the string and store the text. | |
htmlParser.visitAllNodesWith(textVisitor); | |
return textVisitor.getExtractedText().trim(); | |
} catch (Exception E) { | |
return ""; | |
} | |
} | |
private void indexDocument(String documentID, String uscKey, | |
String currentThrough, String itemPath, String head, String text) | |
throws IOException { | |
/** | |
* This method actually writes the document into the index. | |
*/ | |
if ((text.length() == 0) || (documentID.length() == 0)) { | |
return; | |
} | |
// Create document. | |
Document doc = new Document(); | |
doc.add(new Field("documentid", documentID, Field.Store.YES, | |
Field.Index.NOT_ANALYZED)); | |
doc.add(new Field("usckey", uscKey, Field.Store.YES, | |
Field.Index.NOT_ANALYZED)); | |
doc.add(new Field("currentthrough", currentThrough, Field.Store.YES, | |
Field.Index.NOT_ANALYZED)); | |
doc.add(new Field("itempath", itemPath, Field.Store.YES, | |
Field.Index.ANALYZED)); | |
doc.add(new Field("head", head, Field.Store.YES, Field.Index.ANALYZED, | |
Field.TermVector.YES)); | |
doc.add(new Field("text", text, Field.Store.NO, Field.Index.ANALYZED, | |
Field.TermVector.YES)); | |
// Write into index. | |
indexWriter.addDocument(doc); | |
} | |
} | |
/** | |
* This class is the driver that processes ZIP files to construct a CodeIndex. | |
*/ | |
public class buildCodeIndex { | |
// CodeIndex object | |
private static CodeIndex codeIndex; | |
public static void main(final String[] args) { | |
// Check that a proper command line is passed and fail if not. | |
if (args.length == 0) { | |
System.err.println("Usage: buildCodeIndex <path to ZIP files>"); | |
System.exit(-1); | |
} | |
// Now check that the argument is a directory that exists. | |
File directory = new File(args[0]); | |
if (!directory.exists() && directory.isDirectory()) { | |
System.err.println("Usage: buildCodeIndex <path to ZIP files>"); | |
System.err | |
.println("The specified path to ZIP files does not exists."); | |
System.exit(-1); | |
} | |
try { | |
// Create the CodeIndex object. | |
codeIndex = new CodeIndex("index/"); | |
// Start the indexing by processing the directory. | |
processDirectory(directory); | |
// Shut down the index. | |
codeIndex.close(); | |
} catch (Exception E) { | |
E.printStackTrace(); | |
System.exit(-1); | |
} | |
} | |
/** | |
* This method processes the ZIP files in a given directory. These ZIP files | |
* should be obtained from this URL: http://uscode.house.gov/xhtml/ | |
* | |
* @param directoryPath | |
* : path to the directory that contains the U.S. Code ZIP files. | |
*/ | |
private static void processDirectory(final File directory) { | |
// Build the sorted list of files | |
File[] fileList = directory.listFiles(); | |
java.util.Arrays.sort(fileList); | |
// Iterate over all files | |
for (File f : fileList) { | |
// Check if this is a ZIP file and process it if so. | |
if (f.getName().toLowerCase().endsWith(".zip")) { | |
System.out.println("Processing " + f.getAbsolutePath()); | |
processZIP(f.getAbsolutePath()); | |
} | |
} | |
} | |
/** | |
* This method process a specific ZIP file that should contain the XHTML | |
* files from the Law Revision Counsel at the U.S. House of Representatives. | |
* | |
* @param fileName | |
* : File name of the ZIP to be processed. | |
*/ | |
private static void processZIP(final String fileName) { | |
// Try to open the ZIP file and return if we can't. | |
final ZipFile zipFile; | |
try { | |
zipFile = new ZipFile(fileName); | |
} catch (Exception E) { | |
E.printStackTrace(); | |
return; | |
} | |
// Now iterate over all entries in the ZIP file and parse the HTML ones. | |
for (final Enumeration<? extends ZipEntry> entryList = zipFile | |
.entries(); entryList.hasMoreElements();) { | |
// Get the current entry | |
ZipEntry entry = entryList.nextElement(); | |
// Now read the entry and process it if non-null. | |
String entryBuffer = readZIPEntry(zipFile, entry); | |
if (entryBuffer != null) { | |
codeIndex.parseHTML(entryBuffer); | |
} | |
} | |
} | |
/** | |
* This method reads a ZIP entry and returns a String representation of the | |
* buffer. | |
* | |
* @param zipFile | |
* : ZipFile containing the ZipEntry to be read. | |
* @param zipEntry | |
* : ZipEntry to be read. | |
*/ | |
private static String readZIPEntry(final ZipFile zipFile, | |
final ZipEntry zipEntry) { | |
InputStream inputStream; | |
InputStreamReader inputStreamReader; | |
StringWriter stringBuffer = new StringWriter(); | |
// Read the entry data from the ZIP | |
try { | |
// Create the InputStream objects. | |
inputStream = zipFile.getInputStream(zipEntry); | |
inputStreamReader = new InputStreamReader(inputStream); | |
// Read buffer-sized chunks into the StringWriter. | |
char[] buffer = new char[16384]; | |
while (inputStreamReader.read(buffer, 0, buffer.length) != -1) { | |
stringBuffer.append(String.valueOf(buffer)); | |
} | |
// Close the InputStream objects. | |
inputStreamReader.close(); | |
inputStream.close(); | |
// Return the string buffer. | |
return stringBuffer.toString(); | |
} catch (Exception E) { | |
// Handle the exception. | |
E.printStackTrace(); | |
return null; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment