tenten0213 · August 29, 2015 14:02
diff --git a/build.xml b/build.xml
 <?xml version="1.0"?>
 <!--
  Usage:

  To use a web proxy to download the dictionary data, do the following:

  $ ant -Dproxy.host=proxy.hoehoe.jp -Dproxy.port=8080

 -->
 <project name="ipadic" default="compile" basedir=".">
  <property name="ipadic.home" value="http://sourceforge.jp/projects/ipadic/downloads/24435/ipadic-2.7.0.tar.gz"/>
  <property name="ipadic.version" value="2.7.0"/>
  <property name="ipadic.archive"  value="ipadic-${ipadic.version}.tar.gz"/>
  <property name="ipadic.dir"  value="ipadic-${ipadic.version}"/>

  <!-- Checks the current build status -->
  <!-- Sets the property "dics.unpacked" if the dictionary is already unpacked -->
  <!-- Sets the property "ipadic.archive.present" if the ipadic archive is already present -->
  <!-- Sets the property "dics.preprocessed" if the dictionary is already preprocessed -->
  <!-- Sets the property "dics.complete" if the dictionary is already compiled -->
  <target name="check-build-status">
    <available file="${ipadic.archive}" property="ipadic.archive.present"/>
    <condition property="dics.unpacked">
      <and>
        <available file="ipadic-${ipadic.version}/Noun.dic"/>
      </and>
    </condition>
    <condition property="dics.preprocessed">
      <and>
        <available file="dic.csv"/>
        <available file="connect.csv"/>
      </and>
    </condition>
    <condition property="dics.complete">
      <and>
        <available file="da.sen"/>
        <available file="matrix.sen"/>
        <available file="posInfo.sen"/>
        <available file="token.sen"/>
      </and>
    </condition>
  </target>

  <!--
  <target name="download" depends="prepare-proxy,check-build-status" unless="ipadic.archive.present">
    <get src="${ipadic.home}/${ipadic.archive}" dest="${ipadic.archive}" />
  </target>
  -->

  <!-- Unpacks the ipadic dictionary -->
  <target name="unpack" depends="check-build-status" unless="dics.unpacked">
    <gunzip src="${ipadic.archive}"/>
    <untar src="${ipadic.dir}.tar" dest="." />
    <delete file="${ipadic.dir}.tar"/>
  </target>

  <!-- Deletes the ipadic dictionary and compiled files -->
  <target name="clean">
    <delete>
      <fileset dir="." includes="*.sen"/>
      <fileset dir="." includes="*.csv"/>
    </delete>
    <delete dir="ipadic-${ipadic.version}"/>
    <delete file="${ipadic.archive}" />
  </target>

  <!-- Preprocesses the ipadic dictionary for compilation -->
  <target name="preprocess" depends="unpack" unless="dics.preprocessed">
    <java classname="net.java.sen.tools.DictionaryPreprocessor"
          fork="true">
      <classpath>
        <pathelement location="."/>
        <pathelement location="../../bin"/>
        <pathelement location="../../jisx0213-1.0.jar"/>
        <pathelement path="${java.class.path}"/>
      </classpath>
      <arg line="X-EUC-JISX0213" />
      <arg line="ipadic-${ipadic.version}" />
      <arg line="." />
    </java>
  </target>

  <!-- Default task - compiles the ipadic dictionary -->
  <target name="compile" depends="preprocess" unless="dics.complete">
    <java classname="net.java.sen.tools.DictionaryCompiler"
          fork="true">
      <classpath>
        <pathelement location="."/>
        <pathelement location="../../bin"/>
        <pathelement path="${java.class.path}"/>
      </classpath>
    </java>
  </target>

  <!-- Downloads and compiles the ipadic dictionary from scratch -->
  <target name="all" depends="clean,unpack,compile"/>

 </project>

diff --git a/CostMatrixBuilder.java b/CostMatrixBuilder.java
 /*
 * Copyright (C) 2001-2007
 * Taku Kudoh <[email protected]>
 * Takashi Okamoto <[email protected]>
 * Matt Francis <[email protected]>
 * 
 * This library is free software; you can redistribute it and/or modify it under
 * the terms of the GNU Lesser General Public License as published by the Free
 * Software Foundation; either version 2.1 of the License, or any later version.
 * 
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
 * details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with this library; if not, write to the Free Software Foundation, Inc.,
 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 * 
 */

 package net.java.sen.compiler;

 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.LinkedHashSet;
 import java.util.Map;
 import java.util.Set;
 import java.util.Vector;


 /**
 * Builds an axis of the Connection Cost matrix from supplied part-of-speech /
 * conjugation data
 * 
 * TODO The workings of this class are relatively simple but somewhat magical.
 * It could use some explanation from someone who understands what exactly it's
 * doing.
 */
 class CostMatrixBuilder {

 	/**
 	 * Set containing all unique values from one column of the Connection Cost CSV file
 	 */
 	private LinkedHashSet<String> ruleSet = new LinkedHashSet<String>();

 	/**
 	 * The input rules (from ruleSet) split into individual values
 	 */
 	private Vector<String[]> ruleList = new Vector<String[]>();

 	
 	/**
 	 * TODO This is magic. How does this work?
 	 */
 	private Vector<Vector<Integer>> idList = new Vector<Vector<Integer>>();

 	/**
 	 * // dic2IdHash('word type')= id for word type
 	 * TODO This is magic. How does this work?
 	 */
 	private Map<String, Integer> dicIndex = new HashMap<String, Integer>();

 	
 	/**
 	 * A map containing a unique integer ID for each rule added
 	 */
 	private Map<String, Integer> ruleIndex = new HashMap<String, Integer>();

 	/**
 	 * Contains the set of the rules' last fields where the field is not equal to '*'
 	 * TODO This is magic. How does this work?
 	 */
 	private Set<String> lexicalized = new HashSet<String>();


 	/**
 	 * Converts a list of part-of-speech / conjugation identifier strings to
 	 * a vector of IDs unique to each string 
 	 * TODO This is magic. How does this work?
 	 *
 	 * @param csv The part-of-speech / conjugation strings
 	 * @param parent TODO How does this work?
 	 * @return A vector of IDs for the strings
 	 */
 	private Vector<Integer> getIdList(String csv[], boolean parent) {

 		Vector<Integer> results = new Vector<Integer>(this.ruleList.size());
 		results.setSize(this.ruleList.size());

 		for (int j = 0; j < this.ruleList.size(); j++) {
 			results.set(j, j);
 		}

 		for (int j = 0; j < csv.length; j++) {
 			int k = 0;
 			for (int n = 0; n < results.size(); n++) {
 				int i = results.get(n);
 				String rl_ij = this.ruleList.get(i)[j];
 				if (
 						   ((!parent) && (csv[j].charAt(0) == '*'))
 						|| ((parent) && (rl_ij.charAt(0) == '*'))
 						|| rl_ij.equals(csv[j])
 				   )
 				{
 					results.set(k++, results.get(n));
 				}
 			}
 			results.setSize(k);
 		}

 		return results;

 	}


 	/**
 	 * Calculates a unique(?) ID for a split rule
 	 * TODO This is magic. How does this work?
 	 *
 	 * @param csv The split rule
 	 * @return The calculated ID
 	 */
 	private int getDicIdNoCache(String csv[]) {

 		Vector<Integer> results = getIdList(csv, true);

 /*		if (results.size() == 0) {
 			throw new IllegalArgumentException();
 		}*/

 		int priority[] = new int[results.size()];
 		int max = 0;
 		for (int i = 0; i < results.size(); i++) {
 			String csvValues[] = this.ruleList.get(results.get(i));
 			for (int j = 0; j < csvValues.length; j++) {
 				if (csvValues[j].charAt(0) != '*') {
 					priority[i]++;
 				}
 			}
 			if (priority[max] < priority[i]) {
 				max = i;
 			}
 		}

 		return results.size() > 0 ? results.get(max) : 0;

 	}


 	/**
 	 * Adds a Connection Cost CSV value to the builder
 	 *
 	 * @param rule The rule to add
 	 */
 	public void add(String rule) {

 		this.ruleSet.add(rule);

 	}


 	/**
 	 * Builds the matrix axis based on the data passed to {@link #add(String)}.
 	 * It is an error to call {@link #add(String)} after calling
 	 * {@link #build()}.
 	 */
 	public void build() {

 		int i = 0;

 		this.ruleList.setSize(this.ruleSet.size());
 		for (Iterator<String> iterator = this.ruleSet.iterator(); iterator.hasNext();) {
 			String str = iterator.next();
 			this.ruleIndex.put(str, i);

 			String tokenList[] = str.split(",");

 			this.ruleList.set(i, tokenList);
 			if (tokenList[tokenList.length - 1].charAt(0) != '*') {
 				this.lexicalized.add(tokenList[tokenList.length - 1]);
 			}
 			i++;
 		}

 		this.ruleSet.clear();

 		this.idList.setSize(this.ruleList.size());
 		for (int j = 0; j < this.ruleList.size(); j++) {
 			Vector<Integer> results = getIdList(this.ruleList.get(j), false);
 			this.idList.set(j, results);
 		}

 	}


 	/**
 	 * Returns the size of the built matrix axis
 	 *
 	 * @return The size of the built matrix axis
 	 */
 	public int size() {

 		return this.ruleList.size();

 	}


 	/**
 	 * TODO This is magic. How does this work?
 	 *
 	 * @param rule The rule
 	 * @return TODO how is this ID defined?
 	 */
 	public int getDicId(String rule) {

 		String csv[] = rule.split(",");

 		String lex = csv[csv.length - 1];

 		if (this.lexicalized.contains(lex)) {

 			return getDicIdNoCache(csv);

 		}
 		// Remove end field
 		String partOfSpeech = rule.substring(0, rule.lastIndexOf(","));

 		Integer r = this.dicIndex.get(partOfSpeech);
 		if ((r != null) && (r != 0)) {

 			// 0 if empty
 			return r - 1;
 		}

 		int rg = getDicIdNoCache(csv);

 		this.dicIndex.put(partOfSpeech, rg + 1);
 		return rg;

 	}


 	/**
 	 * Converts a rule to a vector of IDs unique to each component part
 	 *
 	 * @param rule The rule
 	 * @return A vector of IDs for the component parts
 	 */
 	public Vector<Integer> getRuleIdList(String rule) {

 		return this.idList.get(this.ruleIndex.get(rule));

 	}


 }
diff --git a/DictionaryBuilder.java b/DictionaryBuilder.java
 /*
 * Copyright (C) 2002-2007
 * Taku Kudoh <[email protected]>
 * Takashi Okamoto <[email protected]>
 * Matt Francis <[email protected]>
 * 
 * This library is free software; you can redistribute it and/or modify it under
 * the terms of the GNU Lesser General Public License as published by the Free
 * Software Foundation; either version 2.1 of the License, or any later version.
 * 
 * This library is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
 * details.
 * 
 * You should have received a copy of the GNU Lesser General Public License
 * along with this library; if not, write to the Free Software Foundation, Inc.,
 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 * 
 */

 package net.java.sen.compiler;

 import java.io.BufferedOutputStream;
 import java.io.DataOutputStream;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.RandomAccessFile;
 import java.nio.MappedByteBuffer;
 import java.nio.ShortBuffer;
 import java.nio.channels.FileChannel;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Vector;

 import net.java.sen.dictionary.CToken;
 import net.java.sen.trie.TrieBuilder;
 import net.java.sen.util.CSVData;
 import net.java.sen.util.CSVParser;


 /**
 * Compiles CSV source data into the data files used for analysis
 */
 public class DictionaryBuilder {

 	/**
 	 * Input dictionary CSV filename
 	 */
 	private static final String DICTIONARY_CSV_FILENAME = "dictionary.csv";

 	/**
 	 * Input connection CSV filename
 	 */
 	private static final String CONNECTION_CSV_FILENAME = "connection.csv";

 	/**
 	 * Compiled connection cost data filename
 	 */
 	private static final String CONNECTION_COST_DATA_FILENAME = "connectionCost.sen";

 	/**
 	 * Compiled part of speech data filename
 	 */
 	private static final String PART_OF_SPEECH_DATA_FILENAME = "partOfSpeech.sen";

 	/**
 	 * Compiled token data filename
 	 */
 	private static final String TOKEN_DATA_FILENAME = "token.sen";

 	/**
 	 * Compiled trie data filename
 	 */
 	private static final String TRIE_DATA_FILENAME = "trie.sen";

 	/**
 	 * Default connection cost
 	 */
 	private static final short DEFAULT_CONNECTION_COST = 10000;

 	/**
 	 * Start of part-of-speech data within the dictionary CSV
 	 */
 	private static final int PART_OF_SPEECH_START = 2;

 	/**
 	 * Size of part-of-speech data within the dictionary CSV
 	 */
 	private static final int PART_OF_SPEECH_SIZE = 7;

 	/**
 	 * Beginning-of-string token part-of-speech
 	 */
 	private static final String BOS_PART_OF_SPEECH = "文頭,*,*,*,*,*,*";
 	
 	/**
 	 * End-of-string token part-of-speech
 	 */
 	private static final String EOS_PART_OF_SPEECH = "文末,*,*,*,*,*,*";

 	/**
 	 * Unknown token part-of-speech
 	 */
 	private static final String UNKNOWN_PART_OF_SPEECH = "名詞,サ変接続,*,*,*,*,*";


 	/**
 	 * Precursor data for the Trie file
 	 */
 	private static class TrieData {

 		/**
 		 * Trie keys
 		 */
 		public String keys[];
 	
 		/**
 		 * Trie values
 		 */
 		public int values[];
 	
 		/**
 		 * The actual number of entries in the keys/values arrays
 		 */
 		public int size;

 	}


 	/**
 	 * Increases the size of an array of <code>short</code>s
 	 *
 	 * @param current The existing array
 	 * @return The resized array
 	 */
 	private static short[] resize(short current[]) {

 		short tmp[] = new short[(int) (current.length * 1.5)];
 		System.arraycopy(current, 0, tmp, 0, current.length);

 		return tmp;

 	}


 	/**
 	 * Splits a compound reading or pronunciation field into a list
 	 * 
 	 * Compound fields are of the form:
 	 * 
 	 *   "{head1/head2[/head3 ...]}tail"
 	 * 
 	 * The returned list will consist of:
 	 * 
 	 *   "head1tail",
 	 *   "head2tail",
 	 *   "head3tail",
 	 *   ...
 	 *
 	 * @param compoundField The field to split
 	 * @return The split list
 	 */
 	private List<String> splitCompoundField(String compoundField) {

 		List<String> splitFieldList;

 		if ((compoundField.length() == 0) || (compoundField.charAt(0) != '{')) {

 			// No alternatives
 			splitFieldList = new ArrayList<String>(1);
 			splitFieldList.add(compoundField);

 		} else {

 			// 1 or more alternatives. No existing entry in Ipadic has more than 4
 			splitFieldList = new ArrayList<String>(4);

 			String[] parts = compoundField.split("[{}]");
 			String tail = (parts.length == 3) ? parts[2] : "";

 			String[] heads = parts.length > 0 ? parts[1].split("/") : new String[0] ;

 			for (int i = 0; i < heads.length; i++) {
 				splitFieldList.add(heads[i] + tail);
 			}

 		}

 		return splitFieldList;

 	}


 	/**
 	 * Creates the part-of-speech data file
 	 * 
 	 * @param dictionaryCSVFilenames The filenames of the dictionary CSV data file and any additional dictionaries 
 	 * @param partOfSpeechDataFilename The filename for the part-of-speech data file
 	 * @param matrixBuilders The three <code>CostMatrixBuilder</code>s
 	 * @param partOfSpeechStart The starting index of the part-of-speech data within a CSV line
 	 * @param partOfSpeechSize The number of part-of-speech values within a CSV line
 	 * @param charset The charset of the CSV data
 	 * @param bosPartOfSpeech The beginning-of-string part-of-speech code
 	 * @param eosPartOfSpeech The end-of-string part-of-speech code 
 	 * @param unknownPartOfSpeech  The beginning-of-string part-of-speech code
 	 * @param dictionaryList Populated by this method with the String/CToken tuples that will be used to create the Token file
 	 * @param standardCTokens Populated by this method with the three standard CTokens ("bos", "eos" and "unknown")
 	 *
 	 * @throws IOException 
 	 */
 	private void createPartOfSpeechDataFile(List<String> dictionaryCSVFilenames, String partOfSpeechDataFilename,
 			CostMatrixBuilder[] matrixBuilders, int partOfSpeechStart, int partOfSpeechSize, String charset,
 			String bosPartOfSpeech, String eosPartOfSpeech, String unknownPartOfSpeech, VirtualTupleList dictionaryList, CToken[] standardCTokens) throws IOException
 	{

 		String[] csvValues = null;

 		CSVData key_b = new CSVData();
 		CSVData pos_b = new CSVData();

 		DataOutputStream outputStream = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(partOfSpeechDataFilename)));

 		for (String dictionaryCSVFilename : dictionaryCSVFilenames) {
 			
 			CSVParser parser = new CSVParser(new FileInputStream(dictionaryCSVFilename), charset);

 			while ((csvValues = parser.nextTokens()) != null) {

 				if (csvValues.length < (partOfSpeechSize + partOfSpeechStart)) {
 					throw new RuntimeException("format error:" + parser.currentLine());
 				}

 				key_b.clear();
 				pos_b.clear();
 				for (int i = partOfSpeechStart; i < (partOfSpeechStart + partOfSpeechSize); i++) {
 					key_b.append(csvValues[i]);
 					pos_b.append(csvValues[i]);
 				}

 				for (int i = partOfSpeechStart + partOfSpeechSize; i < csvValues.length; i++) {
 					pos_b.append(csvValues[i]);
 				}

 				CToken ctoken = new CToken();

 				ctoken.rcAttr2 = (short) matrixBuilders[0].getDicId(key_b.toString());
 				ctoken.rcAttr1 = (short) matrixBuilders[1].getDicId(key_b.toString());
 				ctoken.lcAttr = (short) matrixBuilders[2].getDicId(key_b.toString());
 				ctoken.partOfSpeechIndex = outputStream.size() >> 1;
 				ctoken.length = (short) csvValues[0].length();
                try {
                    ctoken.cost = (short) Integer.parseInt(csvValues[1]);
                } catch (NumberFormatException ex) {
                    ctoken.cost = (short) 0;
                }

 				dictionaryList.add(csvValues[0], ctoken);


 				// Write to part of speech data file

 				StringBuilder partOfSpeechBuilder = new StringBuilder();
 				for (int i = partOfSpeechStart; i < (partOfSpeechStart + 4); i++) {
 					if (!csvValues[i].equals("*")) {
 						partOfSpeechBuilder.append(csvValues[i]);
 						partOfSpeechBuilder.append("-");
 					}
 				}
 				String partOfSpeech = partOfSpeechBuilder.substring(0, partOfSpeechBuilder.length() - 1);
 				String conjugationalType = csvValues[partOfSpeechStart + 4];
 				String conjugationalForm = csvValues[partOfSpeechStart + 5];
 				String basicForm = csvValues[partOfSpeechStart + 6];
 				List<String> readings = splitCompoundField(csvValues[partOfSpeechStart + 7]);
 				List<String> pronunciations = splitCompoundField(csvValues[partOfSpeechStart + 8]);

 				outputStream.writeChar(partOfSpeech.length());
 				outputStream.writeChars(partOfSpeech);

 				outputStream.writeChar(conjugationalType.length());
 				outputStream.writeChars(conjugationalType);

 				outputStream.writeChar(conjugationalForm.length());
 				outputStream.writeChars(conjugationalForm);

 				outputStream.writeChar(basicForm.length());
 				outputStream.writeChars(basicForm);

 				outputStream.writeChar(readings.size());

 				for (String reading : readings) {
 					outputStream.writeChar(reading.length());
 					outputStream.writeChars(reading);
 				}

 				for (String pronunciation : pronunciations) {
 					outputStream.writeChar(pronunciation.length());
 					outputStream.writeChars(pronunciation);
 				}

 			}

 		}

 		outputStream.close();

 		dictionaryList.sort();

 		CToken bosCToken = new CToken();
 		bosCToken.rcAttr2 = (short) matrixBuilders[0].getDicId(bosPartOfSpeech);
 		bosCToken.rcAttr1 = (short) matrixBuilders[1].getDicId(bosPartOfSpeech);
 		bosCToken.lcAttr = (short) matrixBuilders[2].getDicId(bosPartOfSpeech);
 		standardCTokens[0] = bosCToken;

 		CToken eosCToken = new CToken();
 		eosCToken.rcAttr2 = (short) matrixBuilders[0].getDicId(eosPartOfSpeech);
 		eosCToken.rcAttr1 = (short) matrixBuilders[1].getDicId(eosPartOfSpeech);
 		eosCToken.lcAttr = (short) matrixBuilders[2].getDicId(eosPartOfSpeech);
 		standardCTokens[1] = eosCToken;

 		CToken unknownCToken = new CToken();
 		unknownCToken.rcAttr2 = (short) matrixBuilders[0].getDicId(unknownPartOfSpeech);
 		unknownCToken.rcAttr1 = (short) matrixBuilders[1].getDicId(unknownPartOfSpeech);
 		unknownCToken.lcAttr = (short) matrixBuilders[2].getDicId(unknownPartOfSpeech);
 		unknownCToken.partOfSpeechIndex = -1;
 		standardCTokens[2] = unknownCToken;

 	}


 	/**
 	 * Creates the connection cost matrix file
 	 * 
 	 * @param connectionCSVFilename The filename of the connection CSV data
 	 * @param connectionCostDataFilename The filename for the connection cost matrix
 	 * @param defaultCost The default connection cost
 	 * @param charset The charset of the connection CSV data
 	 * @return An array of three <code>CostMatrixBuilder</code>s
 	 * @throws IOException 
 	 */
 	private CostMatrixBuilder[] createConnectionCostFile(String connectionCSVFilename, String connectionCostDataFilename, short defaultCost, String charset) throws IOException {

 		CostMatrixBuilder[] matrixBuilders = new CostMatrixBuilder[3];

 		matrixBuilders[0] = new CostMatrixBuilder();
 		matrixBuilders[1] = new CostMatrixBuilder();
 		matrixBuilders[2] = new CostMatrixBuilder();
 		Vector<String> rule1 = new Vector<String>();
 		Vector<String> rule2 = new Vector<String>();
 		Vector<String> rule3 = new Vector<String>();

 		// The approximate length of the file, plus a bit. If we're wrong it'll be expanded during processing
 		short[] scores = new short[30000];

 		// Read connection cost CSV data
 		CSVParser parser = new CSVParser(new FileInputStream(connectionCSVFilename), charset);
 		String t[];
 		int line = 0;
 		while ((t = parser.nextTokens()) != null) {
 			if (t.length < 4) {
 				throw new IOException("Connection cost CSV format error");
 			}
 			matrixBuilders[0].add(t[0]);
 			rule1.add(t[0]);

 			matrixBuilders[1].add(t[1]);
 			rule2.add(t[1]);

 			matrixBuilders[2].add(t[2]);
 			rule3.add(t[2]);

 			if (line == scores.length) {
 				scores = resize(scores);
 			}

 			scores[line++] = (short) Integer.parseInt(t[3]);
 		}

 		// Compile CostMatrixBuilders
 		matrixBuilders[0].build();
 		matrixBuilders[1].build();
 		matrixBuilders[2].build();

 		int size1 = matrixBuilders[0].size();
 		int size2 = matrixBuilders[1].size();
 		int size3 = matrixBuilders[2].size();
 		int ruleSize = rule1.size();


 		// Write connection cost data
 		MappedByteBuffer buffer = null;
 		ShortBuffer shortBuffer = null;
 		int matrixSizeBytes = (size1 * size2 * size3 * 2);
 		int headerSizeBytes = (3 * 2);

 		RandomAccessFile file = new RandomAccessFile(connectionCostDataFilename, "rw");
 		file.setLength(0);
 		file.writeShort(size1);
 		file.writeShort(size2);
 		file.writeShort(size3);
 		file.setLength(headerSizeBytes + matrixSizeBytes);
 		FileChannel indexChannel = file.getChannel();
 		buffer = indexChannel.map(FileChannel.MapMode.READ_WRITE, headerSizeBytes, matrixSizeBytes);
 		shortBuffer = buffer.asShortBuffer();
 		indexChannel.close();

 		for (int i = 0; i < (size1 * size2 * size3); i++) {
 			shortBuffer.put(i, defaultCost);
 		}

 		for (int i = 0; i < ruleSize; i++) {
 			Vector<Integer> r1 = matrixBuilders[0].getRuleIdList(rule1.get(i));
 			Vector<Integer> r2 = matrixBuilders[1].getRuleIdList(rule2.get(i));
 			Vector<Integer> r3 = matrixBuilders[2].getRuleIdList(rule3.get(i));

 			for (Iterator<Integer> i1 = r1.iterator(); i1.hasNext();) {
 				int ii1 = i1.next();
 				for (Iterator<Integer> i2 = r2.iterator(); i2.hasNext();) {
 					int ii2 = i2.next();
 					for (Iterator<Integer> i3 = r3.iterator(); i3.hasNext();) {
 						int ii3 = i3.next();
 						int position = size3 * (size2 * ii1 + ii2) + ii3;
 						shortBuffer.put(position, scores[i]);
 					}
 				}
 			}
 		}

 		buffer.force();

 		return matrixBuilders;

 	}


 	/**
 	 * Create the token data file
 	 * 
 	 * @param tokenDataFilename The filename for the token data file 
 	 * @param standardCTokens The beginning-of-string, end-of-string, and unknown-morpheme CTokens  
 	 * @param tupleList The (String,CToken) tuples
 	 *
 	 * @return The Trie precursor data
 	 * @throws IOException 
 	 */
 	private TrieData createTokenFile(String tokenDataFilename, CToken[] standardCTokens, VirtualTupleList tupleList)
 			throws IOException
 	{

 		TrieData trieData = new TrieData();
 		
 		trieData.values = new int[tupleList.size()];
 		trieData.keys = new String[tupleList.size()];
 		trieData.size = 0;
 		int spos = 0;
 		int bsize = 0;
 		String prev = "";

 		DataOutputStream out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(tokenDataFilename)));

 		// Write beginning-of-string, end-of-string, unknown-morpheme tokens
 		CToken.write(out, standardCTokens[0]);
 		CToken.write(out, standardCTokens[1]);
 		CToken.write(out, standardCTokens[2]);

 		// Write token data
 		for (int i = 0; i < trieData.keys.length; i++) {
 			StringCTokenTuple tuple = tupleList.get(i); 
 			String k = tuple.key;
 			if (!prev.equals(k) && i != 0) {
 				trieData.keys[trieData.size] = tupleList.get(spos).key;
 				trieData.values[trieData.size] = bsize + (spos << 8);
 				trieData.size++;
 				bsize = 1;
 				spos = i;
 			} else {
 				bsize++;
 			}
 			prev = tuple.key;
 			CToken.write(out, tuple.value);
 		}
 		out.flush();
 		out.close();

 		trieData.keys[trieData.size] = tupleList.get(spos).key;
 		trieData.values[trieData.size] = bsize + (spos << 8);
 		trieData.size++;


 		return trieData;

 	}


 	/**
 	 * Create Trie file
 	 * 
 	 * @param trieDataFilename The filename for the Trie file 
 	 * @param trieData The Trie precursor data
 	 * @throws IOException 
 	 */
 	private void createTrieFile(String trieDataFilename, TrieData trieData) throws IOException {

 		TrieBuilder builder = new TrieBuilder(trieData.keys, trieData.values, trieData.size);
 		builder.build(trieDataFilename);

 	}


 	/**
 	 * Compiles CSV source data into the data files used for analysis
 	 * 
 	 * @param customDictionaryCSVFilenames The filenames of custom dictionaries, or <code>null</code>
 	 * @throws IOException 
 	 */
 	public DictionaryBuilder(String[] customDictionaryCSVFilenames) throws IOException {

 		List<String> dictionaryCSVFilenames = new ArrayList<String>();
 		dictionaryCSVFilenames.add(DICTIONARY_CSV_FILENAME);
 		dictionaryCSVFilenames.addAll(Arrays.asList(customDictionaryCSVFilenames));

 		String charset = "UTF-8";


 		// Create connection cost file (matrix.sen)
 		CostMatrixBuilder[] matrixBuilders = createConnectionCostFile(
 				CONNECTION_CSV_FILENAME,
 				CONNECTION_COST_DATA_FILENAME,
 				DEFAULT_CONNECTION_COST,
 				charset
 		);


 		// Create part-of-speech data file (posInfo.sen)
 		VirtualTupleList dictionaryList = new VirtualTupleList();
 		CToken[] standardCTokens = new CToken[3];

 		createPartOfSpeechDataFile(
 				dictionaryCSVFilenames,
 				PART_OF_SPEECH_DATA_FILENAME,
 				matrixBuilders,
 				PART_OF_SPEECH_START,
 				PART_OF_SPEECH_SIZE,
 				charset,
 				BOS_PART_OF_SPEECH,
 				EOS_PART_OF_SPEECH,
 				UNKNOWN_PART_OF_SPEECH,
 				dictionaryList,
 				standardCTokens
 		);

 		// Free temporary object for GC
 		matrixBuilders = null;


 		// Create Token file (token.sen)
 		TrieData trieData = createTokenFile(
 				TOKEN_DATA_FILENAME,
 				standardCTokens,
 				dictionaryList
 		);

 		// Free temporary object for GC
 		dictionaryList = null;


 		// Create Trie file (da.sen)
 		createTrieFile(TRIE_DATA_FILENAME, trieData);

 	}


 }
	<?xml version="1.0"?>
	<!--
	Usage:

	To use a web proxy to download the dictionary data, do the following:

	$ ant -Dproxy.host=proxy.hoehoe.jp -Dproxy.port=8080

	-->
	<project name="ipadic" default="compile" basedir=".">
	<property name="ipadic.home" value="http://sourceforge.jp/projects/ipadic/downloads/24435/ipadic-2.7.0.tar.gz"/>
	<property name="ipadic.version" value="2.7.0"/>
	<property name="ipadic.archive" value="ipadic-${ipadic.version}.tar.gz"/>
	<property name="ipadic.dir" value="ipadic-${ipadic.version}"/>

	<!-- Checks the current build status -->
	<!-- Sets the property "dics.unpacked" if the dictionary is already unpacked -->
	<!-- Sets the property "ipadic.archive.present" if the ipadic archive is already present -->
	<!-- Sets the property "dics.preprocessed" if the dictionary is already preprocessed -->
	<!-- Sets the property "dics.complete" if the dictionary is already compiled -->
	<target name="check-build-status">
	<available file="${ipadic.archive}" property="ipadic.archive.present"/>
	<condition property="dics.unpacked">
	<and>
	<available file="ipadic-${ipadic.version}/Noun.dic"/>
	</and>
	</condition>
	<condition property="dics.preprocessed">
	<and>
	<available file="dic.csv"/>
	<available file="connect.csv"/>
	</and>
	</condition>
	<condition property="dics.complete">
	<and>
	<available file="da.sen"/>
	<available file="matrix.sen"/>
	<available file="posInfo.sen"/>
	<available file="token.sen"/>
	</and>
	</condition>
	</target>

	<!--
	<target name="download" depends="prepare-proxy,check-build-status" unless="ipadic.archive.present">
	<get src="${ipadic.home}/${ipadic.archive}" dest="${ipadic.archive}" />
	</target>
	-->

	<!-- Unpacks the ipadic dictionary -->
	<target name="unpack" depends="check-build-status" unless="dics.unpacked">
	<gunzip src="${ipadic.archive}"/>
	<untar src="${ipadic.dir}.tar" dest="." />
	<delete file="${ipadic.dir}.tar"/>
	</target>

	<!-- Deletes the ipadic dictionary and compiled files -->
	<target name="clean">
	<delete>
	<fileset dir="." includes="*.sen"/>
	<fileset dir="." includes="*.csv"/>
	</delete>
	<delete dir="ipadic-${ipadic.version}"/>
	<delete file="${ipadic.archive}" />
	</target>

	<!-- Preprocesses the ipadic dictionary for compilation -->
	<target name="preprocess" depends="unpack" unless="dics.preprocessed">
	<java classname="net.java.sen.tools.DictionaryPreprocessor"
	fork="true">
	<classpath>
	<pathelement location="."/>
	<pathelement location="../../bin"/>
	<pathelement location="../../jisx0213-1.0.jar"/>
	<pathelement path="${java.class.path}"/>
	</classpath>
	<arg line="X-EUC-JISX0213" />
	<arg line="ipadic-${ipadic.version}" />
	<arg line="." />
	</java>
	</target>

	<!-- Default task - compiles the ipadic dictionary -->
	<target name="compile" depends="preprocess" unless="dics.complete">
	<java classname="net.java.sen.tools.DictionaryCompiler"
	fork="true">
	<classpath>
	<pathelement location="."/>
	<pathelement location="../../bin"/>
	<pathelement path="${java.class.path}"/>
	</classpath>
	</java>
	</target>

	<!-- Downloads and compiles the ipadic dictionary from scratch -->
	<target name="all" depends="clean,unpack,compile"/>

	</project>
	/*
	* Copyright (C) 2001-2007
	* Taku Kudoh <[email protected]>
	* Takashi Okamoto <[email protected]>
	* Matt Francis <[email protected]>
	*
	* This library is free software; you can redistribute it and/or modify it under
	* the terms of the GNU Lesser General Public License as published by the Free
	* Software Foundation; either version 2.1 of the License, or any later version.
	*
	* This library is distributed in the hope that it will be useful, but WITHOUT
	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
	* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
	* details.
	*
	* You should have received a copy of the GNU Lesser General Public License
	* along with this library; if not, write to the Free Software Foundation, Inc.,
	* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	*
	*/

	package net.java.sen.compiler;

	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.Iterator;
	import java.util.LinkedHashSet;
	import java.util.Map;
	import java.util.Set;
	import java.util.Vector;


	/**
	* Builds an axis of the Connection Cost matrix from supplied part-of-speech /
	* conjugation data
	*
	* TODO The workings of this class are relatively simple but somewhat magical.
	* It could use some explanation from someone who understands what exactly it's
	* doing.
	*/
	class CostMatrixBuilder {

	/**
	* Set containing all unique values from one column of the Connection Cost CSV file
	*/
	private LinkedHashSet<String> ruleSet = new LinkedHashSet<String>();

	/**
	* The input rules (from ruleSet) split into individual values
	*/
	private Vector<String[]> ruleList = new Vector<String[]>();


	/**
	* TODO This is magic. How does this work?
	*/
	private Vector<Vector<Integer>> idList = new Vector<Vector<Integer>>();

	/**
	* // dic2IdHash('word type')= id for word type
	* TODO This is magic. How does this work?
	*/
	private Map<String, Integer> dicIndex = new HashMap<String, Integer>();


	/**
	* A map containing a unique integer ID for each rule added
	*/
	private Map<String, Integer> ruleIndex = new HashMap<String, Integer>();

	/**
	* Contains the set of the rules' last fields where the field is not equal to '*'
	* TODO This is magic. How does this work?
	*/
	private Set<String> lexicalized = new HashSet<String>();


	/**
	* Converts a list of part-of-speech / conjugation identifier strings to
	* a vector of IDs unique to each string
	* TODO This is magic. How does this work?
	*
	* @param csv The part-of-speech / conjugation strings
	* @param parent TODO How does this work?
	* @return A vector of IDs for the strings
	*/
	private Vector<Integer> getIdList(String csv[], boolean parent) {

	Vector<Integer> results = new Vector<Integer>(this.ruleList.size());
	results.setSize(this.ruleList.size());

	for (int j = 0; j < this.ruleList.size(); j++) {
	results.set(j, j);
	}

	for (int j = 0; j < csv.length; j++) {
	int k = 0;
	for (int n = 0; n < results.size(); n++) {
	int i = results.get(n);
	String rl_ij = this.ruleList.get(i)[j];
	if (
	((!parent) && (csv[j].charAt(0) == '*'))
	\|\| ((parent) && (rl_ij.charAt(0) == '*'))
	\|\| rl_ij.equals(csv[j])
	)
	{
	results.set(k++, results.get(n));
	}
	}
	results.setSize(k);
	}

	return results;

	}


	/**
	* Calculates a unique(?) ID for a split rule
	* TODO This is magic. How does this work?
	*
	* @param csv The split rule
	* @return The calculated ID
	*/
	private int getDicIdNoCache(String csv[]) {

	Vector<Integer> results = getIdList(csv, true);

	/* if (results.size() == 0) {
	throw new IllegalArgumentException();
	}*/

	int priority[] = new int[results.size()];
	int max = 0;
	for (int i = 0; i < results.size(); i++) {
	String csvValues[] = this.ruleList.get(results.get(i));
	for (int j = 0; j < csvValues.length; j++) {
	if (csvValues[j].charAt(0) != '*') {
	priority[i]++;
	}
	}
	if (priority[max] < priority[i]) {
	max = i;
	}
	}

	return results.size() > 0 ? results.get(max) : 0;

	}


	/**
	* Adds a Connection Cost CSV value to the builder
	*
	* @param rule The rule to add
	*/
	public void add(String rule) {

	this.ruleSet.add(rule);

	}


	/**
	* Builds the matrix axis based on the data passed to {@link #add(String)}.
	* It is an error to call {@link #add(String)} after calling
	* {@link #build()}.
	*/
	public void build() {

	int i = 0;

	this.ruleList.setSize(this.ruleSet.size());
	for (Iterator<String> iterator = this.ruleSet.iterator(); iterator.hasNext();) {
	String str = iterator.next();
	this.ruleIndex.put(str, i);

	String tokenList[] = str.split(",");

	this.ruleList.set(i, tokenList);
	if (tokenList[tokenList.length - 1].charAt(0) != '*') {
	this.lexicalized.add(tokenList[tokenList.length - 1]);
	}
	i++;
	}

	this.ruleSet.clear();

	this.idList.setSize(this.ruleList.size());
	for (int j = 0; j < this.ruleList.size(); j++) {
	Vector<Integer> results = getIdList(this.ruleList.get(j), false);
	this.idList.set(j, results);
	}

	}


	/**
	* Returns the size of the built matrix axis
	*
	* @return The size of the built matrix axis
	*/
	public int size() {

	return this.ruleList.size();

	}


	/**
	* TODO This is magic. How does this work?
	*
	* @param rule The rule
	* @return TODO how is this ID defined?
	*/
	public int getDicId(String rule) {

	String csv[] = rule.split(",");

	String lex = csv[csv.length - 1];

	if (this.lexicalized.contains(lex)) {

	return getDicIdNoCache(csv);

	}
	// Remove end field
	String partOfSpeech = rule.substring(0, rule.lastIndexOf(","));

	Integer r = this.dicIndex.get(partOfSpeech);
	if ((r != null) && (r != 0)) {

	// 0 if empty
	return r - 1;
	}

	int rg = getDicIdNoCache(csv);

	this.dicIndex.put(partOfSpeech, rg + 1);
	return rg;

	}


	/**
	* Converts a rule to a vector of IDs unique to each component part
	*
	* @param rule The rule
	* @return A vector of IDs for the component parts
	*/
	public Vector<Integer> getRuleIdList(String rule) {

	return this.idList.get(this.ruleIndex.get(rule));

	}


	}
	/*
	* Copyright (C) 2002-2007
	* Taku Kudoh <[email protected]>
	* Takashi Okamoto <[email protected]>
	* Matt Francis <[email protected]>
	*
	* This library is free software; you can redistribute it and/or modify it under
	* the terms of the GNU Lesser General Public License as published by the Free
	* Software Foundation; either version 2.1 of the License, or any later version.
	*
	* This library is distributed in the hope that it will be useful, but WITHOUT
	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
	* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
	* details.
	*
	* You should have received a copy of the GNU Lesser General Public License
	* along with this library; if not, write to the Free Software Foundation, Inc.,
	* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	*
	*/

	package net.java.sen.compiler;

	import java.io.BufferedOutputStream;
	import java.io.DataOutputStream;
	import java.io.FileInputStream;
	import java.io.FileOutputStream;
	import java.io.IOException;
	import java.io.RandomAccessFile;
	import java.nio.MappedByteBuffer;
	import java.nio.ShortBuffer;
	import java.nio.channels.FileChannel;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Iterator;
	import java.util.List;
	import java.util.Vector;

	import net.java.sen.dictionary.CToken;
	import net.java.sen.trie.TrieBuilder;
	import net.java.sen.util.CSVData;
	import net.java.sen.util.CSVParser;


	/**
	* Compiles CSV source data into the data files used for analysis
	*/
	public class DictionaryBuilder {

	/**
	* Input dictionary CSV filename
	*/
	private static final String DICTIONARY_CSV_FILENAME = "dictionary.csv";

	/**
	* Input connection CSV filename
	*/
	private static final String CONNECTION_CSV_FILENAME = "connection.csv";

	/**
	* Compiled connection cost data filename
	*/
	private static final String CONNECTION_COST_DATA_FILENAME = "connectionCost.sen";

	/**
	* Compiled part of speech data filename
	*/
	private static final String PART_OF_SPEECH_DATA_FILENAME = "partOfSpeech.sen";

	/**
	* Compiled token data filename
	*/
	private static final String TOKEN_DATA_FILENAME = "token.sen";

	/**
	* Compiled trie data filename
	*/
	private static final String TRIE_DATA_FILENAME = "trie.sen";

	/**
	* Default connection cost
	*/
	private static final short DEFAULT_CONNECTION_COST = 10000;

	/**
	* Start of part-of-speech data within the dictionary CSV
	*/
	private static final int PART_OF_SPEECH_START = 2;

	/**
	* Size of part-of-speech data within the dictionary CSV
	*/
	private static final int PART_OF_SPEECH_SIZE = 7;

	/**
	* Beginning-of-string token part-of-speech
	*/
	private static final String BOS_PART_OF_SPEECH = "文頭,,,,,,";

	/**
	* End-of-string token part-of-speech
	*/
	private static final String EOS_PART_OF_SPEECH = "文末,,,,,,";

	/**
	* Unknown token part-of-speech
	*/
	private static final String UNKNOWN_PART_OF_SPEECH = "名詞,サ変接続,,,,,*";


	/**
	* Precursor data for the Trie file
	*/
	private static class TrieData {

	/**
	* Trie keys
	*/
	public String keys[];

	/**
	* Trie values
	*/
	public int values[];

	/**
	* The actual number of entries in the keys/values arrays
	*/
	public int size;

	}


	/**
	* Increases the size of an array of <code>short</code>s
	*
	* @param current The existing array
	* @return The resized array
	*/
	private static short[] resize(short current[]) {

	short tmp[] = new short[(int) (current.length * 1.5)];
	System.arraycopy(current, 0, tmp, 0, current.length);

	return tmp;

	}


	/**
	* Splits a compound reading or pronunciation field into a list
	*
	* Compound fields are of the form:
	*
	* "{head1/head2[/head3 ...]}tail"
	*
	* The returned list will consist of:
	*
	* "head1tail",
	* "head2tail",
	* "head3tail",
	* ...
	*
	* @param compoundField The field to split
	* @return The split list
	*/
	private List<String> splitCompoundField(String compoundField) {

	List<String> splitFieldList;

	if ((compoundField.length() == 0) \|\| (compoundField.charAt(0) != '{')) {

	// No alternatives
	splitFieldList = new ArrayList<String>(1);
	splitFieldList.add(compoundField);

	} else {

	// 1 or more alternatives. No existing entry in Ipadic has more than 4
	splitFieldList = new ArrayList<String>(4);

	String[] parts = compoundField.split("[{}]");
	String tail = (parts.length == 3) ? parts[2] : "";

	String[] heads = parts.length > 0 ? parts[1].split("/") : new String[0] ;

	for (int i = 0; i < heads.length; i++) {
	splitFieldList.add(heads[i] + tail);
	}

	}

	return splitFieldList;

	}


	/**
	* Creates the part-of-speech data file
	*
	* @param dictionaryCSVFilenames The filenames of the dictionary CSV data file and any additional dictionaries
	* @param partOfSpeechDataFilename The filename for the part-of-speech data file
	* @param matrixBuilders The three <code>CostMatrixBuilder</code>s
	* @param partOfSpeechStart The starting index of the part-of-speech data within a CSV line
	* @param partOfSpeechSize The number of part-of-speech values within a CSV line
	* @param charset The charset of the CSV data
	* @param bosPartOfSpeech The beginning-of-string part-of-speech code
	* @param eosPartOfSpeech The end-of-string part-of-speech code
	* @param unknownPartOfSpeech The beginning-of-string part-of-speech code
	* @param dictionaryList Populated by this method with the String/CToken tuples that will be used to create the Token file
	* @param standardCTokens Populated by this method with the three standard CTokens ("bos", "eos" and "unknown")
	*
	* @throws IOException
	*/
	private void createPartOfSpeechDataFile(List<String> dictionaryCSVFilenames, String partOfSpeechDataFilename,
	CostMatrixBuilder[] matrixBuilders, int partOfSpeechStart, int partOfSpeechSize, String charset,
	String bosPartOfSpeech, String eosPartOfSpeech, String unknownPartOfSpeech, VirtualTupleList dictionaryList, CToken[] standardCTokens) throws IOException
	{

	String[] csvValues = null;

	CSVData key_b = new CSVData();
	CSVData pos_b = new CSVData();

	DataOutputStream outputStream = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(partOfSpeechDataFilename)));

	for (String dictionaryCSVFilename : dictionaryCSVFilenames) {

	CSVParser parser = new CSVParser(new FileInputStream(dictionaryCSVFilename), charset);

	while ((csvValues = parser.nextTokens()) != null) {

	if (csvValues.length < (partOfSpeechSize + partOfSpeechStart)) {
	throw new RuntimeException("format error:" + parser.currentLine());
	}

	key_b.clear();
	pos_b.clear();
	for (int i = partOfSpeechStart; i < (partOfSpeechStart + partOfSpeechSize); i++) {
	key_b.append(csvValues[i]);
	pos_b.append(csvValues[i]);
	}

	for (int i = partOfSpeechStart + partOfSpeechSize; i < csvValues.length; i++) {
	pos_b.append(csvValues[i]);
	}

	CToken ctoken = new CToken();

	ctoken.rcAttr2 = (short) matrixBuilders[0].getDicId(key_b.toString());
	ctoken.rcAttr1 = (short) matrixBuilders[1].getDicId(key_b.toString());
	ctoken.lcAttr = (short) matrixBuilders[2].getDicId(key_b.toString());
	ctoken.partOfSpeechIndex = outputStream.size() >> 1;
	ctoken.length = (short) csvValues[0].length();
	try {
	ctoken.cost = (short) Integer.parseInt(csvValues[1]);
	} catch (NumberFormatException ex) {
	ctoken.cost = (short) 0;
	}

	dictionaryList.add(csvValues[0], ctoken);


	// Write to part of speech data file

	StringBuilder partOfSpeechBuilder = new StringBuilder();
	for (int i = partOfSpeechStart; i < (partOfSpeechStart + 4); i++) {
	if (!csvValues[i].equals("*")) {
	partOfSpeechBuilder.append(csvValues[i]);
	partOfSpeechBuilder.append("-");
	}
	}
	String partOfSpeech = partOfSpeechBuilder.substring(0, partOfSpeechBuilder.length() - 1);
	String conjugationalType = csvValues[partOfSpeechStart + 4];
	String conjugationalForm = csvValues[partOfSpeechStart + 5];
	String basicForm = csvValues[partOfSpeechStart + 6];
	List<String> readings = splitCompoundField(csvValues[partOfSpeechStart + 7]);
	List<String> pronunciations = splitCompoundField(csvValues[partOfSpeechStart + 8]);

	outputStream.writeChar(partOfSpeech.length());
	outputStream.writeChars(partOfSpeech);

	outputStream.writeChar(conjugationalType.length());
	outputStream.writeChars(conjugationalType);

	outputStream.writeChar(conjugationalForm.length());
	outputStream.writeChars(conjugationalForm);

	outputStream.writeChar(basicForm.length());
	outputStream.writeChars(basicForm);

	outputStream.writeChar(readings.size());

	for (String reading : readings) {
	outputStream.writeChar(reading.length());
	outputStream.writeChars(reading);
	}

	for (String pronunciation : pronunciations) {
	outputStream.writeChar(pronunciation.length());
	outputStream.writeChars(pronunciation);
	}

	}

	}

	outputStream.close();

	dictionaryList.sort();

	CToken bosCToken = new CToken();
	bosCToken.rcAttr2 = (short) matrixBuilders[0].getDicId(bosPartOfSpeech);
	bosCToken.rcAttr1 = (short) matrixBuilders[1].getDicId(bosPartOfSpeech);
	bosCToken.lcAttr = (short) matrixBuilders[2].getDicId(bosPartOfSpeech);
	standardCTokens[0] = bosCToken;

	CToken eosCToken = new CToken();
	eosCToken.rcAttr2 = (short) matrixBuilders[0].getDicId(eosPartOfSpeech);
	eosCToken.rcAttr1 = (short) matrixBuilders[1].getDicId(eosPartOfSpeech);
	eosCToken.lcAttr = (short) matrixBuilders[2].getDicId(eosPartOfSpeech);
	standardCTokens[1] = eosCToken;

	CToken unknownCToken = new CToken();
	unknownCToken.rcAttr2 = (short) matrixBuilders[0].getDicId(unknownPartOfSpeech);
	unknownCToken.rcAttr1 = (short) matrixBuilders[1].getDicId(unknownPartOfSpeech);
	unknownCToken.lcAttr = (short) matrixBuilders[2].getDicId(unknownPartOfSpeech);
	unknownCToken.partOfSpeechIndex = -1;
	standardCTokens[2] = unknownCToken;

	}


	/**
	* Creates the connection cost matrix file
	*
	* @param connectionCSVFilename The filename of the connection CSV data
	* @param connectionCostDataFilename The filename for the connection cost matrix
	* @param defaultCost The default connection cost
	* @param charset The charset of the connection CSV data
	* @return An array of three <code>CostMatrixBuilder</code>s
	* @throws IOException
	*/
	private CostMatrixBuilder[] createConnectionCostFile(String connectionCSVFilename, String connectionCostDataFilename, short defaultCost, String charset) throws IOException {

	CostMatrixBuilder[] matrixBuilders = new CostMatrixBuilder[3];

	matrixBuilders[0] = new CostMatrixBuilder();
	matrixBuilders[1] = new CostMatrixBuilder();
	matrixBuilders[2] = new CostMatrixBuilder();
	Vector<String> rule1 = new Vector<String>();
	Vector<String> rule2 = new Vector<String>();
	Vector<String> rule3 = new Vector<String>();

	// The approximate length of the file, plus a bit. If we're wrong it'll be expanded during processing
	short[] scores = new short[30000];

	// Read connection cost CSV data
	CSVParser parser = new CSVParser(new FileInputStream(connectionCSVFilename), charset);
	String t[];
	int line = 0;
	while ((t = parser.nextTokens()) != null) {
	if (t.length < 4) {
	throw new IOException("Connection cost CSV format error");
	}
	matrixBuilders[0].add(t[0]);
	rule1.add(t[0]);

	matrixBuilders[1].add(t[1]);
	rule2.add(t[1]);

	matrixBuilders[2].add(t[2]);
	rule3.add(t[2]);

	if (line == scores.length) {
	scores = resize(scores);
	}

	scores[line++] = (short) Integer.parseInt(t[3]);
	}

	// Compile CostMatrixBuilders
	matrixBuilders[0].build();
	matrixBuilders[1].build();
	matrixBuilders[2].build();

	int size1 = matrixBuilders[0].size();
	int size2 = matrixBuilders[1].size();
	int size3 = matrixBuilders[2].size();
	int ruleSize = rule1.size();


	// Write connection cost data
	MappedByteBuffer buffer = null;
	ShortBuffer shortBuffer = null;
	int matrixSizeBytes = (size1 * size2 * size3 * 2);
	int headerSizeBytes = (3 * 2);

	RandomAccessFile file = new RandomAccessFile(connectionCostDataFilename, "rw");
	file.setLength(0);
	file.writeShort(size1);
	file.writeShort(size2);
	file.writeShort(size3);
	file.setLength(headerSizeBytes + matrixSizeBytes);
	FileChannel indexChannel = file.getChannel();
	buffer = indexChannel.map(FileChannel.MapMode.READ_WRITE, headerSizeBytes, matrixSizeBytes);
	shortBuffer = buffer.asShortBuffer();
	indexChannel.close();

	for (int i = 0; i < (size1 * size2 * size3); i++) {
	shortBuffer.put(i, defaultCost);
	}

	for (int i = 0; i < ruleSize; i++) {
	Vector<Integer> r1 = matrixBuilders[0].getRuleIdList(rule1.get(i));
	Vector<Integer> r2 = matrixBuilders[1].getRuleIdList(rule2.get(i));
	Vector<Integer> r3 = matrixBuilders[2].getRuleIdList(rule3.get(i));

	for (Iterator<Integer> i1 = r1.iterator(); i1.hasNext();) {
	int ii1 = i1.next();
	for (Iterator<Integer> i2 = r2.iterator(); i2.hasNext();) {
	int ii2 = i2.next();
	for (Iterator<Integer> i3 = r3.iterator(); i3.hasNext();) {
	int ii3 = i3.next();
	int position = size3 * (size2 * ii1 + ii2) + ii3;
	shortBuffer.put(position, scores[i]);
	}
	}
	}
	}

	buffer.force();

	return matrixBuilders;

	}


	/**
	* Create the token data file
	*
	* @param tokenDataFilename The filename for the token data file
	* @param standardCTokens The beginning-of-string, end-of-string, and unknown-morpheme CTokens
	* @param tupleList The (String,CToken) tuples
	*
	* @return The Trie precursor data
	* @throws IOException
	*/
	private TrieData createTokenFile(String tokenDataFilename, CToken[] standardCTokens, VirtualTupleList tupleList)
	throws IOException
	{

	TrieData trieData = new TrieData();

	trieData.values = new int[tupleList.size()];
	trieData.keys = new String[tupleList.size()];
	trieData.size = 0;
	int spos = 0;
	int bsize = 0;
	String prev = "";

	DataOutputStream out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(tokenDataFilename)));

	// Write beginning-of-string, end-of-string, unknown-morpheme tokens
	CToken.write(out, standardCTokens[0]);
	CToken.write(out, standardCTokens[1]);
	CToken.write(out, standardCTokens[2]);

	// Write token data
	for (int i = 0; i < trieData.keys.length; i++) {
	StringCTokenTuple tuple = tupleList.get(i);
	String k = tuple.key;
	if (!prev.equals(k) && i != 0) {
	trieData.keys[trieData.size] = tupleList.get(spos).key;
	trieData.values[trieData.size] = bsize + (spos << 8);
	trieData.size++;
	bsize = 1;
	spos = i;
	} else {
	bsize++;
	}
	prev = tuple.key;
	CToken.write(out, tuple.value);
	}
	out.flush();
	out.close();

	trieData.keys[trieData.size] = tupleList.get(spos).key;
	trieData.values[trieData.size] = bsize + (spos << 8);
	trieData.size++;


	return trieData;

	}


	/**
	* Create Trie file
	*
	* @param trieDataFilename The filename for the Trie file
	* @param trieData The Trie precursor data
	* @throws IOException
	*/
	private void createTrieFile(String trieDataFilename, TrieData trieData) throws IOException {

	TrieBuilder builder = new TrieBuilder(trieData.keys, trieData.values, trieData.size);
	builder.build(trieDataFilename);

	}


	/**
	* Compiles CSV source data into the data files used for analysis
	*
	* @param customDictionaryCSVFilenames The filenames of custom dictionaries, or <code>null</code>
	* @throws IOException
	*/
	public DictionaryBuilder(String[] customDictionaryCSVFilenames) throws IOException {

	List<String> dictionaryCSVFilenames = new ArrayList<String>();
	dictionaryCSVFilenames.add(DICTIONARY_CSV_FILENAME);
	dictionaryCSVFilenames.addAll(Arrays.asList(customDictionaryCSVFilenames));

	String charset = "UTF-8";


	// Create connection cost file (matrix.sen)
	CostMatrixBuilder[] matrixBuilders = createConnectionCostFile(
	CONNECTION_CSV_FILENAME,
	CONNECTION_COST_DATA_FILENAME,
	DEFAULT_CONNECTION_COST,
	charset
	);


	// Create part-of-speech data file (posInfo.sen)
	VirtualTupleList dictionaryList = new VirtualTupleList();
	CToken[] standardCTokens = new CToken[3];

	createPartOfSpeechDataFile(
	dictionaryCSVFilenames,
	PART_OF_SPEECH_DATA_FILENAME,
	matrixBuilders,
	PART_OF_SPEECH_START,
	PART_OF_SPEECH_SIZE,
	charset,
	BOS_PART_OF_SPEECH,
	EOS_PART_OF_SPEECH,
	UNKNOWN_PART_OF_SPEECH,
	dictionaryList,
	standardCTokens
	);

	// Free temporary object for GC
	matrixBuilders = null;


	// Create Token file (token.sen)
	TrieData trieData = createTokenFile(
	TOKEN_DATA_FILENAME,
	standardCTokens,
	dictionaryList
	);

	// Free temporary object for GC
	dictionaryList = null;


	// Create Trie file (da.sen)
	createTrieFile(TRIE_DATA_FILENAME, trieData);

	}


	}