Last active
August 29, 2015 14:02
-
-
Save tenten0213/dda64e3bec39a069f775 to your computer and use it in GitHub Desktop.
GoSenのビルド
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0"?> | |
<!-- | |
Usage: | |
To use a web proxy to download the dictionary data, do the following: | |
$ ant -Dproxy.host=proxy.hoehoe.jp -Dproxy.port=8080 | |
--> | |
<project name="ipadic" default="compile" basedir="."> | |
<property name="ipadic.home" value="http://sourceforge.jp/projects/ipadic/downloads/24435/ipadic-2.7.0.tar.gz"/> | |
<property name="ipadic.version" value="2.7.0"/> | |
<property name="ipadic.archive" value="ipadic-${ipadic.version}.tar.gz"/> | |
<property name="ipadic.dir" value="ipadic-${ipadic.version}"/> | |
<!-- Checks the current build status --> | |
<!-- Sets the property "dics.unpacked" if the dictionary is already unpacked --> | |
<!-- Sets the property "ipadic.archive.present" if the ipadic archive is already present --> | |
<!-- Sets the property "dics.preprocessed" if the dictionary is already preprocessed --> | |
<!-- Sets the property "dics.complete" if the dictionary is already compiled --> | |
<target name="check-build-status"> | |
<available file="${ipadic.archive}" property="ipadic.archive.present"/> | |
<condition property="dics.unpacked"> | |
<and> | |
<available file="ipadic-${ipadic.version}/Noun.dic"/> | |
</and> | |
</condition> | |
<condition property="dics.preprocessed"> | |
<and> | |
<available file="dic.csv"/> | |
<available file="connect.csv"/> | |
</and> | |
</condition> | |
<condition property="dics.complete"> | |
<and> | |
<available file="da.sen"/> | |
<available file="matrix.sen"/> | |
<available file="posInfo.sen"/> | |
<available file="token.sen"/> | |
</and> | |
</condition> | |
</target> | |
<!-- | |
<target name="download" depends="prepare-proxy,check-build-status" unless="ipadic.archive.present"> | |
<get src="${ipadic.home}/${ipadic.archive}" dest="${ipadic.archive}" /> | |
</target> | |
--> | |
<!-- Unpacks the ipadic dictionary --> | |
<target name="unpack" depends="check-build-status" unless="dics.unpacked"> | |
<gunzip src="${ipadic.archive}"/> | |
<untar src="${ipadic.dir}.tar" dest="." /> | |
<delete file="${ipadic.dir}.tar"/> | |
</target> | |
<!-- Deletes the ipadic dictionary and compiled files --> | |
<target name="clean"> | |
<delete> | |
<fileset dir="." includes="*.sen"/> | |
<fileset dir="." includes="*.csv"/> | |
</delete> | |
<delete dir="ipadic-${ipadic.version}"/> | |
<delete file="${ipadic.archive}" /> | |
</target> | |
<!-- Preprocesses the ipadic dictionary for compilation --> | |
<target name="preprocess" depends="unpack" unless="dics.preprocessed"> | |
<java classname="net.java.sen.tools.DictionaryPreprocessor" | |
fork="true"> | |
<classpath> | |
<pathelement location="."/> | |
<pathelement location="../../bin"/> | |
<pathelement location="../../jisx0213-1.0.jar"/> | |
<pathelement path="${java.class.path}"/> | |
</classpath> | |
<arg line="X-EUC-JISX0213" /> | |
<arg line="ipadic-${ipadic.version}" /> | |
<arg line="." /> | |
</java> | |
</target> | |
<!-- Default task - compiles the ipadic dictionary --> | |
<target name="compile" depends="preprocess" unless="dics.complete"> | |
<java classname="net.java.sen.tools.DictionaryCompiler" | |
fork="true"> | |
<classpath> | |
<pathelement location="."/> | |
<pathelement location="../../bin"/> | |
<pathelement path="${java.class.path}"/> | |
</classpath> | |
</java> | |
</target> | |
<!-- Downloads and compiles the ipadic dictionary from scratch --> | |
<target name="all" depends="clean,unpack,compile"/> | |
</project> | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright (C) 2001-2007 | |
* Taku Kudoh <[email protected]> | |
* Takashi Okamoto <[email protected]> | |
* Matt Francis <[email protected]> | |
* | |
* This library is free software; you can redistribute it and/or modify it under | |
* the terms of the GNU Lesser General Public License as published by the Free | |
* Software Foundation; either version 2.1 of the License, or any later version. | |
* | |
* This library is distributed in the hope that it will be useful, but WITHOUT | |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more | |
* details. | |
* | |
* You should have received a copy of the GNU Lesser General Public License | |
* along with this library; if not, write to the Free Software Foundation, Inc., | |
* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
* | |
*/ | |
package net.java.sen.compiler; | |
import java.util.HashMap; | |
import java.util.HashSet; | |
import java.util.Iterator; | |
import java.util.LinkedHashSet; | |
import java.util.Map; | |
import java.util.Set; | |
import java.util.Vector; | |
/** | |
* Builds an axis of the Connection Cost matrix from supplied part-of-speech / | |
* conjugation data | |
* | |
* TODO The workings of this class are relatively simple but somewhat magical. | |
* It could use some explanation from someone who understands what exactly it's | |
* doing. | |
*/ | |
class CostMatrixBuilder { | |
/** | |
* Set containing all unique values from one column of the Connection Cost CSV file | |
*/ | |
private LinkedHashSet<String> ruleSet = new LinkedHashSet<String>(); | |
/** | |
* The input rules (from ruleSet) split into individual values | |
*/ | |
private Vector<String[]> ruleList = new Vector<String[]>(); | |
/** | |
* TODO This is magic. How does this work? | |
*/ | |
private Vector<Vector<Integer>> idList = new Vector<Vector<Integer>>(); | |
/** | |
* // dic2IdHash('word type')= id for word type | |
* TODO This is magic. How does this work? | |
*/ | |
private Map<String, Integer> dicIndex = new HashMap<String, Integer>(); | |
/** | |
* A map containing a unique integer ID for each rule added | |
*/ | |
private Map<String, Integer> ruleIndex = new HashMap<String, Integer>(); | |
/** | |
* Contains the set of the rules' last fields where the field is not equal to '*' | |
* TODO This is magic. How does this work? | |
*/ | |
private Set<String> lexicalized = new HashSet<String>(); | |
/** | |
* Converts a list of part-of-speech / conjugation identifier strings to | |
* a vector of IDs unique to each string | |
* TODO This is magic. How does this work? | |
* | |
* @param csv The part-of-speech / conjugation strings | |
* @param parent TODO How does this work? | |
* @return A vector of IDs for the strings | |
*/ | |
private Vector<Integer> getIdList(String csv[], boolean parent) { | |
Vector<Integer> results = new Vector<Integer>(this.ruleList.size()); | |
results.setSize(this.ruleList.size()); | |
for (int j = 0; j < this.ruleList.size(); j++) { | |
results.set(j, j); | |
} | |
for (int j = 0; j < csv.length; j++) { | |
int k = 0; | |
for (int n = 0; n < results.size(); n++) { | |
int i = results.get(n); | |
String rl_ij = this.ruleList.get(i)[j]; | |
if ( | |
((!parent) && (csv[j].charAt(0) == '*')) | |
|| ((parent) && (rl_ij.charAt(0) == '*')) | |
|| rl_ij.equals(csv[j]) | |
) | |
{ | |
results.set(k++, results.get(n)); | |
} | |
} | |
results.setSize(k); | |
} | |
return results; | |
} | |
/** | |
* Calculates a unique(?) ID for a split rule | |
* TODO This is magic. How does this work? | |
* | |
* @param csv The split rule | |
* @return The calculated ID | |
*/ | |
private int getDicIdNoCache(String csv[]) { | |
Vector<Integer> results = getIdList(csv, true); | |
/* if (results.size() == 0) { | |
throw new IllegalArgumentException(); | |
}*/ | |
int priority[] = new int[results.size()]; | |
int max = 0; | |
for (int i = 0; i < results.size(); i++) { | |
String csvValues[] = this.ruleList.get(results.get(i)); | |
for (int j = 0; j < csvValues.length; j++) { | |
if (csvValues[j].charAt(0) != '*') { | |
priority[i]++; | |
} | |
} | |
if (priority[max] < priority[i]) { | |
max = i; | |
} | |
} | |
return results.size() > 0 ? results.get(max) : 0; | |
} | |
/** | |
* Adds a Connection Cost CSV value to the builder | |
* | |
* @param rule The rule to add | |
*/ | |
public void add(String rule) { | |
this.ruleSet.add(rule); | |
} | |
/** | |
* Builds the matrix axis based on the data passed to {@link #add(String)}. | |
* It is an error to call {@link #add(String)} after calling | |
* {@link #build()}. | |
*/ | |
public void build() { | |
int i = 0; | |
this.ruleList.setSize(this.ruleSet.size()); | |
for (Iterator<String> iterator = this.ruleSet.iterator(); iterator.hasNext();) { | |
String str = iterator.next(); | |
this.ruleIndex.put(str, i); | |
String tokenList[] = str.split(","); | |
this.ruleList.set(i, tokenList); | |
if (tokenList[tokenList.length - 1].charAt(0) != '*') { | |
this.lexicalized.add(tokenList[tokenList.length - 1]); | |
} | |
i++; | |
} | |
this.ruleSet.clear(); | |
this.idList.setSize(this.ruleList.size()); | |
for (int j = 0; j < this.ruleList.size(); j++) { | |
Vector<Integer> results = getIdList(this.ruleList.get(j), false); | |
this.idList.set(j, results); | |
} | |
} | |
/** | |
* Returns the size of the built matrix axis | |
* | |
* @return The size of the built matrix axis | |
*/ | |
public int size() { | |
return this.ruleList.size(); | |
} | |
/** | |
* TODO This is magic. How does this work? | |
* | |
* @param rule The rule | |
* @return TODO how is this ID defined? | |
*/ | |
public int getDicId(String rule) { | |
String csv[] = rule.split(","); | |
String lex = csv[csv.length - 1]; | |
if (this.lexicalized.contains(lex)) { | |
return getDicIdNoCache(csv); | |
} | |
// Remove end field | |
String partOfSpeech = rule.substring(0, rule.lastIndexOf(",")); | |
Integer r = this.dicIndex.get(partOfSpeech); | |
if ((r != null) && (r != 0)) { | |
// 0 if empty | |
return r - 1; | |
} | |
int rg = getDicIdNoCache(csv); | |
this.dicIndex.put(partOfSpeech, rg + 1); | |
return rg; | |
} | |
/** | |
* Converts a rule to a vector of IDs unique to each component part | |
* | |
* @param rule The rule | |
* @return A vector of IDs for the component parts | |
*/ | |
public Vector<Integer> getRuleIdList(String rule) { | |
return this.idList.get(this.ruleIndex.get(rule)); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Copyright (C) 2002-2007 | |
* Taku Kudoh <[email protected]> | |
* Takashi Okamoto <[email protected]> | |
* Matt Francis <[email protected]> | |
* | |
* This library is free software; you can redistribute it and/or modify it under | |
* the terms of the GNU Lesser General Public License as published by the Free | |
* Software Foundation; either version 2.1 of the License, or any later version. | |
* | |
* This library is distributed in the hope that it will be useful, but WITHOUT | |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | |
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more | |
* details. | |
* | |
* You should have received a copy of the GNU Lesser General Public License | |
* along with this library; if not, write to the Free Software Foundation, Inc., | |
* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
* | |
*/ | |
package net.java.sen.compiler; | |
import java.io.BufferedOutputStream; | |
import java.io.DataOutputStream; | |
import java.io.FileInputStream; | |
import java.io.FileOutputStream; | |
import java.io.IOException; | |
import java.io.RandomAccessFile; | |
import java.nio.MappedByteBuffer; | |
import java.nio.ShortBuffer; | |
import java.nio.channels.FileChannel; | |
import java.util.ArrayList; | |
import java.util.Arrays; | |
import java.util.Iterator; | |
import java.util.List; | |
import java.util.Vector; | |
import net.java.sen.dictionary.CToken; | |
import net.java.sen.trie.TrieBuilder; | |
import net.java.sen.util.CSVData; | |
import net.java.sen.util.CSVParser; | |
/** | |
* Compiles CSV source data into the data files used for analysis | |
*/ | |
public class DictionaryBuilder { | |
/** | |
* Input dictionary CSV filename | |
*/ | |
private static final String DICTIONARY_CSV_FILENAME = "dictionary.csv"; | |
/** | |
* Input connection CSV filename | |
*/ | |
private static final String CONNECTION_CSV_FILENAME = "connection.csv"; | |
/** | |
* Compiled connection cost data filename | |
*/ | |
private static final String CONNECTION_COST_DATA_FILENAME = "connectionCost.sen"; | |
/** | |
* Compiled part of speech data filename | |
*/ | |
private static final String PART_OF_SPEECH_DATA_FILENAME = "partOfSpeech.sen"; | |
/** | |
* Compiled token data filename | |
*/ | |
private static final String TOKEN_DATA_FILENAME = "token.sen"; | |
/** | |
* Compiled trie data filename | |
*/ | |
private static final String TRIE_DATA_FILENAME = "trie.sen"; | |
/** | |
* Default connection cost | |
*/ | |
private static final short DEFAULT_CONNECTION_COST = 10000; | |
/** | |
* Start of part-of-speech data within the dictionary CSV | |
*/ | |
private static final int PART_OF_SPEECH_START = 2; | |
/** | |
* Size of part-of-speech data within the dictionary CSV | |
*/ | |
private static final int PART_OF_SPEECH_SIZE = 7; | |
/** | |
* Beginning-of-string token part-of-speech | |
*/ | |
private static final String BOS_PART_OF_SPEECH = "文頭,*,*,*,*,*,*"; | |
/** | |
* End-of-string token part-of-speech | |
*/ | |
private static final String EOS_PART_OF_SPEECH = "文末,*,*,*,*,*,*"; | |
/** | |
* Unknown token part-of-speech | |
*/ | |
private static final String UNKNOWN_PART_OF_SPEECH = "名詞,サ変接続,*,*,*,*,*"; | |
/** | |
* Precursor data for the Trie file | |
*/ | |
private static class TrieData { | |
/** | |
* Trie keys | |
*/ | |
public String keys[]; | |
/** | |
* Trie values | |
*/ | |
public int values[]; | |
/** | |
* The actual number of entries in the keys/values arrays | |
*/ | |
public int size; | |
} | |
/** | |
* Increases the size of an array of <code>short</code>s | |
* | |
* @param current The existing array | |
* @return The resized array | |
*/ | |
private static short[] resize(short current[]) { | |
short tmp[] = new short[(int) (current.length * 1.5)]; | |
System.arraycopy(current, 0, tmp, 0, current.length); | |
return tmp; | |
} | |
/** | |
* Splits a compound reading or pronunciation field into a list | |
* | |
* Compound fields are of the form: | |
* | |
* "{head1/head2[/head3 ...]}tail" | |
* | |
* The returned list will consist of: | |
* | |
* "head1tail", | |
* "head2tail", | |
* "head3tail", | |
* ... | |
* | |
* @param compoundField The field to split | |
* @return The split list | |
*/ | |
private List<String> splitCompoundField(String compoundField) { | |
List<String> splitFieldList; | |
if ((compoundField.length() == 0) || (compoundField.charAt(0) != '{')) { | |
// No alternatives | |
splitFieldList = new ArrayList<String>(1); | |
splitFieldList.add(compoundField); | |
} else { | |
// 1 or more alternatives. No existing entry in Ipadic has more than 4 | |
splitFieldList = new ArrayList<String>(4); | |
String[] parts = compoundField.split("[{}]"); | |
String tail = (parts.length == 3) ? parts[2] : ""; | |
String[] heads = parts.length > 0 ? parts[1].split("/") : new String[0] ; | |
for (int i = 0; i < heads.length; i++) { | |
splitFieldList.add(heads[i] + tail); | |
} | |
} | |
return splitFieldList; | |
} | |
/** | |
* Creates the part-of-speech data file | |
* | |
* @param dictionaryCSVFilenames The filenames of the dictionary CSV data file and any additional dictionaries | |
* @param partOfSpeechDataFilename The filename for the part-of-speech data file | |
* @param matrixBuilders The three <code>CostMatrixBuilder</code>s | |
* @param partOfSpeechStart The starting index of the part-of-speech data within a CSV line | |
* @param partOfSpeechSize The number of part-of-speech values within a CSV line | |
* @param charset The charset of the CSV data | |
* @param bosPartOfSpeech The beginning-of-string part-of-speech code | |
* @param eosPartOfSpeech The end-of-string part-of-speech code | |
* @param unknownPartOfSpeech The beginning-of-string part-of-speech code | |
* @param dictionaryList Populated by this method with the String/CToken tuples that will be used to create the Token file | |
* @param standardCTokens Populated by this method with the three standard CTokens ("bos", "eos" and "unknown") | |
* | |
* @throws IOException | |
*/ | |
private void createPartOfSpeechDataFile(List<String> dictionaryCSVFilenames, String partOfSpeechDataFilename, | |
CostMatrixBuilder[] matrixBuilders, int partOfSpeechStart, int partOfSpeechSize, String charset, | |
String bosPartOfSpeech, String eosPartOfSpeech, String unknownPartOfSpeech, VirtualTupleList dictionaryList, CToken[] standardCTokens) throws IOException | |
{ | |
String[] csvValues = null; | |
CSVData key_b = new CSVData(); | |
CSVData pos_b = new CSVData(); | |
DataOutputStream outputStream = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(partOfSpeechDataFilename))); | |
for (String dictionaryCSVFilename : dictionaryCSVFilenames) { | |
CSVParser parser = new CSVParser(new FileInputStream(dictionaryCSVFilename), charset); | |
while ((csvValues = parser.nextTokens()) != null) { | |
if (csvValues.length < (partOfSpeechSize + partOfSpeechStart)) { | |
throw new RuntimeException("format error:" + parser.currentLine()); | |
} | |
key_b.clear(); | |
pos_b.clear(); | |
for (int i = partOfSpeechStart; i < (partOfSpeechStart + partOfSpeechSize); i++) { | |
key_b.append(csvValues[i]); | |
pos_b.append(csvValues[i]); | |
} | |
for (int i = partOfSpeechStart + partOfSpeechSize; i < csvValues.length; i++) { | |
pos_b.append(csvValues[i]); | |
} | |
CToken ctoken = new CToken(); | |
ctoken.rcAttr2 = (short) matrixBuilders[0].getDicId(key_b.toString()); | |
ctoken.rcAttr1 = (short) matrixBuilders[1].getDicId(key_b.toString()); | |
ctoken.lcAttr = (short) matrixBuilders[2].getDicId(key_b.toString()); | |
ctoken.partOfSpeechIndex = outputStream.size() >> 1; | |
ctoken.length = (short) csvValues[0].length(); | |
try { | |
ctoken.cost = (short) Integer.parseInt(csvValues[1]); | |
} catch (NumberFormatException ex) { | |
ctoken.cost = (short) 0; | |
} | |
dictionaryList.add(csvValues[0], ctoken); | |
// Write to part of speech data file | |
StringBuilder partOfSpeechBuilder = new StringBuilder(); | |
for (int i = partOfSpeechStart; i < (partOfSpeechStart + 4); i++) { | |
if (!csvValues[i].equals("*")) { | |
partOfSpeechBuilder.append(csvValues[i]); | |
partOfSpeechBuilder.append("-"); | |
} | |
} | |
String partOfSpeech = partOfSpeechBuilder.substring(0, partOfSpeechBuilder.length() - 1); | |
String conjugationalType = csvValues[partOfSpeechStart + 4]; | |
String conjugationalForm = csvValues[partOfSpeechStart + 5]; | |
String basicForm = csvValues[partOfSpeechStart + 6]; | |
List<String> readings = splitCompoundField(csvValues[partOfSpeechStart + 7]); | |
List<String> pronunciations = splitCompoundField(csvValues[partOfSpeechStart + 8]); | |
outputStream.writeChar(partOfSpeech.length()); | |
outputStream.writeChars(partOfSpeech); | |
outputStream.writeChar(conjugationalType.length()); | |
outputStream.writeChars(conjugationalType); | |
outputStream.writeChar(conjugationalForm.length()); | |
outputStream.writeChars(conjugationalForm); | |
outputStream.writeChar(basicForm.length()); | |
outputStream.writeChars(basicForm); | |
outputStream.writeChar(readings.size()); | |
for (String reading : readings) { | |
outputStream.writeChar(reading.length()); | |
outputStream.writeChars(reading); | |
} | |
for (String pronunciation : pronunciations) { | |
outputStream.writeChar(pronunciation.length()); | |
outputStream.writeChars(pronunciation); | |
} | |
} | |
} | |
outputStream.close(); | |
dictionaryList.sort(); | |
CToken bosCToken = new CToken(); | |
bosCToken.rcAttr2 = (short) matrixBuilders[0].getDicId(bosPartOfSpeech); | |
bosCToken.rcAttr1 = (short) matrixBuilders[1].getDicId(bosPartOfSpeech); | |
bosCToken.lcAttr = (short) matrixBuilders[2].getDicId(bosPartOfSpeech); | |
standardCTokens[0] = bosCToken; | |
CToken eosCToken = new CToken(); | |
eosCToken.rcAttr2 = (short) matrixBuilders[0].getDicId(eosPartOfSpeech); | |
eosCToken.rcAttr1 = (short) matrixBuilders[1].getDicId(eosPartOfSpeech); | |
eosCToken.lcAttr = (short) matrixBuilders[2].getDicId(eosPartOfSpeech); | |
standardCTokens[1] = eosCToken; | |
CToken unknownCToken = new CToken(); | |
unknownCToken.rcAttr2 = (short) matrixBuilders[0].getDicId(unknownPartOfSpeech); | |
unknownCToken.rcAttr1 = (short) matrixBuilders[1].getDicId(unknownPartOfSpeech); | |
unknownCToken.lcAttr = (short) matrixBuilders[2].getDicId(unknownPartOfSpeech); | |
unknownCToken.partOfSpeechIndex = -1; | |
standardCTokens[2] = unknownCToken; | |
} | |
/** | |
* Creates the connection cost matrix file | |
* | |
* @param connectionCSVFilename The filename of the connection CSV data | |
* @param connectionCostDataFilename The filename for the connection cost matrix | |
* @param defaultCost The default connection cost | |
* @param charset The charset of the connection CSV data | |
* @return An array of three <code>CostMatrixBuilder</code>s | |
* @throws IOException | |
*/ | |
private CostMatrixBuilder[] createConnectionCostFile(String connectionCSVFilename, String connectionCostDataFilename, short defaultCost, String charset) throws IOException { | |
CostMatrixBuilder[] matrixBuilders = new CostMatrixBuilder[3]; | |
matrixBuilders[0] = new CostMatrixBuilder(); | |
matrixBuilders[1] = new CostMatrixBuilder(); | |
matrixBuilders[2] = new CostMatrixBuilder(); | |
Vector<String> rule1 = new Vector<String>(); | |
Vector<String> rule2 = new Vector<String>(); | |
Vector<String> rule3 = new Vector<String>(); | |
// The approximate length of the file, plus a bit. If we're wrong it'll be expanded during processing | |
short[] scores = new short[30000]; | |
// Read connection cost CSV data | |
CSVParser parser = new CSVParser(new FileInputStream(connectionCSVFilename), charset); | |
String t[]; | |
int line = 0; | |
while ((t = parser.nextTokens()) != null) { | |
if (t.length < 4) { | |
throw new IOException("Connection cost CSV format error"); | |
} | |
matrixBuilders[0].add(t[0]); | |
rule1.add(t[0]); | |
matrixBuilders[1].add(t[1]); | |
rule2.add(t[1]); | |
matrixBuilders[2].add(t[2]); | |
rule3.add(t[2]); | |
if (line == scores.length) { | |
scores = resize(scores); | |
} | |
scores[line++] = (short) Integer.parseInt(t[3]); | |
} | |
// Compile CostMatrixBuilders | |
matrixBuilders[0].build(); | |
matrixBuilders[1].build(); | |
matrixBuilders[2].build(); | |
int size1 = matrixBuilders[0].size(); | |
int size2 = matrixBuilders[1].size(); | |
int size3 = matrixBuilders[2].size(); | |
int ruleSize = rule1.size(); | |
// Write connection cost data | |
MappedByteBuffer buffer = null; | |
ShortBuffer shortBuffer = null; | |
int matrixSizeBytes = (size1 * size2 * size3 * 2); | |
int headerSizeBytes = (3 * 2); | |
RandomAccessFile file = new RandomAccessFile(connectionCostDataFilename, "rw"); | |
file.setLength(0); | |
file.writeShort(size1); | |
file.writeShort(size2); | |
file.writeShort(size3); | |
file.setLength(headerSizeBytes + matrixSizeBytes); | |
FileChannel indexChannel = file.getChannel(); | |
buffer = indexChannel.map(FileChannel.MapMode.READ_WRITE, headerSizeBytes, matrixSizeBytes); | |
shortBuffer = buffer.asShortBuffer(); | |
indexChannel.close(); | |
for (int i = 0; i < (size1 * size2 * size3); i++) { | |
shortBuffer.put(i, defaultCost); | |
} | |
for (int i = 0; i < ruleSize; i++) { | |
Vector<Integer> r1 = matrixBuilders[0].getRuleIdList(rule1.get(i)); | |
Vector<Integer> r2 = matrixBuilders[1].getRuleIdList(rule2.get(i)); | |
Vector<Integer> r3 = matrixBuilders[2].getRuleIdList(rule3.get(i)); | |
for (Iterator<Integer> i1 = r1.iterator(); i1.hasNext();) { | |
int ii1 = i1.next(); | |
for (Iterator<Integer> i2 = r2.iterator(); i2.hasNext();) { | |
int ii2 = i2.next(); | |
for (Iterator<Integer> i3 = r3.iterator(); i3.hasNext();) { | |
int ii3 = i3.next(); | |
int position = size3 * (size2 * ii1 + ii2) + ii3; | |
shortBuffer.put(position, scores[i]); | |
} | |
} | |
} | |
} | |
buffer.force(); | |
return matrixBuilders; | |
} | |
/** | |
* Create the token data file | |
* | |
* @param tokenDataFilename The filename for the token data file | |
* @param standardCTokens The beginning-of-string, end-of-string, and unknown-morpheme CTokens | |
* @param tupleList The (String,CToken) tuples | |
* | |
* @return The Trie precursor data | |
* @throws IOException | |
*/ | |
private TrieData createTokenFile(String tokenDataFilename, CToken[] standardCTokens, VirtualTupleList tupleList) | |
throws IOException | |
{ | |
TrieData trieData = new TrieData(); | |
trieData.values = new int[tupleList.size()]; | |
trieData.keys = new String[tupleList.size()]; | |
trieData.size = 0; | |
int spos = 0; | |
int bsize = 0; | |
String prev = ""; | |
DataOutputStream out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(tokenDataFilename))); | |
// Write beginning-of-string, end-of-string, unknown-morpheme tokens | |
CToken.write(out, standardCTokens[0]); | |
CToken.write(out, standardCTokens[1]); | |
CToken.write(out, standardCTokens[2]); | |
// Write token data | |
for (int i = 0; i < trieData.keys.length; i++) { | |
StringCTokenTuple tuple = tupleList.get(i); | |
String k = tuple.key; | |
if (!prev.equals(k) && i != 0) { | |
trieData.keys[trieData.size] = tupleList.get(spos).key; | |
trieData.values[trieData.size] = bsize + (spos << 8); | |
trieData.size++; | |
bsize = 1; | |
spos = i; | |
} else { | |
bsize++; | |
} | |
prev = tuple.key; | |
CToken.write(out, tuple.value); | |
} | |
out.flush(); | |
out.close(); | |
trieData.keys[trieData.size] = tupleList.get(spos).key; | |
trieData.values[trieData.size] = bsize + (spos << 8); | |
trieData.size++; | |
return trieData; | |
} | |
/** | |
* Create Trie file | |
* | |
* @param trieDataFilename The filename for the Trie file | |
* @param trieData The Trie precursor data | |
* @throws IOException | |
*/ | |
private void createTrieFile(String trieDataFilename, TrieData trieData) throws IOException { | |
TrieBuilder builder = new TrieBuilder(trieData.keys, trieData.values, trieData.size); | |
builder.build(trieDataFilename); | |
} | |
/** | |
* Compiles CSV source data into the data files used for analysis | |
* | |
* @param customDictionaryCSVFilenames The filenames of custom dictionaries, or <code>null</code> | |
* @throws IOException | |
*/ | |
public DictionaryBuilder(String[] customDictionaryCSVFilenames) throws IOException { | |
List<String> dictionaryCSVFilenames = new ArrayList<String>(); | |
dictionaryCSVFilenames.add(DICTIONARY_CSV_FILENAME); | |
dictionaryCSVFilenames.addAll(Arrays.asList(customDictionaryCSVFilenames)); | |
String charset = "UTF-8"; | |
// Create connection cost file (matrix.sen) | |
CostMatrixBuilder[] matrixBuilders = createConnectionCostFile( | |
CONNECTION_CSV_FILENAME, | |
CONNECTION_COST_DATA_FILENAME, | |
DEFAULT_CONNECTION_COST, | |
charset | |
); | |
// Create part-of-speech data file (posInfo.sen) | |
VirtualTupleList dictionaryList = new VirtualTupleList(); | |
CToken[] standardCTokens = new CToken[3]; | |
createPartOfSpeechDataFile( | |
dictionaryCSVFilenames, | |
PART_OF_SPEECH_DATA_FILENAME, | |
matrixBuilders, | |
PART_OF_SPEECH_START, | |
PART_OF_SPEECH_SIZE, | |
charset, | |
BOS_PART_OF_SPEECH, | |
EOS_PART_OF_SPEECH, | |
UNKNOWN_PART_OF_SPEECH, | |
dictionaryList, | |
standardCTokens | |
); | |
// Free temporary object for GC | |
matrixBuilders = null; | |
// Create Token file (token.sen) | |
TrieData trieData = createTokenFile( | |
TOKEN_DATA_FILENAME, | |
standardCTokens, | |
dictionaryList | |
); | |
// Free temporary object for GC | |
dictionaryList = null; | |
// Create Trie file (da.sen) | |
createTrieFile(TRIE_DATA_FILENAME, trieData); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment