Created
September 4, 2019 06:17
-
-
Save jiqiujia/a182f2e25ceb42f3cc694ebf9bced43f to your computer and use it in GitHub Desktop.
berkeley lm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.File; | |
import java.util.ArrayList; | |
import java.util.List; | |
import edu.berkeley.nlp.lm.ConfigOptions; | |
import edu.berkeley.nlp.lm.StringWordIndexer; | |
import edu.berkeley.nlp.lm.io.ArpaLmReader; | |
import edu.berkeley.nlp.lm.io.LmReaders; | |
import edu.berkeley.nlp.lm.util.Logger; | |
public class BuildLM { | |
private static void usage() { | |
System.err.println("Usage: <lmOrder> <ARPA lm output file> <textfiles>*"); | |
System.exit(1); | |
} | |
public void makelml(String [] argv) | |
{ | |
if (argv.length < 2) { | |
usage(); | |
} | |
final int lmOrder = Integer.parseInt(argv[0]); | |
final String outputFile = argv[1]; | |
final List<String> inputFiles = new ArrayList<>(); | |
for (int i = 2; i < argv.length; ++i) { | |
inputFiles.add(argv[i]); | |
} | |
if (inputFiles.isEmpty()) inputFiles.add("-"); | |
Logger.setGlobalLogger(new Logger.SystemLogger(System.out, System.err)); | |
Logger.startTrack("Reading text files " + inputFiles + " and writing to file " + outputFile); | |
final StringWordIndexer wordIndexer = new StringWordIndexer(); | |
wordIndexer.setStartSymbol(ArpaLmReader.START_SYMBOL); | |
wordIndexer.setEndSymbol(ArpaLmReader.END_SYMBOL); | |
wordIndexer.setUnkSymbol(ArpaLmReader.UNK_SYMBOL); | |
LmReaders.createKneserNeyLmFromTextFiles(inputFiles, wordIndexer, lmOrder, new File(outputFile), new ConfigOptions()); | |
Logger.endTrack(); | |
} | |
public static void main(String[] args) { | |
BuildLM d = new BuildLM(); | |
// String inputfile = "local_data/cloth/segdata/wxf_seg_cloth_top_part_noauthor.txt"; | |
// String outputfile = "local_data/cloth/segdata/cloth.arpa"; | |
// String s[]={"3",outputfile,inputfile}; | |
// d.makelml(s); | |
String inputfile = "local_data/cloth/segdata/segSentences.txt"; | |
String outputfile = "local_data/cloth/segdata/sentence.arpa"; | |
String s[]={"3",outputfile,inputfile}; | |
d.makelml(s); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.*; | |
import java.util.ArrayList; | |
import java.util.List; | |
import java.util.Map; | |
import com.hankcs.hanlp.HanLP; | |
import com.hankcs.hanlp.dictionary.CustomDictionary; | |
import com.hankcs.hanlp.seg.common.Term; | |
import edu.berkeley.nlp.lm.ArrayEncodedProbBackoffLm; | |
import edu.berkeley.nlp.lm.ConfigOptions; | |
import edu.berkeley.nlp.lm.NgramLanguageModel; | |
import edu.berkeley.nlp.lm.StringWordIndexer; | |
import edu.berkeley.nlp.lm.collections.Counter; | |
import edu.berkeley.nlp.lm.io.LmReaders; | |
public class LoadLM { | |
public static ArrayEncodedProbBackoffLm<String> getLm(boolean compress,String file) { | |
final File lmFile = new File(file); | |
final ConfigOptions configOptions = new ConfigOptions(); | |
configOptions.unknownWordLogProb = 0.0f; | |
final ArrayEncodedProbBackoffLm<String> lm = LmReaders.readArrayEncodedLmFromArpa(lmFile.getPath(), compress, new StringWordIndexer(), configOptions, | |
Integer.MAX_VALUE); | |
return lm; | |
} | |
public static void main(String[] args) throws IOException { | |
LmReaders readers = new LmReaders(); | |
ArrayEncodedProbBackoffLm<String> model = LoadLM.getLm(false, "local_data/cloth/segdata/cloth.arpa"); | |
String sentence = "棉质的手感,前面有一条压线"; | |
Segmentor.loadCustomDictionary(new FileInputStream("server_data/customDictionary.txt"), false); | |
List<String> terms = Segmentor.seg(sentence); | |
float score = model.getLogProb(terms); | |
System.out.println(score); | |
Counter<String> c = NgramLanguageModel.StaticMethods.getDistributionOverNextWords(model, terms); | |
c.pruneKeysBelowThreshold(0.001); | |
for(Map.Entry<String, Double> entry: c.getEntriesSortedByDecreasingCount()){ | |
System.out.println(entry.getKey()+"\t"+entry.getValue()); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment