Skip to content

Instantly share code, notes, and snippets.

@vivizhyy
Created September 19, 2012 12:04
Show Gist options
  • Save vivizhyy/3749300 to your computer and use it in GitHub Desktop.
Save vivizhyy/3749300 to your computer and use it in GitHub Desktop.
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.sql.Timestamp;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.List;
import ICTCLAS.kevin.zhang.ICTCLAS2011;
public class Tagger {
private static final ICTCLAS2011 ictcla = new ICTCLAS2011();
private KeyValuePairs<String, String> tag(String text) {
KeyValuePairs<String, String> tags = new KeyValuePairs<>();
String[] sents = text.split(" ");
for (String sent : sents) {
String[] wordPunc = sent.split("/");
if (wordPunc.length == 2) {
tags.add(wordPunc[0], wordPunc[1]);
}
}
return tags;
}
private static List<String> getContents() {
List<String> contents = new ArrayList<>();
try {
BufferedReader reader = new BufferedReader(new FileReader("content.dat"));
String line;
while ((line = reader.readLine()) != null) {
line = line.replaceAll("<[^>]+>", "");
contents.add(line);
}
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
return contents;
}
private void writeResult(String filePath, KeyValuePairs<String, String> tags) {
try {
BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(filePath));
List<KeyValuePairs.Entry<String, String>> tagPairs = tags.entries();
for (KeyValuePairs.Entry<String, String> tagPair : tagPairs) {
bufferedWriter.append(tagPair.toString() + "\n");
}
bufferedWriter.flush();
bufferedWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
private void addWords() {
try {
BufferedReader reader = new BufferedReader(new FileReader("userdic.txt"));
String line;
while ((line = reader.readLine()) != null) {
ictcla.ICTCLAS_AddUserWord(line.getBytes());
}
reader.close();
ictcla.ICTCLAS_SaveTheUsrDic();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
Tagger testTagger = new Tagger();
List<String> contents = getContents();
try {
String argu = "/home/yyzhang/ICTCLAS2012/";
System.out.println("ICTCLAS_Init");
if (ICTCLAS2011.ICTCLAS_Init(argu.getBytes("GB2312"),0) == false)
{
System.out.println("Init Fail!");
return;
}
testTagger.addWords();
// http://www.icl.pku.edu.cn/icl_res/segtag98/catetkset.html
ictcla.ICTCLAS_SetPOSmap(2);
String resultPath = "seg-result-" + new SimpleDateFormat("yyyy-MM-dd-HH-mm-ss").format(new Timestamp(System.currentTimeMillis()));
if (new File(resultPath).mkdir()) {
for (String content : contents) {
byte[] nativeBytes = ictcla.ICTCLAS_ParagraphProcess(content.getBytes("GB2312"), 3);
testTagger.writeResult(resultPath + "/result",
testTagger.tag(new String(nativeBytes, 0, nativeBytes.length, "GB2312")));
}
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
}
}
package com.wumii.model.service.search;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
public class TestSmartCn {
private static Analyzer ANALYZER = new SmartChineseAnalyzer(Version.LUCENE_35);
private static String FILE_PATH = "content.dat";
public static void main(String[] args) {
new TestSmartCn().wordSegmentation();
}
public void wordSegmentation() {
try {
BufferedReader reader = new BufferedReader(new FileReader(FILE_PATH));
TokenStream ts = ANALYZER.tokenStream("", reader);
while (ts.incrementToken()) {
System.out.println(ts.getAttribute(CharTermAttribute.class));
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment