Created
September 19, 2012 12:04
-
-
Save vivizhyy/3749300 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.BufferedReader; | |
import java.io.BufferedWriter; | |
import java.io.File; | |
import java.io.FileReader; | |
import java.io.FileWriter; | |
import java.io.IOException; | |
import java.io.UnsupportedEncodingException; | |
import java.sql.Timestamp; | |
import java.text.SimpleDateFormat; | |
import java.util.ArrayList; | |
import java.util.List; | |
import ICTCLAS.kevin.zhang.ICTCLAS2011; | |
public class Tagger { | |
private static final ICTCLAS2011 ictcla = new ICTCLAS2011(); | |
private KeyValuePairs<String, String> tag(String text) { | |
KeyValuePairs<String, String> tags = new KeyValuePairs<>(); | |
String[] sents = text.split(" "); | |
for (String sent : sents) { | |
String[] wordPunc = sent.split("/"); | |
if (wordPunc.length == 2) { | |
tags.add(wordPunc[0], wordPunc[1]); | |
} | |
} | |
return tags; | |
} | |
private static List<String> getContents() { | |
List<String> contents = new ArrayList<>(); | |
try { | |
BufferedReader reader = new BufferedReader(new FileReader("content.dat")); | |
String line; | |
while ((line = reader.readLine()) != null) { | |
line = line.replaceAll("<[^>]+>", ""); | |
contents.add(line); | |
} | |
reader.close(); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
return contents; | |
} | |
private void writeResult(String filePath, KeyValuePairs<String, String> tags) { | |
try { | |
BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(filePath)); | |
List<KeyValuePairs.Entry<String, String>> tagPairs = tags.entries(); | |
for (KeyValuePairs.Entry<String, String> tagPair : tagPairs) { | |
bufferedWriter.append(tagPair.toString() + "\n"); | |
} | |
bufferedWriter.flush(); | |
bufferedWriter.close(); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
} | |
private void addWords() { | |
try { | |
BufferedReader reader = new BufferedReader(new FileReader("userdic.txt")); | |
String line; | |
while ((line = reader.readLine()) != null) { | |
ictcla.ICTCLAS_AddUserWord(line.getBytes()); | |
} | |
reader.close(); | |
ictcla.ICTCLAS_SaveTheUsrDic(); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
} | |
public static void main(String[] args) { | |
Tagger testTagger = new Tagger(); | |
List<String> contents = getContents(); | |
try { | |
String argu = "/home/yyzhang/ICTCLAS2012/"; | |
System.out.println("ICTCLAS_Init"); | |
if (ICTCLAS2011.ICTCLAS_Init(argu.getBytes("GB2312"),0) == false) | |
{ | |
System.out.println("Init Fail!"); | |
return; | |
} | |
testTagger.addWords(); | |
// http://www.icl.pku.edu.cn/icl_res/segtag98/catetkset.html | |
ictcla.ICTCLAS_SetPOSmap(2); | |
String resultPath = "seg-result-" + new SimpleDateFormat("yyyy-MM-dd-HH-mm-ss").format(new Timestamp(System.currentTimeMillis())); | |
if (new File(resultPath).mkdir()) { | |
for (String content : contents) { | |
byte[] nativeBytes = ictcla.ICTCLAS_ParagraphProcess(content.getBytes("GB2312"), 3); | |
testTagger.writeResult(resultPath + "/result", | |
testTagger.tag(new String(nativeBytes, 0, nativeBytes.length, "GB2312"))); | |
} | |
} | |
} catch (UnsupportedEncodingException e) { | |
e.printStackTrace(); | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.wumii.model.service.search; | |
import java.io.BufferedReader; | |
import java.io.FileReader; | |
import java.io.IOException; | |
import org.apache.lucene.analysis.Analyzer; | |
import org.apache.lucene.analysis.TokenStream; | |
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; | |
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; | |
import org.apache.lucene.util.Version; | |
public class TestSmartCn { | |
private static Analyzer ANALYZER = new SmartChineseAnalyzer(Version.LUCENE_35); | |
private static String FILE_PATH = "content.dat"; | |
public static void main(String[] args) { | |
new TestSmartCn().wordSegmentation(); | |
} | |
public void wordSegmentation() { | |
try { | |
BufferedReader reader = new BufferedReader(new FileReader(FILE_PATH)); | |
TokenStream ts = ANALYZER.tokenStream("", reader); | |
while (ts.incrementToken()) { | |
System.out.println(ts.getAttribute(CharTermAttribute.class)); | |
} | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment