vivizhyy · September 19, 2012 12:04
diff --git a/Tagger.java b/Tagger.java
 import java.io.BufferedReader;
 import java.io.BufferedWriter;
 import java.io.File;
 import java.io.FileReader;
 import java.io.FileWriter;
 import java.io.IOException;
 import java.io.UnsupportedEncodingException;
 import java.sql.Timestamp;
 import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.List;

 import ICTCLAS.kevin.zhang.ICTCLAS2011;

 public class Tagger {
    
    private static final ICTCLAS2011 ictcla = new ICTCLAS2011();
    
    private KeyValuePairs<String, String> tag(String text) {
        KeyValuePairs<String, String> tags = new KeyValuePairs<>();
        String[] sents = text.split(" ");
        for (String sent : sents) {
            String[] wordPunc = sent.split("/");
            if (wordPunc.length == 2) {
                tags.add(wordPunc[0], wordPunc[1]);
            }
        }
        return tags;
    }
    
    private static List<String> getContents() {
        List<String> contents = new ArrayList<>();
        try {
            BufferedReader reader = new BufferedReader(new FileReader("content.dat"));
            String line;
            while ((line = reader.readLine()) != null) {
                line = line.replaceAll("<[^>]+>", "");
                contents.add(line);
            }
            reader.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        
        return contents;
    }
    
    private void writeResult(String filePath, KeyValuePairs<String, String> tags) {
        try {
            BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(filePath));
            List<KeyValuePairs.Entry<String, String>> tagPairs = tags.entries();
            for (KeyValuePairs.Entry<String, String> tagPair : tagPairs) {
                bufferedWriter.append(tagPair.toString() + "\n");
            }
            bufferedWriter.flush();
            bufferedWriter.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    
    private void addWords() {
        try {
            BufferedReader reader = new BufferedReader(new FileReader("userdic.txt"));
            String line;
            while ((line = reader.readLine()) != null) {
                ictcla.ICTCLAS_AddUserWord(line.getBytes());
            }
            reader.close();
            ictcla.ICTCLAS_SaveTheUsrDic();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    
    public static void main(String[] args) {
        Tagger testTagger = new Tagger();
        List<String> contents = getContents();
        try {
            String argu = "/home/yyzhang/ICTCLAS2012/";
            System.out.println("ICTCLAS_Init");
            if (ICTCLAS2011.ICTCLAS_Init(argu.getBytes("GB2312"),0) == false)
            {
                System.out.println("Init Fail!");
                return;
            }
            testTagger.addWords();
            // http://www.icl.pku.edu.cn/icl_res/segtag98/catetkset.html
            ictcla.ICTCLAS_SetPOSmap(2);
            String resultPath = "seg-result-" + new SimpleDateFormat("yyyy-MM-dd-HH-mm-ss").format(new Timestamp(System.currentTimeMillis()));
            if (new File(resultPath).mkdir()) {
                for (String content : contents) {
                    byte[] nativeBytes = ictcla.ICTCLAS_ParagraphProcess(content.getBytes("GB2312"), 3);
                    testTagger.writeResult(resultPath + "/result", 
                            testTagger.tag(new String(nativeBytes, 0, nativeBytes.length, "GB2312")));
                }
            }
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
    }
    
 }
diff --git a/TestSmartCn.java b/TestSmartCn.java
 package com.wumii.model.service.search;

 import java.io.BufferedReader;
 import java.io.FileReader;
 import java.io.IOException;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.util.Version;

 public class TestSmartCn {

    private static Analyzer ANALYZER = new SmartChineseAnalyzer(Version.LUCENE_35);
    private static String FILE_PATH = "content.dat";
    
    public static void main(String[] args) {
        new TestSmartCn().wordSegmentation();
    }
    
    public void wordSegmentation() {
        try {
            BufferedReader reader = new BufferedReader(new FileReader(FILE_PATH));
            TokenStream ts = ANALYZER.tokenStream("", reader);
            while (ts.incrementToken()) {
                System.out.println(ts.getAttribute(CharTermAttribute.class));
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
 }
	import java.io.BufferedReader;
	import java.io.BufferedWriter;
	import java.io.File;
	import java.io.FileReader;
	import java.io.FileWriter;
	import java.io.IOException;
	import java.io.UnsupportedEncodingException;
	import java.sql.Timestamp;
	import java.text.SimpleDateFormat;
	import java.util.ArrayList;
	import java.util.List;

	import ICTCLAS.kevin.zhang.ICTCLAS2011;

	public class Tagger {

	private static final ICTCLAS2011 ictcla = new ICTCLAS2011();

	private KeyValuePairs<String, String> tag(String text) {
	KeyValuePairs<String, String> tags = new KeyValuePairs<>();
	String[] sents = text.split(" ");
	for (String sent : sents) {
	String[] wordPunc = sent.split("/");
	if (wordPunc.length == 2) {
	tags.add(wordPunc[0], wordPunc[1]);
	}
	}
	return tags;
	}

	private static List<String> getContents() {
	List<String> contents = new ArrayList<>();
	try {
	BufferedReader reader = new BufferedReader(new FileReader("content.dat"));
	String line;
	while ((line = reader.readLine()) != null) {
	line = line.replaceAll("<[^>]+>", "");
	contents.add(line);
	}
	reader.close();
	} catch (IOException e) {
	e.printStackTrace();
	}

	return contents;
	}

	private void writeResult(String filePath, KeyValuePairs<String, String> tags) {
	try {
	BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(filePath));
	List<KeyValuePairs.Entry<String, String>> tagPairs = tags.entries();
	for (KeyValuePairs.Entry<String, String> tagPair : tagPairs) {
	bufferedWriter.append(tagPair.toString() + "\n");
	}
	bufferedWriter.flush();
	bufferedWriter.close();
	} catch (IOException e) {
	e.printStackTrace();
	}
	}

	private void addWords() {
	try {
	BufferedReader reader = new BufferedReader(new FileReader("userdic.txt"));
	String line;
	while ((line = reader.readLine()) != null) {
	ictcla.ICTCLAS_AddUserWord(line.getBytes());
	}
	reader.close();
	ictcla.ICTCLAS_SaveTheUsrDic();
	} catch (IOException e) {
	e.printStackTrace();
	}
	}

	public static void main(String[] args) {
	Tagger testTagger = new Tagger();
	List<String> contents = getContents();
	try {
	String argu = "/home/yyzhang/ICTCLAS2012/";
	System.out.println("ICTCLAS_Init");
	if (ICTCLAS2011.ICTCLAS_Init(argu.getBytes("GB2312"),0) == false)
	{
	System.out.println("Init Fail!");
	return;
	}
	testTagger.addWords();
	// http://www.icl.pku.edu.cn/icl_res/segtag98/catetkset.html
	ictcla.ICTCLAS_SetPOSmap(2);
	String resultPath = "seg-result-" + new SimpleDateFormat("yyyy-MM-dd-HH-mm-ss").format(new Timestamp(System.currentTimeMillis()));
	if (new File(resultPath).mkdir()) {
	for (String content : contents) {
	byte[] nativeBytes = ictcla.ICTCLAS_ParagraphProcess(content.getBytes("GB2312"), 3);
	testTagger.writeResult(resultPath + "/result",
	testTagger.tag(new String(nativeBytes, 0, nativeBytes.length, "GB2312")));
	}
	}
	} catch (UnsupportedEncodingException e) {
	e.printStackTrace();
	}
	}

	}
	package com.wumii.model.service.search;

	import java.io.BufferedReader;
	import java.io.FileReader;
	import java.io.IOException;

	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
	import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
	import org.apache.lucene.util.Version;

	public class TestSmartCn {

	private static Analyzer ANALYZER = new SmartChineseAnalyzer(Version.LUCENE_35);
	private static String FILE_PATH = "content.dat";

	public static void main(String[] args) {
	new TestSmartCn().wordSegmentation();
	}

	public void wordSegmentation() {
	try {
	BufferedReader reader = new BufferedReader(new FileReader(FILE_PATH));
	TokenStream ts = ANALYZER.tokenStream("", reader);
	while (ts.incrementToken()) {
	System.out.println(ts.getAttribute(CharTermAttribute.class));
	}
	} catch (IOException e) {
	e.printStackTrace();
	}
	}
	}