Created
April 2, 2013 04:48
-
-
Save ansjsun/5290012 to your computer and use it in GitHub Desktop.
TagWord关键字标红
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.lang.reflect.Array; | |
import java.util.Arrays; | |
import java.util.HashSet; | |
import java.util.Iterator; | |
import java.util.Set; | |
/** | |
* 文本坐标记用的 tagBegin是开始标记 tagEnd 是结束标记 用了二分法查找来确定单词 content 是传入的正文 正文可以传多次 | |
* 标记词语也可以传多次 | |
* | |
* @author lkjlzf | |
* | |
*/ | |
public class TagWord { | |
private String tagBegin; | |
private String tagEnd; | |
Branch frontbegin = null; | |
Set<String> keyWords = new HashSet<String>(); | |
public TagWord(String begin, String end) { | |
this.tagBegin = begin; | |
this.tagEnd = end; | |
} | |
public TagWord addKeyWords(String[] keyWord) { | |
if (keyWord.length > 0) { | |
for (int i = 0; i < keyWord.length; i++) { | |
this.keyWords.add(keyWord[i].trim()); | |
} | |
} | |
return this; | |
} | |
// 是否发现词 | |
boolean findWord = false; | |
public String getTagContent(String content) { | |
if (content == null || content.trim().length() == 0 | |
|| keyWords.size() == 0) { | |
return content; | |
} | |
this.frontbegin = new MakeLibrary().getStringTree(this.keyWords); | |
if (frontbegin == null) { | |
return content; | |
} | |
char[] chars = content.toCharArray(); | |
// 正文 | |
StringBuilder sb = new StringBuilder(); | |
WoodInterface head = this.frontbegin; | |
int start = 0; | |
int end = 1; | |
int index = 0; | |
boolean isBack = false; | |
int length = chars.length; | |
// 此处是正向最大匹配 | |
for (int i = 0; i < length; i++) { | |
index++; | |
head = head.get(chars[i]); | |
if (head == null) { | |
if (isBack) { | |
sb.append(tagBegin).append(chars, start, end) | |
.append(tagEnd); | |
start = start + end; | |
i = start - 1; | |
isBack = false; | |
} else { | |
sb.append(chars, start, end); | |
i = start; | |
start++; | |
} | |
head = this.frontbegin; | |
index = 0; | |
end = 1; | |
continue; | |
} | |
switch (head.getStatus()) { | |
case 1: | |
break; | |
case 2: | |
end = index; | |
isBack = true; | |
break; | |
case 3: | |
sb.append(tagBegin).append(chars, start, index).append(tagEnd); | |
start = start + index; | |
index = 0; | |
end = 1; | |
isBack = false; | |
head = this.frontbegin; | |
break; | |
} | |
} | |
if(head!=this.frontbegin){ | |
sb.append(chars, start, index) ; | |
} | |
return sb.toString(); | |
} | |
public static void main(String[] args) { | |
String[] keyWords = { "1239" }; | |
long start = System.currentTimeMillis(); | |
for (int i = 0; i < 1; i++) { | |
System.out.println(new TagWord("<begin>", "<end>").addKeyWords( | |
keyWords).getTagContent("QB 1239-1991")); | |
} | |
} | |
} | |
class MakeLibrary { | |
public MakeLibrary() { | |
} | |
// 是否有下一个 | |
private static boolean hasNext = true; | |
// 是否是一个词 | |
private static boolean isWords = true; | |
Iterator<String> it = null; | |
public Branch getStringTree(Set<String> keyWords) { | |
it = keyWords.iterator(); | |
Branch head = new Branch('h', 0, 0); | |
Branch branch = head; | |
while (it.hasNext()) { | |
char[] chars = it.next().toCharArray(); | |
for (int i = 0; i < chars.length; i++) { | |
if (chars.length == (i + 1)) { | |
isWords = true; | |
hasNext = false; | |
} else { | |
isWords = false; | |
hasNext = true; | |
} | |
int status = 1; | |
if (isWords && hasNext) { | |
status = 2; | |
} | |
if (!isWords && hasNext) { | |
status = 1; | |
} | |
if (isWords && !hasNext) { | |
status = 3; | |
} | |
branch.add(new Branch(chars[i], status, 0)); | |
branch = (Branch) branch.get(chars[i]); | |
} | |
branch = head; | |
} | |
return head; | |
} | |
} | |
interface WoodInterface { | |
public WoodInterface add(WoodInterface branch); | |
public WoodInterface get(char c); | |
public boolean contains(char c); | |
public int compareTo(char c); | |
public boolean equals(char c); | |
public byte getStatus(); | |
public char getC(); | |
public void setStatus(int status); | |
public byte getNature(); | |
public void setNature(byte nature); | |
} | |
class Branch implements WoodInterface { | |
/** | |
* status 此字的状态1,继续 2,是个词语但是还可以继续 ,3确定 nature 词语性质 0.未知 . 1是姓 . 2 是职位名称 3 | |
* 是数量级的词 . 4 是数字词语 5 是标点 | |
*/ | |
WoodInterface[] branches = new WoodInterface[0]; | |
private char c; | |
// 状态 | |
private byte status = 1; | |
// 索引 | |
private short index = -1; | |
// 词性 | |
private byte nature = 0; | |
// 单独查找出来的对象 | |
WoodInterface branch = null; | |
public WoodInterface add(WoodInterface branch) { | |
if ((this.branch = this.get(branch.getC())) != null) { | |
switch (branch.getStatus()) { | |
case 1: | |
if (this.branch.getStatus() == 2) { | |
this.branch.setStatus(2); | |
} | |
if (this.branch.getStatus() == 3) { | |
this.branch.setStatus(2); | |
} | |
break; | |
case 2: | |
this.branch.setStatus(2); | |
case 3: | |
if (this.branch.getStatus() == 2) { | |
this.branch.setStatus(2); | |
} | |
if (this.branch.getStatus() == 1) { | |
this.branch.setStatus(2); | |
} | |
} | |
this.branch.setNature(branch.getNature()); | |
return this.branch; | |
} | |
index++; | |
if ((index + 1) > branches.length) { | |
branches = Arrays.copyOf(branches, index + 1); | |
} | |
branches[index] = branch; | |
AnsjArrays.sort(branches); | |
return branch; | |
} | |
public Branch(char c, int status, int nature) { | |
this.c = c; | |
this.status = (byte) status; | |
this.nature = (byte) nature; | |
} | |
int i = 0; | |
public WoodInterface get(char c) { | |
int i = AnsjArrays.binarySearch(branches, c); | |
if (i > -1) { | |
return branches[i]; | |
} | |
return null; | |
} | |
public boolean contains(char c) { | |
if (AnsjArrays.binarySearch(branches, c) > -1) { | |
return true; | |
} else { | |
return false; | |
} | |
} | |
public int compareTo(char c) { | |
if (this.c > c) { | |
return 1; | |
} else if (this.c < c) { | |
return -1; | |
} else | |
return 0; | |
} | |
public boolean equals(char c) { | |
if (this.c == c) { | |
return true; | |
} else { | |
return false; | |
} | |
} | |
@Override | |
public int hashCode() { | |
// TODO Auto-generated method stub | |
return c; | |
} | |
public byte getStatus() { | |
return status; | |
} | |
public void setStatus(int status) { | |
this.status = (byte) status; | |
} | |
public char getC() { | |
return this.c; | |
} | |
public byte getNature() { | |
return nature; | |
} | |
public void setNature(byte nature) { | |
this.nature = nature; | |
} | |
} | |
class AnsjArrays { | |
private static final int INSERTIONSORT_THRESHOLD = 7; | |
/** | |
* 二分法查找.摘抄了jdk的东西..只不过把他的自动装箱功能给去掉了 | |
* | |
* @param branches | |
* @param c | |
* @return | |
*/ | |
public static int binarySearch(WoodInterface[] branches, char c) { | |
int high = branches.length - 1; | |
if (branches.length < 1) { | |
return high; | |
} | |
int low = 0; | |
while (low <= high) { | |
int mid = (low + high) >>> 1; | |
int cmp = branches[mid].compareTo(c); | |
if (cmp < 0) | |
low = mid + 1; | |
else if (cmp > 0) | |
high = mid - 1; | |
else | |
return mid; // key found | |
} | |
return -1; // key not found. | |
} | |
public static void sort(WoodInterface[] a) { | |
WoodInterface[] aux = (WoodInterface[]) a.clone(); | |
mergeSort(aux, a, 0, a.length, 0); | |
} | |
public static void sort(WoodInterface[] a, int fromIndex, int toIndex) { | |
rangeCheck(a.length, fromIndex, toIndex); | |
WoodInterface[] aux = copyOfRange(a, fromIndex, toIndex); | |
mergeSort(aux, a, fromIndex, toIndex, -fromIndex); | |
} | |
private static void rangeCheck(int arrayLen, int fromIndex, int toIndex) { | |
if (fromIndex > toIndex) | |
throw new IllegalArgumentException("fromIndex(" + fromIndex | |
+ ") > toIndex(" + toIndex + ")"); | |
if (fromIndex < 0) | |
throw new ArrayIndexOutOfBoundsException(fromIndex); | |
if (toIndex > arrayLen) | |
throw new ArrayIndexOutOfBoundsException(toIndex); | |
} | |
private static void mergeSort(WoodInterface[] src, WoodInterface[] dest, | |
int low, int high, int off) { | |
int length = high - low; | |
// Insertion sort on smallest arrays | |
if (length < INSERTIONSORT_THRESHOLD) { | |
for (int i = low; i < high; i++) | |
for (int j = i; j > low | |
&& (dest[j - 1]).compareTo(dest[j].getC()) > 0; j--) | |
swap(dest, j, j - 1); | |
return; | |
} | |
// Recursively sort halves of dest into src | |
int destLow = low; | |
int destHigh = high; | |
low += off; | |
high += off; | |
int mid = (low + high) >>> 1; | |
mergeSort(dest, src, low, mid, -off); | |
mergeSort(dest, src, mid, high, -off); | |
// If list is already sorted, just copy from src to dest. This | |
// is an | |
// optimization that results in faster sorts for nearly ordered | |
// lists. | |
if (src[mid - 1].compareTo(src[mid].getC()) <= 0) { | |
System.arraycopy(src, low, dest, destLow, length); | |
return; | |
} | |
// Merge sorted halves (now in src) into dest | |
for (int i = destLow, p = low, q = mid; i < destHigh; i++) { | |
if (q >= high || p < mid && src[p].compareTo(src[q].getC()) <= 0) | |
dest[i] = src[p++]; | |
else | |
dest[i] = src[q++]; | |
} | |
} | |
/** | |
* Swaps x[a] with x[b]. | |
*/ | |
private static void swap(WoodInterface[] x, int a, int b) { | |
WoodInterface t = x[a]; | |
x[a] = x[b]; | |
x[b] = t; | |
} | |
public static <T> T[] copyOfRange(T[] original, int from, int to) { | |
return copyOfRange(original, from, to, (Class<T[]>) original.getClass()); | |
} | |
public static <T, U> T[] copyOfRange(U[] original, int from, int to, | |
Class<? extends T[]> newType) { | |
int newLength = to - from; | |
if (newLength < 0) | |
throw new IllegalArgumentException(from + " > " + to); | |
T[] copy = ((Object) newType == (Object) Object[].class) ? (T[]) new Object[newLength] | |
: (T[]) Array | |
.newInstance(newType.getComponentType(), newLength); | |
System.arraycopy(original, from, copy, 0, Math.min(original.length | |
- from, newLength)); | |
return copy; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment