Last active
October 19, 2020 02:31
-
-
Save haoflynet/428ca120ea4669c03e3ce989997fef5b to your computer and use it in GitHub Desktop.
Java DFA sensitive words filter Java实现的敏感词过滤(支持停顿词/重复词/全半角/字符串替换,改进https://www.jb51.net/article/128990.htm)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.haofly.net.sensitiveword; | |
public class BCConvert { | |
/** | |
* ASCII表中可见字符从!开始,偏移位值为33(Decimal) | |
*/ | |
static final char DBC_CHAR_START = 33; // 半角! | |
/** | |
* ASCII表中可见字符到~结束,偏移位值为126(Decimal) | |
*/ | |
static final char DBC_CHAR_END = 126; // 半角~ | |
/** | |
* 全角对应于ASCII表的可见字符从!开始,偏移值为65281 | |
*/ | |
static final char SBC_CHAR_START = 65281; // 全角! | |
/** | |
* 全角对应于ASCII表的可见字符到~结束,偏移值为65374 | |
*/ | |
static final char SBC_CHAR_END = 65374; // 全角~ | |
/** | |
* ASCII表中除空格外的可见字符与对应的全角字符的相对偏移 | |
*/ | |
static final int CONVERT_STEP = 65248; // 全角半角转换间隔 | |
/** | |
* 全角空格的值,它没有遵从与ASCII的相对偏移,必须单独处理 | |
*/ | |
static final char SBC_SPACE = 12288; // 全角空格 12288 | |
/** | |
* 半角空格的值,在ASCII中为32(Decimal) | |
*/ | |
static final char DBC_SPACE = ' '; // 半角空格 | |
/** | |
* <PRE> | |
* 半角字符->全角字符转换 | |
* 只处理空格,!到˜之间的字符,忽略其他 | |
* </PRE> | |
*/ | |
public static String bj2qj(String src) { | |
if (src == null) { | |
return src; | |
} | |
StringBuilder buf = new StringBuilder(src.length()); | |
char[] ca = src.toCharArray(); | |
for (int i = 0; i < ca.length; i++) { | |
if (ca[i] == DBC_SPACE) { // 如果是半角空格,直接用全角空格替代 | |
buf.append(SBC_SPACE); | |
} else if ((ca[i] >= DBC_CHAR_START) && (ca[i] <= DBC_CHAR_END)) { // 字符是!到~之间的可见字符 | |
buf.append((char) (ca[i] + CONVERT_STEP)); | |
} else { // 不对空格以及ascii表中其他可见字符之外的字符做任何处理 | |
buf.append(ca[i]); | |
} | |
} | |
return buf.toString(); | |
} | |
/** | |
* 半角转换全角 | |
* | |
* @param src | |
* @return | |
*/ | |
public static int bj2qj(char src) { | |
int r = src; | |
if (src == DBC_SPACE) { // 如果是半角空格,直接用全角空格替代 | |
src = SBC_SPACE; | |
} else if ((src >= DBC_CHAR_START) && (src <= DBC_CHAR_END)) { // 字符是!到~之间的可见字符 | |
r = src + CONVERT_STEP; | |
} | |
return r; | |
} | |
/** | |
* <PRE> | |
* 全角字符->半角字符转换 | |
* 只处理全角的空格,全角!到全角~之间的字符,忽略其他 | |
* </PRE> | |
*/ | |
public static String qj2bj(String src) { | |
if (src == null) { | |
return src; | |
} | |
StringBuilder buf = new StringBuilder(src.length()); | |
char[] ca = src.toCharArray(); | |
for (int i = 0; i < src.length(); i++) { | |
if (ca[i] >= SBC_CHAR_START && ca[i] <= SBC_CHAR_END) { // 如果位于全角!到全角~区间内 | |
buf.append((char) (ca[i] - CONVERT_STEP)); | |
} else if (ca[i] == SBC_SPACE) { // 如果是全角空格 | |
buf.append(DBC_SPACE); | |
} else { // 不处理全角空格,全角!到全角~区间外的字符 | |
buf.append(ca[i]); | |
} | |
} | |
return buf.toString(); | |
} | |
/** | |
* 全角转换半角 | |
* | |
* @param src | |
* @return | |
*/ | |
public static int qj2bj(char src) { | |
int r = src; | |
if (src >= SBC_CHAR_START && src <= SBC_CHAR_END) { // 如果位于全角!到全角~区间内 | |
r = src - CONVERT_STEP; | |
} else if (src == SBC_SPACE) { // 如果是全角空格 | |
r = DBC_SPACE; | |
} | |
return r; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.haofly.net.sensitiveword; | |
public class FilterSet{ | |
private final long[] elements; | |
public FilterSet() { | |
elements = new long[1 + (65535 >>> 6)]; | |
} | |
public void add(final int no) { | |
elements[no >>> 6] |= (1L << (no & 63)); | |
} | |
public void add(final int... no) { | |
for(int currNo : no) { | |
elements[currNo >>> 6] |= (1L << (currNo & 63)); | |
} | |
} | |
public void remove(final int no) { | |
elements[no >>> 6] &= ~(1L << (no & 63)); | |
} | |
/** | |
* | |
* @param no | |
* @return true:添加成功 false:原已包含 | |
*/ | |
public boolean addAndNotify(final int no) { | |
int eWordNum = no >>> 6; | |
long oldElements = elements[eWordNum]; | |
elements[eWordNum] |= (1L << (no & 63)); | |
boolean result = elements[eWordNum] != oldElements; | |
// if (result) | |
// size++; | |
return result; | |
} | |
/** | |
* | |
* @param no | |
* @return true:移除成功 false:原本就不包含 | |
*/ | |
public boolean removeAndNotify(final int no) { | |
int eWordNum = no >>> 6; | |
long oldElements = elements[eWordNum]; | |
elements[eWordNum] &= ~(1L << (no & 63)); | |
boolean result = elements[eWordNum] != oldElements; | |
return result; | |
} | |
public boolean contains(final int no) { | |
return (elements[no >>> 6] & (1L << (no & 63))) != 0; | |
} | |
public boolean containsAll(final int... no) { | |
if(no.length==0) { | |
return true; | |
} | |
for(int currNo : no) { | |
if ((elements[currNo >>> 6] & (1L << (currNo & 63))) == 0) { | |
return false; | |
} | |
} | |
return true; | |
} | |
/** | |
* 不如直接循环调用contains | |
* @param no | |
* @return | |
*/ | |
public boolean containsAll_ueslessWay(final int... no) { | |
long[] elements = new long[this.elements.length]; | |
for(int currNo : no){ | |
elements[currNo >>> 6] |= (1L << (currNo & 63)); | |
}//这一步执行完跟循环调用contains差不多了 | |
for (int i = 0; i < elements.length; i++) { | |
if ((elements[i] & ~this.elements[i]) != 0) { | |
return false; | |
} | |
} | |
return true; | |
} | |
/** | |
* 目前没有去维护size,每次都是去计算size | |
* @return | |
*/ | |
public int size() { | |
int size = 0; | |
for (long element : elements) { | |
size += Long.bitCount(element); | |
} | |
return size; | |
} | |
public static void main(String[] args) { | |
FilterSet oi = new FilterSet(); | |
System.out.println(oi.elements.length); | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.haofly.net.sensitiveword; | |
import java.io.Serializable; | |
import java.util.Set; | |
public class SensitiveContentDto implements Serializable { | |
private static final long serialVersionUID = 3487911794259456000L; | |
private String oldContent; | |
private String newContent; | |
/** | |
* 是否包含敏感词 | |
*/ | |
private Boolean isContains; | |
/** | |
* 内容中出现的敏感词的列表 | |
*/ | |
private Set<String> sensitiveWords; | |
public String getOldContent() { | |
return oldContent; | |
} | |
public void setOldContent(String oldContent) { | |
this.oldContent = oldContent; | |
} | |
public String getNewContent() { | |
return newContent; | |
} | |
public void setNewContent(String newContent) { | |
this.newContent = newContent; | |
} | |
public Set<String> getSensitiveWords() { | |
return sensitiveWords; | |
} | |
public void setSensitiveWords(Set<String> sensitiveWords) { | |
this.sensitiveWords = sensitiveWords; | |
} | |
public Boolean getContains() { | |
return isContains; | |
} | |
public void setContains(Boolean contains) { | |
isContains = contains; | |
} | |
@Override | |
public String toString() { | |
return "SensitiveContentDto{" + | |
"oldContent='" + oldContent + '\'' + | |
", newContent='" + newContent + '\'' + | |
", isContains=" + isContains + | |
", sensitiveWords=" + sensitiveWords + | |
'}'; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.haofly.net.sensitiveword; | |
import java.io.*; | |
import java.util.*; | |
public class SensitiveWordUtil { | |
/** | |
* 存储首字 | |
*/ | |
private FilterSet set = new FilterSet(); | |
/** | |
* 存储节点 | |
*/ | |
private Map<Integer, WordNode> nodes = new HashMap<>(1024, 1); | |
/** | |
* 存储停顿词 | |
*/ | |
private Set<Integer> stopWordSet = new HashSet<>(); | |
/** | |
* 敏感词过滤替换 | |
*/ | |
private char[] replaceChar; | |
public SensitiveWordUtil() { | |
} | |
public SensitiveWordUtil(List<String> words) { | |
addSensitiveWord(words); | |
} | |
public SensitiveWordUtil(List<String> words, List<String> stopWords, String replaceWord) { | |
addSensitiveWord(words); | |
addStopWord(stopWords); | |
this.replaceChar = replaceWord.toCharArray(); | |
} | |
/** | |
* 从文件增加敏感词 | |
*/ | |
private List<String> readWordFromFile(String path) { | |
List<String> words; | |
BufferedReader br = null; | |
try { | |
File file = new File(path); | |
BufferedReader reader; | |
reader = new BufferedReader(new FileReader(file)); | |
String tempString; | |
int line = 1; | |
words = new ArrayList<String>(1200); | |
while ((tempString = reader.readLine()) != null) { | |
if ("".equals(tempString.trim())) { | |
continue; | |
} | |
words.add(tempString.trim()); | |
line++; | |
} | |
reader.close(); | |
} catch (Exception e) { | |
throw new RuntimeException(e); | |
} finally { | |
try { | |
if (br != null) { | |
br.close(); | |
} | |
} catch (IOException e) { | |
} | |
} | |
return words; | |
} | |
/** | |
* 添加停顿词 | |
*/ | |
public void addStopWord(List<String> words) { | |
if (words != null && words.size() > 0) { | |
char[] chs; | |
for (String curr : words) { | |
chs = curr.toCharArray(); | |
for (char c : chs) { | |
stopWordSet.add(charConvert(c)); | |
} | |
} | |
} | |
} | |
/** | |
* 添加敏感词/添加DFA节点 | |
*/ | |
public void addSensitiveWord(List<String> words) { | |
if (words != null && words.size() > 0) { | |
char[] chs; | |
int fchar; | |
int lastIndex; | |
WordNode fnode; // 首字母节点 | |
for (String curr : words) { | |
chs = curr.toCharArray(); | |
fchar = charConvert(chs[0]); | |
// 没有首字定义 | |
if (!set.contains(fchar)) { | |
// 首字标志位 可重复add,反正判断了,不重复了 | |
set.add(fchar); | |
fnode = new WordNode(fchar, chs.length == 1); | |
nodes.put(fchar, fnode); | |
} else { | |
fnode = nodes.get(fchar); | |
if (!fnode.isLast() && chs.length == 1) { | |
fnode.setLast(true); | |
} | |
} | |
lastIndex = chs.length - 1; | |
for (int i = 1; i < chs.length; i++) { | |
fnode = fnode.addIfNoExist(charConvert(chs[i]), i == lastIndex); | |
} | |
} | |
} | |
} | |
/** | |
* 过滤判断 将敏感词转化为成屏蔽词 | |
*/ | |
public SensitiveContentDto replaceSensitiveWord(String src) { | |
char[] chs = src.toCharArray(); | |
int length = chs.length; | |
int current; | |
int k; | |
WordNode node; | |
Set<String> sensitiveWords = new HashSet<>(); | |
for (int i = 0; i < length; i++) { | |
current = charConvert(chs[i]); | |
if (!set.contains(current)) { | |
continue; | |
} | |
node = nodes.get(current); | |
// 其实不会发生,习惯性写上了 | |
if (node == null) { | |
continue; | |
} | |
boolean couldMark = false; | |
int markNum = -1; | |
// 单字匹配(日) | |
if (node.isLast()) { | |
couldMark = true; | |
markNum = 0; | |
sensitiveWords.add(src); | |
} | |
// 继续匹配(日你/日你妹),以长的优先 | |
// 你-3 妹-4 夫-5 | |
k = i; | |
for (; ++k < length; ) { | |
int temp = charConvert(chs[k]); | |
if (stopWordSet.contains(temp)) { | |
continue; | |
} | |
node = node.querySub(temp); | |
if (node == null) { | |
break; | |
} | |
if (node.isLast()) { | |
couldMark = true; | |
markNum = k - i; | |
sensitiveWords.add(new String(Arrays.copyOfRange(chs, i, k + 1))); | |
} | |
} | |
if (couldMark) { | |
chs = this.modify(chs, replaceChar, i, markNum + 1); | |
i = i - (markNum + 1) + replaceChar.length + 1; | |
if (i < 0) { | |
break; | |
} | |
length = chs.length; | |
} | |
} | |
SensitiveContentDto sensitiveContentDto = new SensitiveContentDto(); | |
sensitiveContentDto.setOldContent(src); | |
sensitiveContentDto.setNewContent(Arrays.toString(chs)); | |
sensitiveContentDto.setSensitiveWords(sensitiveWords); | |
sensitiveContentDto.setContains(sensitiveWords.size() > 0); | |
return sensitiveContentDto; | |
} | |
/** | |
* 是否包含敏感词 | |
*/ | |
public boolean isContains(String src) { | |
char[] chs = src.toCharArray(); | |
int length = chs.length; | |
int currc; | |
int k; | |
WordNode node; | |
for (int i = 0; i < length; i++) { | |
currc = charConvert(chs[i]); | |
if (!set.contains(currc)) { | |
continue; | |
} | |
// 日 2 | |
node = nodes.get(currc); | |
// 其实不会发生,习惯性写上了 | |
if (node == null) { | |
continue; | |
} | |
boolean couldMark = false; | |
// 单字匹配(日) | |
if (node.isLast()) { | |
couldMark = true; | |
} | |
// 继续匹配(日你/日你妹),以长的优先 | |
// 你-3 妹-4 夫-5 | |
k = i; | |
for (; ++k < length; ) { | |
int temp = charConvert(chs[k]); | |
if (stopWordSet.contains(temp)) { | |
continue; | |
} | |
node = node.querySub(temp); | |
if (node == null) { | |
break; | |
} | |
if (node.isLast()) { | |
couldMark = true; | |
} | |
} | |
if (couldMark) { | |
return true; | |
} | |
} | |
return false; | |
} | |
/** | |
* 大写转化为小写 全角转化为半角 | |
*/ | |
private int charConvert(char src) { | |
int r = BCConvert.qj2bj(src); | |
return (r >= 'A' && r <= 'Z') ? r + 32 : r; | |
} | |
/** | |
* @param chs 原始数组 | |
* @param replace 替换数组 | |
* @param index 替换位置 | |
* @param num 替换长度 | |
* @return 新的i的值 | |
*/ | |
private char[] modify(char[] chs, char[] replace, int index, int num) { | |
char[] temp = new char[chs.length - num + replace.length]; | |
for (int i = 0; i < temp.length; i++) { | |
if (i < index) { | |
temp[i] = chs[i]; | |
} else if (i == index) { | |
for (int k = 0; k < replace.length; k++) { | |
temp[i + k] = replace[k]; | |
} | |
i = i + replace.length - 1; | |
} else { | |
temp[i] = chs[i - replace.length + num]; | |
} | |
} | |
return temp; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.haofly.net.sensitiveword; | |
import java.util.LinkedList; | |
import java.util.List; | |
public class WordNode { | |
private int value; // 节点名称 | |
private List<WordNode> subNodes; // 子节点 | |
private boolean isLast;// 默认false | |
public WordNode(int value) { | |
this.value = value; | |
} | |
public WordNode(int value, boolean isLast) { | |
this.value = value; | |
this.isLast = isLast; | |
} | |
/** | |
* | |
* @param subNode | |
* @return 就是传入的subNode | |
*/ | |
private WordNode addSubNode(final WordNode subNode) { | |
if (subNodes == null) { | |
subNodes = new LinkedList<WordNode>(); | |
} | |
subNodes.add(subNode); | |
return subNode; | |
} | |
/** | |
* 有就直接返回该子节点, 没有就创建添加并返回该子节点 | |
* | |
* @param value | |
* @return | |
*/ | |
public WordNode addIfNoExist(final int value, final boolean isLast) { | |
if (subNodes == null) { | |
return addSubNode(new WordNode(value, isLast)); | |
} | |
for (WordNode subNode : subNodes) { | |
if (subNode.value == value) { | |
if (!subNode.isLast && isLast) { | |
subNode.isLast = true; | |
} | |
return subNode; | |
} | |
} | |
return addSubNode(new WordNode(value, isLast)); | |
} | |
public WordNode querySub(final int value) { | |
if (subNodes == null) { | |
return null; | |
} | |
for (WordNode subNode : subNodes) { | |
if (subNode.value == value) { | |
return subNode; | |
} | |
} | |
return null; | |
} | |
public boolean isLast() { | |
return isLast; | |
} | |
public void setLast(boolean isLast) { | |
this.isLast = isLast; | |
} | |
@Override | |
public int hashCode() { | |
return value; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment