Last active
July 9, 2021 12:59
-
-
Save wangzaixiang/3422ad6a443ec34504e3d315d7305900 to your computer and use it in GitHub Desktop.
一个高效的敏感词过滤实现,同时也演示一下 FP 的简洁和强大
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package demo | |
import java.lang | |
import java.util.regex.Pattern | |
import scala.annotation.tailrec | |
import scala.collection.{BitSet, mutable} | |
import scala.io.Source | |
trait SensitiveEngine { | |
def setKeywords(words: List[String]): Unit | |
def process(input: String): String | |
} | |
/** | |
* a fast sensetive word replace util. | |
*/ | |
object FastEngine extends SensitiveEngine { | |
class Node(val ch: Char, val depth: Int) { | |
var isKeyword: Boolean = false | |
val nexts: mutable.Map[Char, Node] = collection.mutable.Map[Char, Node]() | |
var fallthrough: Node = _ // TODO 暂时未实现,可以进一步优化,避免回溯,理论上性能会更高 | |
def addWord(word: String): Unit = { | |
assert(word.nonEmpty) | |
val char = word.charAt(0) | |
val remain = word.substring(1) | |
val next: Node = nexts.get(char) match { | |
case Some(node) => node | |
case None => | |
val it = new Node(char, depth + 1) | |
nexts(char) = it | |
it | |
} | |
if (remain.isEmpty) { | |
next.isKeyword = true | |
} | |
else { | |
next.addWord(remain) | |
} | |
} | |
def getNext(ch: Char): Option[Node] = nexts.get(ch) | |
} | |
private val root = new Node(0, 0) | |
private var rootBitSet: BitSet = _ // the root node have a lot of children, using the BitSet for fast detection, avoid hashmap lookup | |
override def setKeywords(words: List[String]): Unit = { | |
words.foreach(addWord) | |
rootBitSet = buildRootBitSet(root) | |
} | |
def addWord(word: String): Unit = { | |
root.addWord(word) | |
} | |
def process(input: String): String = { | |
process(input, 0, root, new StringBuilder(input.length)) | |
} | |
@tailrec | |
def process(input: String, pos: Int, currentNode: Node, builder: StringBuilder): String = { | |
def emitNodeAsStar(node: Node): Unit = { | |
var i = 0 | |
while (i < node.depth) { | |
builder.append('*') | |
i += 1 | |
} | |
} | |
if (pos >= input.length) { // process the last node | |
if (currentNode.isKeyword) { | |
emitNodeAsStar(currentNode) | |
} | |
else { | |
val begin = pos - currentNode.depth | |
builder.append(input.substring(begin, begin + currentNode.depth)) | |
} | |
builder.toString() | |
} | |
else { | |
val ch = input.charAt(pos) | |
val nextNode: Option[Node] = | |
if (currentNode == root) { | |
if (isRootChar(ch)) currentNode.getNext(ch) | |
else None | |
} | |
else currentNode.getNext(ch) | |
if(nextNode != None) { | |
process(input, pos + 1, nextNode.get, builder) | |
} | |
else { // nextNode == None | |
if (currentNode eq root) { // currentNode == root && nextNode == None | |
builder.append(ch) | |
process(input, pos + 1, root, builder) // process the next char | |
} | |
else { // currentNode != root && nextNode == None | |
if (currentNode.isKeyword) { | |
emitNodeAsStar(currentNode) | |
process(input, pos, root, builder) // continue process at pos | |
} | |
else { // TODO avoid backward, using the fallthrough field, then continue at the pos + 1 not begin + 1 | |
val begin = pos - currentNode.depth | |
builder.append(input.charAt(begin)) | |
process(input, begin + 1, root, builder) | |
} | |
} | |
} | |
} | |
} | |
private def buildRootBitSet(root: Node): BitSet = { | |
val mutable = collection.mutable.BitSet() | |
root.nexts.keySet.foreach { ch: Char => | |
mutable.add(ch.toInt) | |
} | |
mutable.toImmutable | |
} | |
private def isRootChar(ch: Char): Boolean = rootBitSet.contains(ch.toInt) | |
} | |
/** | |
* a simple prototype implementation | |
*/ | |
object PrototypeEngine extends SensitiveEngine { | |
var pattern: Pattern = _ | |
override def setKeywords(words: List[String]): Unit = { | |
val regexp = words.mkString("|") | |
pattern = Pattern.compile(regexp) | |
} | |
override def process(input: String): String = { | |
val matcher = pattern.matcher(input) | |
matcher.replaceAll("***") | |
} | |
} | |
object Test1 { | |
val keywords = | |
""" | |
福音会 | |
中国教徒 | |
统一教 | |
观音法门 | |
清海无上师 | |
盘古 | |
李洪志 | |
志洪李 | |
李宏志 | |
轮功 | |
法轮 | |
轮法功 | |
三去车仑 | |
氵去车仑 | |
发论工 | |
法x功 | |
法o功 | |
法0功 | |
法一轮一功 | |
轮子功 | |
车仑工力 | |
法lun | |
fa轮 | |
法lg | |
flg | |
fl功 | |
falungong | |
大法弟子 | |
大纪元 | |
dajiyuan | |
明慧网 | |
明慧周报 | |
正见网 | |
新唐人 | |
伪火 | |
退党 | |
tuidang | |
退dang | |
超越红墙 | |
自fen | |
真善忍 | |
""" | |
val keywords2 = """爱女人 | |
|爱液 | |
|按摩棒 | |
|拔出来 | |
|爆草 | |
|包二奶 | |
|暴干 | |
|暴奸 | |
|暴乳 | |
|爆乳 | |
|暴淫 | |
|屄 | |
|被操 | |
|被插 | |
|被干 | |
|逼奸 | |
|仓井空 | |
|插暴 | |
|操逼 | |
|操黑 | |
|操烂 | |
|肏你 | |
|肏死 | |
|操死 | |
|操我 | |
|厕奴 | |
|插比 | |
|插b | |
|插逼 | |
|插进 | |
|插你 | |
|插我 | |
|插阴 | |
|潮吹 | |
|潮喷 | |
|成人dv | |
|成人电影 | |
|成人论坛 | |
|成人小说 | |
|成人电 | |
|成人电影 | |
|成人卡通 | |
|成人聊 | |
|成人片""".stripMargin | |
def buildEngine(engine: SensitiveEngine, kws: List[String]): SensitiveEngine = { | |
engine.setKeywords(kws) | |
engine | |
} | |
def main(args: Array[String]): Unit = { | |
val kws = Source.fromString(keywords).getLines().filter(_.nonEmpty).toList | |
val engine = buildEngine(FastEngine, kws) | |
val input = "在中国,法轮功是一种邪教,李洪志是邪教的头目,大法弟大法弟子" | |
val output = engine.process(input) | |
println(output) | |
} | |
def times(label: String, skip: Int, loops: Int)(f: => Unit): Unit = { | |
0.until(skip).foreach(_ => f) | |
val tm0 = System.currentTimeMillis() | |
0.until(loops).foreach(_ => f) | |
val tm1 = System.currentTimeMillis() | |
println(s"$label loops $loops time:${tm1 - tm0}ms, ${(tm1 - tm0) * 1.0 / loops}ms/per") | |
} | |
def main2(args: Array[String]): Unit = { | |
val kws = Source.fromString(keywords + "\n" + keywords2).getLines().filter(_.nonEmpty).toList | |
println(s"keywords:${kws.size}") | |
val engine1 = buildEngine(FastEngine, kws) | |
val engine2 = buildEngine(PrototypeEngine, kws) | |
val input = Source.fromFile("/tmp/app.e96ae46d.js").mkString | |
times("version1", skip = 1000, loops = 1000) { | |
engine1.process(input) | |
} | |
times("version2", skip = 100, loops = 100) { | |
engine2.process(input) | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package demo | |
import java.io.{FileOutputStream, PrintWriter} | |
import java.util.regex.Pattern | |
import scala.annotation.tailrec | |
import scala.collection.immutable.IntMap | |
import scala.collection.{BitSet, mutable} | |
import scala.io.Source | |
trait SensitiveEngine { | |
def setKeywords(words: List[String]): Unit | |
def process(input: String): String | |
} | |
/** | |
* faster sensetive word replace util. | |
*/ | |
object FastEngine extends SensitiveEngine { | |
class Node(val ch: Char, val depth: Int) { | |
var isKeyword: Boolean = false | |
var nexts: mutable.Map[Char, Node] = collection.mutable.Map[Char, Node]() | |
// optimize for small set (10) using array, otherwise using IntMap | |
private var next_chars: Array[Char] = _ | |
private var next_nodes: Array[Node] = _ | |
private var nn: IntMap[Node] = _ | |
var fallthrough: Node = _ // TODO 暂时未实现,可以进一步优化,避免回溯,理论上性能会更高 | |
def addWord(word: String): Unit = { | |
assert(word.nonEmpty) | |
val char = word.charAt(0) | |
val remain = word.substring(1) | |
val next: Node = nexts.get(char) match { | |
case Some(node) => node | |
case None => | |
val it = new Node(char, depth + 1) | |
nexts(char) = it | |
it | |
} | |
if (remain.isEmpty) { | |
next.isKeyword = true | |
} | |
else { | |
next.addWord(remain) | |
} | |
} | |
def getNext(ch: Char): Node = { | |
if(next_chars != null){ | |
var i = 0 | |
while(i < next_chars.length) { | |
if(next_chars(i) == ch) return next_nodes(i) | |
i += 1 | |
} | |
null | |
} | |
else nn.getOrElse(ch, null) | |
} | |
def afterInit(): Unit ={ | |
if(nexts.size < 10) { | |
next_chars = nexts.keys.toArray.sorted | |
next_nodes = next_chars.map(ch => nexts(ch) ) | |
} | |
else { | |
nn = IntMap.from( nexts.map( x=> (x._1.toInt, x._2) ) ) | |
} | |
nexts.values.foreach(_.afterInit()) | |
nexts = null // free memory | |
} | |
} | |
private val root = new Node(0, 0) | |
private var rootNodes: Array[Node] = _ | |
override def setKeywords(words: List[String]): Unit = { | |
words.foreach(addWord) | |
rootNodes = new Array[Node](0x10000) | |
(0 to 0xFFFF).foreach { ch => | |
rootNodes(ch) = root.nexts.getOrElse(ch.toChar, null) | |
} | |
root.afterInit() | |
} | |
def addWord(word: String): Unit = { | |
root.addWord(word) | |
} | |
def process(input: String): String = { | |
emits = 0 | |
process(input, input.length, 0, root, new Array[Char](input.length), 0) | |
} | |
var emits: Int = _ | |
@tailrec | |
def process(input: String, inputLength: Int, pos: Int, currentNode: Node, buffer: Array[Char], buffPos: Int): String = { | |
var _buffPos = buffPos; | |
if (pos >= inputLength) { // process the last node | |
if (currentNode.isKeyword) { | |
var i = 0 | |
while(i < currentNode.depth) { | |
buffer(_buffPos) = '*' | |
_buffPos += 1 | |
i += 1 | |
} | |
emits += 1 | |
} | |
else { | |
val begin = pos - currentNode.depth | |
var i = begin | |
while(i < pos){ | |
buffer(_buffPos) = input.charAt(i) | |
_buffPos += 1 | |
i += 1 | |
} | |
} | |
new String(buffer) | |
} | |
else { | |
val ch = input.charAt(pos) | |
val nextNode: Node = // currentNode.getNext(ch) | |
if (currentNode eq root) getRootNext(ch) | |
else currentNode.getNext(ch) | |
if ((currentNode eq root) && (nextNode eq null)) { // currentNode == root && nextNode == None | |
buffer(_buffPos) = ch; _buffPos += 1 | |
process(input, inputLength, pos+1, root, buffer, _buffPos) | |
} | |
else if(!(currentNode eq root) && (nextNode eq null)) { // currentNode != root && nextNode == None | |
if (currentNode.isKeyword) { | |
var i = 0 | |
while(i < currentNode.depth){ | |
buffer(_buffPos) = '*' | |
_buffPos += 1 | |
i += 1 | |
} | |
emits += 1 | |
process(input, inputLength, pos, root, buffer, _buffPos) | |
} | |
else { // TODO avoid backward, using the fallthrough field, then continue at the pos + 1 not begin + 1 | |
val begin = pos - currentNode.depth | |
buffer(_buffPos) = input.charAt(begin); _buffPos += 1 | |
process(input, inputLength, begin + 1, root, buffer, _buffPos) | |
} | |
} | |
else { // nextNode != null | |
process(input, inputLength, pos+1, nextNode, buffer, _buffPos) | |
} | |
} | |
} | |
private def buildRootBitSet(root: Node): BitSet = { | |
val mutable = collection.mutable.BitSet() | |
root.nexts.keySet.foreach { ch: Char => | |
mutable.add(ch.toInt) | |
} | |
mutable.toImmutable | |
} | |
private def getRootNext(ch: Char): Node = rootNodes(ch) | |
} | |
/** | |
* a simple prototype implementation | |
*/ | |
object PrototypeEngine extends SensitiveEngine { | |
var pattern: Pattern = _ | |
override def setKeywords(words: List[String]): Unit = { | |
val regexp = words.mkString("|") | |
pattern = Pattern.compile(regexp) | |
} | |
override def process(input: String): String = { | |
val matcher = pattern.matcher(input) | |
matcher.replaceAll("***") | |
} | |
} | |
object Test1 { | |
val keywords = | |
""" | |
福音会 | |
中国教徒 | |
统一教 | |
观音法门 | |
清海无上师 | |
盘古 | |
李洪志 | |
志洪李 | |
李宏志 | |
轮功 | |
法轮 | |
轮法功 | |
三去车仑 | |
氵去车仑 | |
发论工 | |
法x功 | |
法o功 | |
法0功 | |
法一轮一功 | |
轮子功 | |
车仑工力 | |
法lun | |
fa轮 | |
法lg | |
flg | |
fl功 | |
falungong | |
大法弟子 | |
大纪元 | |
dajiyuan | |
明慧网 | |
明慧周报 | |
正见网 | |
新唐人 | |
伪火 | |
退党 | |
tuidang | |
退dang | |
超越红墙 | |
自fen | |
真善忍 | |
""" | |
def buildEngine[T <: SensitiveEngine](engine: T, kws: List[String]): T = { | |
engine.setKeywords(kws) | |
engine | |
} | |
def main1(args: Array[String]): Unit = { | |
val kws = Source.fromString(keywords).getLines().filter(_.nonEmpty).toList | |
val engine = buildEngine(FastEngine, kws) | |
val input = "在中国,法轮功是一种邪教,李洪志是邪教的头目,大法弟大法弟子" | |
val output = engine.process(input) | |
println(output) | |
} | |
def times(label: String, skip: Int, loops: Int)(f: => Unit): Unit = { | |
0.until(skip).foreach(_ => f) | |
val tm0 = System.currentTimeMillis() | |
0.until(loops).foreach(_ => f) | |
val tm1 = System.currentTimeMillis() | |
println(s"$label loops $loops time:${tm1 - tm0}ms, ${(tm1 - tm0) * 1.0 / loops}ms/per") | |
} | |
def load(file: String): List[String] = Source.fromResource(file).getLines() | |
.filter(_.nonEmpty).toList | |
def main(args: Array[String]): Unit = { | |
val kws = // Source.fromString(keywords).getLines().filter(_.nonEmpty).toList | |
load("demo/policy.txt") ++ | |
load("demo/sex.txt") ++ | |
load("demo/sen3.txt") | |
println(s"keywords:${kws.size}") | |
val engine1 = buildEngine(FastEngine, kws) | |
val engine2 = buildEngine(PrototypeEngine, kws) | |
// val input = Source.fromFile("/tmp/app.e96ae46d.js").mkString | |
val input = Source.fromFile("/tmp/sen1.txt").mkString | |
val output = engine1.process(input) | |
val file = new PrintWriter(new FileOutputStream("/tmp/sen2.txt")) | |
file.println(output) | |
file.close() | |
println(s"emits = ${engine1.emits}") | |
times("version1", skip = 1000, loops = 10_000) { | |
engine1.process(input) | |
} | |
// times("version2", skip = 100, loops = 100) { | |
// engine2.process(input) | |
// } | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment