Last active
December 12, 2023 23:18
-
-
Save Jire/4aa72bd3554cdccdc369c216a230ee56 to your computer and use it in GitHub Desktop.
Based off PimDeWitte's, this improves performance by over a magnitude and eliminates all garbage (allocations).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import it.unimi.dsi.fastutil.longs.Long2ObjectMap; | |
import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap; | |
import net.openhft.hashing.LongHashFunction; | |
import java.io.BufferedReader; | |
import java.io.IOException; | |
import java.io.InputStreamReader; | |
import java.net.URL; | |
/** | |
* Originally created by Pim De Witte. | |
* | |
* Performance drastically improved by over an order of magnitude by Thomas G. P. Nappo (Jire). | |
* Garbage production has been eliminated as well. | |
*/ | |
public class BadWords { | |
static Long2ObjectMap<String[]> words = new Long2ObjectOpenHashMap<>(); | |
static int largestWordLength = 0; | |
public static void flag(String word) { | |
String[] ignore_in_combination_with_words = new String[]{}; | |
if (word.length() > largestWordLength) { | |
largestWordLength = word.length(); | |
} | |
words.put(LongHashFunction.xx().hashChars(word.replaceAll(" ", "")), ignore_in_combination_with_words); | |
} | |
public static void loadConfigs() { | |
try { | |
BufferedReader reader = new BufferedReader(new InputStreamReader(new URL("https://docs.google.com/spreadsheets/d/1hIEi2YG3ydav1E06Bzf2mQbGZ12kh2fe4ISgLg_UBuM/export?format=csv").openConnection().getInputStream())); | |
String line = ""; | |
int counter = 0; | |
while((line = reader.readLine()) != null) { | |
counter++; | |
String[] content = null; | |
try { | |
content = line.split(","); | |
if(content.length == 0) { | |
continue; | |
} | |
String word = content[0]; | |
String[] ignore_in_combination_with_words = new String[]{}; | |
if(content.length > 1) { | |
ignore_in_combination_with_words = content[1].split("_"); | |
} | |
if(word.length() > largestWordLength) { | |
largestWordLength = word.length(); | |
} | |
words.put(LongHashFunction.xx().hashChars(word.replace(" ", "")), ignore_in_combination_with_words); | |
} catch(Exception e) { | |
e.printStackTrace(); | |
} | |
} | |
System.out.println("Loaded " + counter + " words to filter out"); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
} | |
private static final char[][] convert = { | |
{'o', '0'}, | |
{'i', '1'}, | |
{'l', '1'}, | |
{'t', '+'}, | |
{'e', '3'}, | |
{'i', '!'}, | |
{'l', '!'}, | |
{'s', '$'}, | |
{'a', '&'}, | |
{'a', '@'}, | |
{'c', '('}, | |
{'d', ')'}, | |
{'d', '0'}, | |
{'g', '6'}, | |
{'t', '7'}, | |
{'g', '9'}, | |
{'s', '5'}, | |
{'a', '4'} | |
}; | |
private static final ThreadLocal<StringBuilder> sb = ThreadLocal.withInitial(StringBuilder::new); // make this regular if you don't need thread safety. | |
/** | |
* Iterates over a String input and checks whether a cuss word was found in a list, then checks if the word should be ignored (e.g. bass contains the word *ss). | |
* | |
* @param input | |
* @return | |
*/ | |
public static boolean badWordsFound(String input) { | |
if (input == null) { | |
return false; | |
} | |
StringBuilder sb = BadWords.sb.get(); | |
sb.setLength(0); | |
removeLeetspeak: | |
for (int i = 0; i < input.length(); i++) { | |
char c = input.charAt(i); | |
if (Character.isLetter(c)) { | |
sb.append(Character.toLowerCase(c)); | |
} else { | |
for (char[] conversion : convert) { | |
if (c == conversion[1]) { | |
sb.append(conversion[0]); | |
continue removeLeetspeak; | |
} | |
} | |
} | |
} | |
// iterate over each letter in the word | |
for (int start = 0; start < sb.length(); start++) { | |
// from each letter, keep going to find bad words until either the end of the sentence is reached, or the max word length is reached. | |
for (int offset = 1; offset < (sb.length() + 1 - start) && offset < largestWordLength; offset++) { | |
long hash = LongHashFunction.xx().hashChars(sb, start, offset); | |
if (words.containsKey(hash)) { | |
// for example, if you want to say the word bass, that should be possible. | |
String[] ignoreCheck = words.get(hash); | |
boolean ignore = false; | |
for (int s = 0; s < ignoreCheck.length; s++) { | |
if (indexOf(sb, ignoreCheck[s]) >= 0) { | |
ignore = true; | |
break; | |
} | |
} | |
if (!ignore) { | |
return true; | |
} | |
} | |
} | |
} | |
return false; | |
} | |
private static int indexOf(CharSequence source, CharSequence target) { | |
int sourceCount = source.length(); | |
int targetCount = target.length(); | |
int sourceOffset = 0; | |
int targetOffset = 0; | |
if (0 >= sourceCount) { | |
return (targetCount == 0 ? sourceCount : -1); | |
} | |
if (targetCount == 0) { | |
return 0; | |
} | |
char first = target.charAt(targetOffset); | |
int max = sourceOffset + (sourceCount - targetCount); | |
for (int i = sourceOffset; i <= max; i++) { | |
/* Look for first character. */ | |
if (source.charAt(i) != first) { | |
while (++i <= max && source.charAt(i) != first); | |
} | |
/* Found first character, now look at the rest of v2 */ | |
if (i <= max) { | |
int j = i + 1; | |
int end = j + targetCount - 1; | |
for (int k = targetOffset + 1; j < end && source.charAt(j) | |
== target.charAt(k); j++, k++); | |
if (j == end) { | |
/* Found whole string. */ | |
return i - sourceOffset; | |
} | |
} | |
} | |
return -1; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I know you are about full memory optimization, but is there any way to return the swear word for a reference in an automatic report? This is because I get a ton of false positives in messages and its easier to figure out what the false positive is if I have the word available.
ie
if (!ignore) { return wordToCheck; }
and at the end if nothing is found
return null