Last active
January 3, 2019 15:19
-
-
Save EliasRanz/eba5211b3a364b32a18a7f81e78385a6 to your computer and use it in GitHub Desktop.
String Utility that handles common String operations such as checking against character spam, and ascii art for Twitch moderation.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.commons.lang3.StringUtils; | |
import java.util.*; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
public class StringUtil { | |
private static Integer spamCharCount = 10; | |
public static boolean containsAscii(String text) { | |
List<String> asciiList = new ArrayList<>(); | |
Pattern pattern = Pattern.compile("\\P{ASCII}"); | |
Matcher matcher = pattern.matcher(text); | |
while (matcher.find()){ | |
asciiList.add(matcher.group(0)); | |
} | |
return !asciiList.isEmpty(); | |
} | |
public static List<Character> getUniqueChars(String text) { | |
List<Character> chars = new ArrayList<>(); | |
for(int i = 0; i < text.length(); i++) { | |
Character character = text.charAt(i); | |
if(!chars.contains(character)) { | |
chars.add(character); | |
} | |
} | |
return chars; | |
} | |
public static Map<Character, Integer> getCharacterCount(String text) { | |
List<Character> characters = getUniqueChars(text); | |
Map<Character, Integer> matchedCharCount = new HashMap<>(); | |
for(int i = 0; i < characters.size(); i++) { | |
Integer count = StringUtils.countMatches(text, characters.get(i)); | |
matchedCharCount.put(characters.get(i), count); | |
} | |
return matchedCharCount; | |
} | |
public static boolean isSpamCharCount(String text) { | |
Map<Character, Integer> characters = getCharacterCount(text); | |
Map.Entry<Character, Integer> maxEntry = null; | |
for(Map.Entry<Character, Integer> entry : characters.entrySet()) { | |
if(maxEntry == null || entry.getValue().compareTo(maxEntry.getValue()) > 0) { | |
maxEntry = entry; | |
} | |
} | |
return maxEntry.getValue() > spamCharCount; | |
} | |
public static double checkPercentageOfCaps(String text) { | |
List<String> matches = new ArrayList<>(); | |
Pattern pattern = Pattern.compile("[A-Z]", Pattern.MULTILINE | Pattern.DOTALL); | |
Matcher matcher = pattern.matcher(text); | |
while (matcher.find()){ | |
matches.add(matcher.group(0)); | |
} | |
return (double) matches.size() / text.length(); | |
} | |
public static double checkSimilarity(String s, String t) { | |
if(s == null || t == null) return 0; | |
if(s.equals(t) ) return 1; | |
if(s.length() < 2 || t.length() < 2) return 0; | |
int[] sPairs = generateBigram(s); | |
int n = s.length() - 1; | |
int[] tPairs = generateBigram(t); | |
int m = t.length() - 1; | |
Arrays.sort(sPairs); | |
Arrays.sort(tPairs); | |
int matches = 0, i = 0, j = 0; | |
while (i < n && j < m) { | |
if(sPairs[i] == tPairs[j]) { | |
matches += 2; | |
i++; | |
j++; | |
} else if(sPairs[i] < tPairs[j]) { | |
i++; | |
} else { | |
j++; | |
} | |
} | |
return (double) matches/(n+m); | |
} | |
private static int[] generateBigram(String s) { | |
final int n = s.length() - 1; | |
final int[] pairs = new int[n]; | |
for(int i = 0; i <= n; i++) { | |
if(i == 0) { | |
pairs[i] = s.charAt(i) << 16; | |
} else if(i == n) { | |
pairs[i-1] = s.charAt(i); | |
} else { | |
pairs[i] = (pairs[i-1] |= s.charAt(i)) << 16; | |
} | |
} | |
return pairs; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is a class that I'm working on for my Twitch bot.
Current Methods
containsAscii
: takes a string of text and checks to see if the string contains ASCII art.getUniqueChars
: takes a string of text and gets the unique characters in the given string.getCharactersCount
: takes a string of text and checks to see how many instances of each character there are.isSpamCharCount
: takes a string of text and checks to see if it the character count is greater thanspamCharCount
.checkSimilarity
: takes 2 strings and then runs the Dice's Coefficient algorithm on them, which returns a double with the percent match.checkPercentageOfCaps
: checks a string of text to see how many of them are capital letters.