Last active
April 29, 2020 01:58
-
-
Save shinkou/66e537ec0eb964f87752a1a68deaf195 to your computer and use it in GitHub Desktop.
A Java snippet which detects if input parameters have any emoji
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* The MIT License (MIT) | |
* | |
* Copyright (c) 2020 Chun-Kwong Wong | |
* | |
* Permission is hereby granted, free of charge, to any person obtaining a copy | |
* of this software and associated documentation files (the "Software"), to | |
* deal in the Software without restriction, including without limitation the | |
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or | |
* sell copies of the Software, and to permit persons to whom the Software is | |
* furnished to do so, subject to the following conditions: | |
* | |
* The above copyright notice and this permission notice shall be included in | |
* all copies or substantial portions of the Software. | |
* | |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | |
* IN THE SOFTWARE. | |
* | |
* Detects, and prints out in groups, if inputs have any emoji. | |
* | |
* How to use: | |
* | |
* $ javac ./EmojiDetector.java | |
* $ java [-Xss<SIZE>] \ | |
* [ -Dmaxlen=<MAX GROUP SIZE> ] \ | |
* [ -Dfilemode=<true/false> ] \ | |
* EmojiDetector INPUT [ INPUT [ INPUT ... ] ] ] | |
* | |
* NOTE: | |
* | |
* - use "-Dfilemode" option to treat INPUT as filepaths and read contents | |
* from them. | |
* | |
* - adjust option "-Xss" and "-Dmaxlen" to avoid the infamous | |
* StackOverflowError RegExp bug. More details here: | |
* https://bugs.java.com/bugdatabase/view_bug.do?bug_id=6882582 | |
*/ | |
import java.io.BufferedReader; | |
import java.io.FileNotFoundException; | |
import java.io.FileReader; | |
import java.util.ArrayList; | |
import java.util.Collections; | |
import java.util.List; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
import java.util.stream.Collectors; | |
public class EmojiDetector | |
{ | |
final private static String HELPMSG | |
= "usage: java [ OPTIONS ] EmojiDetector " | |
+ "INPUT [ INPUT [ INPUT ... ] ] ]\n\n" | |
+ "OPTIONS:\n" | |
+ " -Dmaxlen=<SIZE> where SIZE is the maximum number of characters of each\n" | |
+ " group of capture. (default: 1)\n" | |
+ " -Dfilemode=<BOOL> where BOOL can be true/false to indicate whether INPUTs\n" | |
+ " are filepaths to read contents from. (default: false)\n" | |
+ " -Xss<MEMSIZE> where MEMSIZE is the Java thread stack size.\n"; | |
final private static String RE_Digit = "\\u0023\\u002A\\u0030-\\u0039"; | |
final private static String RE_RI = "\\uD83C\\uDDE6-\\uD83C\\uDDFF"; | |
final private static String RE_2Byte = "\\u00A9\\u00AE\\u203C\\u2049\\u2122\\u2139\\u2194-\\u2199\\u21A9-\\u21AA\\u231A-\\u231B\\u2328\\u23CF\\u23E9-\\u23EF\\u23F0-\\u23F3\\u23F8-\\u23FA\\u24C2\\u25AA\\u25AB\\u25B6\\u25C0\\u25FB-\\u25FE\\u2600-\\u2604\\u260E\\u2611\\u2614-\\u2615\\u2618\\u261D\\u2620\\u2622-\\u2623\\u2626\\u262A\\u262E\\u262F\\u2638\\u2639\\u263A\\u2640\\u2642\\u2648-\\u2653\\u265F\\u2660\\u2663\\u2665-\\u2666\\u2668\\u267B\\u267E\\u267F\\u2692-\\u2697\\u2699\\u269B-\\u269C\\u26A0-\\u26A1\\u26A7\\u26AA-\\u26AB\\u26B0-\\u26B1\\u26BD-\\u26BE\\u26C4-\\u26C5\\u26C8\\u26CE\\u26CF\\u26D1\\u26D3\\u26D4\\u26E9\\u26EA\\u26F0-\\u26F5\\u26F7-\\u26F9\\u26FA\\u26FD\\u2702\\u2705\\u2708-\\u270C\\u270D\\u270F\\u2712\\u2714\\u2716\\u271D\\u2721\\u2728\\u2733-\\u2734\\u2744\\u2747\\u274C\\u274E\\u2753-\\u2755\\u2757\\u2763\\u2764\\u2795-\\u2797\\u27A1\\u27B0\\u27BF\\u2934-\\u2935\\u2B05-\\u2B07\\u2B1B-\\u2B1C\\u2B50\\u2B55\\u3030\\u303D\\u3297\\u3299"; | |
final private static String RE_3Byte = "\\uD83C\\uDC04\\uD83C\\uDCCF\\uD83C\\uDD70-\\uD83C\\uDD71\\uD83C\\uDD7E-\\uD83C\\uDD7F\\uD83C\\uDD8E\\uD83C\\uDD91-\\uD83C\\uDD9A\\uD83C\\uDDE6-\\uD83C\\uDDFF\\uD83C\\uDE01-\\uD83C\\uDE02\\uD83C\\uDE1A\\uD83C\\uDE2F\\uD83C\\uDE32-\\uD83C\\uDE3A\\uD83C\\uDE50-\\uD83C\\uDE51\\uD83C\\uDF00-\\uD83C\\uDF21\\uD83C\\uDF24-\\uD83C\\uDF2C\\uD83C\\uDF2D-\\uD83C\\uDF2F\\uD83C\\uDF30-\\uD83C\\uDF93\\uD83C\\uDF96-\\uD83C\\uDF97\\uD83C\\uDF99-\\uD83C\\uDF9B\\uD83C\\uDF9E-\\uD83C\\uDF9F\\uD83C\\uDFA0-\\uD83C\\uDFF0\\uD83C\\uDFF3-\\uD83C\\uDFF5\\uD83C\\uDFF7\\uD83C\\uDFF8-\\uD83D\\uDC29\\uD83D\\uDC2A-\\uD83D\\uDCFD\\uD83D\\uDCFF-\\uD83D\\uDD3D\\uD83D\\uDD49-\\uD83D\\uDD4E\\uD83D\\uDD50-\\uD83D\\uDD67\\uD83D\\uDD6F-\\uD83D\\uDD70\\uD83D\\uDD73-\\uD83D\\uDD7A\\uD83D\\uDD87\\uD83D\\uDD8A-\\uD83D\\uDD8D\\uD83D\\uDD90\\uD83D\\uDD95-\\uD83D\\uDD96\\uD83D\\uDDA4\\uD83D\\uDDA5\\uD83D\\uDDA8\\uD83D\\uDDB1-\\uD83D\\uDDB2\\uD83D\\uDDBC\\uD83D\\uDDC2-\\uD83D\\uDDC4\\uD83D\\uDDD1-\\uD83D\\uDDD3\\uD83D\\uDDDC-\\uD83D\\uDDDE\\uD83D\\uDDE1\\uD83D\\uDDE3\\uD83D\\uDDE8\\uD83D\\uDDEF\\uD83D\\uDDF3\\uD83D\\uDDFA-\\uD83D\\uDE4F\\uD83D\\uDE80-\\uD83D\\uDEC5\\uD83D\\uDECB-\\uD83D\\uDED2\\uD83D\\uDED5-\\uD83D\\uDED7\\uD83D\\uDEE0-\\uD83D\\uDEE5\\uD83D\\uDEE9\\uD83D\\uDEEB-\\uD83D\\uDEEC\\uD83D\\uDEF0\\uD83D\\uDEF3\\uD83D\\uDEF4-\\uD83D\\uDEFC\\uD83D\\uDFE0-\\uD83D\\uDFEB\\uD83E\\uDD0C-\\uD83E\\uDD3A\\uD83E\\uDD3C-\\uD83E\\uDD45\\uD83E\\uDD47-\\uD83E\\uDD78\\uD83E\\uDD7A-\\uD83E\\uDDCB\\uD83E\\uDDCD-\\uD83E\\uDDFF\\uD83E\\uDE70-\\uD83E\\uDE74\\uD83E\\uDE78-\\uD83E\\uDE86\\uD83E\\uDE90-\\uD83E\\uDEA8\\uD83E\\uDEB0-\\uD83E\\uDEB6\\uD83E\\uDEC0-\\uD83E\\uDEC2\\uD83E\\uDED0-\\uD83E\\uDED6"; | |
final private static String RE_EMod = "\\uD83C\\uDFFB-\\uD83C\\uDFFF"; | |
final private static String RE_Emoji = RE_2Byte + RE_3Byte; | |
final private static String RE_TagMod = "[\\uDB40\\uDC20-\\uDB40\\uDC7E]+\\uDB40\\uDC7F"; | |
final private static String RE = | |
"[" + RE_RI + "]{2}" | |
+ "|[" + RE_Digit + "]\\uFE0F?\\u20E3" | |
+ "|[" + RE_Emoji + "]" | |
+ "(?:[" + RE_EMod + "]" | |
+ "|\\uFE0F\\u20E3?" | |
+ "|" + RE_TagMod + ")?" | |
+ "(?:\\u200D[" + RE_Emoji + "]" | |
+ "(?:[" + RE_EMod + "]" | |
+ "|\\uFE0F\\u20E3?" | |
+ "|" + RE_TagMod + ")?" | |
+ ")*" | |
; | |
final private Pattern re; | |
final private boolean filemode; | |
public EmojiDetector(int maxlen, boolean filemode) | |
{ | |
if (1 == maxlen) | |
this.re = Pattern.compile(RE); | |
else if (0 > maxlen) | |
this.re = Pattern.compile("(?:" + RE + ")+"); | |
else | |
this.re = Pattern.compile("(?:" + RE + "){1," + maxlen + "}"); | |
this.filemode = filemode; | |
} | |
public List<String> processInput(String s) | |
{ | |
if (this.filemode) | |
return this.processFile(s); | |
else | |
return this.processString(s); | |
} | |
private List<String> processString(String s) | |
{ | |
List<String> out = new ArrayList<>(); | |
Matcher m = this.re.matcher(s); | |
while(m.find()) out.add(m.group()); | |
return out; | |
} | |
private List<String> processFile(String fpath) | |
{ | |
try | |
{ | |
BufferedReader br = new BufferedReader(new FileReader(fpath)); | |
return br.lines().map(l -> this.processString(l)) | |
.flatMap(l -> l.stream()).collect(Collectors.toList()); | |
} | |
catch(FileNotFoundException fnfe) | |
{ | |
System.err.println("Could not open file \"" + fpath + "\"."); | |
} | |
return Collections.emptyList(); | |
} | |
public static void main(String[] args) | |
{ | |
if (0 == args.length) | |
{ | |
System.out.println(HELPMSG); | |
return; | |
} | |
EmojiDetector detector = new EmojiDetector | |
( | |
Integer.parseInt(System.getProperty("maxlen", "1")) | |
, Boolean.parseBoolean(System.getProperty("filemode", "false")) | |
); | |
for(String arg: args) | |
for(String emoji: detector.processInput(arg)) | |
System.out.println(emoji); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment