Skip to content

Instantly share code, notes, and snippets.

@shinkou
Last active April 29, 2020 01:58
Show Gist options
  • Save shinkou/66e537ec0eb964f87752a1a68deaf195 to your computer and use it in GitHub Desktop.
Save shinkou/66e537ec0eb964f87752a1a68deaf195 to your computer and use it in GitHub Desktop.
A Java snippet which detects if input parameters have any emoji
/*
* The MIT License (MIT)
*
* Copyright (c) 2020 Chun-Kwong Wong
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* Detects, and prints out in groups, if inputs have any emoji.
*
* How to use:
*
* $ javac ./EmojiDetector.java
* $ java [-Xss<SIZE>] \
* [ -Dmaxlen=<MAX GROUP SIZE> ] \
* [ -Dfilemode=<true/false> ] \
* EmojiDetector INPUT [ INPUT [ INPUT ... ] ] ]
*
* NOTE:
*
* - use "-Dfilemode" option to treat INPUT as filepaths and read contents
* from them.
*
* - adjust option "-Xss" and "-Dmaxlen" to avoid the infamous
* StackOverflowError RegExp bug. More details here:
* https://bugs.java.com/bugdatabase/view_bug.do?bug_id=6882582
*/
import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class EmojiDetector
{
final private static String HELPMSG
= "usage: java [ OPTIONS ] EmojiDetector "
+ "INPUT [ INPUT [ INPUT ... ] ] ]\n\n"
+ "OPTIONS:\n"
+ " -Dmaxlen=<SIZE> where SIZE is the maximum number of characters of each\n"
+ " group of capture. (default: 1)\n"
+ " -Dfilemode=<BOOL> where BOOL can be true/false to indicate whether INPUTs\n"
+ " are filepaths to read contents from. (default: false)\n"
+ " -Xss<MEMSIZE> where MEMSIZE is the Java thread stack size.\n";
final private static String RE_Digit = "\\u0023\\u002A\\u0030-\\u0039";
final private static String RE_RI = "\\uD83C\\uDDE6-\\uD83C\\uDDFF";
final private static String RE_2Byte = "\\u00A9\\u00AE\\u203C\\u2049\\u2122\\u2139\\u2194-\\u2199\\u21A9-\\u21AA\\u231A-\\u231B\\u2328\\u23CF\\u23E9-\\u23EF\\u23F0-\\u23F3\\u23F8-\\u23FA\\u24C2\\u25AA\\u25AB\\u25B6\\u25C0\\u25FB-\\u25FE\\u2600-\\u2604\\u260E\\u2611\\u2614-\\u2615\\u2618\\u261D\\u2620\\u2622-\\u2623\\u2626\\u262A\\u262E\\u262F\\u2638\\u2639\\u263A\\u2640\\u2642\\u2648-\\u2653\\u265F\\u2660\\u2663\\u2665-\\u2666\\u2668\\u267B\\u267E\\u267F\\u2692-\\u2697\\u2699\\u269B-\\u269C\\u26A0-\\u26A1\\u26A7\\u26AA-\\u26AB\\u26B0-\\u26B1\\u26BD-\\u26BE\\u26C4-\\u26C5\\u26C8\\u26CE\\u26CF\\u26D1\\u26D3\\u26D4\\u26E9\\u26EA\\u26F0-\\u26F5\\u26F7-\\u26F9\\u26FA\\u26FD\\u2702\\u2705\\u2708-\\u270C\\u270D\\u270F\\u2712\\u2714\\u2716\\u271D\\u2721\\u2728\\u2733-\\u2734\\u2744\\u2747\\u274C\\u274E\\u2753-\\u2755\\u2757\\u2763\\u2764\\u2795-\\u2797\\u27A1\\u27B0\\u27BF\\u2934-\\u2935\\u2B05-\\u2B07\\u2B1B-\\u2B1C\\u2B50\\u2B55\\u3030\\u303D\\u3297\\u3299";
final private static String RE_3Byte = "\\uD83C\\uDC04\\uD83C\\uDCCF\\uD83C\\uDD70-\\uD83C\\uDD71\\uD83C\\uDD7E-\\uD83C\\uDD7F\\uD83C\\uDD8E\\uD83C\\uDD91-\\uD83C\\uDD9A\\uD83C\\uDDE6-\\uD83C\\uDDFF\\uD83C\\uDE01-\\uD83C\\uDE02\\uD83C\\uDE1A\\uD83C\\uDE2F\\uD83C\\uDE32-\\uD83C\\uDE3A\\uD83C\\uDE50-\\uD83C\\uDE51\\uD83C\\uDF00-\\uD83C\\uDF21\\uD83C\\uDF24-\\uD83C\\uDF2C\\uD83C\\uDF2D-\\uD83C\\uDF2F\\uD83C\\uDF30-\\uD83C\\uDF93\\uD83C\\uDF96-\\uD83C\\uDF97\\uD83C\\uDF99-\\uD83C\\uDF9B\\uD83C\\uDF9E-\\uD83C\\uDF9F\\uD83C\\uDFA0-\\uD83C\\uDFF0\\uD83C\\uDFF3-\\uD83C\\uDFF5\\uD83C\\uDFF7\\uD83C\\uDFF8-\\uD83D\\uDC29\\uD83D\\uDC2A-\\uD83D\\uDCFD\\uD83D\\uDCFF-\\uD83D\\uDD3D\\uD83D\\uDD49-\\uD83D\\uDD4E\\uD83D\\uDD50-\\uD83D\\uDD67\\uD83D\\uDD6F-\\uD83D\\uDD70\\uD83D\\uDD73-\\uD83D\\uDD7A\\uD83D\\uDD87\\uD83D\\uDD8A-\\uD83D\\uDD8D\\uD83D\\uDD90\\uD83D\\uDD95-\\uD83D\\uDD96\\uD83D\\uDDA4\\uD83D\\uDDA5\\uD83D\\uDDA8\\uD83D\\uDDB1-\\uD83D\\uDDB2\\uD83D\\uDDBC\\uD83D\\uDDC2-\\uD83D\\uDDC4\\uD83D\\uDDD1-\\uD83D\\uDDD3\\uD83D\\uDDDC-\\uD83D\\uDDDE\\uD83D\\uDDE1\\uD83D\\uDDE3\\uD83D\\uDDE8\\uD83D\\uDDEF\\uD83D\\uDDF3\\uD83D\\uDDFA-\\uD83D\\uDE4F\\uD83D\\uDE80-\\uD83D\\uDEC5\\uD83D\\uDECB-\\uD83D\\uDED2\\uD83D\\uDED5-\\uD83D\\uDED7\\uD83D\\uDEE0-\\uD83D\\uDEE5\\uD83D\\uDEE9\\uD83D\\uDEEB-\\uD83D\\uDEEC\\uD83D\\uDEF0\\uD83D\\uDEF3\\uD83D\\uDEF4-\\uD83D\\uDEFC\\uD83D\\uDFE0-\\uD83D\\uDFEB\\uD83E\\uDD0C-\\uD83E\\uDD3A\\uD83E\\uDD3C-\\uD83E\\uDD45\\uD83E\\uDD47-\\uD83E\\uDD78\\uD83E\\uDD7A-\\uD83E\\uDDCB\\uD83E\\uDDCD-\\uD83E\\uDDFF\\uD83E\\uDE70-\\uD83E\\uDE74\\uD83E\\uDE78-\\uD83E\\uDE86\\uD83E\\uDE90-\\uD83E\\uDEA8\\uD83E\\uDEB0-\\uD83E\\uDEB6\\uD83E\\uDEC0-\\uD83E\\uDEC2\\uD83E\\uDED0-\\uD83E\\uDED6";
final private static String RE_EMod = "\\uD83C\\uDFFB-\\uD83C\\uDFFF";
final private static String RE_Emoji = RE_2Byte + RE_3Byte;
final private static String RE_TagMod = "[\\uDB40\\uDC20-\\uDB40\\uDC7E]+\\uDB40\\uDC7F";
final private static String RE =
"[" + RE_RI + "]{2}"
+ "|[" + RE_Digit + "]\\uFE0F?\\u20E3"
+ "|[" + RE_Emoji + "]"
+ "(?:[" + RE_EMod + "]"
+ "|\\uFE0F\\u20E3?"
+ "|" + RE_TagMod + ")?"
+ "(?:\\u200D[" + RE_Emoji + "]"
+ "(?:[" + RE_EMod + "]"
+ "|\\uFE0F\\u20E3?"
+ "|" + RE_TagMod + ")?"
+ ")*"
;
final private Pattern re;
final private boolean filemode;
public EmojiDetector(int maxlen, boolean filemode)
{
if (1 == maxlen)
this.re = Pattern.compile(RE);
else if (0 > maxlen)
this.re = Pattern.compile("(?:" + RE + ")+");
else
this.re = Pattern.compile("(?:" + RE + "){1," + maxlen + "}");
this.filemode = filemode;
}
public List<String> processInput(String s)
{
if (this.filemode)
return this.processFile(s);
else
return this.processString(s);
}
private List<String> processString(String s)
{
List<String> out = new ArrayList<>();
Matcher m = this.re.matcher(s);
while(m.find()) out.add(m.group());
return out;
}
private List<String> processFile(String fpath)
{
try
{
BufferedReader br = new BufferedReader(new FileReader(fpath));
return br.lines().map(l -> this.processString(l))
.flatMap(l -> l.stream()).collect(Collectors.toList());
}
catch(FileNotFoundException fnfe)
{
System.err.println("Could not open file \"" + fpath + "\".");
}
return Collections.emptyList();
}
public static void main(String[] args)
{
if (0 == args.length)
{
System.out.println(HELPMSG);
return;
}
EmojiDetector detector = new EmojiDetector
(
Integer.parseInt(System.getProperty("maxlen", "1"))
, Boolean.parseBoolean(System.getProperty("filemode", "false"))
);
for(String arg: args)
for(String emoji: detector.processInput(arg))
System.out.println(emoji);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment