Created
          May 16, 2023 11:14 
        
      - 
      
- 
        Save kssd/9e1d6b98d709f3dce235e1b8121f6fd4 to your computer and use it in GitHub Desktop. 
    JAVA : Check UTF8 enconded characters in a string
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | import java.util.*; | |
| import java.util.stream.*; | |
| import java.nio.charset.StandardCharsets; | |
| public class UTF8Check { | |
| public static final String[] utf8Strings = new String[]{ | |
| "UTF8 check", | |
| "@!#$%^&*()_+}{[]';:/.,?><`~1234567890-=\\|" | |
| }; | |
| public static final String[] nonUtf8Strings = new String[]{ | |
| "Héllo", // contains the character 'é' (U+00E9) | |
| "नमस्ते", // contains the Devanagari character 'न' (U+0928) | |
| "Привет", // contains the Cyrillic character 'р' (U+0440) | |
| "مرحبا", // contains the Arabic character 'ح' (U+062D) | |
| "こんにちは", // contains the Hiragana character 'に' (U+306B) | |
| "안녕하세요", // contains the Hangul character '하' (U+D558) | |
| "שלום", // contains the Hebrew character 'ל' (U+05DC) | |
| "سلام", // contains the Arabic character 'م' (U+0645) | |
| "你好", // contains the Chinese character '好' (U+597D) | |
| "สวัสดี", // contains the Thai character 'ด' (U+0E14) | |
| "\uD83D\uDE00" // contains a non-UTF-8 emoji (U+1F600) | |
| }; | |
| public static boolean hasOnlyUtf8Characters(String str) { | |
| byte[] bytes = str.getBytes(StandardCharsets.UTF_8); | |
| for (byte b : bytes) { | |
| if ((b & 0xC0) == 0x80) { | |
| return false; // Non-UTF-8 character detected | |
| } | |
| } | |
| return true; | |
| } | |
| public static void main(String... args){ | |
| Stream.concat( | |
| Arrays.stream(utf8Strings), | |
| Arrays.stream(nonUtf8Strings)) | |
| .forEach((s) -> System.out.println(String.format("%-50s : has non-utf8 characters - %s", s, !hasOnlyUtf8Characters(s)))); | |
| } | |
| } | 
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment