Created
November 2, 2011 17:53
-
-
Save nikopol/1334349 to your computer and use it in GitHub Desktop.
is_utf8 perl vs inline c
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
use Inline C; | |
#use Benchmark qw(:all); | |
local $/=undef; | |
my $file=$ARGV[0] or die("you must specify a file"); | |
open my $fh, "<", $file; | |
my $text=<$fh>; | |
close $fh; | |
##$stat=timethese(100, { | |
## 'Perl' => sub { _is_utf8($text); }, | |
## 'InlineC' => sub { __is_utf8($text); }, | |
##}); | |
##cmpthese($stat) ; | |
if(__is_utf8($text)) { print "\nseems to be utf8\n"; } | |
else { print "\ndoesn't seems to be utf8\n"; } | |
sub _is_utf8 { | |
my $text=shift; | |
my @asc=unpack("C*",$text); | |
my $len=scalar(@asc); | |
my $i=0; | |
my $utflen=0; | |
my $c=0; | |
#print "PParsing ".$len." bytes\n"; | |
while($i<$len) { | |
$c=$asc[$i]; | |
if($c & 0x80) { #high bit? => utf char ? | |
#print "highbit! [".$c."]"; | |
if (($c&0xe0)==0xc0) { $utflen=1; } | |
elsif(($c&0xf0)==0xe0) { $utflen=2; } | |
elsif(($c&0xf8)==0xf0) { $utflen=3; } | |
else { return 0; } | |
#print " len=".$utflen; | |
while($utflen--) { | |
$c=$asc[++$i]; | |
return 0 if(($c&0xc0)!=0x80); | |
} | |
#print " ok\n"; | |
} | |
++$i; | |
} | |
1; | |
} | |
__END__ | |
__C__ | |
int __is_utf8(char *txt) { | |
unsigned char c; | |
int len; | |
while((c=*txt++)!=0) | |
if((c&0x80)==0x80) { | |
if ((c&0xe0)==0xc0) len=1; | |
else if((c&0xf0)==0xe0) len=2; | |
else if((c&0xf8)==0xf0) len=3; | |
else return 0; | |
while(len--) { | |
c=*txt++; | |
if((c&0xc0)!=0x80) return 0; | |
} | |
} | |
return 1; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment