j2labs · October 25, 2009 17:52
diff --git a/urdu-segmenter.pl b/urdu-segmenter.pl
 #!/usr/bin/perl -w -CD
 binmode STDOUT, ":utf8";
 binmode STDIN, ":utf8";
 no warnings;


 #-----Description------------------------------------------------------
 #
 # Program:urdu-segmenter.pl
 # Written by: Danish Munir
 # Purpose:breaks urdu text into sentences
 #
 # Syntax: urdu-segmenter.pl [filename]
 #or program_that_outputs_urdu_text | urdu-segmenter.pl [-x] -s [docid(optional)]
 # This script takes a utf8 encoded file with Urdu text as input
 # and outputs to STDOUT, the text after segmenting it into sentences.
 #
 # The xml format of the output is as follows
 # <DOC docid = "Filename" lang = "URD">
 # <SEG id = "1">Urdu Sentence 1</SEG>
 # <SEG id = "2">Urdu Sentence 2</SEG>
 # <SEG id = "3">Urdu Sentence 3</SEG>
 # </DOC>
 #
 # This script breaks urdu sentences based on the following punctuations:
 # [dash]Unicode 06D4
 # [question]Unicode 061F
 # multiple newline characters
 #-----------------------------------------------------------------------

 if ($ARGV[0] =~ m/^-h$/ || $ARGV[0] =~ m/^-+help$/){
    print "\n
 breaksenteces.pl
 ----------------
 Syntax: urdu-segmenter.pl [filename]
 of urdu-segmenter.pl -x [filename]
 or program_that_outputs_urdu_text | urdu-segmenter.pl -s [docid(optional)]
 of program_that_outputs_urdu_text | urdu-segmenter.pl -s -x [docid(optional)]
 eg: more sourcefile1.txt | urdu-segmenter.pl -s Title

 The -x option is used to output xml tags, if and only if the -x option is used

 This script takes a utf8 encoded file with Urdu text as input and outputs to STDOUT, the text after segmenting it into sentences.

 The xml format of the output is as follows
 <DOC docid = \"Filename\" lang = \"URD\">
 <SEG id = \"1\">Urdu Sentence 1</SEG>
 <SEG id = \"2\">Urdu Sentence 2</SEG>
 </DOC>

 This script breaks urdu sentences based on the following punctuations:

 multiple newline characters
 [dash]Unicode 06D4
 [question]Unicode 061F
 [ellipsis]Unicode 2026
 [bullet]Unicode 2022 

 \n\n";
    exit;
 }


 #Code Starts here
 if ($ARGV[0] =~ m/^-s$/){
    if ($ARGV[1] =~ m/^-x$/) {
        $printxml = 1;
        $filename = $ARGV[2];
        $/=undef;
        $_=<STDIN>;
    } else {
        $printxml = 0;
        $filename = $ARGV[1];
        $/=undef;
        $_=<STDIN>;
    }
 }
 else {
    if ($ARGV[0] =~ m/^-x$/) {
        $printxml = 1;
        open(I,"<:utf8", $ARGV[1]) #Open the file passed, or exit upon error
            or die "Cannot open file $ARGV[1]: $!";

        $filename = $ARGV[1]; #This and the next 2 lines cleanup the

        $filename =~ s/.*\///; #filename by removing the path and the
        $filename =~ s/\.[^\.]*$//; #extension.

        $/=undef; #Set the input delimeter to undef to 
 #read the entire file at once.

        $_=<I>; 
    } else {
        $printxml = 0;
        open(I,"<:utf8", $ARGV[0]) #Open the file passed, or exit upon error
            or die "Cannot open file $ARGV[0]: $!";

        $filename = $ARGV[0]; #This and the next 2 lines cleanup the

        $filename =~ s/.*\///; #filename by removing the path and the
        $filename =~ s/\.[^\.]*$//; #extension.

        $/=undef; #Set the input delimeter to undef to 
 #read the entire file at once.

        $_=<I>; 
    }
 }
 s/\r//sgi;
 s/\n/\n\n/sg;
 if ($printxml) {
    print "<DOC docid = \"$filename\" lang = \"URD\">\n"; 
 }
 s/\s*\x{2022}\s*/\n\n\n\n\n/g; #Replace bullets with sentence breaks.

 s/\t* +\t*$/ /g;
 s/[\n\x{000D}][ ]+[\n\x{000d}]/\n\n/sg; #This and the following 4 lines
 s/^[\t\x{0020}]+$/\n\n/g; #attempt to remove lines with
 #s/ +$/\n\n/g; #spaces only.

 #s/([\x{06d4}\x{061f}\n\x{000d}]) *[\n\x{000d}]*/$1/g;

 s/|//g; #Remove pipe character from files.


 my @sentences=split(/(\n{2,}|!|\x{061f}|\x{06D4}|\x{2022}|\x{000d}|\s{2,}|\x{2026}|\x{002e})/); #This line actually splits the text into
 ###my @sentences=split(/(\n{2,}|!|\x{002e})/); #This line actually splits the text into
 #sentences based on the various delimiters
 #described above


 my $i = 0; #Initialize loop counter i, and the
 my $j=1; #segment counter j

 sent: while ( $i < @sentences ) { #The @sentences array has a list of items such that
     #an item at index i, is followed by the punctuation
     #that marked the end of this sentence at index i+1

     $string=$sentences[$i]; #Take a sentence and trim any white d
     $string =~ s/^\s*(.*?)\s*$/$1/g; #spaces at the start or en

     if (length($string) <=3 || $string =~ m/^\s+$/){#Check to see if a sentence contains only white
         $i += 0; #space. If it does, than discard it.
     }
     else{
         if ($printxml) {
             print "<SEG id=\"$j\">$string"; #If it passes the test, than sentence is valid, so print it
         } else {
             print "$string";
         }

         if($sentences[$i+1] =~ m/[\n\x{000d}\x{2022}]/){#Check the punctuation follownig the sentence. 
             print ""; #If newline, carraige-return, or bullet than dont print it.
         } else{ #If passes both tests than print it.
             print "$sentences[$i+1]";
         }

         if ($printxml) {
             print "</SEG>\n"; #Close segment tag.
         } else {
             print "\n";
         }
         $j++; #Increment segment counter.
     }
     $i+=2; #Increment sentence counter by 2, to move to next
 } #set of sentence and its ending punctuation.

 if ($printxml) {
    print "</DOC>\n"; #Close DOC tag.
 }

 close;
	#!/usr/bin/perl -w -CD
	binmode STDOUT, ":utf8";
	binmode STDIN, ":utf8";
	no warnings;


	#-----Description------------------------------------------------------
	#
	# Program:urdu-segmenter.pl
	# Written by: Danish Munir
	# Purpose:breaks urdu text into sentences
	#
	# Syntax: urdu-segmenter.pl [filename]
	#or program_that_outputs_urdu_text \| urdu-segmenter.pl [-x] -s [docid(optional)]
	# This script takes a utf8 encoded file with Urdu text as input
	# and outputs to STDOUT, the text after segmenting it into sentences.
	#
	# The xml format of the output is as follows
	# <DOC docid = "Filename" lang = "URD">
	# <SEG id = "1">Urdu Sentence 1</SEG>
	# <SEG id = "2">Urdu Sentence 2</SEG>
	# <SEG id = "3">Urdu Sentence 3</SEG>
	# </DOC>
	#
	# This script breaks urdu sentences based on the following punctuations:
	# [dash]Unicode 06D4
	# [question]Unicode 061F
	# multiple newline characters
	#-----------------------------------------------------------------------

	if ($ARGV[0] =~ m/^-h$/ \|\| $ARGV[0] =~ m/^-+help$/){
	print "\n
	breaksenteces.pl
	----------------
	Syntax: urdu-segmenter.pl [filename]
	of urdu-segmenter.pl -x [filename]
	or program_that_outputs_urdu_text \| urdu-segmenter.pl -s [docid(optional)]
	of program_that_outputs_urdu_text \| urdu-segmenter.pl -s -x [docid(optional)]
	eg: more sourcefile1.txt \| urdu-segmenter.pl -s Title

	The -x option is used to output xml tags, if and only if the -x option is used

	This script takes a utf8 encoded file with Urdu text as input and outputs to STDOUT, the text after segmenting it into sentences.

	The xml format of the output is as follows
	<DOC docid = \"Filename\" lang = \"URD\">
	<SEG id = \"1\">Urdu Sentence 1</SEG>
	<SEG id = \"2\">Urdu Sentence 2</SEG>
	</DOC>

	This script breaks urdu sentences based on the following punctuations:

	multiple newline characters
	[dash]Unicode 06D4
	[question]Unicode 061F
	[ellipsis]Unicode 2026
	[bullet]Unicode 2022

	\n\n";
	exit;
	}


	#Code Starts here
	if ($ARGV[0] =~ m/^-s$/){
	if ($ARGV[1] =~ m/^-x$/) {
	$printxml = 1;
	$filename = $ARGV[2];
	$/=undef;
	$_=<STDIN>;
	} else {
	$printxml = 0;
	$filename = $ARGV[1];
	$/=undef;
	$_=<STDIN>;
	}
	}
	else {
	if ($ARGV[0] =~ m/^-x$/) {
	$printxml = 1;
	open(I,"<:utf8", $ARGV[1]) #Open the file passed, or exit upon error
	or die "Cannot open file $ARGV[1]: $!";

	$filename = $ARGV[1]; #This and the next 2 lines cleanup the

	$filename =~ s/.*\///; #filename by removing the path and the
	$filename =~ s/\.[^\.]*$//; #extension.

	$/=undef; #Set the input delimeter to undef to
	#read the entire file at once.

	$_=<I>;
	} else {
	$printxml = 0;
	open(I,"<:utf8", $ARGV[0]) #Open the file passed, or exit upon error
	or die "Cannot open file $ARGV[0]: $!";

	$filename = $ARGV[0]; #This and the next 2 lines cleanup the

	$filename =~ s/.*\///; #filename by removing the path and the
	$filename =~ s/\.[^\.]*$//; #extension.

	$/=undef; #Set the input delimeter to undef to
	#read the entire file at once.

	$_=<I>;
	}
	}
	s/\r//sgi;
	s/\n/\n\n/sg;
	if ($printxml) {
	print "<DOC docid = \"$filename\" lang = \"URD\">\n";
	}
	s/\s\x{2022}\s/\n\n\n\n\n/g; #Replace bullets with sentence breaks.

	s/\t* +\t*$/ /g;
	s/[\n\x{000D}][ ]+[\n\x{000d}]/\n\n/sg; #This and the following 4 lines
	s/^[\t\x{0020}]+$/\n\n/g; #attempt to remove lines with
	#s/ +$/\n\n/g; #spaces only.

	#s/([\x{06d4}\x{061f}\n\x{000d}]) [\n\x{000d}]/$1/g;

	s/\|//g; #Remove pipe character from files.


	my @sentences=split(/(\n{2,}\|!\|\x{061f}\|\x{06D4}\|\x{2022}\|\x{000d}\|\s{2,}\|\x{2026}\|\x{002e})/); #This line actually splits the text into
	###my @sentences=split(/(\n{2,}\|!\|\x{002e})/); #This line actually splits the text into
	#sentences based on the various delimiters
	#described above


	my $i = 0; #Initialize loop counter i, and the
	my $j=1; #segment counter j

	sent: while ( $i < @sentences ) { #The @sentences array has a list of items such that
	#an item at index i, is followed by the punctuation
	#that marked the end of this sentence at index i+1

	$string=$sentences[$i]; #Take a sentence and trim any white d
	$string =~ s/^\s(.?)\s*$/$1/g; #spaces at the start or en

	if (length($string) <=3 \|\| $string =~ m/^\s+$/){#Check to see if a sentence contains only white
	$i += 0; #space. If it does, than discard it.
	}
	else{
	if ($printxml) {
	print "<SEG id=\"$j\">$string"; #If it passes the test, than sentence is valid, so print it
	} else {
	print "$string";
	}

	if($sentences[$i+1] =~ m/[\n\x{000d}\x{2022}]/){#Check the punctuation follownig the sentence.
	print ""; #If newline, carraige-return, or bullet than dont print it.
	} else{ #If passes both tests than print it.
	print "$sentences[$i+1]";
	}

	if ($printxml) {
	print "</SEG>\n"; #Close segment tag.
	} else {
	print "\n";
	}
	$j++; #Increment segment counter.
	}
	$i+=2; #Increment sentence counter by 2, to move to next
	} #set of sentence and its ending punctuation.

	if ($printxml) {
	print "</DOC>\n"; #Close DOC tag.
	}

	close;