Created
October 25, 2009 17:52
-
-
Save j2labs/218149 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w -CD | |
binmode STDOUT, ":utf8"; | |
binmode STDIN, ":utf8"; | |
no warnings; | |
#-----Description------------------------------------------------------ | |
# | |
# Program:urdu-segmenter.pl | |
# Written by: Danish Munir | |
# Purpose:breaks urdu text into sentences | |
# | |
# Syntax: urdu-segmenter.pl [filename] | |
#or program_that_outputs_urdu_text | urdu-segmenter.pl [-x] -s [docid(optional)] | |
# This script takes a utf8 encoded file with Urdu text as input | |
# and outputs to STDOUT, the text after segmenting it into sentences. | |
# | |
# The xml format of the output is as follows | |
# <DOC docid = "Filename" lang = "URD"> | |
# <SEG id = "1">Urdu Sentence 1</SEG> | |
# <SEG id = "2">Urdu Sentence 2</SEG> | |
# <SEG id = "3">Urdu Sentence 3</SEG> | |
# </DOC> | |
# | |
# This script breaks urdu sentences based on the following punctuations: | |
# [dash]Unicode 06D4 | |
# [question]Unicode 061F | |
# multiple newline characters | |
#----------------------------------------------------------------------- | |
if ($ARGV[0] =~ m/^-h$/ || $ARGV[0] =~ m/^-+help$/){ | |
print "\n | |
breaksenteces.pl | |
---------------- | |
Syntax: urdu-segmenter.pl [filename] | |
of urdu-segmenter.pl -x [filename] | |
or program_that_outputs_urdu_text | urdu-segmenter.pl -s [docid(optional)] | |
of program_that_outputs_urdu_text | urdu-segmenter.pl -s -x [docid(optional)] | |
eg: more sourcefile1.txt | urdu-segmenter.pl -s Title | |
The -x option is used to output xml tags, if and only if the -x option is used | |
This script takes a utf8 encoded file with Urdu text as input and outputs to STDOUT, the text after segmenting it into sentences. | |
The xml format of the output is as follows | |
<DOC docid = \"Filename\" lang = \"URD\"> | |
<SEG id = \"1\">Urdu Sentence 1</SEG> | |
<SEG id = \"2\">Urdu Sentence 2</SEG> | |
</DOC> | |
This script breaks urdu sentences based on the following punctuations: | |
multiple newline characters | |
[dash]Unicode 06D4 | |
[question]Unicode 061F | |
[ellipsis]Unicode 2026 | |
[bullet]Unicode 2022 | |
\n\n"; | |
exit; | |
} | |
#Code Starts here | |
if ($ARGV[0] =~ m/^-s$/){ | |
if ($ARGV[1] =~ m/^-x$/) { | |
$printxml = 1; | |
$filename = $ARGV[2]; | |
$/=undef; | |
$_=<STDIN>; | |
} else { | |
$printxml = 0; | |
$filename = $ARGV[1]; | |
$/=undef; | |
$_=<STDIN>; | |
} | |
} | |
else { | |
if ($ARGV[0] =~ m/^-x$/) { | |
$printxml = 1; | |
open(I,"<:utf8", $ARGV[1]) #Open the file passed, or exit upon error | |
or die "Cannot open file $ARGV[1]: $!"; | |
$filename = $ARGV[1]; #This and the next 2 lines cleanup the | |
$filename =~ s/.*\///; #filename by removing the path and the | |
$filename =~ s/\.[^\.]*$//; #extension. | |
$/=undef; #Set the input delimeter to undef to | |
#read the entire file at once. | |
$_=<I>; | |
} else { | |
$printxml = 0; | |
open(I,"<:utf8", $ARGV[0]) #Open the file passed, or exit upon error | |
or die "Cannot open file $ARGV[0]: $!"; | |
$filename = $ARGV[0]; #This and the next 2 lines cleanup the | |
$filename =~ s/.*\///; #filename by removing the path and the | |
$filename =~ s/\.[^\.]*$//; #extension. | |
$/=undef; #Set the input delimeter to undef to | |
#read the entire file at once. | |
$_=<I>; | |
} | |
} | |
s/\r//sgi; | |
s/\n/\n\n/sg; | |
if ($printxml) { | |
print "<DOC docid = \"$filename\" lang = \"URD\">\n"; | |
} | |
s/\s*\x{2022}\s*/\n\n\n\n\n/g; #Replace bullets with sentence breaks. | |
s/\t* +\t*$/ /g; | |
s/[\n\x{000D}][ ]+[\n\x{000d}]/\n\n/sg; #This and the following 4 lines | |
s/^[\t\x{0020}]+$/\n\n/g; #attempt to remove lines with | |
#s/ +$/\n\n/g; #spaces only. | |
#s/([\x{06d4}\x{061f}\n\x{000d}]) *[\n\x{000d}]*/$1/g; | |
s/|//g; #Remove pipe character from files. | |
my @sentences=split(/(\n{2,}|!|\x{061f}|\x{06D4}|\x{2022}|\x{000d}|\s{2,}|\x{2026}|\x{002e})/); #This line actually splits the text into | |
###my @sentences=split(/(\n{2,}|!|\x{002e})/); #This line actually splits the text into | |
#sentences based on the various delimiters | |
#described above | |
my $i = 0; #Initialize loop counter i, and the | |
my $j=1; #segment counter j | |
sent: while ( $i < @sentences ) { #The @sentences array has a list of items such that | |
#an item at index i, is followed by the punctuation | |
#that marked the end of this sentence at index i+1 | |
$string=$sentences[$i]; #Take a sentence and trim any white d | |
$string =~ s/^\s*(.*?)\s*$/$1/g; #spaces at the start or en | |
if (length($string) <=3 || $string =~ m/^\s+$/){#Check to see if a sentence contains only white | |
$i += 0; #space. If it does, than discard it. | |
} | |
else{ | |
if ($printxml) { | |
print "<SEG id=\"$j\">$string"; #If it passes the test, than sentence is valid, so print it | |
} else { | |
print "$string"; | |
} | |
if($sentences[$i+1] =~ m/[\n\x{000d}\x{2022}]/){#Check the punctuation follownig the sentence. | |
print ""; #If newline, carraige-return, or bullet than dont print it. | |
} else{ #If passes both tests than print it. | |
print "$sentences[$i+1]"; | |
} | |
if ($printxml) { | |
print "</SEG>\n"; #Close segment tag. | |
} else { | |
print "\n"; | |
} | |
$j++; #Increment segment counter. | |
} | |
$i+=2; #Increment sentence counter by 2, to move to next | |
} #set of sentence and its ending punctuation. | |
if ($printxml) { | |
print "</DOC>\n"; #Close DOC tag. | |
} | |
close; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment