/BppR/g;
$_ =~ s/BbrR{3,}/\n/g;
$_ =~ s/BppR{3,}/\n/g;
$_ =~ s/<.*?>//g;
$_ =~ s/BbrR/
\n/g;
$_ =~ s/BppR/
\n/g;
$_ =~ s/ / /g;
$_ =~ s/\x0a/_/g; # for empty lines
$_ =~ s/\n/_/g; # for empty lines
$_ =~ s/_{3,}/\n/g;
$_ =~ s/_/\n/g;
# $_ =~ s/[|_]//g; # use this line for extraneous chars
$_ =~ s/ {3,}/ /g;
$_ =~ s/\.{3,}/\./g;
}
### If the user uploads a file:
elsif ($use_file eq "true") {
$_ = ""; # Clears any residue from the web form # n/a right now
#open(UPFILE, $uploaded_file);
#open(UPFILE, "buckwalter/iso639.ar.html");
open(UPFILE, "win1256.html");
while (my $line =
\nINPUT STRING: $token
\n";
$lookup_word = get_lookup($token); # returns the Arabic string without vowels/diacritics and converted to transliteration
print "LOOK-UP WORD: $lookup_word
\n"; $tokens++; $types{$lookup_word}++;
if ( exists($found{$lookup_word}) ) {
print $found{$lookup_word}; # no need to re-analyse it
}
elsif ( exists($notfound{$lookup_word}) ) { # we keep %found and %notfound separate because %notfound can have additional lookups
print $notfound{$lookup_word}; $freqnotfound{$lookup_word}++;
}
else {
if ( @solutions = analyze($lookup_word) ) { # if lookup word has 1 or more solutions
foreach $solution (@solutions) {
$found{$lookup_word} .= $solution;
}
print $found{$lookup_word};
}
else {
$notfound{$lookup_word} = " Comment: $lookup_word NOT FOUND
\n";
if ( @alternatives = get_alternatives($lookup_word) ) {
foreach $alt (@alternatives) {
$notfound{$lookup_word} .= " ALTERNATIVE: $alt
\n";
if ( exists($found{$alt}) ) {
$notfound{$lookup_word} .= $found{$alt};
}
else {
if ( @solutions = analyze($alt) ) {
foreach $solution (@solutions) {
$notfound{$lookup_word} .= $solution;
}
}
else {
$notfound{$lookup_word} .= " Comment: $alt NOT FOUND
\n";
}
}
}# end foreach
}# end if
print $notfound{$lookup_word}; $freqnotfound{$lookup_word}++;
}
}#end else
}
else {
# it's not an Arabic word
@nonArabictokens = tokenize_nonArabic($token); # tokenize it on white space
foreach $item (@nonArabictokens) {
print "
\nINPUT STRING: $item
\n Comment: Non-Alphabetic Data
\n" unless ($item eq " " or $item eq "");
}
}
}#end foreach
#print STDOUT "\nNEWLINE\n";
#}#end while
# ====================================================
# print out not-found words by frequency:
#open (OUT, ">notFound") || die "cannot open: $!";
#print STDERR "\n\n========= Some stats ============================\n";
@types = keys %types; $types = @types;
#print STDERR "Tokens: $tokens -- Types: $types\n";
#print STDERR "\n========= Frequency count of Not-Found =========\n";
my @items = keys %freqnotfound;
foreach my $item (sort { $freqnotfound{$b} <=> $freqnotfound{$a} } @items) {
$rank++;
#print STDERR "$item\t$freqnotfound{$item}\n" unless ( $rank > 25 );
# print OUT "$item\t$freqnotfound{$item}\n"; # unless ( $rank > 25 );
}
#close OUT;
# ============================
sub analyze { # returns a list of 1 or more solutions
$this_word = shift @_; @solutions = (); $cnt = 0;
segmentword($this_word); # get a list of valid segmentations
foreach $segmentation (@segmented) {
($prefix,$stem,$suffix) = split ("\t",$segmentation); #print $segmentation, "\n";
if (exists($prefix_hash{$prefix})) {
if (exists($stem_hash{$stem})) {
if (exists($suffix_hash{$suffix})) {
# all 3 components exist in their respective lexicons, but are they compatible? (check the $cat pairs)
foreach $prefix_value (@{$prefix_hash{$prefix}}) {
($prefix, $voc_a, $cat_a, $gloss_a, $pos_a) = split (/\t/, $prefix_value);
foreach $stem_value (@{$stem_hash{$stem}}) {
#($stem, $voc_b, $cat_b, $gloss_b, $pos_b) = split (/\t/, $stem_value);
($stem, $voc_b, $cat_b, $gloss_b, $pos_b, $lemmaID) = split (/\t/, $stem_value);
if ( exists($hash_AB{"$cat_a"." "."$cat_b"}) ) {
foreach $suffix_value (@{$suffix_hash{$suffix}}) {
($suffix, $voc_c, $cat_c, $gloss_c, $pos_c) = split (/\t/, $suffix_value);
if ( exists($hash_AC{"$cat_a"." "."$cat_c"}) ) {
if ( exists($hash_BC{"$cat_b"." "."$cat_c"}) ) {
#$cnt++; push (@solutions, " SOLUTION $cnt: ($voc_a$voc_b$voc_c) $pos_a$pos_b$pos_c\n (GLOSS): $gloss_a + $gloss_b + $gloss_c\n");
$cnt++; push (@solutions, " SOLUTION $cnt: ($voc_a$voc_b$voc_c) [$lemmaID] $pos_a$pos_b$pos_c
\n (GLOSS): $gloss_a + $gloss_b + $gloss_c
\n");
}
}
}
}
}
}# end foreach $prefix_value
}
}# end if (exists($stem_hash{$stem}))
}
}# end foreach $segmentation
return (@solutions);
}
# ============================
sub get_alternatives { # returns a list of alternative spellings
$word = shift @_; @alternatives = ();
$temp = $word;
if ($temp =~ m/Y'$/) { # Y_w'_Y'
$temp =~ s/Y/y/g; # y_w'_y'
push (@alternatives, $temp); # y_w'_y' -- pushed
if ($temp =~ s/w'/&/) { # y_&__y'
push (@alternatives, $temp); # y_&__y' -- pushed
}
$temp = $word; # Y_w'_Y'
$temp =~ s/Y/y/g; # y_w'_y'
$temp =~ s/y'$/}/; # y_w'_}
push (@alternatives, $temp); # y_w'_} -- pushed
if ($temp =~ s/w'/&/) { # y_&__}
push (@alternatives, $temp); # y_&__} -- pushed
}
}
elsif ($temp =~ m/y'$/) { # Y_w'_y'
if ($temp =~ s/Y/y/g) { # Y_w'_y'
push (@alternatives, $temp); # y_w'_y' -- pushed
}
if ($temp =~ s/w'/&/) { # y_w'_y'
push (@alternatives, $temp); # y_&__y' -- pushed
}
$temp = $word; # Y_w'_y'
$temp =~ s/Y/y/g; # y_w'_y'
$temp =~ s/y'$/}/; # y_w'_}
push (@alternatives, $temp); # y_w'_} -- pushed
if ($temp =~ s/w'/&/) { # y_&__}
push (@alternatives, $temp); # y_&__} -- pushed
}
}
elsif ($temp =~ s/Y$/y/) { # Y_w'_y
$temp =~ s/Y/y/g; # y_w'_y
push (@alternatives, $temp); # y_w'_y -- pushed
if ($temp =~ s/w'/&/) { # y_&__y
push (@alternatives, $temp); # y_&__y -- pushed
}
}
elsif ($temp =~ m/y$/) { # Y_w'_y
$temp =~ s/Y/y/g; # y_w'_y
if ($temp =~ s/w'/&/) { # y_&__y
push (@alternatives, $temp); # y_&__y -- pushed
}
$temp = $word; # Y_w'_y
$temp =~ s/Y/y/g; # y_w'_y
$temp =~ s/y$/Y/g; # y_w'_Y
push (@alternatives, $temp); # y_w'_Y -- pushed
if ($temp =~ s/w'/&/) { # y_&__Y
push (@alternatives, $temp); # y_&__Y -- pushed
}
}
elsif ($temp =~ m/h$/) { # Y_w'_h
if ($temp =~ s/Y/y/g) { # y_w'_h
push (@alternatives, $temp); # y_w'_h -- pushed
}
if ($temp =~ s/w'/&/) { # y_&__h
push (@alternatives, $temp); # y_&__h -- pushed
}
$temp =~ s/h$/p/; # y_w'_p
push (@alternatives, $temp); # y_&__p -- pushed
}
elsif ($temp =~ m/p$/) { # Y_w'_h
if ($temp =~ s/Y/y/g) { # y_w'_h
push (@alternatives, $temp); # y_w'_h -- pushed
}
if ($temp =~ s/w'/&/) { # y_&__h
push (@alternatives, $temp); # y_&__h -- pushed
}
$temp =~ s/p$/h/; # y_w'_p
push (@alternatives, $temp); # y_&__p -- pushed
}
elsif ($temp =~ s/Y/y/g) { # Y_w'__
push (@alternatives, $temp); # y_w'__ -- pushed
if ($temp =~ s/w'/&/) { # y_&___
push (@alternatives, $temp); # y_&___ -- pushed
}
}
elsif ($temp =~ s/w'/&/) { # y_w'__
push (@alternatives, $temp); # y_&___ -- pushed
}
else {
# nothing
}
return @alternatives;
}
# ============================
sub tokenize_nonArabic { # tokenize non-Arabic strings by splitting them on white space
$nonArabic = shift @_;
$nonArabic =~ s/^\s+//; $nonArabic =~ s/\s+$//; # remove leading & trailing space
@nonArabictokens = split (/\s+/, $nonArabic);
return @nonArabictokens;
}
# ============================
sub tokenize { # returns a list of tokens
$line = shift @_;
chomp($line);
$line =~ s/^\s+//; $line =~ s/\s+$//; $line =~ s/\s+/ /g; # remove or minimize white space
@tokens = split (/([^\x81\x8D\x8E\x90\xC1-\xD6\xD8-\xDF\xE1\xE3-\xE6\xEC-\xED\xF0-\xF3\xF5\xF6\xF8\xFA]+)/,$line);
return @tokens;
}
# ================================
sub get_lookup { # creates a suitable lookup version of the Arabic input string (removes diacritics; transliterates)
$input_str = shift @_;
$tmp_word = $input_str; # we need to modify the input string for lookup
$tmp_word =~ s/\xDC//g; # remove kashida/taTwiyl (U+0640)
$tmp_word =~ s/[\xF0-\xF3\xF5\xF6\xF8\xFA]//g; # remove fatHatAn and all vowels/diacritics (ðñòóõöøú)
$tmp_word =~ tr/\x81\x8D\x8E\x90\xA1\xBA\xBF\xC1\xC2\xC3\xC4\xC5\xC6\xC7\xC8\xC9\xCA\xCB\xCC\xCD\xCE\xCF\xD0\xD1\xD2\xD3\xD4\xD5\xD6\xD8\xD9\xDA\xDB\xDC\xDD\xDE\xDF\xE1\xE3\xE4\xE5\xE6\xEC\xED\xF0\xF1\xF2\xF3\xF5\xF6\xF8\xFA/PJRG,;?'|>&<}AbptvjHxd*rzs\$SDTZEg_fqklmnhwYyFNKaui~o/; # convert to transliteration
return $tmp_word;
}
# ============================
sub segmentword { # returns a list of valid segmentations
$str = shift @_;
@segmented = ();
$prefix_len = 0;
$suffix_len = 0;
$str_len = length($str);
while ( $prefix_len <= 4 ) {
$prefix = substr($str, 0, $prefix_len);
$stem_len = ($str_len - $prefix_len);
$suffix_len = 0;
while (($stem_len >= 1) and ($suffix_len <= 6)) {
$stem = substr($str, $prefix_len, $stem_len);
$suffix = substr($str, ($prefix_len + $stem_len), $suffix_len);
push (@segmented, "$prefix\t$stem\t$suffix");
$stem_len--;
$suffix_len++;
}
$prefix_len++;
}
return @segmented;
}
# ==============================================================
sub load_dict { # loads a dict into a hash table where the key is $entry and its value is a list (each $entry can have multiple values)
%temp_hash = (); $entries = 0; $lemmaID = "";
$filename = shift @_;
open (IN, $filename) || die "cannot open: $!";
#print STDERR "loading $filename ...";
while (