#!/usr/bin/perl # Written by Jon Dehdari 2004-2007 # Perl 5.8 # Perstem: Stemmer and Morphological Parser for Persian # The license is the GPL v.2 (www.fsf.org) # Usage: perl perstem.pl [options] < input > output # Issues: punctuation vs tokenization use strict; #use warnings; #use diagnostics; use Getopt::Long; my $version = "0.9.7"; my $date = "2007-05-12"; my $copyright = "(c) 2004-2007 Jon Dehdari - GPL v2"; my $title = "Perstem: Persian stemmer $version, $date - $copyright"; my ( $dont_stem, $input_type, $output_type, $no_roman, $pos, $recall, $show_links, $show_only_root, $tokenize, $unvowel, $zwnj ) = undef; my ( $pos_v, $pos_n, $pos_aj ) = undef; my $ar_chars = "EqHSTDZLVU"; #my $al = "AbptVjcHxdLrzJsCSDTZEGfqkglmnuhiaoe\x5d\x7cPkMXIUN~"; #my $longvowel = "Aui]"; my %resolve; my $usage = <<"END_OF_USAGE"; ${title} Usage: perl $0 [options] < input > output Function: Stemmer and morphological analyzer for the Persian language (Farsi). Inflexional morphemes are separated from their roots. Options: -d, --nostem Don't stem -- mostly for character-set conversion -h, --help Print usage -i, --input Input character encoding type {cp1256,isiri3342,utf8,unihtml} -l, --links Show morphological links -n, --noroman Delete all non-Arabic script characters (eg. HTML tags) -o, --output Output character encoding type {arabtex,cp1256,isiri3342,utf8,unihtml} -p, --pos Tag words for parts of speech -r, --recall Increase recall by parsing ambiguous affixes -t, --tokenize Tokenize punctuation -u, --unvowel Remove short vowels -v, --version Print version ($version) -w, --root Return only word roots -z, --zwnj Insert Zero Width Non-Joiners where they should be END_OF_USAGE # -s, --stoplist Use external stopword list GetOptions( 'd|nostem' => \$dont_stem, 'h|help|?' => sub { print $usage; exit; }, 'i|input:s' => \$input_type, 'l|links' => \$show_links, 'n|noroman' => \$no_roman, 'o|output:s' => \$output_type, 'p|pos' => \$pos, 'r|recall' => \$recall, # 's|stoplist:s' => \$resolve_file, 't|tokenize' => \$tokenize, 'u|unvowel' => \$unvowel, 'v|version' => sub { print "$version\n"; exit; }, 'w|root' => \$show_only_root, 'z|zwnj' => \$zwnj, ) or die $usage; $input_type and $input_type =~ s/.*1256/cp1256/; # equates win1256 with cp1256 $output_type and $output_type =~ s/.*1256/cp1256/; # equates win1256 with cp1256 $input_type and $input_type =~ tr/[A-Z]/[a-z]/; # recognizes more enctype spellings $output_type and $output_type =~ tr/[A-Z]/[a-z]/; # recognizes more enctype spellings $input_type and $input_type =~ tr/-//; # eg. UTF-8 & utf8 $output_type and $output_type =~ tr/-//; # eg. UTF-8 & utf8 ### Open Resolve section while (my $resolve = ) { next if $resolve =~ /^#/; chomp $resolve; my @resolve = split /\t/, $resolve; %resolve = ( %resolve, "$resolve[0]" => "$resolve[1]" , ); } ### A hack for what Perl should've already done: support at runtime BOTH utf8 & other input types if ($input_type and $input_type eq "utf8") { # UTF-8 use encoding "utf8"; open STDIN, "<:encoding(UTF-8)" ; } else { unimport encoding "utf8";} while ($_ = <> ) { next if ( /^$/ | /^\s+$/ | /^#/ ); # Skips empty or commented-out lines $_ =~ tr/\r/\n/d; # Deletes lame DOS carriage returns $_ =~ s/\n/ ==20==/; # Converts newlines to temporary placeholder ==20== (after \x20) @_ = split(/(?/\n/g; $_ =~ s/

/\n/g; $_ =~ tr/\x01-\x09\x1b-\x1f\x21-\x2d\x2f-\x5a\x5c\x5e-\x9f//d; # Deletes all chars below xa0 except: 0a,20,2e,5b,5d # $_ =~ s/<\.>//g; # Deletes all dots in HTML tags # $_ =~ s/<.*?>//g; # Deletes all HTML tags on 1 line # $_ =~ s/<.*?//g; # Deleses 1st part of line-spanning HTML tags # $_ =~ s/.*?>//g; # Deletes 2nd part of line-spanning HTML tags } if ($input_type eq "utf8") { $_ =~ tr/ابپتثجچحخدذرزژسشصضطظعغفقكگلمنوهيَُِآ☿ةکیءىۀئؤًّ،؛؟٪‍‌/AbptVjcHxdLrzJsCSDTZEGfqkglmnuhiaoe\x5d\x7cPkiMiXIUN~,;?%*\-/; } elsif ($input_type eq "unihtml") { my %unihtml2roman = ( 'ا' => 'A', '☿' => '|', "ب" => 'b', 'ة' => 'P', 'پ' => 'p', 'ت' => 't', 'ث' => 'V', 'ج' => 'j', 'چ' => 'c', 'ح' => 'H', 'خ' => 'x', 'د' => 'd', 'ذ' => 'L', 'ر' => 'r', 'ز' => 'z', 'ژ' => 'J', 'س' => 's', 'ش' => 'C', 'ص' => 'S', 'ض' => 'D', 'ط' => 'T', 'ظ' => 'Z', 'ع' => 'E', 'غ' => 'G', 'ف' => 'f', 'ق' => 'q', 'ك' => 'k', 'ک' => 'k', 'گ' => 'g', 'ل' => 'l', 'م' => 'm', 'ن' => 'n', 'و' => 'u', 'ه' => 'h', 'ي' => 'i', 'ی' => 'i', 'ى' => 'A', 'َ' => 'a', 'ُ' => 'o', 'ِ' => 'e', 'ّ' => '~', 'آ' => ']', 'ء' => 'M', 'ً' => 'N', 'أ' => '|', 'ؤ' => 'U', 'إ' => '|', 'ئ' => 'I', 'ۀ' => 'X', '٪' => '%', '،' => ',', '؛' => ';', '؟' => '?', '‌' => "-", ' ' => ' ', '.' => '.', ':' => ':', ); my @charx = split(/(?=\&\#)|(?=\s)|(?=\n)/, $_); $_ = ""; foreach my $charx (@charx) { my $text_from_new = $unihtml2roman{$charx}; $_ = $_ . $text_from_new; } # Ends foreach } # Ends elsif ($input_type eq "unihtml") elsif ($input_type eq "cp1256") { $_ =~ tr/\xc7\xc8\x81\xca\xcb\xcc\x8d\xcd\xce\xcf\xd0\xd1\xd2\x8e\xd3\xd4\xd5\xd6\xd8\xd9\xda\xdb\xdd\xde\xdf\x90\xe1\xe3\xe4\xe6\xe5\xed\xf3\xf5\xf6\xc2\xff\xc9\x98\xc1\xc0\xc6\xc4\xf0\xf8\xa1\xba\xbf\xab\xbb\x9d\xec/AbptVjcHxdLrzJsCSDTZEGfqkglmnuhiaoe\x5d\x7cPkMXIUN~,;?{}\-i/; } elsif ($input_type eq "isiri3342") { $_ =~ tr/\xc1\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xfe\xf0\xf2\xf1\xc0\xc1\xfc\xda\xe1\xc2\xfb\xfa\xf3\xf6\xac\xbb\xbf\xa5\xe7\xe6\xa1/AbptVjcHxdLrzJsCSDTZEGfqKglmnuhyaoe\x5d\x7cPkiMIUN~,;?%{}\-/; } $_ =~ s/\bA/|/g; # eg. AirAn -> |irAn } # if ($input_type) if ( $unvowel ) { $_ =~ s/\b([aeo])/|/g; # Inserts alef before words that begin with short vowel $_ =~ s/\bA/]/g; # Changes long 'aa' at beginning of word to alef madda $_ =~ s/[aeo~]//g; # Finally, removes all other short vowels and tashdids } if ( $zwnj ) { #Inserts ZWNJ's where they should have been originally, but weren't $_ =~ s/(?\S*?)(?:\S{3}(? 'ا', '|' => 'ا', 'b' => 'ب', 'p' => 'پ', 't' => 'ت', 'V' => 'ث', 'j' => 'ج', 'c' => 'چ', 'H' => 'ح', 'x' => 'خ', 'd' => 'د', 'L' => 'ذ', 'r' => 'ر', 'z' => 'ز', 'J' => 'ژ', 's' => 'س', 'C' => 'ش', 'S' => 'ص', 'D' => 'ض', 'T' => 'ط', 'Z' => 'ظ', 'E' => 'ع', 'G' => 'غ', 'f' => 'ف', 'q' => 'ق', 'k' => 'ک', 'K' => 'ك', 'g' => 'گ', 'l' => 'ل', 'm' => 'م', 'n' => 'ن', 'u' => 'و', 'v' => 'و', 'w' => 'و', 'h' => 'ه', 'X' => 'ۀ', 'i' => 'ی', 'I' => 'ئ', 'a' => 'َ', 'o' => 'ُ', 'e' => 'ِ', '~' => 'ّ', ',' => '،', ';' => '؛', '?' => '؟', ']' => 'آ', 'M' => 'ء', 'N' => 'ً', 'U' => 'ؤ', '-' => '‌', ' ' => ' ', '_' => '_', '+' => '+', "\n" => '
', '.' => '‫.‪', ); my @charx = split(//, $_); $_ = ""; foreach my $charx (@charx) { my $newchar = $roman2unihtml{$charx}; $_ = $_ . $newchar; } # Ends foreach } # Ends elsif (unihtml) elsif ($output_type eq "cp1256") { $_ =~ tr/AbptVjcHxdLrzJsCSDTZEGfqKglmnuhyaoe\x5d\x7cPkMXIUN~,;?{}\-i/\xc7\xc8\x81\xca\xcb\xcc\x8d\xcd\xce\xcf\xd0\xd1\xd2\x8e\xd3\xd4\xd5\xd6\xd8\xd9\xda\xdb\xdd\xde\xdf\x90\xe1\xe3\xe4\xe6\xe5\xed\xf3\xf5\xf6\xc2\xff\xc9\x98\xc1\xc0\xc6\xc4\xf0\xf8\xa1\xba\xbf\xab\xbb\x9d\xec/; # $_ =~ s/\x2e/\xfe\x2e\xfd/g; # Corrects periods to be RTL embedded; broken } elsif ($output_type eq "isiri3342") { $_ =~ tr/AbptVjcHxdLrzJsCSDTZEGfqKglmnuhyaoe\x5d\x7cPkiMIUN~,;?%{}\-/\xc1\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xfe\xf0\xf2\xf1\xc0\xc1\xfc\xda\xe1\xc2\xfb\xfa\xf3\xf6\xac\xbb\xbf\xa5\xe7\xe6\xa1/; } elsif ($output_type eq "arabtex") { my %roman2arabtex = ( 'A' => 'A', '|' => 'a', 'b' => 'b', 'p' => 'p', 't' => 't', 'V' => '_t', 'j' => 'j', 'c' => '^c', 'H' => '.h', 'x' => 'x', 'd' => 'd', 'L' => '_d', 'r' => 'r', 'z' => 'z', 'J' => '^z', 's' => 's', 'C' => '^s', 'S' => '.s', 'D' => '.d', 'T' => '.t', 'Z' => '.z', 'E' => '`', 'G' => '.g', 'f' => 'f', 'q' => 'q', 'K' => 'k', 'k' => 'k', 'g' => 'g', 'l' => 'l', 'm' => 'm', 'n' => 'n', 'u' => 'U', 'v' => 'w', 'w' => 'w', 'h' => 'h', 'X' => 'H-i', 'i' => 'I', 'I' => '\'y', 'a' => 'a', 'o' => 'o', 'e' => 'e', 'P' => 'T', '~' => '', ',' => ',', ';' => ';', '?' => '?', ']' => '^A', 'M' => '\'', 'N' => 'aN', 'U' => 'U\'', '{' => '\lq ', '}' => '\rq ', '-' => '\hspace{0ex}', '.' => '.', ' ' => ' ', '_' => '_', '+' => '+', ); my @charx = split(//, $_); $_ = ""; foreach my $charx (@charx) { my $newchar = $roman2arabtex{$charx}; $_ = $_ . $newchar; } # Ends foreach # $_ = $_ . '\\\\'; # Appends LaTeX newline '\\' after each line } # Ends elsif (arabtex) if ($output_type eq "utf8" && m/[^ .\n]/) { # If utf8 & non-empty binmode(STDOUT, ":utf8"); # Uses the :utf8 output layer s/==20==/\n/g && print "$_" or print "$_ "; } elsif ( /[^ .\n]/ ) { # if arabic-script line is non-empty s/==20==/\n/g && print "$_" or print "$_ "; } } # if ($output_type) -- for non-roman input elsif ( /[^ .\n]/ ) { # if roman-script line is non-empty s/==20==/\n/g && print "$_" or print "$_ "; } } # ends foreach @_ } # ends while (<>) ### Resolve section ## The format of the Resolve section ( __DATA__ ) is as follows: ## 1. Mokassar (broken plurals): 'ktb ktAb' OR 'ktb ktAb_+PL' ## 2. Preparsed (speed): 'krdn kr_+dn' ## 3. Don't stem (false positive): 'bArAn bArAn' ## 4. Stop word (delete): 'u ' __DATA__ #u #dr #bh #|z #kh #|in #mi #rA #bA #hA #]n #ik #hm #mn #tu #|u #mA #CmA #tA #digr #iA #|mA #|gr #hr #ps #ch #iki #hic #uli #nh #|st #hA #bi #|i #br u u dr dr bh bh |z |z kh kh |in |in mi mi rA rA bA bA hA hA ]n ]n ik ik hm hm mn mn tu tu |u |u mA mA CmA CmA tA tA digr digr iA iA |mA |mA |gr |gr hr hr ps ps ch ch iki iki hic hic uli uli nh nh |st |st hA hA bi bi |i |i br br |iCAn |iCAn ]nhA ]nhA ]nAn ]nAn bArAn bArAn thrAn thrAn tim tim hfth hfth kihAn kihAn Hti Hti zndgi zndgi sAzmAn sAzmAn EnuAn EnuAn nZAm nZAm jhAn jhAn pAiAn pAiAn biCtr biCtr miAn miAn frhngi frhngi tnhA tnhA |ntxAbAt |ntxAbAt |stfAdh |stfAdh iAzdh iAzdh duAzdh duAzdh pAnzdh pAnzdh sizdh sizdh CAnzdh CAnzdh nuzdh nuzdh frxndh frxndh ]mrikA ]mrikA rIis rIis xndh xndh lndn lndn mEdn mEdn tmdn tmdn |rdn |rdn grdn grdn lAdn lAdn kudn kudn mAdh mAdh miliArd miliArd kilumtr kilumtr jAdh jAdh |st |st bud bud br br ktb ktAb |fkAr fkr |EDA EDu |fGAnstAn |fGAnstAn pArlmAn pArlmAn mrA mn rA trA tu rA cist ch |st krdn kr_+dn Cdh C_+d_+h krdh kr_+d_+h mrdm mrd_+m dAdh dA_+d_+h budh bu_+d_+h zbAnhAi zbAn_+hA_+e zbAnhA zbAn_+hA budh bu_+d_+h gLCth gLC_+t_+h budnd bud_+nd dACth dAC_+t_+h krdnd krd_+nd rui ru_+e kCurhAi kCur_+hA_+e kCurhA kCur_+hA sui su_+e grfth grf_+t_+h Cdn C_+dn ]indh ]i_+ndh dftr dftr dfAtr dfAtr dktr dktr sAxth sAx_+t_+h ]mdh ]m_+d_+h rAi rA_+e jAi jA_+e uqt uqt gLACth gLAC_+t_+h budn bu_+dn nCdh n+_C_+d_+h didn di_+dn didh di_+d_+h dAdn dA_+dn zdh z_+d_+h zdnd z_+d_+nd dAdnd dAd_+nd |slAmi |slAm_+i knnd kn_+nd knd kn_+d Cud Cu_+d dhd dh_+d dArd dAr_+d xuAhd xuAh_+d nist n+_|st kjAst kjA+_|st ]mrikAii ]mrikA_+i |nsAni |nsAn_+i