#!/usr/bin/env perl
## Tokenizes sentences
## Changes this:  He/She thought, "Is 9.5 my favorite number?"
## To this:       He / She thought , " Is 9.5 my favorite number ? "

use strict;
use encoding "utf8";
#open STDOUT, "<:encoding(UTF-8)" ;

while (<>) {
  m/^#/ && print && next;	# skip comments
  s/([,،;؛:?!"\])}»”؟])/ $1 /g;
  s/\.(?!\w)/ . /g;		# don't tokenize 9.2, .5,  or www.example.com
  s/([({\[“„«「『])/ $1 /g;
  s{/}{ / }g;
  s/\s+$/\n/g;			# rm trailing spaces
  s/^ +//g;				# rm leading  spaces
  s/ {2,}/ /g;			# merge duplicate spaces
  print;
}

