# Input: one or more lexical entries of the form # # lex(Word, CATEGORY). # # Word is a string; CATEGORY is actually a colon-separated pair of # all-caps categories. # # We need to convert Word into a prolog atom: a string that starts # with a lowercase letter and only contains lowercase, uppercase, # numerals, and the underscore. # # In addition, I get rid of the umlauts, etc. at this point, mostly # because I don't understand all of the encoding issues involved (i.e. # which tools support which encodings to which degree). # # So all non-first-26-alphabetics/non-numerics (umlauted letters, # hyphens, etc.) are converted into things like "_27_". Then, if the # result doesn't start with a lowercase letter, the string "zz__" is # prepended. As a result, we have a reversible transformation. # # The worst case for input is probably something like: # # lex(,, $,_NMC). # # which becomes # # lex(zz___44_, zz___36__44__nmc). while (<>) { ($word, $cat) = /^lex\((.*), (.*)\).$/; $word =~ s/([^A-Za-z0-9])/"_".ord($1)."_"/eg; if (substr($word, 0, 1) =~ /[^a-z]/) { substr($word, 0, 0) = "zz__"; } $cat =~ tr/A-Z/a-z/; $cat =~ s/([^A-Za-z0-9_])/"_".ord($1)."_"/eg; if (substr($cat, 0, 1) =~ /[^a-z]/) { substr($cat, 0, 0) = "zz__"; } print "lex($word, $cat).\n"; } # I've never figured out how to type umlauts in my version of emacs, # but I can copy-and-paste them without problems. So these stay here # in case of future need. # # $word =~ s/Ä/Ae/g; # $word =~ s/Ö/Oe/g; # $word =~ s/Ü/Ue/g; # $word =~ s/ä/ae/g; # $word =~ s/ö/oe/g; # $word =~ s/ü/ue/g; # $word =~ s/ß/ss/g;