# # parse cyin file and transform it into a format useful for annotation # BEGIN { FS = "\t"; OFS = "\t"; } { # Lines beginning with a star are special. We use a flag st for # now, later the array star will map the current word to the value # of st. if ($1 ~ /^\*/) { $1 = substr($1,2); st = "*"; } else if ($1 ~ /^\?/) { $1 = substr($1, 2); st = "?"; } else st = "-"; # Now we need to deal with yi4ti3zi4. If there are fields beyond # the first one that contain chinese characters (non-ASCII), count # count how many there are and store the result in yi. More # precisely, yi is the number of the first field containing plain # ASCII text used in the transcriptions. for (yi=2; $yi ~ /[^!-z]/; yi++) ; # lx holds the lexical entry (transcription), which is the # concatenation of the contents of all fields from yi to NF-1. lx = $yi; for (j=yi+1; j "/dev/stderr"; if (star[$i] != st) printf ", inconsistent stars" > "/dev/stderr"; if (lex[$i] != lx) printf ", inconsistent entries" > "/dev/stderr"; if (freq[$i] != fq) printf ", inconsistent frequencies" > "/dev/stderr"; printf "\n" > "/dev/stderr"; } # printf "debug: processing w=%s, lex[w]=%s, star[w]=%s, freq[w]=%d\n", $i, lx, st, fq > "/dev/stderr"; star[$i] = st; lex[$i] = lx; freq[$i] = fq; } } END { delete freq; for (w in lex) print w, star[w], lex[w]; }