########################################################################## # POS.of.current.token tier extractor for Wagon: # Written by Kyuchul Yoon ( kyoon@ling.osu.edu ) # Extracts from a set of .TextGrid.lab files 15.POS.of.current.token data field for Wagon training # The script assumes that you already have the TextGrid files labelled by professional K-ToBI labelers. # The script will read in all the TextGrid.lab files one by one from the directory 12.distance.in.eojeols.from.sentence.beginning&end # and write the output files into 10.wagon-features\15.POS.of.current.token # The filename of the output files are .wagon.15 # Assumes that you have a token/POS column in a file 055.POS-big-original\ejk-tok.POS # This script splits a column of token/POS pairs into POSs of each sentence separated by SFN (POS) ########################################################################## form Select files word tokenFile_(with_path) 055.POS-big-original\ejk-tok.POS word subFolderToProcess 10.wagon-features\12.distance.in.eojeols.from.sentence.beginning&end word fileExtOfDoneFiles wagon.12 word outputSubFolder 10.wagon-features\15.POS.of.current.token word tierNameToAdd 0.PhrCat.ult choice outputFileExt: 1 button wagon.15 endform Read Strings from raw text file... 'tokenFile$' Rename... tokenFile # Get the list of filenames of TextGrid. files Create Strings as file list... fileList 'subFolderToProcess$'\*.'fileExtOfDoneFiles$' Sort numFiles = Get number of strings pause 'numFiles' labeled textgrids identified. Continue? # Initialize the line number for .POS file. Will be used for giving info when discrepancy exists btw/ # the numbers of POSs and intervals. And iLineNumPOS for extracting one token at a time from the .POS file. # And numPOS for number of POSs in one sentence in the .POS file lineNumPOS = 1 iLineNumPOS = 1 numPOS = 0 # Loop throught each file for iFile to numFiles select Strings fileList # Get the name for a TextGrid file doneFile$ = Get string... iFile filePrefix$ = doneFile$ - fileExtOfDoneFiles$ Read from file... 'subFolderToProcess$'\'doneFile$' Rename... textGrid numIntervals = Get number of intervals... 1 # Get the number of tiers so that you can add an additional tier at the end numTiers = Get number of tiers Duplicate tier... 1 (numTiers+1) 'tierNameToAdd$' # Set the first/last interval text to naught Set interval text... (numTiers+1) 1 Set interval text... (numTiers+1) numIntervals ######################################################################## ##### Count the # of POSs in a sentence (terminated by "SFN") from the original ???-tok.POS file ##### ##### This will be compared with the number of intervals from the textgrid file, i.e. numIntervals - 2 ##### ##### If they're not the same, then there must be something wrong with the .POS file. Check it ##### ####################################################################### select Strings tokenFile # Initialize the number of POSs for one sentence numPOS = 0 token$ = Get string... lineNumPOS rightThree$ = right$(token$, 3) while (rightThree$ <> "SFN") numPOS = numPOS + 1 lineNumPOS = lineNumPOS + 1 token$ = Get string... lineNumPOS rightThree$ = right$(token$, 3) endwhile numPOS = numPOS + 1 ### Check if the two are the same. If not, give useful information if numPOS <> (numIntervals-2) pause The numbers of POSs and intervals DO NOT MATCH!! printline 'iFile$' printline 'lineNumPOS' endif ################### End of # of POS counting block ########################### for iToken from 2 to (numIntervals-1) # If the token is followed by "#", then take letters between / and # # Otherwise, take what's on the RHS of the slash select Strings tokenFile token$ = Get string... iLineNumPOS lengthOfToken = length(token$) positionOfSlash = index(token$, "/") restAfterSlash$ = right$(token$, (lengthOfToken-positionOfSlash)) lengthOfRestAfterSlash = length(restAfterSlash$) rightMostLetter$ = right$(restAfterSlash$, 1) if rightMostLetter$ = "#" realPOS$ = left$(restAfterSlash$, (lengthOfRestAfterSlash-1)) else realPOS$ = restAfterSlash$ endif # Put the POS label into each interval select TextGrid textGrid Set interval text... (numTiers+1) iToken 'realPOS$' iLineNumPOS = iLineNumPOS + 1 lineNumPOS = iLineNumPOS endfor Edit pause Write to text file... 'outputSubFolder$'\'filePrefix$''outputFileExt$' Remove endfor select Strings fileList Remove #### END OF SCRIPT ####