################################################################################## # 22.phrasal.category.ultimate.of.current.token tier extractor for Wagon: # Written by Kyuchul Yoon ( kyoon@ling.osu.edu ) # Extracts from a set of .TextGrid.lab files 22.phrasal.category.ultimate.of.current.token data field for Wagon training # The script assumes that you already have the TextGrid files labelled by professional K-ToBI labelers. # The script will read in all the TextGrid.lab files one by one from the directory 21.POS.of.+3.following.token # and write the output files into 10.wagon-features\22.phrasal.category.ultimate.of.current.token # The filename of the output files are .wagon.22 # Assumes that you have a token/POS column in a file 056.PhrCat-big-original\kcy-tok.PhrCat # This script splits the second column of ultimate PhrasalCatetories into PhrCats of each sentence separated by S (PhrCat) ################################################################################## form Select files word tokenFile_(with_path) 056.PhrCat-big-original\ytn-tok.PhrCat2.txt word subFolderToProcess 10.wagon-features\21.POS.of.+3.following.token word fileExtOfDoneFiles wagon.21 word outputSubFolder 10.wagon-features\22.phrasal.category.ultimate.of.current.token word tierNameToAdd 0.PhrCat.ult choice outputFileExt: 1 button wagon.22 endform Read Strings from raw text file... 'tokenFile$' Rename... tokenFile # Get the list of filenames of TextGrid. files Create Strings as file list... fileList 'subFolderToProcess$'\*.'fileExtOfDoneFiles$' Sort numFiles = Get number of strings pause 'numFiles' labeled textgrids identified. Continue? # Initialize the line number for .PhrCat file. Will be used for giving info when discrepancy exists btw/ # the numbers of PhrCats and intervals. And iLineNumPhrCat for extracting one token at a time from the .PhrCat file. # And numPhrCat for number of PhrCats in one sentence in the .PhrCat file lineNumPhrCat = 1 iLineNumPhrCat = 1 numPhrCat = 0 # Loop throught each file for iFile to numFiles select Strings fileList # Get the name for a TextGrid file doneFile$ = Get string... iFile filePrefix$ = doneFile$ - fileExtOfDoneFiles$ Read from file... 'subFolderToProcess$'\'doneFile$' Rename... textGrid numIntervals = Get number of intervals... 1 # Get the number of tiers so that you can add an additional tier at the end numTiers = Get number of tiers Duplicate tier... 1 (numTiers+1) 'tierNameToAdd$' # Set the first/last interval text to naught Set interval text... (numTiers+1) 1 Set interval text... (numTiers+1) numIntervals ######################################################################### ##### Count the # of PhrCats in a sentence (terminated by "0 S PERIOD/SFN") from the ##### ##### original ???-tok.PhrCat file. For convenience, check the first letter, i.e. "0" (number) ##### ##### This will be compared with the number of intervals from the textgrid file, i.e. numIntervals - 2 ##### ##### If they're not the same, then there must be something wrong with the .PhrCat file. Check it ##### ######################################################################### select Strings tokenFile # Initialize the number of PhrCats for one sentence numPhrCat = 0 token$ = Get string... lineNumPhrCat rightThree$ = right$(token$, 3) while (rightThree$ <> "SFN") numPhrCat = numPhrCat + 1 lineNumPhrCat = lineNumPhrCat + 1 token$ = Get string... lineNumPhrCat rightThree$ = right$(token$, 3) endwhile numPhrCat = numPhrCat + 1 ### Check if the two are the same. If not, give useful information if numPhrCat <> (numIntervals-2) pause The numbers of PhrCats and intervals DO NOT MATCH!! printline 'iFile' printline 'lineNumPhrCat' endif ################### End of # of PhrCat counting block ########################### for iToken from 2 to (numIntervals-1) select Strings tokenFile ##################################################### #### Word extractor (separated by either spaces or tabs in a line) #### #### Strip leading spaces/tabs in a loop. (Here, token = line in the file #### ##################################################### token$ = Get string... iLineNumPhrCat # To jump to the second PhrCat, i.e. ultimate PhrCat, identify the position of the first white space # Check for either spaces or tabs iSpaces = index(token$, " ") iTabs = index(token$, tab$) lengthOfToken = length(token$) if iSpaces <> 0 numOfLettersToExtract = lengthOfToken - iSpaces + 1 tempUltPhrCat$ = mid$(token$, iSpaces, numOfLettersToExtract) else numOfLettersToExtract = lengthOfToken - iTabs + 1 tempUltPhrCat$ = mid$(token$, iTabs, numOfLettersToExtract) endif ######### The real part starts now, i.e. stripping leading spaces/tabs ####### # If there's a leading white space, get rid of it once. # # Keeping doing that until you see a PhrCat/POS, which is at least one letter long # # The relevant PhrCats can be extracted from textgrids later on by another script # ######################################################## iSpaces2 = index(tempUltPhrCat$, " ") iTabs2 = index(tempUltPhrCat$, tab$) while ((iSpaces2 = 1) or (iTabs2 = 1)) tempLength = length(tempUltPhrCat$) tempUltPhrCat$ = right$(tempUltPhrCat$, (tempLength-1)) iSpaces2 = index(tempUltPhrCat$, " ") iTabs2 = index(tempUltPhrCat$, tab$) endwhile ################ End of word extractor block #################### indexToCut = iSpaces2 ultPhrCat$ = left$(tempUltPhrCat$, (indexToCut-1)) # Put the PhrCat label into each interval select TextGrid textGrid Set interval text... (numTiers+1) iToken 'ultPhrCat$' iLineNumPhrCat = iLineNumPhrCat + 1 lineNumPhrCat = iLineNumPhrCat endfor #Edit #pause Write to text file... 'outputSubFolder$'\'filePrefix$''outputFileExt$' Remove endfor select Strings fileList plus Strings tokenFile Remove printline 'iLineNumPhrCat-1' #### END OF SCRIPT ####