##################################################################################### # Phonological tone tier infor extractor for Wagon: Written by Kyuchul Yoon ( kyoon@ling.osu.edu ) # Extracts from a set of .TextGrid.done files data fields for Wagon training # Data fields include (1) romanized eojeols, (2) number of syllables for each eojeol, # (3) type of boundary (AP, IP, or none) # (4) number of syllables from an immediately preceding AP boundary (marked by "LHa") # (5) number of syllables from an immediately preceding IP boundary (marked by "HL%", "H%", or "L%" # (6) number of syllables from the sentence beginning, and # (7) number of syllables from the sentence ending. # (8) morpheme identify **** Assumes that all "morphemes" have been split and have their own "interval" # in the TextGrid file (Run "specific-syllable-splitter.praat script first!!) # WARNING!!!!!NEED TO CHECK THE POS OF EACH MORPHEME # (9) distance from previous COMMA & to the next COMMA # The script assumes that you already have the TextGrid files labelled by professional K-ToBI labelers. # The script will read in all the TextGrid.done files one by one from the subdirectory # and write the output files into Wagon/ subdirectory. (There should be a "Wagon" subdirectory) # Directory structure for this script should be like the following #root\ : Script should be here # 302000.txt\TextGridDone # 302002.txt\TextGridDone # 302004.txt\TextGridDone # .... # Wagon # The filename of the output files are .wagon # This script "SORTS" the fileList # Fixed a bug that cannot handle "??LHa" label ##################################################################################### # Specify files and folders # For cases where the interval tier (tier 1) and phonology tier (tier 2) have not been # synchronized in point/interval placement. If the gaps are big, adjust the tolerance value form Select files word subFolderToProcess 303010.txt\TextGridDone word fileExtOfDoneFiles TextGrid.done.complete word outputSubFolder Wagon choice OutputFileExt: 1 button WAGON real tierDiffTol 0.05 endform # If an old output file exists, delete it first and then write out the new file filedelete 'outputSubFolder$'\'outputFileName$' # Compose the name for the output file outputFilePrefix$ = subFolderToProcess$ - ".txt\TextGridDone" outputFileName$ = outputFilePrefix$ + ".TextGrid.done." + outputFileExt$ #################### Define morphemes to be identified in the feature columns ############# #### However, because of homphonic syllables, need to check with the POS labels afterwards ####### #### Should've put the token/POS pairs in the interval tier in the first place ####### ######################################################################## # Topic markers (tagged as "PAU", AUxiliary Postposition) morph1a$ = "eun" morph1b$ = "neun" morph1c$ = "do" # Subject markers (tagged as "PCA", CAse Postposition) morph2a$ = "i" morph2b$ = "ga" # Object markers (tagged as "PCA") morph3a$ = "eul" morph3b$ = "leul" # etc. (tagged as "PAD", ADverbial Postposition, or "ECS", Coordinating & Subordinating Ending) morph4$ = "go" morph5$ = "myeo" morph6$ = "yeo" morph7$ = "myeon-seo" # etc. (tagged as "PAN", AdNominal Postposition) morph8$ = "eui" #################### End of morpheme definition block ############################# # Get the list of filenames of TextGrid.done files Create Strings as file list... fileList 'subFolderToProcess$'/*.'fileExtOfDoneFiles$' Sort numFiles = Get number of strings # Loop throught each file for iFile to numFiles select Strings fileList # Get the name for a TextGrid.done file doneFile$ = Get string... iFile Read from file... 'subFolderToProcess$'\'doneFile$' # Section for computing the total number of syllables for the file/sentence # Get the number of intervals from the first tier (word tier), i.e. get the number of eojeols (=numIntervals) numIntervals = Get number of intervals... 1 # Get rid of the leading/following label newNumIntervals = numIntervals - 2 ######## Block for calculating the total number of syllables for the whole sentence ####### ######## This will be used to compute the distance from the sentence end ####### ################################################################ # Initialize the number of total syllables for the sentence totalNum = 0 for i to newNumIntervals # Get the interval text (i.e., eojeol text), excluding the label intervalText$ = Get label of interval... 1 (i+1) numTotalChar = length(intervalText$) # Initialize numSyl = 1 # If the intervalText$ is one of either "PERIOD", "NPERIOD", "PLUS", "COLON", # "(R/L)DQUOTE", "(R/L)SQUOTE", "COMMA", or "(R/L)PAREN", do not increase the number of # syllables for that interval. But if it's either a NUMBER, a HYPHEN, or a FOREIGN, increase the number # of syllables by one (assuming that NUMBER/HYPHEN/FOREIGN is produced eith at least one (or more) syllable(s). if (intervalText$ = "PERIOD" or intervalText$ = "NPERIOD" or intervalText$ = "COMMA" ... or intervalText$ = "PLUS" or intervalText$ = "COLON" or intervalText$ = "RPAREN" ... or intervalText$ = "RDQUOTE" or intervalText$ = "LDQUOTE" or intervalText$ = "LSQUOTE" ... or intervalText$ = "RSQUOTE" or intervalText$ = "LPAREN" ) numSyl = numSyl-1 else while index(intervalText$, "-") indexOfHyphen = index(intervalText$, "-") # Calculate the number of the rest of the characters starting from the location of "-" subNumTotalChar = numTotalChar - indexOfHyphen intervalText$ = right$(intervalText$, subNumTotalChar) numTotalChar = length(intervalText$) numSyl = numSyl +1 endwhile endif totalNum = totalNum + numSyl endfor ################# End of sentence length detection block #################### # Initialize the distance (in number of syllables) from an immediately # preceding AP, IP, COMMA, and the sentence beginning to 1, 0, 0, and 0, respectively distFromPrevAP = 0 distFromPrevIP = 0 distFromPrevCO = 0 distFromSentBegin = 0 # Get the number of intervals from the first tier (word tier), i.e. get the number of eojeols (=numIntervals) numIntervals = Get number of intervals... 1 # Get rid of the leading/following label newNumIntervals = numIntervals - 2 ######### Compute the distance, totalNumBtwCO, to the following COMMA ########### # This is only for the interval from the sentence beginning to the "first" occurrence of COMMA # ################################################################ totalNumBtwCO = 0 numTotalCharCO = 0 iEojeolCO = 2 intervalTextCO$ = Get label of interval... 1 iEojeolCO numTotalCharCO = length(intervalTextCO$) repeat numSylCO = 1 if (intervalTextCO$ = "PERIOD" or intervalTextCO$ = "NPERIOD" or intervalTextCO$ = "LPAREN" ... or intervalTextCO$ = "PLUS" or intervalTextCO$ = "COLON" or intervalTextCO$ = "RPAREN" ... or intervalTextCO$ = "RDQUOTE" or intervalTextCO$ = "LDQUOTE" or intervalTextCO$ = "LSQUOTE" ... or intervalTextCO$ = "RSQUOTE") numSylCO = numSylCO - 1 else while index(intervalTextCO$, "-") indexOfHyphenCO = index(intervalTextCO$, "-") # Calculate the number of the rest of the characters starting from the location of "-" subNumTotalCharCO = numTotalCharCO - indexOfHyphenCO intervalTextCO$ = right$(intervalTextCO$, subNumTotalCharCO) numTotalCharCO = length(intervalTextCO$) numSylCO = numSylCO + 1 endwhile endif totalNumBtwCO = totalNumBtwCO + numSylCO # Get the label for the (next) interval text iEojeolCO = iEojeolCO + 1 intervalTextCO$ = Get label of interval... 1 iEojeolCO numTotalCharCO = length(intervalTextCO$) # Repeat this block until you come to the sentence end, indicated by or to the COMMA until (intervalTextCO$ = "" or intervalTextCO$ = "COMMA") ############# End of distance to "first" COMMA block ###################### # Counter for phonology tier iPhonoTier = 1 ############# Loop through each interval (eojeol) and extract info ############### for iEojeol to newNumIntervals # Initialize the number of syllable for the eojeol actualNumSyl = 1 ######### Get the interval text (i.e., eojeol text), excluding the label ####### intervalText$ = Get label of interval... 1 (iEojeol+1) fileappend 'outputSubFolder$'\'outputFileName$' 'intervalText$''tab$' ######### Extract "morpheme identity" column ######################## # If the intervalText$ is one of those morphemes defined above, then print it, otherwise, put "0" if (intervalText$ = "eun" or intervalText$ = "neun" or intervalText$ = "do" or intervalText$ = "i" ... or intervalText$ = "ga" or intervalText$ = "eul" or intervalText$ = "leul" ... or intervalText$ = "go" or intervalText$ = "myeo" or intervalText$ = "yeo" ... or intervalText$ = "myeon-seo" or intervalText$ = "eui") fileappend 'outputSubFolder$'\'outputFileName$' 'intervalText$''tab$' else fileappend 'outputSubFolder$'\'outputFileName$' 0'tab$' endif ######### End of morpheme identity block ########################### ######### Compute the number of syllables for the interval ################# # If the interval is one of the "unpronounced" symbols, do not increase the number # # Otherwise, loop through each hyphen "-" and count the number of syllables # ########################################################### if (intervalText$ = "PERIOD" or intervalText$ = "NPERIOD" or intervalText$ = "COMMA" ... or intervalText$ = "PLUS" or intervalText$ = "COLON" or intervalText$ = "RPAREN" ... or intervalText$ = "RDQUOTE" or intervalText$ = "LDQUOTE" or intervalText$ = "LSQUOTE" ... or intervalText$ = "RSQUOTE" or intervalText$ = "LPAREN" ) actualNumSyl = actualNumSyl-1 else numTotalChar = length(intervalText$) # Count the number of syllables for that eojeol (e.g. "peu-lang-seu" is 3-syllable long) # If there is no "-" in the eojeol, then the number of syllable for that eojeol is numSyl, i.e. 1 syllable. # Otherwise, repeat the following procedure to count the number of hyphens while index(intervalText$, "-") indexOfHyphen = index(intervalText$, "-") # Calculate the number of the rest of the characters starting from the location of "-" subNumTotalChar = numTotalChar - indexOfHyphen intervalText$ = right$(intervalText$, subNumTotalChar) numTotalChar = length(intervalText$) actualNumSyl = actualNumSyl+1 endwhile endif fileappend 'outputSubFolder$'\'outputFileName$' 'actualNumSyl''tab$' ########## End of syllable number block ############################ ########## Compute the distance from/to previous/next COMMA ############ # If the current interval text is "COMMA", compute the distance of the interval to the # # immediately following comma and use that to compute the distances. # # This is for the interval from the first COMMA to the rest of the sentence. # ########################################################### intervalText$ = Get label of interval... 1 (iEojeol+1) # If the interval text is COMMA, then we need to calculate the inter-COMMA distance again # after we print out if intervalText$ = "COMMA" distFromPrevCO = distFromPrevCO + actualNumSyl # Compute the distance to the next COMMA distFromNextCO = totalNumBtwCO - actualNumSyl fileappend 'outputSubFolder$'\'outputFileName$' 'distFromPrevCO''tab$''distFromNextCO''tab$' # Initialize the distance from previous COMMA distFromPrevCO = 0 ########## Compute the inter-COMMA distance ################## totalNumBtwCO = 1 numTotalCharCO = 0 iEojeolCO = iEojeol + 2 # Get the label for the (next) interval text intervalTextCO$ = Get label of interval... 1 iEojeolCO numTotalCharCO = length(intervalTextCO$) repeat numSylCO2 = 1 # If the interval text is one of these, don't increase the number of syllables if (intervalTextCO$ = "PERIOD" or intervalTextCO$ = "NPERIOD" ... or intervalTextCO$ = "LPAREN" or intervalTextCO$ = "PLUS" ... or intervalTextCO$ = "COLON" or intervalTextCO$ = "RPAREN" ... or intervalTextCO$ = "RDQUOTE" or intervalTextCO$ = "LDQUOTE" ... or intervalTextCO$ = "LSQUOTE" or intervalTextCO$ = "RSQUOTE") numSylCO2 = numSylCO2 -1 # Otherwise, calculate the number of syllables for that interval else while index(intervalTextCO$, "-") indexOfHyphenCO = index(intervalTextCO$, "-") # Calculate the number of the rest of the # characters starting from the location of "-" subNumTotalCharCO = numTotalCharCO - indexOfHyphenCO intervalTextCO$ = right$(intervalTextCO$, subNumTotalCharCO) numTotalCharCO = length(intervalTextCO$) numSylCO2 = numSylCO2 + 1 endwhile endif totalNumBtwCO = totalNumBtwCO + numSylCO2 # Increase the counter by one and move to the next interval text iEojeolCO = iEojeolCO + 1 intervalTextCO$ = Get label of interval... 1 iEojeolCO numTotalCharCO = length(intervalTextCO$) # and keep doing this until you come to either the end of sentence or next COMMA until (intervalTextCO$ = "" or intervalTextCO$ = "COMMA") # Debugged, but don't know for sure why this happened totalNumBtwCO = totalNumBtwCO - 1 ########### End of inter-COMMA distance block ################## else distFromPrevCO = distFromPrevCO + actualNumSyl distFromNextCO = totalNumBtwCO - actualNumSyl totalNumBtwCO = totalNumBtwCO - actualNumSyl fileappend 'outputSubFolder$'\'outputFileName$' 'distFromPrevCO''tab$''distFromNextCO''tab$' endif ################# End of distance from/to previous/next COMMA ############ ################# AP/IP boundary type & various distances ################ # Get the RHS end time of the interval and compare that with the time point of the tier # # below, and if they're close enough, print the boundary type along with other distance # # measures, such as distance from previous AP/IP and from sentence beginning/end. # ############################################################## endTimeOfInterval = Get end point... 1 (iEojeol+1) pointLabel$ = Get label of point... 2 iPhonoTier # If the boundary exists within the prosodic word, skip it and go to the next phono tier point while (pointLabel$ = "??LHa") iPhonoTier = iPhonoTier + 1 pointLabel$ = Get label of point... 2 iPhonoTier endwhile timePoint = Get time of point... 2 iPhonoTier dif = abs(timePoint - endTimeOfInterval) # If the difference in time between the RHS boundary in the interval tier and the point # in the point tier below, then the difference is within the tolerance range. # Go ahead and extract the point labels from the point tier if dif < tierDiffTol if pointLabel$ = "LHa" # Compute the distance from an immediately preceding AP distFromPrevAP = distFromPrevAP + actualNumSyl # Compute the distance from an immediately preceding IP distFromPrevIP = distFromPrevIP + actualNumSyl # Compute the distance from the sentence beginning distFromSentBegin = distFromSentBegin + actualNumSyl # Compute the distance from the end of the sentence distFromSentEnd = totalNum - distFromSentBegin fileappend 'outputSubFolder$'\'outputFileName$' 'pointLabel$''tab$''distFromPrevAP''tab$' ...'distFromPrevIP''tab$''distFromSentBegin''tab$''distFromSentEnd''tab$''newline$' # Reset the distance from an immediately preceding AP to zero distFromPrevAP = 0 # Increaset the iPhonoTier by one iPhonoTier = iPhonoTier +1 elsif (pointLabel$ = "HL%" or pointLabel$ = "H%" or pointLabel$ = "L%" or pointLabel$ = "LHL%") distFromPrevAP = distFromPrevAP + actualNumSyl distFromPrevIP = distFromPrevIP + actualNumSyl distFromSentBegin = distFromSentBegin + actualNumSyl distFromSentEnd = totalNum - distFromSentBegin fileappend 'outputSubFolder$'\'outputFileName$' 'pointLabel$''tab$''distFromPrevAP''tab$' ...'distFromPrevIP''tab$''distFromSentBegin''tab$''distFromSentEnd''tab$''newline$' # Reset the distance from an immediately preceding IP & AP to zero distFromPrevIP = 0 distFromPrevAP = 0 # Increaset the iPhonoTier by one iPhonoTier = iPhonoTier +1 else distFromPrevAP = distFromPrevAP + actualNumSyl distFromPrevIP = distFromPrevIP + actualNumSyl distFromSentBegin = distFromSentBegin + actualNumSyl distFromSentEnd = totalNum - distFromSentBegin fileappend 'outputSubFolder$'\'outputFileName$' 'pointLabel$''tab$''distFromPrevAP''tab$' ...'distFromPrevIP''tab$''distFromSentBegin''tab$''distFromSentEnd''tab$''newline$' endif # Otherwise, do not print the boundary types from the point tier below. # Just calculate the relevant distances else distFromPrevAP = distFromPrevAP + actualNumSyl distFromPrevIP = distFromPrevIP + actualNumSyl distFromSentBegin = distFromSentBegin + actualNumSyl distFromSentEnd = totalNum - distFromSentBegin fileappend 'outputSubFolder$'\'outputFileName$' 0'tab$''distFromPrevAP''tab$''distFromPrevIP''tab$' ...'distFromSentBegin''tab$''distFromSentEnd''tab$''newline$' endif endfor # If you want a blank line between sentences, comment out the following line # fileappend 'outputSubFolder$'\'outputFileName$' 'newline$' Remove endfor select Strings fileList Remove #### END OF SCRIPT ####