##################################################################################### # Phonological tone tier infor extractor for Wagon: Written by Kyuchul Yoon ( kyoon@ling.osu.edu ) # Extracts from a set of .TextGrid.done files data fields for Wagon training # Data fields include (1) romanized eojeols, (2) number of syllables for each eojeol, # (3) type of boundary (AP, IP, or none) # (4) number of syllables from an immediately preceding AP boundary (marked by "LHa") # (5) number of syllables from an immediately preceding IP boundary (marked by "HL%", "H%", or "L%" # (6) number of syllables from the sentence beginning, and # (7) number of syllables from the sentence ending. # (8) morpheme identify **** Assumes that all "morphemes" have been split and have their own "interval" # in the TextGrid file (Run "specific-syllable-splitter.praat script first!!) # The script assumes that you already have the TextGrid files labelled by professional K-ToBI labelers. # The script will read in all the TextGrid.done files one by one from the subdirectory # and write the output files into Wagon/ subdirectory. (There should be a "Wagon" subdirectory) # Directory structure for this script should be like the following #root\ : Script should be here # 302000.txt\TextGridDone # 302002.txt\TextGridDone # 302004.txt\TextGridDone # .... # Wagon # The filename of the output files are .wagon ##################################################################################### # Specify files and folders # For cases where the interval tier (tier 1) and phonology tier (tier 2) have not been # synchronized in point/interval placement. If the gaps are big, adjust the tolerance value form Select files word subFolderToProcess 302000.txt/TextGridDone word fileExtOfDoneFiles TextGrid.done.complete word outputSubFolder Wagon word outputFileName 302000.TextGrid.done.wagon real tierDiffTol 0.005 endform # If an old output file exists, delete it first and then write out the new file # filedelete 'outputSubFolder'\'outputFileName$' ############### Define morphemes to be identified in the feature columns ################## # Topic markers (tagged as "PAU", AUxiliary Postposition) morph1a$ = "eun" morph1b$ = "neun" morph1c$ = "do" # Subject markers (tagged as "PCA", CAse Postposition) morph2a$ = "i" morph2b$ = "ga" # Object markers (tagged as "PCA") morph3a$ = "eul" morph3b$ = "leul" # etc. (tagged as "PAD", ADverbial Postposition, or "ECS", Coordinating & Subordinating Ending) morph4$ = "go" morph5$ = "myeo" morph6$ = "yeo" morph7$ = "myeon-seo" # etc. (tagged as "PAN", AdNominal Postposition) morph8$ = "eui" ######################################################################### # Get the list of filenames of TextGrid.done files Create Strings as file list... fileList 'subFolderToProcess$'/*.'fileExtOfDoneFiles$' numFiles = Get number of strings # Loop throught each file for iFile to numFiles select Strings fileList # Get the name for a TextGrid.done file doneFile$ = Get string... iFile Read from file... 'subFolderToProcess$'/'doneFile$' # Section for computing the total number of syllables for the file/sentence # Get the number of intervals from the first tier (word tier), i.e. get the number of eojeols (=numIntervals) numIntervals = Get number of intervals... 1 # Get rid of the leading/following label newNumIntervals = numIntervals - 2 # Initialize the number of total syllables for the sentence totalNum = 0 for i to newNumIntervals # Get the interval text (i.e., eojeol text), excluding the label intervalText$ = Get label of interval... 1 (i+1) numTotalChar = length(intervalText$) # Initialize numSyl = 1 while index(intervalText$, "-") indexOfHyphen = index(intervalText$, "-") # Calculate the number of the rest of the characters starting from the location of "-" subNumTotalChar = numTotalChar - indexOfHyphen intervalText$ = right$(intervalText$, subNumTotalChar) numTotalChar = length(intervalText$) numSyl = numSyl +1 endwhile totalNum = totalNum + numSyl endfor # Initialize the distance (in number of syllables) from an immediately # preceding AP, IP, and the sentence beginning to 1, 0, 0, and 0, respectively distFromPrevAP = 0 distFromPrevIP = 0 distFromSentBegin = 0 # Get the number of intervals from the first tier (word tier), i.e. get the number of eojeols (=numIntervals) numIntervals = Get number of intervals... 1 # Get rid of the leading/following label newNumIntervals = numIntervals - 2 # Counter for phonology tier iPhonoTier = 1 # Loop through each interval (eojeol) and extract info for iEojeol to newNumIntervals # Initialize the number of syllable for the eojeol actualNumSyl = 1 # Get the interval text (i.e., eojeol text), excluding the label intervalText$ = Get label of interval... 1 (iEojeol+1) fileappend 'outputSubFolder$'/'outputFileName$' 'intervalText$''tab$' # If the intervalText$ is one of those morphemes defined above, then print it, otherwise, put "0" if (intervalText$ = "eun" or intervalText$ = "neun" or intervalText$ = "do" or intervalText$ = "i" ... or intervalText$ = "ga" or intervalText$ = "eul" or intervalText$ = "leul" ... or intervalText$ = "go" or intervalText$ = "myeo" or intervalText$ = "yeo" ... or intervalText$ = "myeon-seo" or intervalText$ = "eui") fileappend 'outputSubFolder$'/'outputFileName$' 'intervalText$''tab$' else fileappend 'outputSubFolder$'/'outputFileName$' 0'tab$' endif numTotalChar = length(intervalText$) # Count the number of syllables for that eojeol (e.g. "peu-lang-seu" is 3-syllable long) # If there is no "-" in the eojeol, then the number of syllable for that eojeol is numSyl, i.e. 1 syllable. # Otherwise, repeat the following procedure to count the number of hyphens while index(intervalText$, "-") indexOfHyphen = index(intervalText$, "-") # Calculate the number of the rest of the characters starting from the location of "-" subNumTotalChar = numTotalChar - indexOfHyphen intervalText$ = right$(intervalText$, subNumTotalChar) numTotalChar = length(intervalText$) actualNumSyl = actualNumSyl +1 endwhile fileappend 'outputSubFolder$'/'outputFileName$' 'actualNumSyl''tab$' # Get the RHS end time of the interval and compare that with the time point of the tier below # And if they're close enough, print the boundary type endTimeOfInterval = Get end point... 1 (iEojeol+1) pointLabel$ = Get label of point... 2 iPhonoTier timePoint = Get time of point... 2 iPhonoTier dif = abs(timePoint - endTimeOfInterval) if dif < tierDiffTol if pointLabel$ = "LHa" # Compute the distance from an immediately preceding AP distFromPrevAP = distFromPrevAP + actualNumSyl # Compute the distance from an immediately preceding IP distFromPrevIP = distFromPrevIP + actualNumSyl # Compute the distance from the sentence beginning distFromSentBegin = distFromSentBegin + actualNumSyl # Compute the distance from the end of the sentence distFromSentEnd = totalNum - distFromSentBegin fileappend 'outputSubFolder$'/'outputFileName$' 'pointLabel$''tab$''distFromPrevAP''tab$' ... 'distFromPrevIP''tab$''distFromSentBegin''tab$''distFromSentEnd''tab$''newline$' # Reset the distance from an immediately preceding AP to zero distFromPrevAP = 0 # Increaset the iPhonoTier by one iPhonoTier = iPhonoTier +1 elsif (pointLabel$ = "HL%" or pointLabel$ = "H%" or pointLabel$ = "L%") distFromPrevAP = distFromPrevAP + actualNumSyl distFromPrevIP = distFromPrevIP + actualNumSyl distFromSentBegin = distFromSentBegin + actualNumSyl distFromSentEnd = totalNum - distFromSentBegin fileappend 'outputSubFolder$'/'outputFileName$' 'pointLabel$''tab$''distFromPrevAP''tab$' ... 'distFromPrevIP''tab$''distFromSentBegin''tab$''distFromSentEnd''tab$''newline$' # Reset the distance from an immediately preceding IP & AP to zero distFromPrevIP = 0 distFromPrevAP = 0 # Increaset the iPhonoTier by one iPhonoTier = iPhonoTier +1 else distFromPrevAP = distFromPrevAP + actualNumSyl distFromPrevIP = distFromPrevIP + actualNumSyl distFromSentBegin = distFromSentBegin + actualNumSyl distFromSentEnd = totalNum - distFromSentBegin fileappend 'outputSubFolder$'/'outputFileName$' 'pointLabel$''tab$''distFromPrevAP''tab$' ... 'distFromPrevIP''tab$''distFromSentBegin''tab$''distFromSentEnd''tab$''newline$' endif else distFromPrevAP = distFromPrevAP + actualNumSyl distFromPrevIP = distFromPrevIP + actualNumSyl distFromSentBegin = distFromSentBegin + actualNumSyl distFromSentEnd = totalNum - distFromSentBegin fileappend 'outputSubFolder$'/'outputFileName$' 0'tab$''distFromPrevAP''tab$''distFromPrevIP''tab$' ... 'distFromSentBegin''tab$''distFromSentEnd''tab$''newline$' endif endfor fileappend 'outputSubFolder$'/'outputFileName$' 'newline$' Remove endfor select Strings fileList Remove #### END OF SCRIPT ####