################################################################# # eojeolFrequency.praat (Written by Kyuchul Yoon, kyoon@ynu.ac.kr) # Given a text file, this script takes each line, tokenizes words # by the space, lists the eojeols and counts the token # frequency of each eojeol. ################################################################# form Specify parameters word inFile_(with_.txt) test.txt word outFile_(to_be_created) tokenFrequency.txt word progressFile_(to_be_created) progress.txt natural progressLines_(report_every_th_line) 1000 natural progressTokens_(report_every_th_token) 1000 endform # Check the start time and print the header for the output timeStarted$ = date$() fileappend 'outFile$' 'timeStarted$''newline$' fileappend 'outFile$' tokenType'tab$'tokenFreq'newline$' # Read the file to process Read Strings from raw text file... 'inFile$' Rename... fileObj numLines = Get number of strings #pause 'numLines' lines identified. Continue? ################ ### TOKENIZE ### ################ # Read all the lines of the file totalTokenCount = 0 for iLine to numLines # Block for identifying the progress of the loop progress = iLine/progressLines progressFloor = floor(progress) diffProgress = progress - progressFloor if diffProgress = 0 fileappend 'progressFile$' 'iLine'th line of 'numLines' lines'newline$' endif # Now the line tokenizing begins here select Strings fileObj lineText$ = Get string... iLine # Do the tokenization only if it's not the blank line if length(lineText$) <> 0 # Tokenize the lineText by the space and fill the array string variable rawTokenized$ lenLineText = length(lineText$) indexOfSpace = index(lineText$," ") while (indexOfSpace <> 0) totalTokenCount = totalTokenCount + 1 rawTokenized'totalTokenCount'$ = left$(lineText$,(indexOfSpace-1)) lineText$ = right$(lineText$,(lenLineText-indexOfSpace)) lenLineText = length(lineText$) indexOfSpace = index(lineText$," ") endwhile # Handle the last token totalTokenCount = totalTokenCount + 1 rawTokenized'totalTokenCount'$ = lineText$ endif endfor # Now, we know the total number of tokens(totalTokenCount) and their identities(rawTokenized$) #pause 'numLines' lines and 'totalTokenCount' tokens identified. Continue? ##################### ### PREPROCESSING ### ##################### # Preprocessing. The preprocessed tokens are stored in an array string variable tokenized$ # Make each raw token lowercased for iToken to totalTokenCount dummyRawToken$ = rawTokenized'iToken'$ tokenized'iToken'$ = dummyRawToken$ endfor ###################### ### TOKEN COUNTING ### ###################### # Count the number of each token and store the token frequency in an array variable tokenFreq # Initialize the total type count typeCount = 0 # Loop through each token for iToken to totalTokenCount # Block for identifying the progress of the loop progress = iToken/progressTokens progressFloor = floor(progress) diffProgress = progress - progressFloor if diffProgress = 0 fileappend 'progressFile$' 'iToken'th processed out of 'totalTokenCount' tokens'newline$' endif token$ = tokenized'iToken'$ # If not the first token, compare it to the processed tokens, doing the counting if iToken = 1 # For the first token, increase the typeCount to one typeCount = typeCount + 1 # And store the processed token in an array variable processedToken$ processedToken'typeCount'$ = token$ # The token frequency count is also one tokenFreq'typeCount' = 1 # From the second token, start the comparisons else flagFoundMatch = 0 numComparisons = 0 # Repeat the loop until you find a match to existing types and # the number of comparisons is fewer than the types found while (flagFoundMatch = 0 and numComparisons < typeCount) numComparisons = numComparisons + 1 # dummy$ represents all the types found dummy$ = processedToken'numComparisons'$ # Compare all the types against the target token if token$ = dummy$ flagFoundMatch = 1 # If found a match, there is no new type # Just increase the token frequency of the existing type dummy = tokenFreq'numComparisons' tokenFreq'numComparisons' = dummy + 1 endif endwhile # A new type found, add the new type to the array variable if flagFoundMatch = 0 # Also, increase the type count typeCount = typeCount + 1 processedToken'typeCount'$ = token$ # And initialize the type count to one tokenFreq'typeCount' = 1 endif endif endfor #pause 'typeCount' types found! ############################# ### PRINT TOKEN FREQUENCY ### ############################# for i to typeCount dummy$ = processedToken'i'$ dummy = tokenFreq'i' fileappend 'outFile$' 'dummy$''tab$''dummy''newline$' endfor timeEnded$ = date$() fileappend 'outFile$' 'timeEnded$''newline$' ######################## END OF SCRIPT ########################