################################################################# # tokenFrequency.praat (Written by Kyuchul Yoon, kyoon@ynu.ac.kr) # Given a text file, this script takes each line, tokenizes words # by the space, removes non-letter symbols, lists the words # and counts the token frequency of each word. ################################################################# form Specify parameters word inFile_(with_.txt) test.txt word outFile_(to_be_created) tokenFrequency.txt endform Read Strings from raw text file... 'inFile$' Rename... fileObj numLines = Get number of strings #pause 'numLines' lines identified. Continue? totalTokenCount = 0 for iLine to numLines select Strings fileObj lineText$ = Get string... iLine # Tokenize the lineText by the space and fill the array string variable rawTokenized$ lenLineText = length(lineText$) indexOfSpace = index(lineText$," ") while (indexOfSpace <> 0) totalTokenCount = totalTokenCount + 1 rawTokenized'totalTokenCount'$ = left$(lineText$,(indexOfSpace-1)) lineText$ = right$(lineText$,(lenLineText-indexOfSpace)) lenLineText = length(lineText$) indexOfSpace = index(lineText$," ") endwhile # Handle the last token totalTokenCount = totalTokenCount + 1 rawTokenized'totalTokenCount'$ = lineText$ endfor # Now, we know the total number of tokens(totalTokenCount) and their identities(rawTokenized$) pause 'totalTokenCount' tokens identified. Continue? # Preprocessing. The preprocessed tokens are stored in an array string variable tokenized$ for i to totalTokenCount dummy$ = rawTokenized'i'$ tokenized'i'$ = dummy$ endfor # Count the number of each token and store the token frequency in an array variable tokenFreq # Initialize the total type count typeCount = 0 # Loop through each token for iToken to totalTokenCount token$ = tokenized'iToken'$ # If not the first token, compare it to the processed tokens, doing the counting if iToken = 1 # For the first token, increase the typeCount to one typeCount = typeCount + 1 # And store the processed token in an array variable processedToken$ processedToken'typeCount'$ = token$ # The token frequency count is also one tokenFreq'typeCount' = 1 # From the second token, start the comparisons else flagFoundMatch = 0 numComparisons = 0 # Repeat the loop until you find a match to existing types and # the number of comparisons is fewer than the types found while (flagFoundMatch = 0 and numComparisons < typeCount) procTokenCount = 0 while (flagFoundMatch = 0 and numComparisons < typeCount) numComparisons = numComparisons + 1 procTokenCount = procTokenCount + 1 dummy$ = processedToken'procTokenCount'$ if token$ = dummy$ flagFoundMatch = 1 dummy = tokenFreq'numComparisons' tokenFreq'numComparisons' = dummy + 1 endif endwhile endwhile # A new type found, add it to the array variable if flagFoundMatch = 0 typeCount = typeCount + 1 procTokenCount = procTokenCount + 1 processedToken'typeCount'$ = token$ tokenFreq'typeCount' = 1 endif endif endfor pause 'typeCount' typeCount clearinfo for i to typeCount dummy$ = processedToken'i'$ dummy = tokenFreq'i' printline 'dummy$''tab$''dummy' endfor