################################################################# # tokenFrequency.praat (Written by Kyuchul Yoon, kyoon@ynu.ac.kr) # Given a text file, this script takes each line, tokenizes words # by the space, removes non-letter symbols, lists the words # and counts the token frequency of each word. ################################################################# form Specify parameters word inFile_(with_.txt) Season01-Unicode.txt word outFile_(to_be_created) tokenFrequency.txt word progressFile_(to_be_created) progress.txt natural progressLines_(report_every_th_line) 1000 natural progressTokens_(report_every_th_token) 1000 endform # Check the start time and print the header for the output timeStarted$ = date$() fileappend 'outFile$' 'timeStarted$''newline$' fileappend 'outFile$' tokenType'tab$'tokenFreq'newline$' # Read the file to process Read Strings from raw text file... 'inFile$' Rename... fileObj numLines = Get number of strings #pause 'numLines' lines identified. Continue? ################ ### TOKENIZE ### ################ # Read all the lines of the file totalTokenCount = 0 for iLine to numLines # Block for identifying the progress of the loop progress = iLine/progressLines progressFloor = floor(progress) diffProgress = progress - progressFloor if diffProgress = 0 fileappend 'progressFile$' 'iLine'th line of 'numLines' lines'newline$' endif # Now the line tokenizing begins here select Strings fileObj lineText$ = Get string... iLine # Do the tokenization only if it's not the blank line if length(lineText$) <> 0 # Tokenize the lineText by the space and fill the array string variable rawTokenized$ lenLineText = length(lineText$) indexOfSpace = index(lineText$," ") while (indexOfSpace <> 0) totalTokenCount = totalTokenCount + 1 rawTokenized'totalTokenCount'$ = left$(lineText$,(indexOfSpace-1)) lineText$ = right$(lineText$,(lenLineText-indexOfSpace)) lenLineText = length(lineText$) indexOfSpace = index(lineText$," ") endwhile # Handle the last token totalTokenCount = totalTokenCount + 1 rawTokenized'totalTokenCount'$ = lineText$ endif endfor # Now, we know the total number of tokens(totalTokenCount) and their identities(rawTokenized$) #pause 'numLines' lines and 'totalTokenCount' tokens identified. Continue? ##################### ### PREPROCESSING ### ##################### # Preprocessing. The preprocessed tokens are stored in an array string variable tokenized$ # Make each raw token lowercased for iToken to totalTokenCount dummyRawToken$ = rawTokenized'iToken'$ lenDummyRawToken = length(dummyRawToken$) # For each character of the raw token, make it lowercase for iChar to lenDummyRawToken singleChar$ = mid$(dummyRawToken$,iChar,1) call makeLowerCase 'singleChar$' # Store each lowercased character in an array string variable lowercaseSingleChar'iChar'$ = lowercasedChar$ endfor # When all the letters are in lowercase, retrieve the lowercased raw token lowercasedRawToken$ = "" for iLetter to lenDummyRawToken dummy$ = lowercaseSingleChar'iLetter'$ lowercasedRawToken$ = lowercasedRawToken$ + dummy$ endfor # Block for identifying the progress of the loop progress = iToken/progressTokens progressFloor = floor(progress) diffProgress = progress - progressFloor if diffProgress = 0 fileappend 'progressFile$' 'iToken'th lowercased out of 'totalTokenCount' tokens'newline$' endif # Now that we've got the lowercased version of the raw token, start the actual preprocessing lenLowercasedRawToken = lenDummyRawToken alphabeticToken$ = "" for iChar to lenLowercasedRawToken singleLetter$ = mid$(lowercasedRawToken$,iChar,1) call deleteNonAlphabets 'singleLetter$' alphabeticToken$ = alphabeticToken$ + alphabeticChar$ endfor # When the preprocessing is over, store the token in an array string variable # only if it's a word lenAlphabeticToken = length(alphabeticToken$) tokenized'iToken'$ = alphabeticToken$ endfor #pause 'totalTokenCount' tokens lowercased. Continue? ###################### ### TOKEN COUNTING ### ###################### # Count the number of each token and store the token frequency in an array variable tokenFreq # Initialize the total type count typeCount = 0 # Loop through each token for iToken to totalTokenCount # Block for identifying the progress of the loop progress = iToken/progressTokens progressFloor = floor(progress) diffProgress = progress - progressFloor if diffProgress = 0 fileappend 'progressFile$' 'iToken'th processed out of 'totalTokenCount' tokens'newline$' endif token$ = tokenized'iToken'$ # If not the first token, compare it to the processed tokens, doing the counting if iToken = 1 # For the first token, increase the typeCount to one typeCount = typeCount + 1 # And store the processed token in an array variable processedToken$ processedToken'typeCount'$ = token$ # The token frequency count is also one tokenFreq'typeCount' = 1 # From the second token, start the comparisons else flagFoundMatch = 0 numComparisons = 0 # Repeat the loop until you find a match to existing types and # the number of comparisons is fewer than the types found while (flagFoundMatch = 0 and numComparisons < typeCount) numComparisons = numComparisons + 1 # dummy$ represents all the types found dummy$ = processedToken'numComparisons'$ # Compare all the types against the target token if token$ = dummy$ flagFoundMatch = 1 # If found a match, there is no new type # Just increase the token frequency of the existing type dummy = tokenFreq'numComparisons' tokenFreq'numComparisons' = dummy + 1 endif endwhile # A new type found, add the new type to the array variable if flagFoundMatch = 0 # Also, increase the type count typeCount = typeCount + 1 processedToken'typeCount'$ = token$ # And initialize the type count to one tokenFreq'typeCount' = 1 endif endif endfor #pause 'typeCount' types found! ############################# ### PRINT TOKEN FREQUENCY ### ############################# for i to typeCount dummy$ = processedToken'i'$ dummy = tokenFreq'i' fileappend 'outFile$' 'dummy$''tab$''dummy''newline$' endfor timeEnded$ = date$() fileappend 'outFile$' 'timeEnded$''newline$' ################################ ### PROCEDURE: makeLowerCase ### ################################ procedure makeLowerCase singleCharacter$ # If the character is not an alphabet, leave it as is. Otherwise, make it lowercase if (singleCharacter$ = "A" or singleCharacter$ = "a") lowercasedChar$ = "a" elsif (singleCharacter$ = "B" or singleCharacter$ = "b") lowercasedChar$ = "b" elsif (singleCharacter$ = "C" or singleCharacter$ = "c") lowercasedChar$ = "c" elsif (singleCharacter$ = "D" or singleCharacter$ = "d") lowercasedChar$ = "d" elsif (singleCharacter$ = "E" or singleCharacter$ = "e") lowercasedChar$ = "e" elsif (singleCharacter$ = "F" or singleCharacter$ = "f") lowercasedChar$ = "f" elsif (singleCharacter$ = "G" or singleCharacter$ = "g") lowercasedChar$ = "g" elsif (singleCharacter$ = "H" or singleCharacter$ = "h") lowercasedChar$ = "h" elsif (singleCharacter$ = "I" or singleCharacter$ = "i") lowercasedChar$ = "i" elsif (singleCharacter$ = "J" or singleCharacter$ = "j") lowercasedChar$ = "j" elsif (singleCharacter$ = "K" or singleCharacter$ = "k") lowercasedChar$ = "k" elsif (singleCharacter$ = "L" or singleCharacter$ = "l") lowercasedChar$ = "l" elsif (singleCharacter$ = "M" or singleCharacter$ = "m") lowercasedChar$ = "m" elsif (singleCharacter$ = "N" or singleCharacter$ = "n") lowercasedChar$ = "n" elsif (singleCharacter$ = "O" or singleCharacter$ = "o") lowercasedChar$ = "o" elsif (singleCharacter$ = "P" or singleCharacter$ = "p") lowercasedChar$ = "p" elsif (singleCharacter$ = "Q" or singleCharacter$ = "q") lowercasedChar$ = "q" elsif (singleCharacter$ = "R" or singleCharacter$ = "r") lowercasedChar$ = "r" elsif (singleCharacter$ = "S" or singleCharacter$ = "s") lowercasedChar$ = "s" elsif (singleCharacter$ = "T" or singleCharacter$ = "t") lowercasedChar$ = "t" elsif (singleCharacter$ = "U" or singleCharacter$ = "u") lowercasedChar$ = "u" elsif (singleCharacter$ = "V" or singleCharacter$ = "v") lowercasedChar$ = "v" elsif (singleCharacter$ = "W" or singleCharacter$ = "w") lowercasedChar$ = "w" elsif (singleCharacter$ = "X" or singleCharacter$ = "x") lowercasedChar$ = "x" elsif (singleCharacter$ = "Y" or singleCharacter$ = "y") lowercasedChar$ = "y" elsif (singleCharacter$ = "Z" or singleCharacter$ = "z") lowercasedChar$ = "z" else lowercasedChar$ = singleCharacter$ endif endproc ##################################### ### PROCEDURE: deleteNonAlphabets ### ##################################### procedure deleteNonAlphabets singleCharacter$ if singleCharacter$ = "a" alphabeticChar$ = "a" elsif singleCharacter$ = "b" alphabeticChar$ = "b" elsif singleCharacter$ = "c" alphabeticChar$ = "c" elsif singleCharacter$ = "d" alphabeticChar$ = "d" elsif singleCharacter$ = "e" alphabeticChar$ = "e" elsif singleCharacter$ = "f" alphabeticChar$ = "f" elsif singleCharacter$ = "g" alphabeticChar$ = "g" elsif singleCharacter$ = "h" alphabeticChar$ = "h" elsif singleCharacter$ = "i" alphabeticChar$ = "i" elsif singleCharacter$ = "j" alphabeticChar$ = "j" elsif singleCharacter$ = "k" alphabeticChar$ = "k" elsif singleCharacter$ = "l" alphabeticChar$ = "l" elsif singleCharacter$ = "m" alphabeticChar$ = "m" elsif singleCharacter$ = "n" alphabeticChar$ = "n" elsif singleCharacter$ = "o" alphabeticChar$ = "o" elsif singleCharacter$ = "p" alphabeticChar$ = "p" elsif singleCharacter$ = "q" alphabeticChar$ = "q" elsif singleCharacter$ = "r" alphabeticChar$ = "r" elsif singleCharacter$ = "s" alphabeticChar$ = "s" elsif singleCharacter$ = "t" alphabeticChar$ = "t" elsif singleCharacter$ = "u" alphabeticChar$ = "u" elsif singleCharacter$ = "v" alphabeticChar$ = "v" elsif singleCharacter$ = "w" alphabeticChar$ = "w" elsif singleCharacter$ = "x" alphabeticChar$ = "x" elsif singleCharacter$ = "y" alphabeticChar$ = "y" elsif singleCharacter$ = "z" alphabeticChar$ = "z" # If it's the apostophe, leave it as is elsif singleCharacter$ = "'" alphabeticChar$ = "'" # If it's none of the alphabets, then return null string else alphabeticChar$ = "" endif endproc ######################## END OF SCRIPT ########################