######################################################################## # HillenbrandHodHawed.R # # From "Notes on Probability and Statistics for Analyzing the # Sounds of Languages" -- a companion textbook to Peter Ladefoged's # "Vowels and Consonants: An Introduction to the Sounds of Languages" # 2nd edition (Blackwell, 2005). # # (c) 2006, Grant McGuire and Mary E. Beckman # Department of Linguistics, Ohio State University # # File of R code for analyzing the and vowels in the # Hillenbrand et al. (1995) Northern Cities vowels that Ladefoged # plots in Figure 5.7 on p. 46 in "Vowels and Consonants" Chapter 5, # in conjunction with the analysis of the same vowels in these # words produced by the class of H286 in Autumn 2007. # # References and acknowledgments # # The data for this part of the term project are from: # James M. Hillenbrand, Laura A. Getty, Michael J. Clark, and # Kimberlee Wheeler (1995). Acoustic characteristics of # American English vowels. Journal of the Acoustical Society of # America, 97: 3099-3111. # They can be downloaded from: # http://homepages.wmich.edu/~hillenbr/voweldata.html # ######################################################################## # # Part 0 -- Here's what you could do to prepare to do the analyses that # relate the identification data to the production data for the 139 men, # women, and children in the Hillenbrand et al. study .... # # Download the file bigdata.dat file from the web site for the # Hillenbrand et al. (1995) vowel study cited above, and rename it as # bigdata.txt (or revise the code for reading it in). Alternatively, # download the renamed bigdata.txt file from the course web site. Put # this data file in the same folder where you have put this file. Open # up the R Console and then open this file from within the R Console. # Then set the working directory to the folder where you have put this # bigdata.txt file. On Mary's laptop, this is the folder that is called # C:\Lx286\Hillenbrand, so this setwd() command is for her machine. Edit # it to match the path name for the folder where you have the data on # your machine. setwd('c:/Lx286/Hillenbrand') # Read in the bigdata.txt file, skipping the header information in the # first 43 rows, and telling R to read the file "as is" and to treat # values of 0 as cells where no measurement is available, as stated in # the header information, line 42, where Jim Hillenbrand tells us: # "Note: An entry of zero means that the formant was not measurable." x=read.table("bigdata.txt", skip=43, as.is=T, na=0) # Also download the iddataMinusAsterisk.txt file from the course web # page and read those data into a data frame called "id". (This is a # file that is identical to the iddata.dat file on Jim Hillenbrand's # site, except that the asterisks in the far right column have been # removed so that all rows have the same number of elements.) id=read.table("iddataMinusAsterisk.txt", skip=19, as.is=T, header=T) # Then set the working directory again, to where you want the graphs # that you make to be stored. setwd('c:/Lx286/projectParts/termProjectPart3') # Assign names to the columns of the vowel formants and durations table. names(x)=c("filename","duration","f0","F1","F2","F3", "F1.1","F2.1","F3.1","F1.2","F2.2","F3.2","F1.3","F2.3","F3.3", "F1.4","F2.4","F3.4","F1.5","F2.5","F3.5","F1.6","F2.6","F3.6", "F1.7","F2.7","F3.7","F1.8","F2.8","F3.8") # Use the filename column to make three new columns for the talker ID # (the three part string such as m01, for the first man), the speaker # type (from the set "m" for men, "w" for women, "b" for boys, and "g" # for girls), and the vowel type (from the set "iy" for the vowel [i] in # the word , "ih" for the vowel [I] in the word , and so on). x$talker=substring(x$filename, 1, 3) x$sex=substring(x$filename, 1, 1) x$vow=substring(x$filename, 4, 5) # The substring command takes a character string (such as the filename # "b01ae") as its first argument, and outputs the substring that starts # at the position specifiedby the second argument and ends at the position # specified by the third argument. So substring(x$filename, 4, 5) means # "take the 4th through 5th elements in each filename" -- i.e., the "ae" # in "b01ae" and so on. # Do the same thing for the id data frame. id$talker=substring(id$Filename, 1, 3) id$sex=substring(id$Filename, 1, 1) id$vow=substring(id$Filename, 4, 5) # Also strip off the ".w" at the end of the Filename variable. id$Filename=substring(id$Filename, 1, 5) # Pick out the subset of rows of x that are for the vowels [O] ("aw") in # the word and [A] ("ah") in the word . This will be useful # for doing the histograms. aw=subset(x, vow=="aw")[,c("filename","duration","F1","F2","talker","sex")] ah=subset(x, vow=="ah")[,c("filename","duration","F1","F2","talker","sex")] # Add two columns to each of these data frames for the percent correct # identification and the percent mis-identified as the other vowel. aw$ID=NA ah$ID=NA aw$misID=NA ah$misID=NA for (i in 1:dim(aw)[1]) { aw$ID[i]=round(id$aw[id$talker==aw$talker[i] & id$vow=="aw"]*5) aw$misID[i]=round(id$ah[id$talker==aw$talker[i] & id$vow=="aw"]*5) ah$ID[i]=round(id$ah[id$talker==ah$talker[i] & id$vow=="ah"]*5) ah$misID[i]=round(id$aw[id$talker==ah$talker[i] & id$vow=="ah"]*5) } # In each of the commands in the loop, you're picking out the relevant # number from the row for that vowel produced by that talker and the # column for identification as that vowel or as the other vowel, and # then multiplying by 5 (which is equivalent to dividing by 20 listeners # to get the proportion of responses that are correct or that are incorrect # in the relevant way, and then multiplying by 100 to convert to a percent). # Add three more columns for the normalized values for the F1, F2, and # the duration, with the normalization algorithm being the percent of # range calculation that Group Awesome used for their second interim report. aw$normF1=NA ah$normF1=NA aw$normF2=NA ah$normF2=NA aw$normDur=NA ah$normDur=NA for (i in 1:dim(aw)[1]) { temp=subset(x, talker==aw$talker[i]) minF1=min(na.omit(temp$F1)) maxF1=max(na.omit(temp$F1)) minF2=min(na.omit(temp$F2)) maxF2=max(na.omit(temp$F2)) minDur=min(na.omit(temp$duration)) maxDur=max(na.omit(temp$duration)) aw$normF1[i]=round(100*(aw$F1[i]-minF1)/(maxF1-minF1)) aw$normF2[i]=round(100*(aw$F2[i]-minF1)/(maxF2-minF2)) aw$normDur[i]=round(100*(aw$duration[i]-minDur)/(maxDur-minDur)) ah$normF1[i]=round(100*(ah$F1[i]-minF1)/(maxF1-minF1)) ah$normF2[i]=round(100*(ah$F2[i]-minF1)/(maxF2-minF2)) ah$normDur[i]=round(100*(ah$duration[i]-minDur)/(maxDur-minDur)) } ######################################################################## # Part 1 -- Making overlaid histograms and doing the t-tests that would # go with each of the histograms. There are sets of sample code for both # the raw values and the range-normalized values, for you to choose from. # Set up plotting pretty window for making overlaid histograms. windows(width=4, height=2, pointsize=12) par(family="serif", oma=rep(0,4), mar=c(3,3,0.1,0.1), mgp=c(1.8,0.5,0)) # Figure out what the range is for the raw F1 values. min(c(aw$F1,ah$F1)) # [1] 305 max(c(aw$F1,ah$F1)) # [1] 590 # Use these values to determine that the histogram should have bins # ranging from 300 to 600 Hz. breaksF1=seq(550,1350,25) # Make a histogram of the first formant of and draw a box around it. hist(aw$F1, breaks=breaksF1, main="", xlab="first formant (Hz)", ylab="number of tokens",ylim=c(0,20),col="grey80");box() # Tell R to overlay a new plot. par(new=T) # Make a histogram of the first formant of . hist(ah$F1, breaks=breaksF1, main="", xlab="", ylab="", ylim=c(0,20),density=30) # Add a legend key in the top left corner of the graph. legend("topright",c("hawed","hod"),density=c(-1,30),fill=c("grey80","black")) # Add cursors at the two mean values. abline(v=c(mean(na.omit(aw$F1)),mean(na.omit(893))),col=c("grey60","black"),lty=c(1,2)) # Save the plot. savePlot("HillenbrandHodHawedF1",type="jpg") # Do a paired t-test to see whether the non-zero difference in mean F1 values # between the two vowels for each of the 139 talkers could plausibly be # attributed to chance, given the large overlap in the distributions. t.test(aw$F1, ah$F1, paired=T, alternative="less") # Now do the same thing for the range-normalized first formant values. min(na.omit(c(aw$normF1,ah$normF1))) # [1] 35 max(na.omit(c(aw$normF1,ah$normF1))) # [1] 100 breaksF1=seq(30,100,5) hist(aw$normF1, breaks=breaksF1, main="", xlab="normalized first formant (%)", ylab="number of tokens",ylim=c(0,132),col="grey80");box() par(new=T) hist(ah$normF1, breaks=breaksF1, main="", xlab="", ylab="", ylim=c(0,132),density=30) legend("topleft",c("hawed","hod"),density=c(-1,30),fill=c("grey80","black")) abline(v=c(mean(na.omit(aw$normF1)),mean(na.omit(ah$normF1))),col=c("grey60","black"),lty=c(1,2)) savePlot("HillenbrandHawedHodNormF1",type="wmf") t.test(aw$normF1, ah$normF1, paired=T, alternative="less") # Now do the same thing for the raw second formant values. min(na.omit(c(aw$F2,ah$F2))) # [1] 856 max(na.omit(c(aw$F2,ah$F2))) # [1] 2042 breaksF2=seq(850,2050,50) hist(aw$F2, breaks=breaksF2, main="", xlab="second formant (Hz)", ylab="number of tokens",ylim=c(0,21),col="grey80");box() par(new=T) hist(ah$F2, breaks=breaksF2, main="", xlab="", ylab="", ylim=c(0,21),density=30) legend("topright",c("hawed","hod"),density=c(-1,30),fill=c("grey80","black")) abline(v=c(mean(na.omit(aw$F2)),mean(na.omit(ah$F2))),col=c("grey60","black"),lty=c(1,2)) savePlot("HillenbrandHawedHodF2",type="wmf") t.test(aw$F2, ah$F2, paired=T, alternative="less") # Now do the same thing for the range-normalized second formant values. min(na.omit(c(aw$normF2,ah$normF2))) # [1] 25 max(na.omit(c(aw$normF2,ah$normF2))) # [1] 95 breaksF2=seq(25,95,5) hist(aw$normF2, breaks=breaksF2, main="", xlab="normalized second formant (%)", ylab="number of tokens",ylim=c(0,45),col="grey80");box() par(new=T) hist(ah$normF2, breaks=breaksF2, main="", xlab="", ylab="", ylim=c(0,45),density=30) legend("topright",c("hawed","hod"),density=c(-1,30),fill=c("grey80","black")) abline(v=c(mean(na.omit(aw$normF2)),mean(na.omit(ah$normF2))),col=c("grey60","black"),lty=c(1,2)) savePlot("HillenbrandHawedHodNormF2",type="wmf") t.test(aw$normF2, ah$normF2, paired=T, alternative="less") # Now do the same thing for the raw duration values. min(c(aw$duration,ah$duration)) # [1] 154 max(c(aw$duration,ah$duration)) # [1] 465 breaksDur=seq(150,475,25) hist(aw$duration, breaks=breaksDur, main="", xlab="duration (ms)", ylab="number of tokens",ylim=c(0,28),col="grey80");box() par(new=T) hist(ah$duration, breaks=breaksDur, main="", xlab="", ylab="", ylim=c(0,28),density=30) legend("topright",c("hawed","hod"),density=c(-1,30),fill=c("grey80","black")) abline(v=c(mean(na.omit(aw$duration)),mean(na.omit(ah$duration))),col=c("grey60","black"),lty=c(1,2)) savePlot("HillenbrandHawedHodDuration",type="wmf") t.test(aw$duration, ah$duration, paired=T, alternative="greater") # Now do the same thing for the range-normalized duration values. min(na.omit(c(aw$normDur,ah$normDur))) # [1] 5 max(na.omit(c(aw$normDur,ah$normDur))) # [1] 100 breaksDur=seq(5,100,5) hist(aw$normDur, breaks=breaksDur, main="", xlab="normalized duration (%)", ylab="number of tokens",ylim=c(0,68),col="grey80");box() par(new=T) hist(ah$normDur, breaks=breaksDur, main="", xlab="", ylab="", ylim=c(0,68),density=30) legend("topleft",c("hawed","hod"),density=c(-1,30),fill=c("grey80","black")) abline(v=c(mean(na.omit(aw$normDur)),mean(na.omit(ah$normDur))),col=c("grey60","black"),lty=c(1,2)) savePlot("HillenbrandHawedHodNormDuration",type="wmf") t.test(aw$normDur, ah$normDur, paired=T, alternative="less") ######################################################################## # Part 2 -- Evaluating the perception patterns. # min(na.omit(c(aw$ID,ah$ID))) # [1] 0 max(na.omit(c(aw$ID,ah$ID))) # [1] 100 breaksID=seq(0,100,5) hist(aw$ID, breaks=breaksID, main="", xlab="correct identification (%)", ylab="number of tokens",ylim=c(0,60),col="grey80");box() par(new=T) hist(ah$ID, breaks=breaksID, main="", xlab="", ylab="", ylim=c(0,60),density=30) legend("top",c("hawed","hod"),density=c(-1,30),fill=c("grey80","black")) # Given that these 20 listeners had a choice of 12 different words to choose # from in identifying any token, the probability of choosing the intended # word just by chance is 1/12 = 8.3%, meaning that 8.3% of the 20 listeners # should have identified the vowel correctly just by chance. Add a # at this chance ID rate. abline(v=(1/12)*100, lty=1);text(5,30,"chance correct ID", srt=90) # The binomial test says that there is a 0.02 chance of 5 out of the 20 # identifying the word correctly just by chance, and a 0.005 chance of 6 # of the 20 identifying the word correctly just by chance, as you can see # by running the following two lines of code: binom.test(5,20,p=(1/12)) binom.test(6,20,p=(1/12)) # So let's also put a cursor at 6 = 30%, to identify which tokens were # identified correctly at significantly better than chance level, if we # say that less than 1 in 100 is significantly better than chance. abline(v=30, lty=2);text(27, 30, "better than chance", srt=90) # Do you want to use this at least 30% correct identification as the # criterion for differentiating distinguishers from non-distinguishers? # If so, here's what you could do to pick them out in the scatterplots # that you may want to show instead of the histograms (or in addition # to the histograms. missed.aw=subset(aw, misID>=30) missed.ah=subset(ah, misID>=30) ######################################################################## # Part 3 -- Making scatterplots of F1 against F2 and duration against F2, # if you want to see if pairs of the values together are more predictive. # Set up x-axis and y-axis ranges from the values you chose for the # ranges of the histograms. xlimF2=c(2050,850) ylimF1=c(1350,550) ylimDur=c(120,475) xlimNormF2=c(71,25) ylimNormF1=c(100,34) ylimNormDur=c(5,100) # Set up a plotting pretty window for making a scatterplot. windows(width=4, height=4, pointsize=12) par(family="serif", oma=rep(0,4), mar=c(3,3,0.1,0.1), mgp=c(1.8,0.5,0), type="s") # Make a scatterplot showing the vowel space for the vowel in . plot(aw$F2, aw$F1, xlim=xlimF2, ylim=ylimF1, pch=19, col="grey60", ylab="first formant (Hz)", xlab="second formant (Hz)") # Overlay points for the vowel in . points(ah$F2, ah$F1, pch=1) legend("bottomright", c("hawed ","hod "), pch=c(19,1), col=c("grey60","black")) # Draw red circles around the points where was misidentified # as by at least 6 listeners. points(missed.aw$F2, missed.aw$F1, pch=1, col="red", cex=1.5) # Draw red squares around the points where was misidentified # as by at least 6 listeners. points(missed.ah$F2, missed.ah$F1, pch=22, col="red", cex=1.5) # Alternatively, use the range-normalized values, like this. plot(aw$normF2, aw$normF1, xlim=xlimNormF2, ylim=ylimNormF1, pch=19, col="grey60", ylab="range-normalized first formant (%)", xlab="range-normalized second formant (%)") points(ah$normF2, ah$normF1, pch=1) legend("topleft", c("hawed ","hod "), pch=c(19,1), col=c("grey60","black")) points(missed.aw$normF2, missed.aw$normF1, pch=1, col="red", cex=1.5) points(missed.ah$normF2, missed.ah$normF1, pch=22, col="red", cex=1.5) # Make another scatterplot showing the relationship between F2 and duration. plot(aw$F2, aw$duration, xlim=xlimF2, ylim=ylimDur, pch=19, col="grey60", ylab="duration of vowel (ms)", xlab="second formant (Hz)") points(ah$F2, ah$duration, pch=1) legend("bottomleft", c("hawed ","hod "), pch=c(19,1), col=c("grey60","black")) # Draw red squares around the points where was misidentified # as by at least 10 listeners, and red circles around the points # where was misidentified as by at least 6 listeners. points(missed.aw$F2, missed.aw$duration, pch=22, col="red", cex=1.5) points(missed.ah$F2, missed.ah$duration, pch=1, col="red", cex=1.5) # Alternatively, use the range-normalized values, like this. plot(aw$normF2, aw$normDur, xlim=xlimNormF2, ylim=ylimNormDur, pch=19, col="grey60", ylab="range-normalized duration (%)", xlab="range-normalized second formant (%)") points(ah$normF2, ah$normDur, pch=1) legend("bottomright", c("hawed ","hod "), pch=c(19,1), col=c("grey60","black")) points(missed.aw$normF2, missed.aw$normDur, pch=1, col="red", cex=1.5) points(missed.ah$normF2, missed.ah$normDur, pch=22, col="red", cex=1.5) # Make another scatterplot showing the relationship between F1 and duration. plot(aw$F1, aw$duration, xlim=ylimF1, ylim=ylimDur, pch=19, col="grey60", ylab="duration of vowel (ms)", xlab="first formant (Hz)") points(ah$F1, ah$duration, pch=1) legend("bottomleft", c("hawed ","hod "), pch=c(19,1), col=c("grey60","black")) points(missed.aw$F1, missed.aw$duration, pch=22, col="red", cex=1.5) points(missed.ah$F1, missed.ah$duration, pch=1, col="red", cex=1.5) # Alternatively, use the range-normalized values, like this. plot(aw$normF1, aw$normDur, xlim=ylimNormF1, ylim=ylimNormDur, pch=19, col="grey60", ylab="range-normalized duration (%)", xlab="range-normalized first formant (%)") points(ah$normF1, ah$normDur, pch=1) legend("bottomright", c("hawed ","hod "), pch=c(19,1), col=c("grey60","black")) points(missed.aw$normF1, missed.aw$normDur, pch=1, col="red", cex=1.5) points(missed.ah$normF1, missed.ah$normDur, pch=22, col="red", cex=1.5) ######################################################################## # Part 4 -- If the percent of missed values is continuous, you might # look at the relationship between the normalized values and the missed # tokens by making scatterplots of perception results against production # differences. To do this, start by setting up a new data frame that # calculates differences and also takes a mean correct identification. talkers=data.frame(talker=aw$talker, F2=ah$F2-aw$F2, F1=ah$F1-aw$F1, normF2=ah$normF2-aw$normF2, normF1=ah$normF1-aw$normF1, duration=aw$duration-ah$duration, normDur=aw$normDur-ah$normDur, ID=((aw$ID+ah$ID)/2), misID=((aw$misID+ah$misID)/2)) summary(talkers[,c(2:9)]) # F2 F1 normF2 normF1 # Min. : 9.0 Min. :-82.0 Min. : 1.00 Min. :-16.00 # 1st Qu.:247.0 1st Qu.: 57.5 1st Qu.:15.00 1st Qu.: 15.00 # Median :321.0 Median :108.0 Median :19.00 Median : 25.00 # Mean :329.4 Mean :125.7 Mean :19.50 Mean : 24.22 # 3rd Qu.:404.0 3rd Qu.:161.5 3rd Qu.:24.00 3rd Qu.: 33.50 # Max. :791.0 Max. :468.0 Max. :42.00 Max. : 66.00 # NA's : 6.0 NA's : 6.00 # duration normDur ID misID # Min. :-97.00 Min. :-63.00 Min. : 45.00 Min. : 0.000 # 1st Qu.: -4.00 1st Qu.: -3.50 1st Qu.: 80.00 1st Qu.: 2.500 # Median : 20.00 Median : 14.00 Median : 90.00 Median : 5.000 # Mean : 17.53 Mean : 13.23 Mean : 86.04 Mean : 9.748 # 3rd Qu.: 38.00 3rd Qu.: 32.00 3rd Qu.: 95.00 3rd Qu.:15.000 # Max. :119.00 Max. : 80.00 Max. :100.00 Max. :47.500 F1diff=c(-82,468) F2diff=c(9,791) normF1diff=c(-16,66) normF2diff=c(1,42) durDiff=c(-97,119) normDurDiff=c(-63,80) ID=c(45,100) misID=c(0,48) # Make scatterplots showing the relationships between F2 difference and # percent correct identification talker by talker, and so on. plot(talkers$F2, talkers$ID, xlim=F2diff, ylim=ID, pch=19, xlab="difference in F2 (Hz)", ylab="percent correctly identified") abline(lm(ID ~ F2, data=talkers), lty=2) summary(lm(ID ~ F2, data=talkers)) plot(talkers$normF2, talkers$ID, xlim=normF2diff, ylim=ID, pch=19, xlab="difference in normalized F2 (%)", ylab="percent correctly identified") abline(lm(ID ~ normF2, data=talkers), lty=2) summary(lm(ID ~ normF2, data=talkers)) plot(talkers$F1, talkers$ID, xlim=F1diff, ylim=ID, pch=19, xlab="difference in F1 (Hz)", ylab="percent correctly identified") abline(lm(ID ~ F1, data=talkers), lty=2) summary(lm(ID~ F1, data=talkers)) plot(talkers$normF1, talkers$ID, xlim=normF1diff, ylim=ID, pch=19, xlab="difference in normalized F1 (%)", ylab="percent correctly identified") abline(lm(ID ~ normF1, data=talkers), lty=2) summary(lm(ID ~ normF1, data=talkers)) plot(talkers$duration, talkers$ID, xlim=durDiff, ylim=ID, pch=19, xlab="duration difference (ms)", ylab="percent correctly identified") abline(lm(ID ~ duration, data=talkers), lty=2) summary(lm(ID ~ duration, data=talkers)) plot(talkers$normDur, talkers$ID, xlim=normDurDiff, ylim=ID, pch=19, xlab="ranged normalized duration difference (%)", ylab="percent correctly identified") abline(lm(ID ~ normDur, data=talkers), lty=2) summary(lm(ID ~ normDur, data=talkers)) ######################################################################## # Part 5 -- Here is code for adding a Euclidean distance in the raw Hz # vowel space and also in the range-normalized vowel space, if you think # the difference in the two formants together might be more predictive # than the difference in either one alone. talkers$Euclid=((talkers$F2^2)+(talkers$F1^2))^0.5 summary(talkers$Euclid) # Min. 1st Qu. Median Mean 3rd Qu. Max. NA's # 15.0 265.2 349.0 360.5 448.6 800.2 6.0 plot(talkers$Euclid, talkers$ID, xlim=c(15,801), ylim=ID, pch=19, xlab="Euclidean distance in vowel space (Hz)", ylab="percent correctly identified") abline(lm(ID ~ Euclid, data=talkers), lty=2) summary(lm(ID ~ Euclid, data=talkers)) talkers$normEuclid=((talkers$normF2^2)+(talkers$normF1^2))^0.5 summary(talkers$normEuclid) # Min. 1st Qu. Median Mean 3rd Qu. Max. NA's # 2.236 25.500 31.620 32.670 41.110 71.310 6.000 plot(talkers$normEuclid, talkers$ID, xlim=c(2,72), ylim=ID, pch=19, xlab="Euclidean distance in normalized vowel space (%)", ylab="percent correctly identified") abline(lm(ID ~ normEuclid, data=talkers), lty=2) summary(lm(ID ~ normEuclid, data=talkers))