######################################################################## # HillenbrandHighFrontVowels.R # # From "Notes on Probability and Statistics for Analyzing the # Sounds of Languages" -- a companion textbook to Peter Ladefoged's # "Vowels and Consonants: An Introduction to the Sounds of Languages" # 2nd edition (Blackwell, 2005). # # (c) 2006, Grant McGuire and Mary E. Beckman # Department of Linguistics, Ohio State University # # File of R code for doing some analyses of the Hillenbrand et al. # (1995) Northern Cities vowels that Ladefoged plots in Figure 5.? # on p. ? in "Vowels and Consonants" Chapter 5. # # References and acknowledgments # # The data for this analysis problem are from: # James M. Hillenbrand, Laura A. Getty, Michael J. Clark, and # Kimberlee Wheeler (1995). Acoustic characteristics of # American English vowels. Journal of the Acoustical Society of # America, 97: 3099-3111. # They can be downloaded from: # http://homepages.wmich.edu/~hillenbr/voweldata.html # ######################################################################## # # Part 0 -- In preparation for doing the data analyses .... # # Download the file bigdata.dat file from the web site for the # Hillenbrand et al. (1995) vowel study cited above. Put it in # the same folder where you have put this file. Open up the R # Console and then open this file from within the R Console. # Then set the working directory to the folder where you have put # this bigdata.dat file that you downloaded from Jim Hillenbrand's # web site for the Hillenbrand et al. (1995) study of Detroit vowels. # On Mary's laptop, this is the folder called C:\Lx286\Hillenbrand, # so this setwd() command is for her machine. Edit it to match the # path name for the folder where you have the data on your machine. setwd('c:/Lx286/Hillenbrand') # Read in the bigdata.dat file, skipping the header information in the # first 43 rows, and telling R to read the file "as is" and to treat # values of 0 as cells where no measurement is available, as stated in # the header information, line 42, where Jim Hillenbrand tells us: # "Note: An entry of zero means that the formant was not measurable." x=read.table("bigdata.dat", skip=43, as.is=T, na=0) # Note: If you prefer to download the file as bigdata.txt from the # course web site, replace the above command with the following. x=read.table("bigdata.txt", skip=43, as.is=T, na=0) # Also download the iddataMinusAsterisk.txt file from the course web # page and read those data into a data frame called "id". (This is a # file that is identical to the iddata.dat file on Jim Hillenbrand's # site, except that the asterisks in the far right column have been # removed so that all rows have the same number of elements.) id=read.table("iddataMinusAsterisk.txt", skip=19, as.is=T, header=T) # Then set the working directory again, to where you want the graphs # that you make to be stored. setwd('c:/Lx286/topic5') # Assign names to the columns of the vowel formants and durations table. names(x)=c("filename","duration","f0","F1","F2","F3", "F1.1","F2.1","F3.1","F1.2","F2.2","F3.2","F1.3","F2.3","F3.3", "F1.4","F2.4","F3.4","F1.5","F2.5","F3.5","F1.6","F2.6","F3.6", "F1.7","F2.7","F3.7","F1.8","F2.8","F3.8") # Use the filename column to make two new columns for the speaker type # (from the set "m" for men, "w" for women, "b" for boys, and "g" for # girls) and the vowel type (from the set "iy" for the vowel [i] in # the word , "ih" for the vowel [I] in the word , and so on). x$sex=substring(x$filename, 1, 1) x$vow=substring(x$filename, 4, 5) # The substring command takes a character string (such as the filename # "b01ae") as its first argument, and outputs the substring that starts # at the position specifiedby the second argument and ends at the position # specified by the third argument. So substring(x$filename, 4, 5) means # "take the 4th through 5th elements in each filename" -- i.e., the "ae" # in "b01ae" and so on. # Do the same thing for the id data frame. id$sex=substring(id$Filename, 1, 1) id$vow=substring(id$Filename, 4, 5) # Also strip off the ".w" at the end of the Filename variable. id$Filename=substring(id$Filename, 1, 5) # Pick out the subset of rows of x that are for the vowels [i] ("iy") in # the word and [I] ("ih") in the word . iy=subset(x, vow=="iy") ih=subset(x, vow=="ih") # Then figure out which tokens of and (if any) were # misidentified by a large number of the 20 listeners who provided the # judgements for the iddata.dat file. Let's do this by first getting a # summary of the counts in the "iy" response column for the vowel [i] of # and of the "ih" response column for the vowel [I] of , by # applying the summary() command on a factor() [i.e., a categorical # variable] that we make from the numbers in the appropriate column of # the subset of rows of id for each of the two vowels. summary(factor(subset(id, vow=="iy")$iy)) # 19 20 # 12 127 summary(factor(subset(id, vow=="ih")$ih)) # 10 13 16 17 18 19 20 # 1 1 1 3 11 20 102 # See whether any were misidentifications of [i] for [I] or vice versa. subset(id, vow=="ih" & ih<20)$iy # [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 length(subset(id, vow=="ih" & ih<20)$iy) # [1] 37 sum(subset(id, vow=="ih" & ih<20)$iy) # [1] 0 # This says that there were 37 cases where was misidentified as # one of the other words, but none of these was a misidentification of # for . subset(id, vow=="iy" & iy<20)$ih # [1] 0 0 0 0 1 0 0 1 0 1 0 1 length(subset(id, vow=="iy" & iy<20)$ih) # [1] 12 sum(subset(id, vow=="iy" & iy<20)$ih) # [1] 4 # This says that four of the tokens of were misidentified as # by one of the listeners. # Then make a new data frame that picks out just these four cases where # there were misindentifications of as . missed.iy.utts=subset(id, vow=="iy" & iy==19 & ih==1)$Filename missed.iy=data.frame(filename=missed.iy.utts, F1=NA, F2=NA, duration=NA) for (i in 1:dim(missed.iy)[1]) { missed.iy[i,c("F1","F2","duration")]=subset(x, filename==as.character(missed.iy$filename[i]))[,c("F1","F2","duration")] } # Here's how to see which points they are. missed.iy # filename F1 F2 duration # 1 m41iy 394 2300 221 # 2 w26iy 375 2577 336 # 3 w39iy 430 2606 338 # 4 b05iy 475 3203 242 ######################################################################## # # Part 1 -- Making overlaid histograms and doing relevant t-tests. # # Set up plotting pretty window for making overlaid histograms. windows(width=4, height=2, pointsize=12) par(family="serif", oma=rep(0,4), mar=c(3,3,0.1,0.1), mgp=c(1.8,0.5,0)) # Figure out what the F1 range is. min(c(iy$F1,ih$F1)) # [1] 305 max(c(iy$F1,ih$F1)) # [1] 590 # Use these values to determine that the histogram should have bins # ranging from 300 to 600 Hz. breaksF1=seq(300,600,25) # Make a histogram of the first formant of [i] and draw a box around it. hist(iy$F1, breaks=breaksF1, main="", xlab="first formant (Hz)", ylab="number of tokens",ylim=c(0,38),col="grey80");box() # Tell R to overlay a new plot. par(new=T) # Make a histogram of the first formant of [I]. hist(ih$F1, breaks=breaksF1, main="", xlab="", ylab="", ylim=c(0,38),density=30) # Add a legend key in the top left corner of the graph. legend("topleft",c("[i]","[I]"),density=c(-1,30),fill=c("grey80","black")) # Draw cursors for the four tokens of [i] that were misidentified as [I]. abline(v=missed.iy$F1, col="red") # Save the plot to a jpg format file or to a wmf ("Windows Metafile") # formant file, to read into your html file or your doc file. savePlot("HillenbrandHighFrontF1",type="jpg") savePlot("HillenbrandHighFrontF1",type="wmf") # Do a t-test to see whether the difference in mean F1 values between the # two vowels could plausibly be attributed to chance, given the large overlap # in the distributions. t.test(iy$F1, ih$F1, alternative="less") # Now do the same thing for the second formant values. min(c(iy$F2,ih$F2)) # [1] 1810 max(c(iy$F2,ih$F2)) # [1] 3451 breaksF2=seq(1800,3500,100) hist(iy$F2, breaks=breaksF2, main="", xlab="second formant (Hz)", ylab="number of tokens",ylim=c(0,21),col="grey80");box() par(new=T) hist(ih$F2, breaks=breaksF2, main="", xlab="", ylab="", ylim=c(0,21),density=30) legend("topright",c("[i]","[I]"),density=c(-1,30),fill=c("grey80","black")) abline(v=missed.iy$F2, col="red") savePlot("HillenbrandHighFrontF2",type="jpg") savePlot("HillenbrandHighFrontF2",type="wmf") t.test(iy$F2, ih$F2, alternative="greater") # Note that here the alternative is the opposite, because the mean F2 for [i] # is higher than the mean F2 for [I]. That is, the F1 and F2 are a good bit # more separated in the [i] as compared to the [I]. # Now do the same thing for the duration values. min(c(iy$duration,ih$duration)) # [1] 134 max(c(iy$duration,ih$duration)) # [1] 433 breaksDur=seq(100,450,25) hist(iy$duration, breaks=breaksDur, main="", xlab="duration (ms)", ylab="number of tokens",ylim=c(0,28),col="grey80");box() par(new=T) hist(ih$duration, breaks=breaksDur, main="", xlab="", ylab="", ylim=c(0,28),density=30) legend("topright",c("[i]","[I]"),density=c(-1,30),fill=c("grey80","black")) # Draw vertical cursors at the durations of the four tokens of that # were misidentified as by at least one listener. abline(v=missed.iy$dur, col="red") savePlot("HillenbrandHighFrontDuration",type="jpg") savePlot("HillenbrandHighFrontDuration",type="wmf") t.test(iy$duration, ih$duration, alternative="greater") # Now evaluate the effect of speaker type on these same values, starting # with the predicted effects of sex on the first and second formants. males=subset(x, (sex=="m" | sex=="b") & (vow=="iy" | vow=="ih")) females=subset(x, (sex=="w" | sex=="g") & (vow=="iy" | vow=="ih")) hist(males$F1, breaks=breaksF1, main="", xlab="first formant (Hz)", ylab="number of tokens",ylim=c(0,42),col="grey80");box() par(new=T) hist(females$F1, breaks=breaksF1, main="", xlab="", ylab="", ylim=c(0,42), density=30) legend("topleft",c("M","F"),density=c(-1,30),fill=c("grey80","black")) savePlot("HillenbrandHighFrontF1bySex",type="jpg") savePlot("HillenbrandHighFrontF1bySex",type="wmf") t.test(males$F1, females$F1, alternative="less") hist(males$F2, breaks=breaksF2, main="", xlab="second formant (Hz)", ylab="number of tokens",ylim=c(0,24),col="grey80");box() par(new=T) hist(females$F2, breaks=breaksF2, main="", xlab="", ylab="", ylim=c(0,24), density=30) legend("topright",c("M","F"),density=c(-1,30),fill=c("grey80","black")) savePlot("HillenbrandHighFrontF2bySex",type="jpg") savePlot("HillenbrandHighFrontF2bySex",type="wmf") t.test(males$F2, females$F2, alternative="less") # Note that here the alternative is "less" because we expect men and boys to # have lower F2 values than women and girls. hist(males$duration, breaks=breaksDur, main="", xlab="duration (ms)", ylab="number of tokens",ylim=c(0,28),col="grey80");box() par(new=T) hist(females$duration, breaks=breaksDur, main="", xlab="", ylab="", ylim=c(0,28), density=30) legend("topright",c("M","F"),density=c(-1,30),fill=c("grey80","black")) savePlot("HillenbrandHighFrontDurationBySex",type="jpg") savePlot("HillenbrandHighFrontDurationBySex",type="wmf") t.test(males$duration, females$duration, alternative="two.sided") # Here the alternative is "two.sided" meaning that we're evaluating differences # in both directions, since we have no a priori reason to expect men to talk # faster than women or vice versa. ######################################################################## # # Part 2 -- Making scatterplots. # # Set up x-axis and y-axis ranges from the values you chose for the # ranges of the histograms. xlimF2=c(3500,1800) ylimF1=c(600,300) # Set up a plotting pretty window for making a scatterplot. windows(width=4, height=4, pointsize=12) par(family="serif", oma=rep(0,4), mar=c(3,3,0.1,0.1), mgp=c(1.8,0.5,0), type="s") # Make a scatterplot showing the vowel space for the vowel in . plot(iy$F2, iy$F1, xlim=xlimF2, ylim=ylimF1, pch="i", xlab="second formant (Hz)", ylab="first formant (Hz)") # Overlay points for the vowel in . points(ih$F2, ih$F1, pch="I", col="grey60") # Draw big red circles around the four points where was # misidentified as by at least one listener. points(missed.iy$F2, missed.iy$F1, col="red", cex=2) # Save the scatterplots. savePlot("HillenbrandHighFrontScatterplot",type="jpg") savePlot("HillenbrandHighFrontScatterplot",type="wmf") # Make four new data frames, so as to separate out the men from the boys # before evaluating the effects of sex. boys=subset(males, sex=="b") men=subset(males, sex="m") # Now make a scatterplot showing the vowel space for the men, using black # character "m" for the data points. plot(men$F2, men$F1, xlim=xlimF2, ylim=ylimF1, pch="m", xlab="second formant (Hz)", ylab="first formant (Hz)") # Overlay points for the women and girls using grey "f" characters, and ... points(females$F2, females$F1, pch="f", col="grey60") # separately for the boys using red "m" characters. points(boys$F2, boys$F1, pch="m", col="red") savePlot("HillenbrandHighFrontScatterplotBySex",type="jpg") savePlot("HillenbrandHighFrontScatterplotBySex",type="wmf")