# classVowelsHistograms.R # # Script for reading in everyone's Table file for the term project # on vowel spaces, part 2, and then doing things like making a histogram # of all the F1 and F2 values, and doing a t-test to see if they're # different. # # (c) by the class of Linguistics H286, Autumn 2007. # Start by setting directory to where the data are and figuring out what # Table files are there and reading them into one big data frame. setwd('c:/Lx286/projectParts/termProjectPart2') filenames=grep(".Table",dir(),value=T) # The above command does a directory listing on the files in the directory # using the dir() command, and then gets those that match the regular # expression ".Table" using the grep() command. # Read in Alexa's file, to start a large data frame. formants=read.table(filenames[1],sep="\t",header=T) # This is what formants should look like right now: # speaker word vowel F1 F2 # 1 Alexa heed i 384 2693 # 2 Alexa hid I 526 2364 # 3 Alexa hayed e 540 2555 # 4 Alexa head E 752 2075 # 5 Alexa had a 876 2094 # 6 Alexa hod A 1007 1464 # 7 Alexa HUD v 695 1587 # 8 Alexa hawed O 803 1223 # 9 Alexa hoed o 542 1199 # 10 Alexa whod u 387 1264 # Then loop through the rest of the files, reading in them in and # adding them to the end of the big data frame. for (i in 2:length(filenames)) { x=read.table(filenames[i],sep="\t",header=T,na="?",as.is=T)[, c("speaker","word","vowel","F1","F2")] formants=rbind(formants,x) } # Make histograms of the F1 values in the vowels [i] in versus # [I] in , as an example of how to make overlaid histograms. # Start by setting up plotting pretty window for the two histograms. windows(width=4, height=2, pointsize=12) par(family="serif", oma=rep(0,4), mar=c(3,3,0.1,0.1), mgp=c(1.8,0.5,0)) # Pick out the rows that are for the two vowels, and figure out the # limits for the F1 and F2 histograms. iy=subset(formants, vowel=="i") ih=subset(formants, vowel=="I") # Figure out what the first and last bins should be in the plot. min(ih$F1) # [1] 415.77 max(ih$F1) # [1] 669 min(iy$F1) # [1] 271 max(iy$F1) # [1] 2095.405 # Hmmm, something is very wrong here. That last F1 value is way too # high. That's what we would expect the F2 to be. So let's check it. # Start by figuring out whose vowel that is. iy[iy$F1=="2095.405",] # speaker word vowel F1 F2 # 12 Brandon heed i 2095.405 3000.205 # We will need to remeasure, and correct this mistracking. But for now, # let's just remove Brandon's [i] from the plot. iy=iy[iy$speaker!="Brandon",] max(iy$F1) # [1] 743 # This looks more like it ... # Now make the overlaid histograms. hist(iy$F1, breaks=seq(250,750,50), main="", xlab="first formant (Hz)", ylab="number of tokens",ylim=c(0,10),col="pink");box() par(new=T) hist(ih$F1, breaks=seq(250,750,50), main="", xlab="first formant (Hz)", ylab="",ylim=c(0,10),density=30) # You can see that there is very little overlap and most of the [i] values # are below the [I] values, as expected. But let's do a t-test anyway. t.test(iy$F1,ih$F1,alternative="less") # Welch Two Sample t-test # # t = -4.4884, df = 29.024, p-value = 5.234e-05 # alternative hypothesis: true difference in means is less than 0 # 95 percent confidence interval: # -Inf -86.23508 # sample estimates: # mean of x mean of y # 369.8883 508.6528 # # These results say that the mean F1 for [i] is 370, more than 100 Hz # less than 508, which is the mean F1 for [I], and there is at most a # 0.00005234 chance that this difference could have come about by chance # if the two sets of F1 values were from the same population. # Here's the same thing for the F1 in versus . O=subset(formants, word=="hawed") A=subset(formants, word=="hod") # Figure out what the first and last bins should be in the plot. min(O$F1) # [1] 587 max(O$F1) # [1] 1600.5 min(A$F1) # [1] 634.69 max(A$F1) # [1] 1029.819 # Hmmm, that 1600.5 seems awfully high. Again, let's see whose vowel # that is. O[O$F1==1600.5,] # speaker word vowel F1 F2 # 87 cj hawed O 1600.5 564.8 # Again, we should check it, and listen to see why, but for now, let's # just take CJ out of the O data frame, to get on with the graph. O=O[O$speaker!="cj",] max(O$F1) # [1] 877 hist(A$F1, breaks=seq(550,1050,50), main="", xlab="first formant (Hz)", ylab="number of tokens",ylim=c(0,7),col="pink");box() par(new=T) hist(O$F1, breaks=seq(550,1050,50), main="", xlab="first formant (Hz)", ylab="",ylim=c(0,7),density=30) # Now, there is a lot more overlap, so let's check with a t-test. t.test(O$F1,A$F1,alternative="less")