# classChartsHodHawed.R # # File of R code for analyzing the and vowels in the # productions by the class, and for relating the perception results # to the formant and durations values, as in the HillenbrandHodHawed.R # code that we used to look at these vowels and the identification in # the Hillenbrand et al. (1995) corpus of 139 Detroiters' productions. # # (c) by the class of Linguistics H286, Autumn 2007. # Start by setting the working directory to where you have put the # perception results file and read in that file. setwd('c:/Lx286/projectParts') ID=read.table("Au2007perception.txt", header=T) # Remind yourself what the variables are in this data frame. names(ID) # [1] "listenerDistinguishes" "listener" "talkerDistinguishes" # [4] "talker" "stimulus" "response" # Then set the working directory to where you have put the file # classVowels.txt, which combines the duration values from the Table # files that Group Awesome provided with the corrected formant values # that Group A gave us (and the corrections that Mary made in the other # vowels besides those in and ). setwd('c:/Lx286/projectParts/termProjectPart3') # Read in the production results data file. x=read.table("classVowels.txt", header=T) # Remind yourself what the variables are in this data frame. names(x) # [1] "speaker" "word" "vowel" "F1" "F2" "duration" # You could add a column for whether the speaker is a distinguisher or not, # like this. x$distinguish="yes" x$distinguish[x$speaker=="Brandon" | x$speaker=="cj" | x$speaker=="Caroline" | x$speaker=="Joe" | x$speaker=="Leo" | x$speaker=="Mike" | x$speaker=="Milo"]="no" # Extract the subset of rows for the vowel and the vowel. aw=subset(x, word=="hawed") ah=subset(x, word=="hod") # Add two columns to each of these data frames for the percent correct # identification and the percent mis-identified as the other vowel. aw$ID=NA ah$ID=NA aw$misID=NA ah$misID=NA for (i in 1:dim(aw)[1]) { temp=subset(ID, talker==as.character(aw$speaker[i]) & stimulus=="hawed") aw$ID[i]=round(sum(temp$response=="hawed")*5) aw$misID[i]=round(sum(temp$response=="hod")*5) temp=subset(ID, talker==as.character(aw$speaker[i]) & stimulus=="hod") ah$ID[i]=round(sum(temp$response=="hod")*5) ah$misID[i]=round(sum(temp$response=="hawed")*5) } # In set of three commands in the loop, you're picking out the relevant rows # of the ID data frame for that speaker and that target stimulus that the # speaker produced, and then counting the number of rows where the listener # identified the word correctly (or mis-identified it as the other word), and # then multiplying by 5 (which is equivalent to dividing by 20 listeners # to get the proportion of responses that are correct or that are incorrect, # and then multiplying by 100 to convert to a percent). # Add three more columns for the normalized values for the F1, F2, and # the duration, with the normalization algorithm being the percent of # range calculation that Group Awesome used for their second interim report. aw$normF1=NA ah$normF1=NA aw$normF2=NA ah$normF2=NA aw$normDur=NA ah$normDur=NA for (i in 1:dim(aw)[1]) { temp=subset(x, speaker==as.character(aw$speaker[i])) minF1=min(na.omit(temp$F1)) maxF1=max(na.omit(temp$F1)) minF2=min(na.omit(temp$F2)) maxF2=max(na.omit(temp$F2)) minDur=min(na.omit(temp$duration)) maxDur=max(na.omit(temp$duration)) aw$normF1[i]=round(100*(aw$F1[i]-minF1)/(maxF1-minF1)) aw$normF2[i]=round(100*(aw$F2[i]-minF1)/(maxF2-minF2)) aw$normDur[i]=round(100*(aw$duration[i]-minDur)/(maxDur-minDur)) ah$normF1[i]=round(100*(ah$F1[i]-minF1)/(maxF1-minF1)) ah$normF2[i]=round(100*(ah$F2[i]-minF1)/(maxF2-minF2)) ah$normDur[i]=round(100*(ah$duration[i]-minDur)/(maxDur-minDur)) } ######################################################################## # Part 1 -- Making overlaid histograms and doing the t-tests that would # go with each of the histograms. This includes only one bit of sample # code, since you've got a lot of code in the HillenbrandHodHawed.R file. # Set up plotting pretty window for making overlaid histograms. windows(width=4, height=2, pointsize=12) par(family="serif", oma=rep(0,4), mar=c(3,3,0.1,0.1), mgp=c(1.8,0.5,0)) # Figure out what ranges to use for F1, F2, and duration. summary(c(aw$F1,ah$F1)) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 587.0 692.0 762.0 776.9 860.0 1030.0 summary(c(aw$F2,ah$F2)) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 992 1126 1221 1260 1408 1604 summary(c(aw$duration,ah$duration)) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 0.1450 0.2340 0.2655 0.2662 0.2907 0.4010 summary(c(aw$normF1,ah$normF1)) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 64.0 77.0 92.0 87.2 100.0 100.0 summary(c(aw$normF2,ah$normF2)) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 40.00 55.75 71.50 68.95 81.00 95.00 summary(c(aw$normDur,ah$normDur)) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 33.00 65.50 79.00 78.10 90.25 100.00 # Use these values to determine the bins for the different histograms. breaksF1=seq(550,1050,50) breaksF2=seq(950,1650,50) breaksDur=seq(0.1,0.450,0.05) breaksNormF1=seq(60,100,5) breaksNormF2=seq(40,95,5) breaksNormDur=seq(30,100,5) # Make a histogram of the first formant of and draw a box around it. hist(aw$F1, breaks=breaksF1, main="", xlab="first formant (Hz)", ylab="number of tokens",ylim=c(0,7),col="grey80");box() # Tell R to overlay a new plot. par(new=T) # Make a histogram of the first formant of . hist(ah$F1, breaks=breaksF1, main="", xlab="", ylab="", ylim=c(0,7),density=30) # Add a legend key in the top left corner of the graph. legend("topright",c("hawed","hod"),density=c(-1,30),fill=c("grey80","black")) # Add cursors at the two mean values. abline(v=c(mean(na.omit(aw$F1)),mean(na.omit(893))),col=c("grey60","black"),lty=c(1,2)) # Save the plot. savePlot("classHodHawedF1",type="jpg") # Do a paired t-test to see whether the non-zero difference in mean F1 values # between the two vowels for each of the 139 talkers could plausibly be # attributed to chance, given the large overlap in the distributions. t.test(aw$F1, ah$F1, paired=T, alternative="less") hist(aw$normF1, breaks=breaksNormF1, main="", xlab="normalized first formant (%)", ylab="number of tokens",ylim=c(0,12),col="grey80");box() par(new=T) hist(ah$normF1, breaks=breaksNormF1, main="", xlab="", ylab="", ylim=c(0,12),density=30) legend("topleft",c("hawed","hod"),density=c(-1,30),fill=c("grey80","black")) abline(v=c(mean(na.omit(aw$normF1)),mean(na.omit(ah$normF1))),col=c("grey60","black"),lty=c(1,2)) savePlot("classHawedHodNormF1",type="wmf") t.test(aw$normF1, ah$normF1, paired=T, alternative="less") # You can use the code from the HillenbrandHodHawed.R for the other two # pairs of histograms, if this seems a useful test to use. ######################################################################## # Part 2 -- Evaluating the perception patterns. # summary(c(aw$ID,ah$ID)) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 0.0 45.0 62.5 59.5 80.0 95.0 summary(c(aw$misID,ah$misID)) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 0.0 20.0 30.0 35.5 50.0 80.0 breaksID=seq(0,100,10) hist(aw$ID, breaks=breaksID, main="", xlab="correct identification (%)", ylab="number of tokens",ylim=c(0,6),col="grey80");box() par(new=T) hist(ah$ID, breaks=breaksID, main="", xlab="", ylab="", ylim=c(0,6),density=30) legend("topleft",c("hawed","hod"),density=c(-1,30),fill=c("grey80","black")) # Given that these 20 listeners had a choice of 2 different words to choose # from in identifying any token, the probability of choosing the intended # word just by chance is 1/2 = 50%, meaning that 50% of the 20 listeners # should have identified the vowel correctly just by chance. Add a cursor # at this chance ID rate. abline(v=50, lty=1);text(47,3,"chance correct ID", srt=90) # The binomial test says that there is a bit more than 0.01 chance of 16 out # of the 20 identifying the word correctly just by chance, as you can see by # running the following line of code: binom.test(16,20,p=(1/2)) # So let's also put a cursor at 16/20 = 80%, to identify which tokens were # identified correctly at significantly better than chance level, if we say # that about 1 in 100 is significantly better than chance. abline(v=80, lty=2);text(77, 3, "better than chance", srt=90) # Set up a plotting pretty window for making a scatterplot. windows(width=4, height=4, pointsize=12) par(family="serif", oma=rep(0,4), mar=c(3,3,0.1,0.1), mgp=c(1.8,0.5,0), type="s") ######################################################################## # Part 3 -- Making scatterplots of F1 against F2 and duration against F2, # if you want to see if pairs of the values together are more predictive. # Set up x-axis and y-axis ranges from the values you chose for the # ranges of the histograms. xlimF2=c(1650,950) ylimF1=c(1050,550) ylimDur=c(0.1,0.450) xlimNormF2=c(95,40) ylimNormF1=c(100,60) ylimNormDur=c(30,100) # Set up data frames for subsets of the aw and ah that correspond to # speakers who do not distinguish. awNon=subset(aw, distinguish=="no") ahNon=subset(ah, distinguish=="no") # Set up a plotting pretty window for making a scatterplot. windows(width=4, height=4, pointsize=12) par(family="serif", oma=rep(0,4), mar=c(3,3,0.1,0.1), mgp=c(1.8,0.5,0), type="s") # Make a scatterplot showing the vowel space for the vowel in . plot(aw$F2, aw$F1, xlim=xlimF2, ylim=ylimF1, pch=19, col="grey60", ylab="first formant (Hz)", xlab="second formant (Hz)") # Overlay points for the vowel in . points(ah$F2, ah$F1, pch=1) legend("bottomright", c("hawed ","hod "), pch=c(19,1), col=c("grey60","black")) # Draw red circles around the points for the nondistinguishers. points(awNon$F2, awNon$F1, pch=1, col="red", cex=1.5) # Draw red squares around the points where was misidentified # as by at least 6 listeners. points(ahNon$F2, ahNon$F1, pch=22, col="red", cex=1.5) # Save the figure to a file. savePlot("classHodHawedF1F2", type="jpg") # Do the same thing for the normalized vowel space. plot(aw$normF2, aw$normF1, xlim=xlimNormF2, ylim=ylimNormF1, pch=19, col="grey60", ylab="first formant (Hz)", xlab="second formant (Hz)") # Overlay points for the vowel in . points(ah$normF2, ah$normF1, pch=1) legend("topright", c("hawed ","hod "), pch=c(19,1), col=c("grey60","black")) # Draw red circles around the points for the nondistinguishers. points(awNon$normF2, awNon$normF1, pch=1, col="red", cex=1.5) # Draw red squares around the points where was misidentified # as by at least 6 listeners. points(ahNon$normF2, ahNon$normF1, pch=22, col="red", cex=1.5) # Save the figure to a file. savePlot("classHodHawedNormF1F2", type="jpg") ######################################################################## # Part 4 -- Since the percent correct ID varies continuously, you might # look at the relationship by making scatterplots of perception results # against production differences. To do this, start by setting up a new # data frame that calculates differences and also takes a mean correct ID. talkers=data.frame(talker=aw$speaker, F2=ah$F2-aw$F2, F1=ah$F1-aw$F1, normF2=ah$normF2-aw$normF2, normF1=ah$normF1-aw$normF1, duration=aw$duration-ah$duration, normDur=aw$normDur-ah$normDur, ID=((aw$ID+ah$ID)/2), misID=((aw$misID+ah$misID)/2)) summary(talkers[,c(2:9)]) # F2 F1 normF2 normF1 # Min. :-110.0 Min. :-42.0 Min. :-5.0 Min. :-10.00 # 1st Qu.: 48.5 1st Qu.: 26.5 1st Qu.: 4.0 1st Qu.: 4.75 # Median : 79.5 Median : 48.5 Median : 6.0 Median : 12.50 # Mean : 118.2 Mean : 69.9 Mean : 8.4 Mean : 13.40 # 3rd Qu.: 172.8 3rd Qu.:104.3 3rd Qu.:11.0 3rd Qu.: 20.75 # Max. : 365.0 Max. :212.0 Max. :30.0 Max. : 36.00 # duration normDur ID misID # Min. :-0.00400 Min. :-4.00 Min. : 0.00 Min. : 0.00 # 1st Qu.: 0.00575 1st Qu.: 6.75 1st Qu.:57.50 1st Qu.:34.38 # Median : 0.02300 Median :14.50 Median :61.25 Median :37.50 # Mean : 0.02645 Mean :18.90 Mean :59.50 Mean :35.50 # 3rd Qu.: 0.03825 3rd Qu.:26.75 3rd Qu.:65.00 3rd Qu.:42.50 # Max. : 0.08900 Max. :67.00 Max. :80.00 Max. :47.50 F1diff=c(-42,212) F2diff=c(-110,365) normF1diff=c(-10,36) normF2diff=c(-5,30) durDiff=c(-0.004,0.089) normDurDiff=c(-4,67) ID=c(0,80) # Make scatterplots showing the relationships between F2 difference and # percent correct identification talker by talker, and so on. plot(talkers$F2, talkers$ID, xlim=F2diff, ylim=ID, pch=19, xlab="difference in F2 (Hz)", ylab="percent correctly identified") abline(lm(ID ~ F2, data=talkers), lty=2) summary(lm(ID ~ F2, data=talkers)) plot(talkers$F1, talkers$ID, xlim=F1diff, ylim=ID, pch=19, xlab="difference in F1 (Hz)", ylab="percent correctly identified") abline(lm(ID ~ F1, data=talkers), lty=2) summary(lm(ID~ F1, data=talkers)) # and so on, following the lead in the HillenbrandHodHawed.R file. ######################################################################## # Part 5 -- Here is code for adding a Euclidean distance in the raw Hz # vowel space and also in the range-normalized vowel space, if you think # the difference in the two formants together might be more predictive # than the difference in either one alone. talkers$Euclid=((talkers$F2^2)+(talkers$F1^2))^0.5 summary(talkers$Euclid) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 20.00 61.68 107.40 152.70 195.70 422.10 plot(talkers$Euclid, talkers$ID, xlim=c(20,422), ylim=ID, pch=19, xlab="Euclidean distance in vowel space (Hz)", ylab="percent correctly identified") abline(lm(ID ~ Euclid, data=talkers), lty=2) summary(lm(ID ~ Euclid, data=talkers)) talkers$normEuclid=((talkers$normF2^2)+(talkers$normF1^2))^0.5 summary(talkers$normEuclid) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 1.414 8.744 14.560 17.460 23.500 46.860 plot(talkers$normEuclid, talkers$ID, xlim=c(1,47), ylim=ID, pch=19, xlab="Euclidean distance in normalized vowel space (%)", ylab="percent correctly identified") abline(lm(ID ~ normEuclid, data=talkers), lty=2) summary(lm(ID ~ normEuclid, data=talkers))