# qmlDay3_part1.R # # R code and notes for the fourth 1.5 hour session of week-long course # in quantitative methods in linguistics, given at the "mini-institute" # after the LSA Summer Meeting, 14-18 July 2008 # # (c) 2008, Mary E. Beckman, Cynthia Clopper, Shari Speer (Ohio State # University, Department of Linguistics) # # Today, we only need the Hillenbrand et al. (1995) data set. Start by # downloading the data set from: # http://homepages.wmich.edu/~hillenbr/voweldata.html # if you don't have it already on your hard drive, on a memory stick, # or the like. Then set the working directory to the path to where the # file is and read it into R. Remember to adjust the path name for the # machine and operating system that you're working on. For example, on # Mary's laptop PC running Windows XP, the path is: setwd("C:/Lx286/Hillenbrand") # but when she is working on the Mac at the front of the room in Derby 29, # she might put the file on the Desktop, in which case, the path is: setwd('/Users/buckeye/Desktop') hill=read.table("vowdata.dat",skip=30,na.strings="0")[,1:7] names(hill)=c("filename","duration","f0","F1","F2","F3","F4") # Use substr() and factor() to split the hill$filename column into three # different variables, one for each of the component parts. hill$group=factor(substr(as.character(hill$filename),1,1)) hill$subject=factor(substr(as.character(hill$filename),1,3)) hill$vowel=factor(substr(as.character(hill$filename),4,5)) # Review/elaboration of last of points from third day ... # Pick out the subset of rows of the Hillenbrand et al. (1995) corpus that # contains values for the tense and lax high front vowels. iy=subset(hill, vowel=="iy") ih=subset(hill, vowel=="ih") brksF1=seq(300,600,50) brksF4=seq(3000,6000,500) # Set up a plotting window to plot the various histograms and scatterplots # relative to each other in a visually satisfying way. windows(width=8,height=8) par(oma=rep(0,4),mar=c(3,3,0.2,0.2),mgp=c(1.8,0.5,0),mfrow=c(2,2)) # Make overlaid histograms of the F4 values. F4.iy=hist(iy$F4, breaks=brksF4, freq=FALSE, ylim=c(0,0.0012), main="", col="gray40",xlab="Hillenbrand et al. (1995) [i:] and [I] F4 frequency (Hz)") F4.ih=hist(ih$F4, breaks=brksF4, freq=FALSE, density=30, add=T);box() legend("topright",c("heed","hid"),fill=c("gray40","black"),density=c(-1,30)) # Make overlaid histograms of the F1 values. F1.iy=hist(iy$F1, breaks=brksF1, freq=FALSE, ylim=c(0,0.009), main="", col="gray40",xlab="Hillenbrand et al. (1995) [i:] and [I] F1 frequency (Hz)") F1.ih=hist(ih$F1, breaks=brksF1, freq=FALSE, density=30, add=T);box() legend("topright",c("heed","hid"),fill=c("gray40","black"),density=c(-1,30)) # Plot the distribution of F1 values in the tense and lax high front vowels # in the Hillenbrand et al. dataset against the distribution of F4 values. plot(iy$F4, iy$F1, xlim=c(brksF4[1], brksF4[length(brksF4)]), ylim=c(brksF1[1],brksF1[length(brksF1)]), pch=19, col="gray40", xlab="F4 (Hz)",ylab="F1 (Hz)") points(ih$F4, ih$F1) # Plot horizontal barplot to emphasize relationship of F1 histogram to the # values that are plotted along the y dimension of the scatterplot to the left. barplot(F1.iy$intensities,horiz=T,xlim=c(0,0.009),space=0,col="gray40", xlab="Density",ylab="Histogram above, rotated");box();par(new=T) barplot(F1.ih$intensities,horiz=T,xlim=c(0,0.009),space=0,col="black",density=30) legend("bottomright",c("hid","heed"),fill=c("black","gray40"),density=c(30,-1)) # Calculate covariances. cov(iy$F1,iy$F4,use="pairwise.complete.obs") cov(ih$F1,ih$F4,use="pairwise.complete.obs") # Calculate correlations. cor(iy$F1,iy$F4,use="pairwise.complete.obs") cor(ih$F1,ih$F4,use="pairwise.complete.obs") # Test hypothesis that the correlation is significantly different from 0. cor.test(iy$F1,iy$F4) cor.test(ih$F1,ih$F4) # Do the same thing using regression. iy.lm=lm(F1~F4, iy);summary(iy.lm) ih.lm=lm(F1~F4, ih);summary(ih.lm) # Make a new plot that fills the whole screen, and overlay the regressions. par(mfrow=c(1,1)) plot(iy$F4, iy$F1, xlim=c(brksF4[1], brksF4[length(brksF4)]), ylim=c(brksF1[1],brksF1[length(brksF1)]), pch=19, col="gray40", xlab="F4 (Hz)",ylab="F1 (Hz)") abline(iy.lm,col="gray40") # The above abline() command is the same as the following: abline(a=iy.lm$coefficients[1],b=iy.lm$coefficients[2],col="gray40") points(ih$F4, ih$F1);abline(ih.lm,lty=3) legend("topleft",c("hid","heed"),col=c("black","gray40"),pch=c(1,19),lty=c(3,1)) # Redo using multiple regression. Start by picking out the subset of high # vowels and remaking the vowel variable so that it has only two levels. # The summary() command here is to show you the effect of this remaking ... hiVow=subset(hill, vowel=="iy" | vowel=="ih");summary(hiVow$vowel) hiVow$vowel=factor(hiVow$vowel);summary(hiVow$vowel) hiVow.lm=lm(F1 ~ F4 + vowel, hiVow);summary(hiVow.lm) # Remake the scatterplot, this time overlaying lines calculated from the # coefficients of the multiple regression. plot(iy$F4, iy$F1, xlim=c(brksF4[1], brksF4[length(brksF4)]), ylim=c(brksF1[1],brksF1[length(brksF1)]), pch=19, col="gray40", xlab="F4 (Hz)",ylab="F1 (Hz)") abline(a=hiVow.lm$coefficients[1]+hiVow.lm$coefficients[3], b=hiVow.lm$coefficients[2],col="gray40") points(ih$F4, ih$F1) abline(a=hiVow.lm$coefficients[1],b=hiVow.lm$coefficients[2],lty=3) legend("topleft",c("hid","heed"),col=c("black","gray40"),pch=c(1,19),lty=c(3,1)) # Make a new data frame in which we replace the F4 with the median F4 for # the speaker, and then do the plot again. hiVow2=data.frame(talker=iy$subject,F4=NA,iyF1=iy$F1,ihF1=NA) for(i in 1:dim(hiVow2)[1]) { x=subset(hill,subject==as.character(hiVow2[i,"talker"])) hiVow2[i,"F4"]=median(x$F4,na.rm=T) hiVow2[i,"ihF1"]=x$F1[which(x$vowel=="ih")] } plot(hiVow2$F4,hiVow2$iyF1,xlim=c(3000,5300), ylim=c(brksF1[1],brksF1[length(brksF1)]), pch=19, col="gray40", xlab="median F4 for talker (Hz)",ylab="F1 (Hz)") points(hiVow2$F4, hiVow2$ihF1) legend("topleft",c("hid","heed"),col=c("black","gray40"),pch=c(1,19),lty=c(3,1)) # Find the talker with the outlier median F4 value, and perhaps listen # to her files and think about why her F4 values are what they are. hiVow2[which(hiVow2$F4==max(hiVow2$F4),] subset(hill, subject=="w05")$F4 # Here's a kludge-y way to redo so that can make a regression to overlay. v1=hiVow2[,c("talker","F4","iyF1")];names(v1)[3]="F1";v1$vowel="iy" v2=hiVow2[,c("talker","F4","ihF1")];names(v2)[3]="F1";v2$vowel="ih" hiVow3=rbind(v1,v2);hiVow3$vowel=factor(hiVow3$vowel) hiVow3.lm=lm(F1 ~ F4 + vowel, hiVow);summary(hiVow3.lm) abline(a=hiVow3.lm$coefficients[1]+hiVow3.lm$coefficients[3], b=hiVow3.lm$coefficients[2],col="gray40") abline(a=hiVow3.lm$coefficients[1],b=hiVow3.lm$coefficients[2],lty=3) # Another way to do this is to assume vocal tract lengths are more # or less constant within a talker group. Start by making the three # groups that you want to plot and evaluate. hiVow3$group=substr(as.character(hiVow3$talker),1,1) hiVow3$group[which(hiVow3$group=="g" | hiVow3$group=="b")]="c" hiVow3$group=factor(hiVow3$group) hiVow2$group=substr(as.character(hiVow2$talker),1,1) hiVow2$group[which(hiVow2$group=="g" | hiVow2$group=="b")]="c" hiVow2$group=factor(hiVow2$group) # Make histograms. m=subset(hiVow2, group=="m") w=subset(hiVow2, group=="w") ch=subset(hiVow2, group=="c") par(mfrow=c(3,1)) hist(m$iyF1,breaks=brksF1, freq=FALSE, ylim=c(0,0.013),main="", col="gray40",xlab="Adult men's [i:] and [I] F1 frequency (Hz)") plot(function(x)dnorm(x,mean=mean(m$iyF1),sd=sd(m$iyF1)), brksF1[1],brksF1[length(brksF1)],add=T) hist(m$ihF1, breaks=brksF1, freq=FALSE, density=30, add=T);box() plot(function(x)dnorm(x,mean=mean(m$ihF1),sd=sd(m$ihF1)), brksF1[1],brksF1[length(brksF1)],add=T,lty=3) legend("topright",c("heed","hid"),fill=c("gray40","black"),density=c(-1,30)) hist(w$iyF1,breaks=brksF1, freq=FALSE, ylim=c(0,0.013),main="", col="gray40",xlab="Adult women's [i:] and [I] F1 frequency (Hz)") plot(function(x)dnorm(x,mean=mean(w$iyF1),sd=sd(w$iyF1)), brksF1[1],brksF1[length(brksF1)],add=T) hist(w$ihF1, breaks=brksF1, freq=FALSE, density=30, add=T);box() plot(function(x)dnorm(x,mean=mean(w$ihF1),sd=sd(w$ihF1)), brksF1[1],brksF1[length(brksF1)],add=T,lty=3) hist(ch$iyF1,breaks=brksF1, freq=FALSE, ylim=c(0,0.013),main="", col="gray40",xlab="Children's [i:] and [I] F1 frequency (Hz)") plot(function(x)dnorm(x,mean=mean(ch$iyF1),sd=sd(ch$iyF1)), brksF1[1],brksF1[length(brksF1)],add=T) hist(ch$ihF1, breaks=brksF1, freq=FALSE, density=30, add=T);box() plot(function(x)dnorm(x,mean=mean(ch$ihF1),sd=sd(ch$ihF1)), brksF1[1],brksF1[length(brksF1)],add=T,lty=3) # Do the corresponding regression (= ANOVA) with only main effects. summary(lm(F1 ~ vowel + group, hiVow3)) summary(aov(F1 ~ vowel + group, hiVow3)) # Do the corresponding regression (= ANOVA) with an interaction factor. summary(lm(F1 ~ vowel * group, hiVow3)) summary(aov(F1 ~ vowel * group, hiVow3))