# qmlDay2.R # # R code and notes for second 1.5 hour session of week-long course in # quantitative methods in linguistics, given at the "mini-institute" # after the LSA Summer Meeting, 14-18 July 2008 # # (c) 2008, Mary E. Beckman, Cynthia Clopper, Shari Speer (Ohio State # University, Department of Linguistics) # # Again, start by setting the working directory to the path for each of the # two data sets and reading them into R. setwd("C:/Lx795Q/quantitativeKAJ/CD") bres=read.table("BresDative.txt",header=TRUE) setwd("C:/Lx286/Hillenbrand") hill=read.table("vowdata.dat",skip=30,na.strings="0")[,1:7] names(hill)=c("filename","dur","f0","F1","F2","F3","F4") # Note ways to refer to first three cells of first column of Hillenbrand. hill[1:3,1] hill[1:3,"filename"] hill$filename[1:3] # The filename variable in the Hillenbrand et al. dataset are composed # as follows: # # character 1: m=man, w=woman, b=boy, g=girl # characters 2-3: talker number # characters 4-5: vowel (ae="had", ah="hod", aw="hawed", eh="head", er="heard", # ei="haid", ih="hid", iy="heed", oa=/o/ as in "boat", # oo="hood", uh="hud", uw="who'd") # # Use substr() and factor() to split the hill$filename column into three # different variables, one for each of the component parts. hill$group=factor(substr(as.character(hill$filename),1,1)) hill$subject=factor(substr(as.character(hill$filename),1,3)) hill$vowel=factor(substr(as.character(hill$filename),4,5)) # Review of last points from first day ... # When is a variable nominal versus ordinal versus ratio? Relate to # different types of values in R data frame. summary(as.numeric(substr(as.character(hill$filename),2,3))) summary(as.factor(substr(as.character(hill$filename),2,3))) summary(hill$group) summary(factor(hill$group,levels=c("m","w","b","g"))) # For basis of ordering here, cf. sort(tapply(hill$F1,list(hill$group),median)) # Do a summary of bres$real to see how many "PP" and how many "NP" # realizations there were. Also, divide by the row dimension of the # the data frame, to convert to proportions. summary(bres$real) summary(bres$real)/dim(bres)[1] # Make barplots of distribution of counts and proportions of "PP" and # "NP" realizations in the Bresnan et al. corpus. barplot(table(bres$real), xlab="realization of recipient", ylab="raw count -- i.e., frequency") barplot(summary(bres$real)/dim(bres)[1],ylim=c(0,1), xlab="realization of recipient", ylab="proportion -- i.e., density") # Now do the same for the two different subsets of the corpora, using the # table() function. x=table(bres$real,bres$mod) barplot(x,beside=TRUE,legend.text=rownames(x), xlab="corpus",ylab="raw count -- i.e., frequency") y=cbind(x[,1]/summary(bres$mod)[1],x[,2]/summary(bres$mod)[2]) names(y)=levels(bres$mod) barplot(y,beside=TRUE,legend.text=rownames(x),names.arg=levels(bres$mod), xlab="corpus",ylab="proportion -- i.e., probability") # Make histogram of F4 values in the Hillenbrand et al. corpus. x=hist(hill$F4, xlab="Hillenbrand et al. (1995) vowels, F4 frequency (Hz)", main="") x$breaks x$counts x$mids x=hist(hill$F4, freq=FALSE, xlab="Hillenbrand et al. (1995) vowels, F4 frequency (Hz)", main="") sum(x$intensities*200) x=hist(hill$F4, breaks=seq(2500,6000,500), freq=FALSE, xlab="Hillenbrand et al. (1995) vowels, F4 frequency (Hz)", main="") sum(x$intensities*500) # Compare normal distribution (see formula Johnson, p. 12) that has # the same mean and standard deviation. mu=mean(hill$F4, na.rm=TRUE) sig=sd(na.omit(hill$F4)) plot(function(x)dnorm(x, mean=mu, sd=sig), 2500, 6000, add=T) # Compare distributions of men, women, and children. brks=seq(2500,6000,500) x=subset(hill, group=="m") mu=mean(x$F4, na.rm=TRUE) sig=sd(na.omit(x$F4)) hist(x$F4, breaks=brks, freq=FALSE, col="blue", ylim=c(0,0.0015), xlab="Hillenbrand et al. (1995) vowels, F4 frequency (Hz)", main="") plot(function(x)dnorm(x, mean=mu, sd=sig), 2500, 6000, add=T, col="blue") x=subset(hill, group=="w") mu=mean(x$F4, na.rm=TRUE) sig=sd(na.omit(x$F4)) hist(x$F4, breaks=brks, freq=FALSE, density=50,col="red",ylim=c(0,0.0015),add=TRUE, main="") plot(function(x)dnorm(x, mean=mu, sd=sig), 2500, 6000, add=T, col="red") x=subset(hill, group!="m" & group!="w") mu=mean(x$F4, na.rm=TRUE) sig=sd(na.omit(x$F4)) hist(x$F4, breaks=brks, freq=FALSE, density=10,ylim=c(0,0.0015),add=TRUE, main="") plot(function(x)dnorm(x, mean=mu, sd=sig), 2500, 6000, add=T, col="black") # Test to see if men have significantly different F4. t.test(subset(hill, group=="m")$F4, subset(hill, group!="m")$F4) # Test to see if women have significantly lower F4 than children. t.test(subset(hill, group=="w")$F4, subset(hill, group!="m" & group!="w")$F4, alternative="less")