########################################################################### # vowelsInMHR.R # # From "Notes on Probability and Statistics for Analyzing the # Sounds of Languages" -- a companion textbook to Peter Ladefoged's # "Vowels and Consonants: An Introduction to the Sounds of Languages" # 2nd edition (Blackwell, 2005). # # (c) 2007, Grant McGuire, Fangfang Li, Eunjong Kong, & Mary E. Beckman # Department of Linguistics, Ohio State University # # R code for evaluating the distribution of vowels in different types of # syllables in the wordlist derived from the Moe, Hopkins, and Rush (1982) # corpus. The Moe et al. study is described in the following book, which # also prints the word list and associated frequencies. # # Alden J. Moe, Carol J. Hopkins, & R. Timothy Rush (1982). The # vocabulary of first-grade children. Springfield, IL: Thomas. # # This book should be cited whenever you use the wordlist data file. # ########################################################################### # Start by downloading the file Moe.wp from the course web site and reading # it into R. # # This sets the working directory to where you have put the data file. You # will need to replace the path name in argument with the right path name # for how you have organized your computer. setwd('C:/children/LexiconAdult/english') mhr=read.table("Moe.wp",header=TRUE,sep="\t") setwd('C:/Lx286/dataAnalysisReports/reportNo5') names(mhr) # [1] "orth" "pron" "freq" "noLetters" "noPhons" "noSyls" ########################################################################### # Count the vowel type frequencies in stressed syllables versus unstressed # syllables in the MHR .... # # Set up a vector of different vowel types. vowels=c("c","U","W","a","O","e","u","o","&","E","Y","i","I","x","R") # Set up a table for the counts, with the vowel types as row names. V.table=data.frame(all=rep(0,length(vowels)), stressed=0, unstressed=0) rownames(V.table)=vowels # Loop through the rows of the table, counting the syllables for the # three columns. for (i in rownames(V.table)) { # Pick out the subsets of words that include the target vowel in any # syllable, in a stressed syllable, and in an unstressed syllable. temp=as.character(mhr$pron[grep(i,as.character(mhr$pron))]) V1=paste(i,"[12]",sep="") temp1=temp[grep(V1,temp)] V0=paste(i,"0",sep="") temp0=temp[grep(V0,temp)] # Set N, N1, and N0 to the number of these words. N=length(temp) N1=length(temp1) N0=length(temp0) # Increment by the number of words in which the target vowel occurs 2 times. N=N+length(grep(paste(i,".*",i,sep=""),temp)) N1=N1+length(grep(paste(V1,".*",V1,sep=""),temp1)) N0=N0+length(grep(paste(V0,".*",V0,sep=""),temp0)) # Increment by the number of words in which the target vowel occurs 3 times. N=N+length(grep(paste(i,".*",i,".*",i,sep=""),temp)) N1=N1+length(grep(paste(V1,".*",V1,".*",V1,sep=""),temp1)) N0=N0+length(grep(paste(V0,".*",V0,".*",V0,sep=""),temp0)) # Increment by the number of words in which the target vowel occurs 4 times. N=N+length(grep(paste(i,".*",i,".*",i,".*",i,sep=""),temp)) N1=N1+length(grep(paste(V1,".*",V1,".*",V1,".*",V1,sep=""),temp1)) N0=N0+length(grep(paste(V0,".*",V0,".*",V0,".*",V0,sep=""),temp0)) # Increment by the number of words in which the target vowel occurs 5 times. N=N+length(grep(paste(i,".*",i,".*",i,".*",i,".*",i,sep=""),temp)) N1=N1+length(grep(paste(V1,".*",V1,".*",V1,".*",V1,".*",V1,sep=""),temp1)) N0=N0+length(grep(paste(V0,".*",V0,".*",V0,".*",V0,".*",V0,sep=""),temp0)) # Increment by the number of words in which the target vowel occurs 6 times. N=N+length(grep(paste(i,".*",i,".*",i,".*",i,".*",i,".*",i,sep=""),temp)) N1=N1+length(grep(paste(V1,".*",V1,".*",V1,".*",V1,".*",V1,".*",V1,sep=""),temp1)) N0=N0+length(grep(paste(V0,".*",V0,".*",V0,".*",V0,".*",V0,".*",V0,sep=""),temp0)) # Set that cell of the table to be V.table[i,"all"]=N V.table[i,"stressed"]=N1 V.table[i,"unstressed"]=N0 } # Do a Chi-squared test to see if the distribution of vowels is independent # of the syllable type. chisq.test(V.table[,c("stressed","unstressed")]) # Pearson's Chi-squared test # # data: V.table[, c("stressed", "unstressed")] # X-squared = 4419.355, df = 14, p-value < 2.2e-16 # Also, evaluate the distribution overall against a model in which every vowel # is equally likely to occur in any syllable in the MHR words. V.model=data.frame(all=V.table[,"all"],model=round(sum(V.table$all)/length(vowels))) chisq.test(V.model) # Pearson's Chi-squared test # # data: V.model # X-squared = 2902.707, df = 14, p-value < 2.2e-16 # Make a barplot for each of the columns for both of the Chi-squared tests. windows(height=4, width=6, pointsize=12) par(family="serif", oma=c(1,0,0,0), mar=c(3,2.2,2,0.1), mgp=c(1.8,0.5,0), mfrow=c(1,4)) vowels2=c("oj","U","aw","a","O","e","u","o","ae","E","aj","i","I","^","r") xlim=c(0,2200) barplot(rev(V.model[,"model"]), horiz=T, xlim=xlim);box(); mtext("model syllables",line=0.5) text(rep(-1,length(vowels2)),seq(0.8,18,1.2),rev(vowels2),xpd=T,adj=c(1,0.5),cex=1.5) barplot(rev(V.table[,"all"]), horiz=T, xlim=xlim);box(); mtext("all syllables",line=0.5) text(rep(-1,length(vowels2)),seq(0.8,18,1.2),rev(vowels2),xpd=T,adj=c(1,0.5),cex=1.5) barplot(rev(V.table[,"stressed"]), horiz=T, xlim=xlim);box(); mtext("stressed syllables",line=0.5) text(rep(-1,length(vowels2)),seq(0.8,18,1.2),rev(vowels2),xpd=T,adj=c(1,0.5),cex=1.5) barplot(rev(V.table[,"unstressed"]), horiz=T, xlim=xlim);box(); mtext("unstressed syllables",line=0.5) text(rep(-1,length(vowels2)),seq(0.8,18,1.2),rev(vowels2),xpd=T,adj=c(1,0.5),cex=1.5) mtext("type counts in Moe, Hopkins, and Rush (1982) words", side=1, line=-1, outer=T) savePlot("vowelsInMHRsyllables", type="jpg")