######################################################################## # Nettle1995.R # # From "Notes on Probability and Statistics for Analyzing the # Sounds of Languages" -- a companion textbook to Peter Ladefoged's # "Vowels and Consonants: An Introduction to the Sounds of Languages" # 2nd edition (Blackwell, 2005). # # (c) 2007, Grant McGuire, Fangfang Li, Eunjong Kong, & Mary E. Beckman # Department of Linguistics, Ohio State University # # R code for analyzing the data in Table 1 on p. 362 of Nettle (1995). # # The data for this analysis problem are from: # Daniel Nettle (1995). Segmental inventory size, word length, and # communicative efficiency. Linguistics 33, 359-367. # # Note that Nettle based his counts on the descriptions of the segments # in the dictionary that he consulted. Since the tones of Thai are # indicated in the orthography in large part by special "consonant" # symbols that are prefixed before the syllable-initial consonant, and # there are two such symbols (for "high" and "mid" class tones) which # contrast with no extra symbol (for "low" class tones), Nettle counted # 21 consonants * 3 tone classes = 66 "consonants" plus 9 vowels = 71 # segments, whereas Maddieson & Precoda counted 21 consonants + 9 vowels # = 30 segments. We have "corrected" the count to the one given in # Maddieson & Precoda's UPSID data base plus 9 more, to count long and # short vowels separately, since that is how Nettle must be counting the # vowels of Nahuatl in order to get the count that he does. # ######################################################################## # Part 1 -- Analyzing the data from Table 1 on p. 362 in Nettle (1995). # # Download the file Nettle1995.txt from the course web page, and then # set the working directory to the folder where you have put the file. setwd('C:/Lx286/dataAnalysisReports/reportNo5') # Read in the data from the file you downloaded. x=read.table("correctedNettle1995.txt", header=T) names(x) # [1] "Language" "NoSegs" "MeanWordLen" "Dictionary" "Population" # [6] "withTone" "noVows" # The first four columns are from Nettle (1995), Table 1, on p. 362, the # 5th column of the table is the estimated number of speakers, as listed # in the Ethnologue database at http://www.ethnologue.com/web.asp # The 6th column is like column 2, except that we have expanded the # numbers for Thai and Mandarin by counting all of the vowels in syllables # with distinct tones as different vowels. That is, we have multipled # 9 (short) vowels by 4 tones and 9 (long) vowels by 5 tones, and added # these two to the 21 consonants to get 9*4+9*5+21=102 segments altogether. # For Mandarin, similarly, we have multiplied the number of vowels listed # in UPSID by the number of tones and added that to the number of consonants. # The last column is the number of vowels. These numbers are taken from # UPSID for Hawaiian, Turkish, German, and Georgian, and from other published # sources for Thai, Italian, Hindi, !Xu, and Mandarin. Note that we are # counting the number of vowels when vowels with different tones are deemed # to be different vowels. # Regress the mean word length against the number of segments. x.lm1=lm(MeanWordLen ~ NoSegs, data=x) # Have R print a summary of the results. summary(x.lm1) # Residuals: # Min 1Q Median 3Q Max # -2.6030 -0.5072 0.1900 0.4440 1.9032 # # Coefficients: # Estimate Std. Error t value Pr(>|t|) # (Intercept) 7.55411 0.75716 9.977 8.64e-06 *** # NoSegs -0.03336 0.01553 -2.148 0.064 . # --- # Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 # # Residual standard error: 1.332 on 8 degrees of freedom # Multiple R-Squared: 0.3657, Adjusted R-squared: 0.2864 # F-statistic: 4.613 on 1 and 8 DF, p-value: 0.064 # Set up a plotting pretty window for making a scatterplot. windows(width=4, height=4, pointsize=12) par(family="serif", oma=rep(0,4), mar=c(3,3,0.1,0.1), mgp=c(1.8,0.5,0), type="s") # Make a scatterplot and label the data points. plot(x$NoSegs, x$MeanWordLen, xlim=c(0,120), ylim=c(3,9), pch=19, xlab="number of segments", ylab="mean word length (number of segments)") text(x$NoSegs, x$MeanWordLen, x$Language, adj=c(1,0)) # Add a regression curve. abline(x.lm1) # Save the plot in a file to embed into your report. savePlot("Nettle1995plot", type="jpg") ######################################################################## # Part 2 -- Seeing the effect of factoring in tone contrasts for the two # languages with lexical tone. # # Do the same sequence of commands to make a scatterplot and apply a # regression model, but this time using the number of segments in the # column "withTone" as the independent variable. x.lm2=lm(MeanWordLen ~ withTone, data=x) summary(x.lm2) # Residuals: # Min 1Q Median 3Q Max # -0.9509 -0.5571 -0.2236 0.4874 1.4448 # # Coefficients: # Estimate Std. Error t value Pr(>|t|) # (Intercept) 8.170675 0.468530 17.439 1.19e-07 *** # withTone -0.040239 0.007985 -5.039 0.00100 ** # --- # Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 # # Residual standard error: 0.8189 on 8 degrees of freedom # Multiple R-Squared: 0.7604, Adjusted R-squared: 0.7305 # F-statistic: 25.39 on 1 and 8 DF, p-value: 0.001003 plot(x$withTone, x$MeanWordLen, xlim=c(0,120), ylim=c(3,9), pch=19, xlab="number of segments, counting tone contrasts", ylab="mean word length (number of segments)") text(x$withTone, x$MeanWordLen, x$Language, adj=c(1,0)) # Add a regression curve. abline(x.lm2) # Save the plot in a file to embed into your report. savePlot("Nettle1995withTone", type="jpg") ######################################################################## # Part 3 -- Modeling the relationship if there were minimum redundancy # in the system, after adjusting for Peter Ladefoged's implicit claim # about the constraints from syllable structure on p. 4 of his book. # Calculate the number of consonants by subtracting the number of # vowels from the number of segments, using the first value (the one # that doesn't include tone) for the number of segments. x$noCons=x$withTone-x$noVows # Add a placeholder variable for the mean length of words as estimated # by the model that will be described in the next comment. x$meanWordModeled=0 # Assume a lexicon of 45,000 words, as in Nagy & Anderson's (1984) # estimate of the number of words an American high school senior knows. # Calculate average length of words one would get if each language were # maximally efficient in using segments to get the shortest possible # words, given the following constraints: (1) the only possible syllable # types are V and CV and (2) V can only occur at the beginning of a # word, so that there are no V.V sequences. That is, the language # should use up the shortest words first, before going on to use the # next shortest template. noWords=45000 # Loop through the list of languages, filling in the meanWordModelled # estimate after calculating the lengths of words in a 45,000 word # lexicon that follows the above constraints. for (i in 1:dim(x)[1]) { # Set up variables for the number of vowels and the number of # consonants in the current language. V=x$noVows[i] C=x$noCons[i] # Set up a variable to hold all of the "word lengths" in the # hypothetical lexicon and fill in the number of segments in # all of the monosyllabic words that are just a vowel. words=rep(0,V) # Fill in the number of segments in all of the monosyllabic words # that are a CV syllable. w1=V+1 w2=w1+C*V words[w1:w2]=2 # Fill in the number of segments in all of the disyllabic words # that are VCV. w1=w2 w2=w1+V*C*V words[w1:w2]=3 # Fill in the number of segments in all of the disyllabic words # that are CVCV. w1=w2 w2=w1+C*V*C*V words[w1:w2]=4 # Fill in the number of segments in all of the trisyllabic words # that are VCVCV, for languages where still need more words. if (w2 < noWords) { w1=w2 w2=w1+V*C*V*C*V words[w1:w2]=5 if (w2 < noWords) { # Fill in the number of segments in all of the # trisyllabic words that are CVCVCV, if still need .... w1=w2 w2=w1+C*V*C*V*C*V words[w1:w2]=6 } } print(paste(x$Language[i],length(words))) # Calculate the mean value for the first 45,000 words. x$meanWordModeled[i]=mean(words[1:noWords]) } # Add the model points and a regression curve for it. points(x$withTone, x$meanWordModeled, pch=20, col="grey40") abline(lm(meanWordModeled ~ withTone, data=x), col="grey40") legend("topright","model", pch=20, col="grey40", lty=1) savePlot("Nettle1995withToneWithModel", type="jpg")