Last login: Wed Jul 16 10:35:54 on ttyp1 Welcome to Darwin! DB029Podium:~ buckeye$ python Python 2.5.2 (r252:60911, Feb 22 2008, 07:57:53) [GCC 4.0.1 (Apple Computer, Inc. build 5363)] on darwin Type "help", "copyright", "credits" or "license" for more information. >>> >>> import nltk >>> import nltk.corpus >>> nltk.corpus.brown.tagged_words()[17] ('evidence', 'NN') >>> DB029Podium:~ buckeye$ python Python 2.5.2 (r252:60911, Feb 22 2008, 07:57:53) [GCC 4.0.1 (Apple Computer, Inc. build 5363)] on darwin Type "help", "copyright", "credits" or "license" for more information. >>> import nltk >>> import nltk.corpus >>> brown = nltk.corpus.brown.words() >>> brown[17] 'evidence' >>> brown[16:19] ['no', 'evidence', "''"] >>> brown[:19] ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''"] >>> brown[-1] '.' >>> brown[-10] 'the' >>> brown[-10:-1] ['the', 'figure', 'inside', 'the', 'coral-colored', 'boucle', 'dress', 'was', 'stupefying'] >>> brown[-10:] ['the', 'figure', 'inside', 'the', 'coral-colored', 'boucle', 'dress', 'was', 'stupefying', '.'] >>> brown[-10:0] [] >>> brown[:10] ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of'] >>> brown[0:10] ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of'] >>> inaugural = nltk.corpus.inaugural.words() >>> inaugural[0:10] ['Fellow', '-', 'Citizens', 'of', 'the', 'Senate', 'and', 'of', 'the', 'House'] >>> print nltk.corpus.inaugural.words > >>> brown = nltk.corpus.brown.tagged_words() >>> brown[:20] [('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS')] >>> inaugural = nltk.corpus.inaugural.tagged_words() Traceback (most recent call last): File "", line 1, in AttributeError: 'PlaintextCorpusReader' object has no attribute 'tagged_words' >>> print nltk.corpus.inaugural.items() __main__:1: DeprecationWarning: Function _get_items() has been deprecated. Use corpus.files() instead Traceback (most recent call last): File "", line 1, in TypeError: 'tuple' object is not callable >>> print nltk.corpus.inaugural.items ('1789-Washington.txt', '1793-Washington.txt', '1797-Adams.txt', '1801-Jefferson.txt', '1805-Jefferson.txt', '1809-Madison.txt', '1813-Madison.txt', '1817-Monroe.txt', '1821-Monroe.txt', '1825-Adams.txt', '1829-Jackson.txt', '1833-Jackson.txt', '1837-VanBuren.txt', '1841-Harrison.txt', '1845-Polk.txt', '1849-Taylor.txt', '1853-Pierce.txt', '1857-Buchanan.txt', '1861-Lincoln.txt', '1865-Lincoln.txt', '1869-Grant.txt', '1873-Grant.txt', '1877-Hayes.txt', '1881-Garfield.txt', '1885-Cleveland.txt', '1889-Harrison.txt', '1893-Cleveland.txt', '1897-McKinley.txt', '1901-McKinley.txt', '1905-Roosevelt.txt', '1909-Taft.txt', '1913-Wilson.txt', '1917-Wilson.txt', '1921-Harding.txt', '1925-Coolidge.txt', '1929-Hoover.txt', '1933-Roosevelt.txt', '1937-Roosevelt.txt', '1941-Roosevelt.txt', '1945-Roosevelt.txt', '1949-Truman.txt', '1953-Eisenhower.txt', '1957-Eisenhower.txt', '1961-Kennedy.txt', '1965-Johnson.txt', '1969-Nixon.txt', '1973-Nixon.txt', '1977-Carter.txt', '1981-Reagan.txt', '1985-Reagan.txt', '1989-Bush.txt', '1993-Clinton.txt', '1997-Clinton.txt', '2001-Bush.txt', '2005-Bush.txt') >>> poppy = nltk.corpus.inaugural.words('1989-Bush.txt') >>> poppy[:10] ['Mr', '.', 'Chief', 'Justice', ',', 'Mr', '.', 'President', ',', 'Vice'] >>> shrub = nltk.corpus.inaugural.words('2005-Bush.txt') >>> shrub[:10] ['Inaugural', 'Addresses', 'of', 'the', 'Presidents', 'of', 'the', 'United', 'States', '.'] >>> for language in nltk.corpus.udhr.items: ... print language ... /Library/Frameworks/Python.framework/Versions/2.5/lib/python2.5/site-packages/nltk/corpus/util.py:53: DeprecationWarning: Function _get_items() has been deprecated. Use corpus.files() instead return getattr(self, attr) Abkhaz-Cyrillic+Abkh Abkhaz-UTF8 Achehnese-Latin1 Achuar-Shiwiar-Latin1 Adja-UTF8 Afaan_Oromo_Oromiffa-Latin1 Afrikaans-Latin1 Aguaruna-Latin1 Akuapem_Twi-UTF8 Albanian_Shqip-Latin1 Amahuaca Amahuaca-Latin1 Amarakaeri-Latin1 Amharic-Afenegus6..60375 Amuesha-Yanesha-UTF8 Arabela-Latin1 Arabic_Alarabia-Arabic Armenian-DallakHelv Asante-UTF8 Ashaninca-Latin1 Asheninca-Latin1 Asturian_Bable-Latin1 Aymara-Latin1 Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117 Azeri_Azerbaijani_Latin-Az.Times.Lat0117 Balinese-Latin1 Bambara-UTF8 Baoule-UTF8 Basque_Euskara-Latin1 Batonu_Bariba-UTF8 Belorus_Belaruski-Cyrillic Belorus_Belaruski-UTF8 Bemba-Latin1 Bengali-UTF8 Beti-UTF8 Bhojpuri-Agra Bichelamar-Latin1 Bikol_Bicolano-Latin1 Bora-Latin1 Bosnian_Bosanski-Cyrillic Bosnian_Bosanski-Latin2 Bosnian_Bosanski-UTF8 Breton-Latin1 Bugisnese-Latin1 Bulgarian_Balgarski-Cyrillic Bulgarian_Balgarski-UTF8 Burmese_Myanmar-UTF8 Burmese_Myanmar-WinResearcher Cakchiquel-Latin1 Campa_Pajonalino-Latin1 Candoshi-Shapra-Latin1 Caquinte-Latin1 Cashibo-Cacataibo-Latin1 Cashinahua-Latin1 Catalan-Latin1 Catalan_Catala-Latin1 Cebuano-Latin1 Chamorro-Latin1 Chayahuita-Latin1 Chechewa_Nyanja-Latin1 Chickasaw-Latin1 Chinanteco-Ajitlan-Latin1 Chinanteco-UTF8 Chinese_Mandarin-GB2312 Chinese_Mandarin-HZ Chinese_Mandarin-UTF8 Chuuk_Trukese-Latin1 Cokwe-Latin1 Corsican-Latin1 Croatian_Hrvatski-Latin2 Czech-Latin2 Czech-Latin2-err Czech-UTF8 Czech_Cesky-Latin2 Czech_Cesky-UTF8 Dagaare-UTF8 Dagbani-UTF8 Dangme-UTF8 Danish_Dansk-Latin1 Dendi-UTF8 Ditammari-UTF8 Dutch_Nederlands-Latin1 Edo-Latin1 English-Latin1 Esperanto-T61 Esperanto-UTF8 Estonian_Eesti-Latin1 Ewe_Eve-UTF8 Fante-UTF8 Faroese-Latin1 Farsi_Persian-UTF8 Farsi_Persian-v2-UTF8 Fijian-Latin1 Filipino_Tagalog-Latin1 Finnish_Suomi-Latin1 Fon-UTF8 French_Francais-Latin1 Frisian-Latin1 Friulian_Friulano-Latin1 Ga-UTF8 Gagauz_Gagauzi-UTF8 Galician_Galego-Latin1 Garifuna_Garifuna-Latin1 German_Deutsch-Latin1 Gonja-UTF8 Greek_Ellinika-Greek Greek_Ellinika-UTF8 Greenlandic_Inuktikut-Latin1 Guarani-Latin1 Guen_Mina-UTF8 Gujarati-UTF8 HaitianCreole_Kreyol-Latin1 HaitianCreole_Popular-Latin1 Hani-Latin1 Hausa_Haoussa-Latin1 Hawaiian-UTF8 Hebrew_Ivrit-Hebrew Hebrew_Ivrit-UTF8 Hiligaynon-Latin1 Hindi-UTF8 Hindi_web-UTF8 Hmong_Miao-Sichuan-Guizhou-Yunnan-Latin1 Hmong_Miao-SouthernEast-Guizhou-Latin1 Hmong_Miao_Northern-East-Guizhou-Latin1 Hrvatski_Croatian-Latin2 Huasteco-Latin1 Huitoto_Murui-Latin1 Hungarian_Magyar-Latin1 Hungarian_Magyar-Latin2 Hungarian_Magyar-UTF8 Hungarian_Magyar-Unicode Ibibio_Efik-Latin1 Icelandic_Yslenska-Latin1 Ido-Latin1 Igbo-UTF8 Iloko_Ilocano-Latin1 Indonesian-Latin1 Interlingua-Latin1 Inuktikut_Greenlandic-Latin1 IrishGaelic_Gaeilge-Latin1 Italian-Latin1 Italian_Italiano-Latin1 Japanese_Nihongo-EUC Japanese_Nihongo-JIS Japanese_Nihongo-SJIS Japanese_Nihongo-UTF8 Javanese-Latin1 Jola-Fogny_Diola-UTF8 Kabye-UTF8 Kannada-UTF8 Kaonde-Latin1 Kapampangan-Latin1 Kasem-UTF8 Kazakh-Cyrillic Kazakh-UTF8 Kiche_Quiche-Latin1 Kicongo-Latin1 Kimbundu_Mbundu-Latin1 Kinyamwezi_Nyamwezi-Latin1 Kinyarwanda-Latin1 Kituba-Latin1 Korean_Hankuko-UTF8 Kpelewo-UTF8 Krio-UTF8 Kurdish-UTF8 Lamnso_Lam-nso-UTF8 Lao-UTF8 Latin_Latina-Latin1 Latin_Latina-v2-Latin1 Latvian-Latin1 Limba-UTF8 Lingala-Latin1 Lithuanian_Lietuviskai-Baltic Lozi-Latin1 Luba-Kasai_Tshiluba-Latin1 Luganda_Ganda-Latin1 Lunda_Chokwe-lunda-Latin1 Luvale-Latin1 Luxembourgish_Letzebuergeusch-Latin1 Macedonian-UTF8 Madurese-Latin1 Magahi-Agra Magahi-UTF8 Makonde-Latin1 Malagasy-Latin1 Malay_BahasaMelayu-Latin1 Maltese-UTF8 Mam-Latin1 Maninka-UTF8 Maori-Latin1 Mapudungun_Mapuzgun-Latin1 Mapudungun_Mapuzgun-UTF8 Marathi-UTF8 Marshallese-Latin1 Matses-Latin1 Mayan_Yucateco-Latin1 Mazahua_Jnatrjo-UTF8 Mazateco-Latin1 Mende-UTF8 Mikmaq_Micmac-Mikmaq-Latin1 Minangkabau-Latin1 Miskito_Miskito-Latin1 Mixteco-Latin1 Mongolian_Khalkha-Cyrillic Mongolian_Khalkha-UTF8 Moore_More-UTF8 Nahuatl-Latin1 Navaho_Dine-Navajo-Navaho-font Ndebele-Latin1 Nepali-UTF8 Ngangela_Nyemba-Latin1 NigerianPidginEnglish-Latin1 Nomatsiguenga-Latin1 NorthernSotho_Pedi-Sepedi-Latin1 Norwegian-Latin1 Norwegian_Norsk-Bokmal-Latin1 Norwegian_Norsk-Nynorsk-Latin1 Nyanja_Chechewa-Latin1 Nyanja_Chinyanja-Latin1 Nzema-UTF8 OccitanAuvergnat-Latin1 OccitanLanguedocien-Latin1 Oromiffa_AfaanOromo-Latin1 Osetin_Ossetian-UTF8 Oshiwambo_Ndonga-Latin1 Otomi_Nahnu-Latin1 Paez-Latin1 Palauan-Latin1 Peuhl-UTF8 Picard-Latin1 Pipil-Latin1 Polish-Latin2 Polish_Polski-Latin2 Ponapean-Latin1 Portuguese_Portugues-Latin1 Pulaar-UTF8 Punjabi_Panjabi-UTF8 Purhepecha-UTF8 Qechi_Kekchi-Latin1 Quechua-Latin1 Quichua-Latin1 Rarotongan_MaoriCookIslands-Latin1 Rhaeto-Romance_Rumantsch-Latin1 Romani-Latin1 Romani-UTF8 Romanian-Latin2 Romanian_Romana-Latin2 Rukonzo_Konjo-Latin1 Rundi_Kirundi-Latin1 Runyankore-rukiga_Nkore-kiga-Latin1 Russian-Cyrillic Russian-UTF8 Russian_Russky-Cyrillic Russian_Russky-UTF8 Sami_Lappish-UTF8 Sammarinese-Latin1 Samoan-Latin1 Sango_Sangho-Latin1 Sanskrit-UTF8 Saraiki-UTF8 Sardinian-Latin1 ScottishGaelic_GaidhligAlbanach-Latin1 Seereer-UTF8 Serbian_Srpski-Cyrillic Serbian_Srpski-Latin2 Serbian_Srpski-UTF8 Sharanahua-Latin1 Shipibo-Conibo-Latin1 Shona-Latin1 Sinhala-UTF8 Siswati-Latin1 Slovak-Latin2 Slovak_Slovencina-Latin2 Slovenian_Slovenscina-Latin2 SolomonsPidgin_Pijin-Latin1 Somali-Latin1 Soninke_Soninkanxaane-UTF8 Sorbian-Latin2 SouthernSotho_Sotho-Sesotho-Sutu-Sesutu-Latin1 Spanish-Latin1 Spanish_Espanol-Latin1 Sukuma-Latin1 Sundanese-Latin1 Sussu_Soussou-Sosso-Soso-Susu-UTF8 Swaheli-Latin1 Swahili_Kiswahili-Latin1 Swedish_Svenska-Latin1 Tahitian-UTF8 Tamil-UTF8 Tenek_Huasteco-Latin1 Tetum-Latin1 Themne_Temne-UTF8 Tigrinya_Tigrigna-VG2Main Tiv-Latin1 Toba-UTF8 Tojol-abal-Latin1 TokPisin-Latin1 Tonga-Latin1 Tongan_Tonga-Latin1 Totonaco-Latin1 Trukese_Chuuk-Latin1 Turkish_Turkce-Turkish Turkish_Turkce-UTF8 Tzeltal-Latin1 Tzotzil-Latin1 Uighur_Uyghur-Latin1 Uighur_Uyghur-UTF8 Ukrainian-Cyrillic Ukrainian-UTF8 Umbundu-Latin1 Urarina-Latin1 Uzbek-Latin1 Vietnamese-ALRN-UTF8 Vietnamese-TCVN Vietnamese-UTF8 Vietnamese-VIQR Vietnamese-VPS Vlach-Latin1 Walloon_Wallon-Latin1 Wama-UTF8 Waray-Latin1 Wayuu-Latin1 Welsh_Cymraeg-Latin1 WesternSotho_Tswana-Setswana-Latin1 Wolof-Latin1 Xhosa-Latin1 Yagua-Latin1 Yao-Latin1 Yapese-Latin1 Yoruba-UTF8 Zapoteco-Latin1 Zapoteco-SanLucasQuiavini-Latin1 Zhuang-Latin1 Zulu-Latin1 >>> from nltk.corpus import treebank >>> print treebank.items ('wsj_0001.mrg', 'wsj_0002.mrg', 'wsj_0003.mrg', 'wsj_0004.mrg', 'wsj_0005.mrg', 'wsj_0006.mrg', 'wsj_0007.mrg', 'wsj_0008.mrg', 'wsj_0009.mrg', 'wsj_0010.mrg', 'wsj_0011.mrg', 'wsj_0012.mrg', 'wsj_0013.mrg', 'wsj_0014.mrg', 'wsj_0015.mrg', 'wsj_0016.mrg', 'wsj_0017.mrg', 'wsj_0018.mrg', 'wsj_0019.mrg', 'wsj_0020.mrg', 'wsj_0021.mrg', 'wsj_0022.mrg', 'wsj_0023.mrg', 'wsj_0024.mrg', 'wsj_0025.mrg', 'wsj_0026.mrg', 'wsj_0027.mrg', 'wsj_0028.mrg', 'wsj_0029.mrg', 'wsj_0030.mrg', 'wsj_0031.mrg', 'wsj_0032.mrg', 'wsj_0033.mrg', 'wsj_0034.mrg', 'wsj_0035.mrg', 'wsj_0036.mrg', 'wsj_0037.mrg', 'wsj_0038.mrg', 'wsj_0039.mrg', 'wsj_0040.mrg', 'wsj_0041.mrg', 'wsj_0042.mrg', 'wsj_0043.mrg', 'wsj_0044.mrg', 'wsj_0045.mrg', 'wsj_0046.mrg', 'wsj_0047.mrg', 'wsj_0048.mrg', 'wsj_0049.mrg', 'wsj_0050.mrg', 'wsj_0051.mrg', 'wsj_0052.mrg', 'wsj_0053.mrg', 'wsj_0054.mrg', 'wsj_0055.mrg', 'wsj_0056.mrg', 'wsj_0057.mrg', 'wsj_0058.mrg', 'wsj_0059.mrg', 'wsj_0060.mrg', 'wsj_0061.mrg', 'wsj_0062.mrg', 'wsj_0063.mrg', 'wsj_0064.mrg', 'wsj_0065.mrg', 'wsj_0066.mrg', 'wsj_0067.mrg', 'wsj_0068.mrg', 'wsj_0069.mrg', 'wsj_0070.mrg', 'wsj_0071.mrg', 'wsj_0072.mrg', 'wsj_0073.mrg', 'wsj_0074.mrg', 'wsj_0075.mrg', 'wsj_0076.mrg', 'wsj_0077.mrg', 'wsj_0078.mrg', 'wsj_0079.mrg', 'wsj_0080.mrg', 'wsj_0081.mrg', 'wsj_0082.mrg', 'wsj_0083.mrg', 'wsj_0084.mrg', 'wsj_0085.mrg', 'wsj_0086.mrg', 'wsj_0087.mrg', 'wsj_0088.mrg', 'wsj_0089.mrg', 'wsj_0090.mrg', 'wsj_0091.mrg', 'wsj_0092.mrg', 'wsj_0093.mrg', 'wsj_0094.mrg', 'wsj_0095.mrg', 'wsj_0096.mrg', 'wsj_0097.mrg', 'wsj_0098.mrg', 'wsj_0099.mrg', 'wsj_0100.mrg', 'wsj_0101.mrg', 'wsj_0102.mrg', 'wsj_0103.mrg', 'wsj_0104.mrg', 'wsj_0105.mrg', 'wsj_0106.mrg', 'wsj_0107.mrg', 'wsj_0108.mrg', 'wsj_0109.mrg', 'wsj_0110.mrg', 'wsj_0111.mrg', 'wsj_0112.mrg', 'wsj_0113.mrg', 'wsj_0114.mrg', 'wsj_0115.mrg', 'wsj_0116.mrg', 'wsj_0117.mrg', 'wsj_0118.mrg', 'wsj_0119.mrg', 'wsj_0120.mrg', 'wsj_0121.mrg', 'wsj_0122.mrg', 'wsj_0123.mrg', 'wsj_0124.mrg', 'wsj_0125.mrg', 'wsj_0126.mrg', 'wsj_0127.mrg', 'wsj_0128.mrg', 'wsj_0129.mrg', 'wsj_0130.mrg', 'wsj_0131.mrg', 'wsj_0132.mrg', 'wsj_0133.mrg', 'wsj_0134.mrg', 'wsj_0135.mrg', 'wsj_0136.mrg', 'wsj_0137.mrg', 'wsj_0138.mrg', 'wsj_0139.mrg', 'wsj_0140.mrg', 'wsj_0141.mrg', 'wsj_0142.mrg', 'wsj_0143.mrg', 'wsj_0144.mrg', 'wsj_0145.mrg', 'wsj_0146.mrg', 'wsj_0147.mrg', 'wsj_0148.mrg', 'wsj_0149.mrg', 'wsj_0150.mrg', 'wsj_0151.mrg', 'wsj_0152.mrg', 'wsj_0153.mrg', 'wsj_0154.mrg', 'wsj_0155.mrg', 'wsj_0156.mrg', 'wsj_0157.mrg', 'wsj_0158.mrg', 'wsj_0159.mrg', 'wsj_0160.mrg', 'wsj_0161.mrg', 'wsj_0162.mrg', 'wsj_0163.mrg', 'wsj_0164.mrg', 'wsj_0165.mrg', 'wsj_0166.mrg', 'wsj_0167.mrg', 'wsj_0168.mrg', 'wsj_0169.mrg', 'wsj_0170.mrg', 'wsj_0171.mrg', 'wsj_0172.mrg', 'wsj_0173.mrg', 'wsj_0174.mrg', 'wsj_0175.mrg', 'wsj_0176.mrg', 'wsj_0177.mrg', 'wsj_0178.mrg', 'wsj_0179.mrg', 'wsj_0180.mrg', 'wsj_0181.mrg', 'wsj_0182.mrg', 'wsj_0183.mrg', 'wsj_0184.mrg', 'wsj_0185.mrg', 'wsj_0186.mrg', 'wsj_0187.mrg', 'wsj_0188.mrg', 'wsj_0189.mrg', 'wsj_0190.mrg', 'wsj_0191.mrg', 'wsj_0192.mrg', 'wsj_0193.mrg', 'wsj_0194.mrg', 'wsj_0195.mrg', 'wsj_0196.mrg', 'wsj_0197.mrg', 'wsj_0198.mrg', 'wsj_0199.mrg') >>> print treebank.parsed_sents('wsj_0003.mrg')[0] (S (S-TPC-1 (NP-SBJ (NP (NP (DT A) (NN form)) (PP (IN of) (NP (NN asbestos)))) (RRC (ADVP-TMP (RB once)) (VP (VBN used) (NP (-NONE- *)) (S-CLR (NP-SBJ (-NONE- *)) (VP (TO to) (VP (VB make) (NP (NNP Kent) (NN cigarette) (NNS filters)))))))) (VP (VBZ has) (VP (VBN caused) (NP (NP (DT a) (JJ high) (NN percentage)) (PP (IN of) (NP (NN cancer) (NNS deaths))) (PP-LOC (IN among) (NP (NP (DT a) (NN group)) (PP (IN of) (NP (NP (NNS workers)) (RRC (VP (VBN exposed) (NP (-NONE- *)) (PP-CLR (TO to) (NP (PRP it))) (ADVP-TMP (NP (QP (RBR more) (IN than) (CD 30)) (NNS years)) (IN ago)))))))))))) (, ,) (NP-SBJ (NNS researchers)) (VP (VBD reported) (SBAR (-NONE- 0) (S (-NONE- *T*-1)))) (. .)) >>> sent = "Buckeye corpora are available at zero cost" >>> sent.split() ['Buckeye', 'corpora', 'are', 'available', 'at', 'zero', 'cost'] >>> mytagger = nltk.DefaultTagger("NN") >>> mytagger.tag(sent.split()) [('Buckeye', 'NN'), ('corpora', 'NN'), ('are', 'NN'), ('available', 'NN'), ('at', 'NN'), ('zero', 'NN'), ('cost', 'NN')] >>> mytagger = nltk.DefaultTagger("bozo") >>> mytagger.tag(sent.split()) [('Buckeye', 'bozo'), ('corpora', 'bozo'), ('are', 'bozo'), ('available', 'bozo'), ('at', 'bozo'), ('zero', 'bozo'), ('cost', 'bozo')] >>> train_sents = nltk.corpus.treebank.tagged_sents()[:3000] >>> test_sents = nltk.corpus.treebank.tagged_sents()[3000:] >>> len(test_sents) 914 >>> len(train_sents) 3000 >>> unigram_tagger = nltk.UnigramTagger(train_sents) >>> unigram_tagger.tag(sent.split()) [('Buckeye', None), ('corpora', None), ('are', 'VBP'), ('available', 'JJ'), ('at', 'IN'), ('zero', None), ('cost', 'NN')] >>> unigram_tagger = nltk.UnigramTagger(train_sents,backoff=DefaultTagger("bozo")) Traceback (most recent call last): File "", line 1, in NameError: name 'DefaultTagger' is not defined >>> unigram_tagger = nltk.UnigramTagger(train_sents,backoff=nltk.DefaultTagger("bozo")) >>> unigram_tagger.tag(sent.split()) [('Buckeye', 'bozo'), ('corpora', 'bozo'), ('are', 'VBP'), ('available', 'JJ'), ('at', 'IN'), ('zero', 'bozo'), ('cost', 'NN')] >>> nltk.tag.accuracy >>> nltk.tag.accuracy(unigram_tagger,test_sents) 0.82495143535506155 >>> unigram_tagger = nltk.UnigramTagger(train_sents,backoff=nltk.DefaultTagger("NN")) >>> nltk.tag.accuracy(unigram_tagger,test_sents) 0.84964385927045116 >>> trigram_tagger = nltk.TrigramTagger(train_sents,backoff=nltk.DefaultTagger("bozo")) >>> KeyboardInterrupt >>> trigram_tagger = nltk.TrigramTagger(train_sents,backoff=nltk.DefaultTagger("NP")) >>> nltk.tag.accuracy(unigram_tagger,test_sents) 0.84964385927045116 >>> nltk.tag.accuracy(trigram_tagger,test_sents) 0.049384847830779191 >>> bigram_tagger = nltk.BigramTagger(train_sents,backoff=nltk.DefaultTagger("bozo")) >>> bigram_tagger = nltk.BigramTagger(train_sents,backoff=nltk.DefaultTagger("NP")) ^[[A>>> bigram_tagger = nltk.BigramTagger(train_sents,backoff=nltk.DefaultTagger("NN")) >>> nltk.tag.accuracy(bigram_tagger,test_sents) 0.71810921649039494 >>> bigram_tagger = nltk.BigramTagger(train_sents,backoff=unigram_tagger) >>> nltk.tag.accuracy(bigram_tagger,test_sents) 0.85335635657241526 >>>