Python自然语言处理NLP(二)

学习地址

语料库

  • 古腾堡gutenberg

    1
    2
    3
    4
    5
    6
    7
    8
    9
    from nltk.corpus import gutenberg
    emmma=gutenberg.words('austen-emma.txt') # j简爱
    # print(type(emmma),len(emmma)) # 192427
    for fileid in gutenberg.fileids(): # austen-emma.txt, austen-persuasion.txt, austen-sense.txt, ...
    num_char=len(gutenberg.raw(fileid)) # sum every char
    num_words=len(gutenberg.words(fileid)) # sum of word num
    num_sents=len(gutenberg.sents(fileid)) # sum of sentence num
    num_vocab=len(set([w.lower() for w in gutenberg.words(fileid)]))
    print(fileid,':',num_char//num_words,num_words//num_sents,num_words//num_vocab)
  • 网络聊天webtext

    1
    2
    3
    from nltk.corpus import webtext 
    for fileid in webtext.fileids():
       print(fileid,': ',webtext.raw(fileid)[:50]) # 每种类型的网络用语内容前50个单词
  • 即时聊天语料库

    1
    2
    3
    4
    5
    6
    from nltk.corpus import nps_chat
    for fileid in nps_chat.fileids():
    print(fileid) # # 10-19-20s_706posts.xml, 10-19-30s_705posts.xml, 10-19-40s_686posts.xml
    chatroom=nps_chat.posts('10-19-20s_706posts.xml')
    print(type(chatroom),len(chatroom))
    print(chatroom[123]) # ['i', 'do', "n't", 'want', 'hot', 'pics', 'of', 'a', 'female', ',', 'I', 'can', 'look', 'in', 'a', 'mirror', '.']
  • 布朗语料库

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    from nltk.corpus import brown
    for fileid in brown.fileids(): # ca01, ca02, ca03, ca04, ...
    print(fileid)
    for x in brown.categories(): # adventure, belles_lettres, editorial, fiction, government, hobbies, humor, ...
    print(x)
    news_text1=brown.words(categories='news')
    print(news_text1) # ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]
    news_text2=brown.words(fileids=['cg22'])
    print(news_text2) # ['Does', 'our', 'society', 'have', 'a', 'runaway', ',', ...]
    news_text3=brown.sents(categories=['news','editorial','reviews'])
    print(news_text3)
  • 路透社语料库

    1
    2
    3
    from nltk.corpus import reuters
    print(len(reuters.fileids())) # 10788
    print(len(reuters.categories())) # 90
  • 就职演讲语料

1
2
3
4
5
6
7
8
9
10
from nltk.corpus import inaugural

cfd=nltk.ConditionalFreqDist(
(target,fileid[:4])
for fileid in inaugural.fileids()
for w in inaugural.words(fileid)
for target in ['america','citizen']
if w.lower().startswith(target)
)
cfd.plot() # Construct a new empty conditional frequency distribution

Figure_39b8d1601601d5d69b.png

条件频率分布

nltk有两种方法可以生成频率图,一种是上图所示代码,另外一种用表格形式生成

1
2
3
4
5
6
7
8
9
10
11
12
13
from nltk.corpus import udhr

cfd=nltk.ConditionalFreqDist(
(lang,len(word))
for lang in ['English','German_Deutsch']
for word in udhr.words(lang+'-Latin1')
)
cfd.tabulate(conditions=['English','German_Deutsch'],samples=np.arange(10),cumulative=False)

- - - - - - - - - - - - -
0 1 2 3 4 5 6 7 8 9
English 0 185 340 358 114 169 117 157 118 80
German_Deutsch 0 171 92 351 103 177 119 97 103 62

词典

  • 停用词:高频词汇但是没有什么实际意义,如’a’,’an’,’the’ …,在实际应用中可以将这些高频词汇过滤掉,剩下一些关键词更有研究意义。

    1
    2
    3
    4
    def content_fraction(text): # 提取关键词
       stopwords = stopwords.words('english')
       content=[w for w in text if w.lower() not in stopwords] # 过滤停用词
       return content
  • 发音词典

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    from nltk.corpus import  cmudict

    entries=cmudict.entries()
    #print(len(entries)) # 133737
    for entry in entries[39943:39951]:
    print(entry)
    - - - - - - - - - - - - - - -
    ('explorer', ['IH0', 'K', 'S', 'P', 'L', 'AO1', 'R', 'ER0'])
    ('explorers', ['IH0', 'K', 'S', 'P', 'L', 'AO1', 'R', 'ER0', 'Z'])
    ('explores', ['IH0', 'K', 'S', 'P', 'L', 'AO1', 'R', 'Z'])
    ('exploring', ['IH0', 'K', 'S', 'P', 'L', 'AO1', 'R', 'IH0', 'NG'])
    ('explosion', ['IH0', 'K', 'S', 'P', 'L', 'OW1', 'ZH', 'AH0', 'N'])
    ('explosions', ['IH0', 'K', 'S', 'P', 'L', 'OW1', 'ZH', 'AH0', 'N', 'Z'])
    ('explosive', ['IH0', 'K', 'S', 'P', 'L', 'OW1', 'S', 'IH0', 'V'])
    ('explosively', ['EH2', 'K', 'S', 'P', 'L', 'OW1', 'S', 'IH0', 'V', 'L', 'IY0'])
  • 词汇工具from nltk.corpus import toolbox

同义词

1
2
3
4
5
6
7
8
9
10
11
12
13
14
from nltk.corpus import wordnet as wn
similar=wn.synsets('sweet')
print(similar) # 查看所属同义词集
print(wn.synset('sweet.n.01').definition()) # 查看这种词集的定义
print(wn.synset('car.n.01').examples()) # 这种定义下的一个例句
print(wn.synset('car.n.01').lemmas()) # 查看同义词集词条
print(wn.lemma('car.n.01.automobile').synset()) # 查找所属同义词集

- - - - - - - - - - - - - - - - - -
[Synset('sweet.n.01'), Synset('dessert.n.01'), Synset('sweet.n.03'), Synset('sweet.n.04'), Synset('sweetness.n.02'), Synset('sweet.a.01'), Synset('angelic.s.03'), Synset('dulcet.s.02'), Synset('sweet.s.04'), Synset('gratifying.s.01'), Synset('odoriferous.s.03'), Synset('sweet.a.07'), Synset('fresh.a.06'), Synset('fresh.s.09'), Synset('sugared.s.01'), Synset('sweetly.r.01')]
English phonetician; one of the founders of modern phonetics (1845-1912)
['he needs a car to get to work']
[Lemma('car.n.01.car'), Lemma('car.n.01.auto'), Lemma('car.n.01.automobile'), Lemma('car.n.01.machine'), Lemma('car.n.01.motorcar')]
Synset('car.n.01')