Python自然语言处理NLP(二)

2019-09-23

NLP

学习地址

语料库

古腾堡gutenberg

from nltk.corpus import gutenberg
emmma=gutenberg.words('austen-emma.txt') # j简爱
# print(type(emmma),len(emmma)) # 192427
for fileid in gutenberg.fileids(): # austen-emma.txt, austen-persuasion.txt, austen-sense.txt, ...
    num_char=len(gutenberg.raw(fileid)) # sum every char
    num_words=len(gutenberg.words(fileid)) # sum of word num
    num_sents=len(gutenberg.sents(fileid)) # sum of sentence num
    num_vocab=len(set([w.lower() for w in gutenberg.words(fileid)]))
    print(fileid,':',num_char//num_words,num_words//num_sents,num_words//num_vocab)

网络聊天webtext

1
2
3

from nltk.corpus import webtext 
for fileid in webtext.fileids():
    print(fileid,': ',webtext.raw(fileid)[:50]) # 每种类型的网络用语内容前50个单词

即时聊天语料库

from nltk.corpus import nps_chat
for fileid in nps_chat.fileids():
    print(fileid) # # 10-19-20s_706posts.xml, 10-19-30s_705posts.xml, 10-19-40s_686posts.xml
chatroom=nps_chat.posts('10-19-20s_706posts.xml')
print(type(chatroom),len(chatroom))
print(chatroom[123]) # ['i', 'do', "n't", 'want', 'hot', 'pics', 'of', 'a', 'female', ',', 'I', 'can', 'look', 'in', 'a', 'mirror', '.']

布朗语料库

from nltk.corpus import brown
for fileid in brown.fileids(): # ca01, ca02, ca03, ca04, ...
    print(fileid)
for x in brown.categories(): # adventure, belles_lettres, editorial, fiction, government, hobbies, humor, ...
    print(x)
news_text1=brown.words(categories='news')
print(news_text1) # ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]
news_text2=brown.words(fileids=['cg22'])
print(news_text2) # ['Does', 'our', 'society', 'have', 'a', 'runaway', ',', ...]
news_text3=brown.sents(categories=['news','editorial','reviews'])
print(news_text3)

路透社语料库

1
2
3

from nltk.corpus import reuters
print(len(reuters.fileids())) # 10788
print(len(reuters.categories())) # 90

就职演讲语料

from nltk.corpus import inaugural

cfd=nltk.ConditionalFreqDist(
    (target,fileid[:4])
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ['america','citizen']
    if w.lower().startswith(target)
)
cfd.plot() # Construct a new empty conditional frequency distribution

条件频率分布

nltk有两种方法可以生成频率图，一种是上图所示代码，另外一种用表格形式生成

from nltk.corpus import udhr

cfd=nltk.ConditionalFreqDist(
    (lang,len(word))
    for lang in ['English','German_Deutsch']
    for word in udhr.words(lang+'-Latin1')
)
cfd.tabulate(conditions=['English','German_Deutsch'],samples=np.arange(10),cumulative=False)

- - - - - - - - - - - - -
                 0   1   2   3   4   5   6   7   8   9 
       English   0 185 340 358 114 169 117 157 118  80 
German_Deutsch   0 171  92 351 103 177 119  97 103  62

词典

停用词：高频词汇但是没有什么实际意义，如’a’,’an’,’the’ …，在实际应用中可以将这些高频词汇过滤掉，剩下一些关键词更有研究意义。

def content_fraction(text): # 提取关键词
    stopwords = stopwords.words('english')
    content=[w for w in text if w.lower() not in stopwords] # 过滤停用词
    return content

发音词典

from nltk.corpus import  cmudict

entries=cmudict.entries()
#print(len(entries)) # 133737
for entry in entries[39943:39951]:
    print(entry)
- - - - - - - - - - - - - - -
('explorer', ['IH0', 'K', 'S', 'P', 'L', 'AO1', 'R', 'ER0'])
('explorers', ['IH0', 'K', 'S', 'P', 'L', 'AO1', 'R', 'ER0', 'Z'])
('explores', ['IH0', 'K', 'S', 'P', 'L', 'AO1', 'R', 'Z'])
('exploring', ['IH0', 'K', 'S', 'P', 'L', 'AO1', 'R', 'IH0', 'NG'])
('explosion', ['IH0', 'K', 'S', 'P', 'L', 'OW1', 'ZH', 'AH0', 'N'])
('explosions', ['IH0', 'K', 'S', 'P', 'L', 'OW1', 'ZH', 'AH0', 'N', 'Z'])
('explosive', ['IH0', 'K', 'S', 'P', 'L', 'OW1', 'S', 'IH0', 'V'])
('explosively', ['EH2', 'K', 'S', 'P', 'L', 'OW1', 'S', 'IH0', 'V', 'L', 'IY0'])

词汇工具from nltk.corpus import toolbox

同义词

from nltk.corpus import wordnet as wn
similar=wn.synsets('sweet')
print(similar) # 查看所属同义词集
print(wn.synset('sweet.n.01').definition()) # 查看这种词集的定义
print(wn.synset('car.n.01').examples()) # 这种定义下的一个例句
print(wn.synset('car.n.01').lemmas()) # 查看同义词集词条
print(wn.lemma('car.n.01.automobile').synset()) # 查找所属同义词集

- - - - - - - - - - - - - - - - - - 
[Synset('sweet.n.01'), Synset('dessert.n.01'), Synset('sweet.n.03'), Synset('sweet.n.04'), Synset('sweetness.n.02'), Synset('sweet.a.01'), Synset('angelic.s.03'), Synset('dulcet.s.02'), Synset('sweet.s.04'), Synset('gratifying.s.01'), Synset('odoriferous.s.03'), Synset('sweet.a.07'), Synset('fresh.a.06'), Synset('fresh.s.09'), Synset('sugared.s.01'), Synset('sweetly.r.01')]
English phonetician; one of the founders of modern phonetics (1845-1912)
['he needs a car to get to work']
[Lemma('car.n.01.car'), Lemma('car.n.01.auto'), Lemma('car.n.01.automobile'), Lemma('car.n.01.machine'), Lemma('car.n.01.motorcar')]
Synset('car.n.01')