Python自然语言处理NLP(三)

2019-09-24

NLP

学习地址

原始英文文本

从网络上下载 - urllib.requests.urlopen

''' txt在线文档下载 '''
from urllib.request import urlopen
import requests
url='http://www.gutenberg.org/files/2554/2554-0.txt'
# response=requests.get(url)
# print(response.text)
response=urlopen(url)
raw=response.read().decode('utf8') # 解码查看内容
print(type(raw),len(raw)) # <class 'str'> 1176967
print(raw[:1000])


''' 分词 '''
from nltk import word_tokenize
tokens=word_tokenize(raw)
print(type(tokens),len(tokens)) # <class 'list'> 257727
print(tokens[:10])


''' 创建text '''
texts=nltk.Text(tokens)
print(type(texts),len(texts)) # <class 'nltk.text.Text'> 257727
print(texts[1024:1062])
a=texts.collocation_list(num=20) # 收集最常见的20个词组
# print(a)


''' 根据内容定义开头与结尾 '''
head=raw.find('PART I')
tail=raw.rfind("End of Project Gutenberg’s Crime") # 查找最后一个匹配的位置
print(head,tail) # 5336 1157812
raw=raw[head:tail]
print(raw.find('PART I')) # 0


'''HTML 解析 '''
from urllib.request import urlopen
from bs4 import BeautifulSoup
url='https://news.163.com/19/0924/16/EPRPQI0H0001875O.html'
html=urlopen(url).read().decode('GBK') # 注意编码
raw=BeautifulSoup(html,'lxml').get_text() # 使用解析器解析
tokens=word_tokenize(raw) # 获取所有分词，但是比较杂乱
bs=BeautifulSoup(html,'lxml').find('div',class_='post_text').get_text()
# print(bs) # 找到指定模块
# print(type(bs),len(bs)) # <class 'str'> 957


- - - - - - - - - - - - - - - - - - - - 
 The Project Gutenberg EBook of Crime and Punishment, by Fyodor Dostoevsky

This eBook is for the 
<class 'list'> 257727
['\ufeffThe', 'Project', 'Gutenberg', 'EBook', 'of', 'Crime', 'and', 'Punishment', ',', 'by']
['an', 'exceptionally', 'hot', 'evening', 'early', 'in', 'July', 'a', 'young', 'man', 'came', 'out', 'of', 'the', 'garret', 'in', 'which', 'he', 'lodged', 'in', 'S.', 'Place', 'and', 'walked', 'slowly', ',', 'as', 'though', 'in', 'hesitation', ',', 'towards', 'K.', 'bridge', '.', 'He', 'had', 'successfully']

读取本地文件

1
2
3

fid=open(path)
conten=fid.read()
fid.close()

Unicode字符

正则表达式

规范化文本

词干提取器

''' 词干提取器 '''
a='she is a nice girl, best ever i have been meeting'
porter=nltk.PorterStemmer()
landcaster=nltk.LancasterStemmer()
a=a.split()
print(a)
print(a)
b=[porter.stem(w) for w in a]
c=[landcaster.stem(w) for w in a]
print(b)
print(c)

- - - - - - - - - - - - - - -
['she', 'is', 'a', 'nice', 'girl,', 'best', 'ever', 'i', 'have', 'been', 'meeting']
['she', 'is', 'a', 'nice', 'girl,', 'best', 'ever', 'i', 'have', 'been', 'meet']
['she', 'is', 'a', 'nic', 'girl,', 'best', 'ev', 'i', 'hav', 'been', 'meet']

词性归并

''' 词性归并 '''
a='do does done'
a=a.split()
wnl=nltk.WordNetLemmatizer()
b=[wnl.lemmatize(w) for w in a]
print(b)

- - - - - - - - - 
['do', 'doe', 'done']

分割

''' 断句 '''
# for file in nltk.corpus.gutenberg.fileids():
#     print(file)
text=nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents=nltk.sent_tokenize(text)
print(sents[2:5])

''' 链表与字符串 '''
a='I have a pen, I have an apple, an apple-pen'
a=a.split()
print(a)
a=' '.join(a)
print(a)


- - - - - - - - - - - - - - - - - -
['Like the white lock of Whistler, that lit our aimless gloom,\nMen showed their own white feather as proudly as a plume.', 'Life was a fly that faded, and death a drone that stung;\nThe world was very old indeed when you and I were young.', 'They twisted even decent sin to shapes not to be named:\nMen were ashamed of honour; but we were not ashamed.']
['I', 'have', 'a', 'pen,', 'I', 'have', 'an', 'apple,', 'an', 'apple-pen']
I have a pen, I have an apple, an apple-pen

使用模拟退火进行评分