tynbl.github.io

NLTK

1. NLTK基本操作

import nltk

nltk.download()
NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> l
Packages:
  [ ] dolch............... Dolch Word List
  [-] indian.............. Indian Language POS-Tagged Corpus
  [ ] mwa_ppdb............ The monolingual word aligner (Sultan et al.
                           2015) subset of the Paraphrase Database.
  [-] panlex_swadesh...... PanLex Swadesh Corpora
  [-] perluniprops........ perluniprops: Index of Unicode Version 7.0.0
                           character properties in Perl
  [-] stopwords........... Stopwords Corpus

Collections:
  [-] all-corpora......... All the corpora
  [-] all-nltk............ All packages available on nltk_data gh-pages
                           branch
  [-] all................. All packages
  [-] book................ Everything used in the NLTK Book
  [-] popular............. Popular packages
  [ ] third-party......... Third-party data packages

([*] marks installed packages; [-] marks out-of-date or corrupt packages)

Download which package (l=list; x=cancel)?
  Identifier> x

---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q





True
import nltk
from nltk.corpus import brown # 需要下载brown语料库
# 引用布朗大学的语料库
# 查看语料库包含的类别
print(brown.categories())
['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
# 查看brown语料库
print('共有{}个句子'.format(len(brown.sents())))
print('共有{}个单词'.format(len(brown.words())))
共有57340个句子
共有1161192个单词
brown.words()[:10]
['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of']
# 词频统计
from nltk import FreqDist
dist = FreqDist(brown.words())
print('非重复单词个数:', len(dist))
print('前10个单词:', list(dist.keys())[:10])
print('the出现的个数:', dist['the'])
非重复单词个数: 56057
前10个单词: ['Ebbetts', '6-ounce', 'rabbi', 'dictator', 'detach', 'Navigation', "o'clock", "Motors'", 'nubile', 'graunt']
the出现的个数: 62713
# 找出长度大于5,且出现次数大于500的单词
freq_words = [w for w in dist.keys() if len(w) > 5 and dist[w] > 500]
freq_words
['against',
 'should',
 'between',
 'another',
 'through',
 'himself',
 'little',
 'before',
 'people',
 'thought',
 'American',
 'around',
 'because',
 'without']

2. 分词

2.1 NLTK英文分词

sentence = "Python is a widely used high-level programming language for general-purpose programming."
tokens = nltk.word_tokenize(sentence) # 需要下载punkt分词模型
print(tokens)
['Python', 'is', 'a', 'widely', 'used', 'high-level', 'programming', 'language', 'for', 'general-purpose', 'programming', '.']

2.2 分句

texts = 'Python is a widely used high-level programming language for general-purpose programming, created by Guido van Rossum and first released in 1991. An interpreted language, Python has a design philosophy that emphasizes code readability (notably using whitespace indentation to delimit code blocks rather than curly brackets or keywords), and a syntax that allows programmers to express concepts in fewer lines of code than might be used in languages such as C++ or Java.[23][24] The language provides constructs intended to enable writing clear programs on both a small and large scale.'
sentences = nltk.sent_tokenize(texts)
len(sentences)
3
sentences
['Python is a widely used high-level programming language for general-purpose programming, created by Guido van Rossum and first released in 1991.',
 'An interpreted language, Python has a design philosophy that emphasizes code readability (notably using whitespace indentation to delimit code blocks rather than curly brackets or keywords), and a syntax that allows programmers to express concepts in fewer lines of code than might be used in languages such as C++ or Java.',
 '[23][24] The language provides constructs intended to enable writing clear programs on both a small and large scale.']

2.3 中文结巴分词

# 安装 pip install jieba
import jieba

seg_list = jieba.cut("欢迎来到小象学院", cut_all=True)
print("全模式: " + "/ ".join(seg_list))  # 全模式

seg_list = jieba.cut("欢迎来到小象学院", cut_all=False)
print("精确模式: " + "/ ".join(seg_list))  # 精确模式
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\LIA892\AppData\Local\Temp\jieba.cache
Loading model cost 0.809 seconds.
Prefix dict has been built succesfully.


全模式: 欢迎/ 迎来/ 来到/ 小象/ 学院
精确模式: 欢迎/ 来到/ 小/ 象/ 学院

3. 词形归一化

3.1 词干提取(stemming)

# PorterStemmer
from nltk.stem.porter import PorterStemmer

porter_stemmer = PorterStemmer()
print(porter_stemmer.stem('looked'))
print(porter_stemmer.stem('went'))
look
went
input1 = 'List listed lists listing listings'
words1 = input1.lower().split(' ')
[porter_stemmer.stem(w) for w in words1]
['list', 'list', 'list', 'list', 'list']
# SnowballStemmer
from nltk.stem import SnowballStemmer

snowball_stemmer = SnowballStemmer('english')
print(snowball_stemmer.stem('looked'))
print(snowball_stemmer.stem('looking'))
look
look
# LancasterStemmer
from nltk.stem.lancaster import LancasterStemmer

lancaster_stemmer = LancasterStemmer()
print(lancaster_stemmer.stem('looked'))
print(lancaster_stemmer.stem('looking'))
look
look

3.2 词形归并(lemmatization)

from nltk.stem import WordNetLemmatizer # 需要下载wordnet语料库

wordnet_lematizer = WordNetLemmatizer()
print(wordnet_lematizer.lemmatize('cats'))
print(wordnet_lematizer.lemmatize('boxes'))
print(wordnet_lematizer.lemmatize('are'))
print(wordnet_lematizer.lemmatize('went'))
cat
box
are
went
# 指明词性可以更准确地进行lemma
# lemmatize 默认为名词
print(wordnet_lematizer.lemmatize('are', pos='v'))
print(wordnet_lematizer.lemmatize('went', pos='v'))
be
go

4. 词性标注 (Part-Of-Speech)

import nltk

words = nltk.word_tokenize('Python is a widely used programming language.')
#print(words)
print(nltk.pos_tag(words)) # 需要下载 averaged_perceptron_tagger
[('Python', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('widely', 'RB'), ('used', 'VBN'), ('programming', 'NN'), ('language', 'NN'), ('.', '.')]

5. 去除停用词

from nltk.corpus import stopwords # 需要下载stopwords

filtered_words = [word for word in words if word not in stopwords.words('english')]
print('原始词:', words)
print('去除停用词后:', filtered_words)
原始词: ['Python', 'is', 'a', 'widely', 'used', 'programming', 'language', '.']
去除停用词后: ['Python', 'widely', 'used', 'programming', 'language', '.']

6. 典型的文本预处理流程

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# 原始文本
raw_text = 'Life is like a box of chocolates. You never know what you\'re gonna get.'

# 分词
raw_words = nltk.word_tokenize(raw_text)

# 词形归一化
wordnet_lematizer = WordNetLemmatizer()
words = [wordnet_lematizer.lemmatize(raw_word) for raw_word in raw_words]

# 去除停用词
filtered_words = [word for word in words if word not in stopwords.words('english')]

print('原始文本:', raw_text)
print('预处理结果:', filtered_words)
原始文本: Life is like a box of chocolates. You never know what you're gonna get.
预处理结果: ['Life', 'like', 'box', 'chocolate', '.', 'You', 'never', 'know', "'re", 'gon', 'na', 'get', '.']