import nltk
nltk.download()
NLTK Downloader
---------------------------------------------------------------------------
d) Download l) List u) Update c) Config h) Help q) Quit
---------------------------------------------------------------------------
Downloader> d
Download which package (l=list; x=cancel)?
Identifier> l
Packages:
[ ] dolch............... Dolch Word List
[-] indian.............. Indian Language POS-Tagged Corpus
[ ] mwa_ppdb............ The monolingual word aligner (Sultan et al.
2015) subset of the Paraphrase Database.
[-] panlex_swadesh...... PanLex Swadesh Corpora
[-] perluniprops........ perluniprops: Index of Unicode Version 7.0.0
character properties in Perl
[-] stopwords........... Stopwords Corpus
Collections:
[-] all-corpora......... All the corpora
[-] all-nltk............ All packages available on nltk_data gh-pages
branch
[-] all................. All packages
[-] book................ Everything used in the NLTK Book
[-] popular............. Popular packages
[ ] third-party......... Third-party data packages
([*] marks installed packages; [-] marks out-of-date or corrupt packages)
Download which package (l=list; x=cancel)?
Identifier> x
---------------------------------------------------------------------------
d) Download l) List u) Update c) Config h) Help q) Quit
---------------------------------------------------------------------------
Downloader> q
True
import nltk
from nltk.corpus import brown # 需要下载brown语料库
# 引用布朗大学的语料库
# 查看语料库包含的类别
print(brown.categories())
['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
# 查看brown语料库
print('共有{}个句子'.format(len(brown.sents())))
print('共有{}个单词'.format(len(brown.words())))
共有57340个句子
共有1161192个单词
brown.words()[:10]
['The',
'Fulton',
'County',
'Grand',
'Jury',
'said',
'Friday',
'an',
'investigation',
'of']
# 词频统计
from nltk import FreqDist
dist = FreqDist(brown.words())
print('非重复单词个数:', len(dist))
print('前10个单词:', list(dist.keys())[:10])
print('the出现的个数:', dist['the'])
非重复单词个数: 56057
前10个单词: ['Ebbetts', '6-ounce', 'rabbi', 'dictator', 'detach', 'Navigation', "o'clock", "Motors'", 'nubile', 'graunt']
the出现的个数: 62713
# 找出长度大于5,且出现次数大于500的单词
freq_words = [w for w in dist.keys() if len(w) > 5 and dist[w] > 500]
freq_words
['against',
'should',
'between',
'another',
'through',
'himself',
'little',
'before',
'people',
'thought',
'American',
'around',
'because',
'without']
sentence = "Python is a widely used high-level programming language for general-purpose programming."
tokens = nltk.word_tokenize(sentence) # 需要下载punkt分词模型
print(tokens)
['Python', 'is', 'a', 'widely', 'used', 'high-level', 'programming', 'language', 'for', 'general-purpose', 'programming', '.']
texts = 'Python is a widely used high-level programming language for general-purpose programming, created by Guido van Rossum and first released in 1991. An interpreted language, Python has a design philosophy that emphasizes code readability (notably using whitespace indentation to delimit code blocks rather than curly brackets or keywords), and a syntax that allows programmers to express concepts in fewer lines of code than might be used in languages such as C++ or Java.[23][24] The language provides constructs intended to enable writing clear programs on both a small and large scale.'
sentences = nltk.sent_tokenize(texts)
len(sentences)
3
sentences
['Python is a widely used high-level programming language for general-purpose programming, created by Guido van Rossum and first released in 1991.',
'An interpreted language, Python has a design philosophy that emphasizes code readability (notably using whitespace indentation to delimit code blocks rather than curly brackets or keywords), and a syntax that allows programmers to express concepts in fewer lines of code than might be used in languages such as C++ or Java.',
'[23][24] The language provides constructs intended to enable writing clear programs on both a small and large scale.']
# 安装 pip install jieba
import jieba
seg_list = jieba.cut("欢迎来到小象学院", cut_all=True)
print("全模式: " + "/ ".join(seg_list)) # 全模式
seg_list = jieba.cut("欢迎来到小象学院", cut_all=False)
print("精确模式: " + "/ ".join(seg_list)) # 精确模式
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\LIA892\AppData\Local\Temp\jieba.cache
Loading model cost 0.809 seconds.
Prefix dict has been built succesfully.
全模式: 欢迎/ 迎来/ 来到/ 小象/ 学院
精确模式: 欢迎/ 来到/ 小/ 象/ 学院
# PorterStemmer
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
print(porter_stemmer.stem('looked'))
print(porter_stemmer.stem('went'))
look
went
input1 = 'List listed lists listing listings'
words1 = input1.lower().split(' ')
[porter_stemmer.stem(w) for w in words1]
['list', 'list', 'list', 'list', 'list']
# SnowballStemmer
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer('english')
print(snowball_stemmer.stem('looked'))
print(snowball_stemmer.stem('looking'))
look
look
# LancasterStemmer
from nltk.stem.lancaster import LancasterStemmer
lancaster_stemmer = LancasterStemmer()
print(lancaster_stemmer.stem('looked'))
print(lancaster_stemmer.stem('looking'))
look
look
from nltk.stem import WordNetLemmatizer # 需要下载wordnet语料库
wordnet_lematizer = WordNetLemmatizer()
print(wordnet_lematizer.lemmatize('cats'))
print(wordnet_lematizer.lemmatize('boxes'))
print(wordnet_lematizer.lemmatize('are'))
print(wordnet_lematizer.lemmatize('went'))
cat
box
are
went
# 指明词性可以更准确地进行lemma
# lemmatize 默认为名词
print(wordnet_lematizer.lemmatize('are', pos='v'))
print(wordnet_lematizer.lemmatize('went', pos='v'))
be
go
import nltk
words = nltk.word_tokenize('Python is a widely used programming language.')
#print(words)
print(nltk.pos_tag(words)) # 需要下载 averaged_perceptron_tagger
[('Python', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('widely', 'RB'), ('used', 'VBN'), ('programming', 'NN'), ('language', 'NN'), ('.', '.')]
from nltk.corpus import stopwords # 需要下载stopwords
filtered_words = [word for word in words if word not in stopwords.words('english')]
print('原始词:', words)
print('去除停用词后:', filtered_words)
原始词: ['Python', 'is', 'a', 'widely', 'used', 'programming', 'language', '.']
去除停用词后: ['Python', 'widely', 'used', 'programming', 'language', '.']
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
# 原始文本
raw_text = 'Life is like a box of chocolates. You never know what you\'re gonna get.'
# 分词
raw_words = nltk.word_tokenize(raw_text)
# 词形归一化
wordnet_lematizer = WordNetLemmatizer()
words = [wordnet_lematizer.lemmatize(raw_word) for raw_word in raw_words]
# 去除停用词
filtered_words = [word for word in words if word not in stopwords.words('english')]
print('原始文本:', raw_text)
print('预处理结果:', filtered_words)
原始文本: Life is like a box of chocolates. You never know what you're gonna get.
预处理结果: ['Life', 'like', 'box', 'chocolate', '.', 'You', 'never', 'know', "'re", 'gon', 'na', 'get', '.']