tynbl.github.io

主题模型及LDA

import jieba
import gensim
from gensim import corpora, models
ch_text1 = ' 非常失望,剧本完全敷衍了事,主线剧情没突破大家可以理解,可所有的人物都缺乏动机,正邪之间、妇联内部都没什么火花。团结-分裂-团结的三段式虽然老套但其实也可以利用积攒下来的形象魅力搞出意思,但剧本写得非常肤浅、平面。场面上调度混乱呆板,满屏的铁甲审美疲劳。只有笑点算得上差强人意。'
ch_text2 = ' 2015年度最失望作品。以为面面俱到,实则画蛇添足;以为主题深刻,实则老调重弹;以为推陈出新,实则俗不可耐;以为场面很high,实则high劲不足。气!上一集的趣味全无,这集的笑点明显刻意到心虚。全片没有任何片段给我有紧张激动的时候,太弱了,跟奥创一样。'
ch_text3 = ' 《铁人2》中勾引钢铁侠,《妇联1》中勾引鹰眼,《美队2》中勾引美国队长,在《妇联2》中终于……跟绿巨人表白了,黑寡妇用实际行动告诉了我们什么叫忠贞不二;而且为了治疗不孕不育连作战武器都变成了两支验孕棒(坚决相信快银没有死,后面还得回来)'
ch_text4 = ' 虽然从头打到尾,但是真的很无聊啊。'
ch_text5 = ' 剧情不如第一集好玩了,全靠密集笑点在提神。僧多粥少的直接后果就是每部寡姐都要换着队友谈恋爱,这特么比打斗还辛苦啊,真心求放过~~~(结尾彩蛋还以为是洛基呢,结果我呸!)'

ch_texts = [ch_text1, ch_text2, ch_text3, ch_text4, ch_text5]
doc_set = [list(jieba.cut(ch_text, cut_all=False)) for ch_text in ch_texts]
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\LIA892\AppData\Local\Temp\jieba.cache
Loading model cost 1.062 seconds.
Prefix dict has been built succesfully.
doc_set
[[' ',
  '非常',
  '失望',
  ',',
  '剧本',
  '完全',
  '敷衍了事',
  ',',
  '主线',
  '剧情',
  '没',
  '突破',
  '大家',
  '可以',
  '理解',
  ',',
  '可',
  '所有',
  '的',
  '人物',
  '都',
  '缺乏',
  '动机',
  ',',
  '正邪',
  '之间',
  '、',
  '妇联',
  '内部',
  '都',
  '没什么',
  '火花',
  '。',
  '团结',
  '-',
  '分裂',
  '-',
  '团结',
  '的',
  '三段式',
  '虽然',
  '老套',
  '但',
  '其实',
  '也',
  '可以',
  '利用',
  '积攒',
  '下来',
  '的',
  '形象',
  '魅力',
  '搞',
  '出',
  '意思',
  ',',
  '但',
  '剧本',
  '写得',
  '非常',
  '肤浅',
  '、',
  '平面',
  '。',
  '场面',
  '上',
  '调度',
  '混乱',
  '呆板',
  ',',
  '满屏',
  '的',
  '铁甲',
  '审美疲劳',
  '。',
  '只有',
  '笑',
  '点算',
  '得',
  '上',
  '差强人意',
  '。'],
 [' ',
  '2015',
  '年度',
  '最',
  '失望',
  '作品',
  '。',
  '以为',
  '面面俱到',
  ',',
  '实则',
  '画蛇添足',
  ';',
  '以为',
  '主题深刻',
  ',',
  '实则',
  '老调重弹',
  ';',
  '以为',
  '推陈出新',
  ',',
  '实则',
  '俗不可耐',
  ';',
  '以为',
  '场面',
  '很',
  'high',
  ',',
  '实则',
  'high',
  '劲',
  '不足',
  '。',
  '气',
  '!',
  '上',
  '一集',
  '的',
  '趣味',
  '全无',
  ',',
  '这集',
  '的',
  '笑',
  '点',
  '明显',
  '刻意',
  '到',
  '心虚',
  '。',
  '全片',
  '没有',
  '任何',
  '片段',
  '给',
  '我',
  '有',
  '紧张',
  '激动',
  '的',
  '时候',
  ',',
  '太弱',
  '了',
  ',',
  '跟',
  '奥创',
  '一样',
  '。'],
 [' ',
  '《',
  '铁人',
  '2',
  '》',
  '中',
  '勾引',
  '钢铁',
  '侠',
  ',',
  '《',
  '妇联',
  '1',
  '》',
  '中',
  '勾引',
  '鹰眼',
  ',',
  '《',
  '美队',
  '2',
  '》',
  '中',
  '勾引',
  '美国',
  '队长',
  ',',
  '在',
  '《',
  '妇联',
  '2',
  '》',
  '中',
  '终于',
  '…',
  '…',
  '跟',
  '绿巨人',
  '表白',
  '了',
  ',',
  '黑寡妇',
  '用',
  '实际行动',
  '告诉',
  '了',
  '我们',
  '什么',
  '叫',
  '忠贞不二',
  ';',
  '而且',
  '为了',
  '治疗',
  '不孕',
  '不育',
  '连',
  '作战',
  '武器',
  '都',
  '变成',
  '了',
  '两支',
  '验孕',
  '棒',
  '(',
  '坚决',
  '相信',
  '快银',
  '没有',
  '死',
  ',',
  '后面',
  '还',
  '得',
  '回来',
  ')'],
 [' ', '虽然', '从头', '打到', '尾', ',', '但是', '真的', '很', '无聊', '啊', '。'],
 [' ',
  '剧情',
  '不如',
  '第一集',
  '好玩',
  '了',
  ',',
  '全靠',
  '密集',
  '笑点',
  '在',
  '提神',
  '。',
  '僧多粥少',
  '的',
  '直接',
  '后果',
  '就是',
  '每部',
  '寡姐',
  '都',
  '要',
  '换',
  '着',
  '队友',
  '谈恋爱',
  ',',
  '这特',
  '么',
  '比',
  '打斗',
  '还',
  '辛苦',
  '啊',
  ',',
  '真心',
  '求',
  '放过',
  '~',
  '~',
  '~',
  '(',
  '结尾',
  '彩蛋',
  '还',
  '以为',
  '是',
  '洛基',
  '呢',
  ',',
  '结果',
  '我',
  '呸',
  '!',
  ')']]
dictionary = corpora.Dictionary(doc_set)
corpus = [ dictionary.doc2bow(doc) for doc in doc_set ]
corpus
[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 4),
  (14, 2),
  (15, 4),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 2),
  (29, 1),
  (30, 1),
  (31, 2),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 2),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 2),
  (46, 1),
  (47, 2),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 2),
  (58, 2),
  (59, 6),
  (60, 1),
  (61, 2)],
 [(13, 3),
  (15, 4),
  (17, 1),
  (44, 1),
  (46, 1),
  (47, 1),
  (49, 1),
  (59, 7),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 4),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 1),
  (87, 1),
  (88, 1),
  (89, 1),
  (90, 3),
  (91, 1),
  (92, 1),
  (93, 1),
  (94, 2),
  (95, 1),
  (96, 4),
  (97, 1),
  (98, 1),
  (99, 1),
  (100, 1),
  (101, 1),
  (102, 1),
  (103, 1),
  (104, 1)],
 [(35, 1),
  (44, 1),
  (54, 2),
  (59, 5),
  (60, 1),
  (69, 3),
  (78, 1),
  (90, 1),
  (95, 1),
  (105, 1),
  (106, 1),
  (107, 1),
  (108, 4),
  (109, 4),
  (110, 1),
  (111, 1),
  (112, 2),
  (113, 1),
  (114, 1),
  (115, 1),
  (116, 1),
  (117, 1),
  (118, 1),
  (119, 1),
  (120, 4),
  (121, 1),
  (122, 1),
  (123, 1),
  (124, 1),
  (125, 1),
  (126, 1),
  (127, 1),
  (128, 1),
  (129, 1),
  (130, 1),
  (131, 1),
  (132, 3),
  (133, 1),
  (134, 1),
  (135, 1),
  (136, 1),
  (137, 1),
  (138, 1),
  (139, 1),
  (140, 1),
  (141, 1),
  (142, 1),
  (143, 1),
  (144, 1),
  (145, 1),
  (146, 1),
  (147, 1),
  (148, 1),
  (149, 1),
  (150, 3),
  (151, 1)],
 [(15, 1),
  (39, 1),
  (44, 1),
  (59, 1),
  (74, 1),
  (152, 1),
  (153, 1),
  (154, 1),
  (155, 1),
  (156, 1),
  (157, 1),
  (158, 1)],
 [(12, 1),
  (13, 1),
  (15, 1),
  (35, 1),
  (44, 1),
  (59, 4),
  (65, 1),
  (69, 1),
  (72, 1),
  (102, 1),
  (107, 2),
  (149, 1),
  (156, 1),
  (159, 1),
  (160, 1),
  (161, 1),
  (162, 1),
  (163, 3),
  (164, 1),
  (165, 1),
  (166, 1),
  (167, 1),
  (168, 1),
  (169, 1),
  (170, 1),
  (171, 1),
  (172, 1),
  (173, 1),
  (174, 1),
  (175, 1),
  (176, 1),
  (177, 1),
  (178, 1),
  (179, 1),
  (180, 1),
  (181, 1),
  (182, 1),
  (183, 1),
  (184, 1),
  (185, 1),
  (186, 1),
  (187, 1),
  (188, 1),
  (189, 1),
  (190, 1),
  (191, 1),
  (192, 1),
  (193, 1),
  (194, 1)]]

gesim LDA模型

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5)
lda_model.show_topics()
[(0,
  '0.054*"," + 0.029*"。" + 0.027*"的" + 0.017*"以为" + 0.015*"实则" + 0.013*";" + 0.013*"上" + 0.010*" " + 0.010*"可以" + 0.010*"-"'),
 (1,
  '0.064*"," + 0.026*"。" + 0.023*"的" + 0.017*" " + 0.015*"~" + 0.013*"都" + 0.013*"啊" + 0.012*"了" + 0.012*"虽然" + 0.011*"以为"'),
 (2,
  '0.042*"," + 0.031*"。" + 0.019*"的" + 0.018*"实则" + 0.018*"以为" + 0.016*";" + 0.012*" " + 0.011*"了" + 0.010*"high" + 0.010*"上"'),
 (3,
  '0.036*"," + 0.023*"》" + 0.023*"中" + 0.023*"《" + 0.017*"2" + 0.017*"了" + 0.017*"勾引" + 0.014*"妇联" + 0.012*";" + 0.012*"…"'),
 (4,
  '0.032*"," + 0.019*"》" + 0.017*"中" + 0.017*"勾引" + 0.016*"《" + 0.015*"还" + 0.015*"了" + 0.014*"2" + 0.012*"~" + 0.010*"…"')]