-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdata_analysis.py
60 lines (42 loc) · 1.7 KB
/
data_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# -*- coding: utf-8 -*-
# author: inspurer(月小水长)
# pc_type lenovo
# create_time: 2019/8/3 21:34
# file_name: data_analysis.py
# github https://github.com/inspurer
# qq邮箱 [email protected]
# 微信公众号 月小水长(ID: inspurer)
import pandas as pd
import jieba
df = pd.read_csv("answers.csv")
def chinese_word_cut(text):
return " ".join(jieba.cut(text))
df["answer"] = df.answer.apply(chinese_word_cut)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# 向量化
n_features = 1000
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
max_features=n_features,
stop_words='english',
max_df = 0.5,
min_df = 10)
tf = tf_vectorizer.fit_transform(df.answer)
from sklearn.decomposition import LatentDirichletAllocation
n_topics = 5
# LDA 处理
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=50,
learning_method='online',
learning_offset=50.,
random_state=0)
lda.fit(tf)
def print_top_words(model, feature_names, n_top_words):
for topic_idx, topic in enumerate(model.components_):
print("Topic %d:" % topic_idx)
print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
n_top_words = 20
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)
import pyLDAvis
import pyLDAvis.sklearn
data = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)
pyLDAvis.show(data)#可视化主题模型