時(shí)間:2023-03-15 22:40:01 | 來源:電子商務(wù)
時(shí)間:2023-03-15 22:40:01 來源:電子商務(wù)
針對(duì)用戶在電商平臺(tái)上留下的評(píng)論數(shù)據(jù),對(duì)其進(jìn)行分詞、詞性標(biāo)注和去除停用詞等文本預(yù)處理?;陬A(yù)處理后的數(shù)據(jù)進(jìn)行情感分析,并使用LDA主題模型提取評(píng)論關(guān)鍵信息,以了解用戶的需求、意見、購買原因及產(chǎn)品的優(yōu)缺點(diǎn)等,最終提出改善產(chǎn)品的建議。import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsimport reimport jieba.posseg as psgimport warningswarnings.filterwarnings("ignore")%matplotlib inlinepath = '/home/mw/input/data/emotion_analysi7147'reviews = pd.read_csv(path+'/reviews.csv')print(reviews.shape)reviews.head()
# 刪除數(shù)據(jù)記錄中所有列值相同的記錄reviews = reviews[['content','content_type']].drop_duplicates()content = reviews['content']reviews.shapereviews
數(shù)據(jù)清洗# 去除英文、數(shù)字、京東、美的、電熱水器等詞語strinfo = re.compile('[0-9a-zA-Z]|京東|美的|電熱水器|熱水器|')content = content.apply(lambda x: strinfo.sub('',x))
分詞、詞性標(biāo)注、去除停用詞# 分詞worker = lambda s: [(x.word, x.flag) for x in psg.cut(s)] # 自定義簡單分詞函數(shù)seg_word = content.apply(worker)seg_word.head()
# 將詞語轉(zhuǎn)為數(shù)據(jù)框形式,一列是詞,一列是詞語所在的句子ID,最后一列是詞語在該句子的位置n_word = seg_word.apply(lambda x: len(x)) # 每一評(píng)論中詞的個(gè)數(shù)n_content = [[x+1]*y for x,y in zip(list(seg_word.index), list(n_word))]# 將嵌套的列表展開,作為詞所在評(píng)論的idindex_content = sum(n_content, [])seg_word = sum(seg_word, [])# 詞word = [x[0] for x in seg_word]# 詞性nature = [x[1] for x in seg_word]content_type = [[x]*y for x,y in zip(list(reviews['content_type']), list(n_word))]# 評(píng)論類型content_type = sum(content_type, [])result = pd.DataFrame({"index_content":index_content, "word":word, "nature":nature, "content_type":content_type})result.head()
# 刪除標(biāo)點(diǎn)符號(hào)result = result[result['nature'] != 'x'] # x表示標(biāo)點(diǎn)符號(hào)# 刪除停用詞stop_path = open(path+"/stoplist.txt", 'r',encoding='UTF-8')stop = stop_path.readlines()stop = [x.replace('/n', '') for x in stop]word = list(set(word) - set(stop))result = result[result['word'].isin(word)]result.head()
# 構(gòu)造各詞在對(duì)應(yīng)評(píng)論的位置列n_word = list(result.groupby(by = ['index_content'])['index_content'].count())index_word = [list(np.arange(0, y)) for y in n_word]# 詞語在該評(píng)論的位置index_word = sum(index_word, [])# 合并評(píng)論idresult['index_word'] = index_wordresult.head()
提取含名詞的評(píng)論# 提取含有名詞類的評(píng)論,即詞性含有“n”的評(píng)論ind = result[['n' in x for x in result['nature']]]['index_content'].unique()result = result[[x in ind for x in result['index_content']]]result.head()
繪制詞云import matplotlib.pyplot as pltfrom wordcloud import WordCloudfrequencies = result.groupby('word')['word'].count()frequencies = frequencies.sort_values(ascending = False)backgroud_Image=plt.imread(path+'/pl.jpg')# 自己上傳中文字體到kescifont_path = '/home/kesci/work/data/fonts/MSYHL.TTC'wordcloud = WordCloud(font_path=font_path, # 設(shè)置字體,不設(shè)置就會(huì)出現(xiàn)亂碼 max_words=100, background_color='white', mask=backgroud_Image)# 詞云形狀my_wordcloud = wordcloud.fit_words(frequencies)plt.imshow(my_wordcloud)plt.axis('off') plt.show()
由圖可以看出,對(duì)評(píng)論數(shù)據(jù)進(jìn)行預(yù)處理后,分詞效果較為符合預(yù)期。其中“安裝”“師傅”“售后”“物流”“服務(wù)”等詞出現(xiàn)頻率較高,因此可以初步判斷用戶對(duì)產(chǎn)品的這幾個(gè)方面比較重視。# 將結(jié)果保存result.to_csv("./word.csv", index = False, encoding = 'utf-8')
二、詞典匹配word = pd.read_csv("./word.csv")# 讀入正面、負(fù)面情感評(píng)價(jià)詞pos_comment = pd.read_csv(path+"/正面評(píng)價(jià)詞語(中文).txt", header=None,sep="/n", encoding = 'utf-8', engine='python')neg_comment = pd.read_csv(path+"/負(fù)面評(píng)價(jià)詞語(中文).txt", header=None,sep="/n", encoding = 'utf-8', engine='python')pos_emotion = pd.read_csv(path+"/正面情感詞語(中文).txt", header=None,sep="/n", encoding = 'utf-8', engine='python')neg_emotion = pd.read_csv(path+"/負(fù)面情感詞語(中文).txt", header=None,sep="/n", encoding = 'utf-8', engine='python') # 合并情感詞與評(píng)價(jià)詞positive = set(pos_comment.iloc[:,0])|set(pos_emotion.iloc[:,0])negative = set(neg_comment.iloc[:,0])|set(neg_emotion.iloc[:,0])# 正負(fù)面情感詞表中相同的詞語intersection = positive&negativepositive = list(positive - intersection)negative = list(negative - intersection)positive = pd.DataFrame({"word":positive, "weight":[1]*len(positive)})negative = pd.DataFrame({"word":negative, "weight":[-1]*len(negative)}) posneg = positive.append(negative)# 將分詞結(jié)果與正負(fù)面情感詞表合并,定位情感詞data_posneg = posneg.merge(word, left_on = 'word', right_on = 'word', how = 'right')data_posneg = data_posneg.sort_values(by = ['index_content','index_word'])data_posneg.head()
修正情感傾向# 載入否定詞表notdict = pd.read_csv(path+"/not.csv")# 構(gòu)造新列,作為經(jīng)過否定詞修正后的情感值data_posneg['amend_weight'] = data_posneg['weight']data_posneg['id'] = np.arange(0, len(data_posneg))# 只保留有情感值的詞語only_inclination = data_posneg.dropna().reset_index(drop=True)index = only_inclination['id']for i in np.arange(0, len(only_inclination)): # 提取第i個(gè)情感詞所在的評(píng)論 review = data_posneg[data_posneg['index_content'] == only_inclination['index_content'][i]] review.index = np.arange(0, len(review)) # 第i個(gè)情感值在該文檔的位置 affective = only_inclination['index_word'][i] if affective == 1: ne = sum([i in notdict['term'] for i in review['word'][affective - 1]])%2 if ne == 1: data_posneg['amend_weight'][index[i]] = -data_posneg['weight'][index[i]] elif affective > 1: ne = sum([i in notdict['term'] for i in review['word'][[affective - 1, affective - 2]]])%2 if ne == 1: data_posneg['amend_weight'][index[i]] = -data_posneg['weight'][index[i]] # 更新只保留情感值的數(shù)據(jù)only_inclination = only_inclination.dropna()# 計(jì)算每條評(píng)論的情感值emotional_value = only_inclination.groupby(['index_content'], as_index=False)['amend_weight'].sum()# 去除情感值為0的評(píng)論emotional_value = emotional_value[emotional_value['amend_weight'] != 0]
查看情感分析效果# 給情感值大于0的賦予評(píng)論類型(content_type)為pos,小于0的為negemotional_value['a_type'] = ''emotional_value['a_type'][emotional_value['amend_weight'] > 0] = 'pos'emotional_value['a_type'][emotional_value['amend_weight'] < 0] = 'neg'emotional_value.head()
# 查看情感分析結(jié)果result = emotional_value.merge(word, left_on = 'index_content', right_on = 'index_content', how = 'left')result.head()
result = result[['index_content','content_type', 'a_type']].drop_duplicates()result.head()
假定用戶在評(píng)論時(shí)不存在“選了好評(píng)的標(biāo)簽,而寫了差評(píng)內(nèi)容”的情況,比較原評(píng)論的評(píng)論類型與情感分析得出的評(píng)論類型,繪制情感傾向分析混淆矩陣,查看詞表的情感分析的準(zhǔn)確率。# 交叉表:統(tǒng)計(jì)分組頻率的特殊透視表confusion_matrix = pd.crosstab(result['content_type'], result['a_type'], margins=True)confusion_matrix.head()
(confusion_matrix.iat[0,0] + confusion_matrix.iat[1,1])/confusion_matrix.iat[2,2]# 提取正負(fù)面評(píng)論信息ind_pos = list(emotional_value[emotional_value['a_type'] == 'pos']['index_content'])ind_neg = list(emotional_value[emotional_value['a_type'] == 'neg']['index_content'])posdata = word[[i in ind_pos for i in word['index_content']]]negdata = word[[i in ind_neg for i in word['index_content']]]# 繪制詞云import matplotlib.pyplot as pltfrom wordcloud import WordCloud# 正面情感詞詞云freq_pos = posdata.groupby('word')['word'].count()freq_pos = freq_pos.sort_values(ascending = False)backgroud_Image=plt.imread(path+'/pl.jpg')wordcloud = WordCloud(font_path=font_path, max_words=100, background_color='white', mask=backgroud_Image)pos_wordcloud = wordcloud.fit_words(freq_pos)plt.imshow(pos_wordcloud)plt.axis('off') plt.show()# 負(fù)面情感詞詞云freq_neg = negdata.groupby(by = ['word'])['word'].count()freq_neg = freq_neg.sort_values(ascending = False)neg_wordcloud = wordcloud.fit_words(freq_neg)plt.imshow(neg_wordcloud)plt.axis('off') plt.show()
# 將結(jié)果寫出,每條評(píng)論作為一行posdata.to_csv("./posdata.csv", index = False, encoding = 'utf-8')negdata.to_csv("./negdata.csv", index = False, encoding = 'utf-8')
由圖正面情感評(píng)論詞云可知,“不錯(cuò)”“滿意”“好評(píng)”等正面情感詞出現(xiàn)的頻數(shù)較高,并且沒有摻雜負(fù)面情感詞語,可以看出情感分析能較好地將正面情感評(píng)論抽取出來。reviews.head()
reviews['content_type'] = reviews['content_type'].map(lambda x:1.0 if x == 'pos' else 0.0)reviews.head()
from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF # 原始文本轉(zhuǎn)化為tf-idf的特征矩陣from sklearn.svm import LinearSVCfrom sklearn.calibration import CalibratedClassifierCVfrom sklearn.model_selection import train_test_split# 將有標(biāo)簽的數(shù)據(jù)集劃分成訓(xùn)練集和測試集train_X,valid_X,train_y,valid_y = train_test_split(reviews['content'],reviews['content_type'],test_size=0.2,random_state=42)train_X.shape,train_y.shape,valid_X.shape,valid_y.shape# 模型構(gòu)建model_tfidf = TFIDF(min_df=5, max_features=5000, ngram_range=(1,3), use_idf=1, smooth_idf=1)# 學(xué)習(xí)idf vectormodel_tfidf.fit(train_X)# 把文檔轉(zhuǎn)換成 X矩陣(該文檔中該特征詞出現(xiàn)的頻次),行是文檔個(gè)數(shù),列是特征詞的個(gè)數(shù)train_vec = model_tfidf.transform(train_X)# 模型訓(xùn)練model_SVC = LinearSVC()clf = CalibratedClassifierCV(model_SVC)clf.fit(train_vec,train_y)# 把文檔轉(zhuǎn)換成矩陣valid_vec = model_tfidf.transform(valid_X)# 驗(yàn)證pre_valid = clf.predict_proba(valid_vec)pre_valid[:5]pre_valid = clf.predict(valid_vec)print('正例:',sum(pre_valid == 1))print('負(fù)例:',sum(pre_valid == 0))from sklearn.metrics import accuracy_scorescore = accuracy_score(pre_valid,valid_y)print("準(zhǔn)確率:",score)
四、LDA模型import reimport itertoolsfrom gensim import corpora, models# 載入情感分析后的數(shù)據(jù)posdata = pd.read_csv("./posdata.csv", encoding = 'utf-8')negdata = pd.read_csv("./negdata.csv", encoding = 'utf-8')# 建立詞典pos_dict = corpora.Dictionary([[i] for i in posdata['word']]) # 正面neg_dict = corpora.Dictionary([[i] for i in negdata['word']]) # 負(fù)面# 建立語料庫pos_corpus = [pos_dict.doc2bow(j) for j in [[i] for i in posdata['word']]] # 正面neg_corpus = [neg_dict.doc2bow(j) for j in [[i] for i in negdata['word']]] # 負(fù)面
五、主題數(shù)尋優(yōu)# 余弦相似度函數(shù)def cos(vector1, vector2): dot_product = 0.0; normA = 0.0; normB = 0.0; for a,b in zip(vector1, vector2): dot_product += a*b normA += a**2 normB += b**2 if normA == 0.0 or normB==0.0: return(None) else: return(dot_product / ((normA*normB)**0.5)) # 主題數(shù)尋優(yōu)def lda_k(x_corpus, x_dict): # 初始化平均余弦相似度 mean_similarity = [] mean_similarity.append(1) # 循環(huán)生成主題并計(jì)算主題間相似度 for i in np.arange(2,11): # LDA模型訓(xùn)練 lda = models.LdaModel(x_corpus, num_topics = i, id2word = x_dict) for j in np.arange(i): term = lda.show_topics(num_words = 50) # 提取各主題詞 top_word = [] for k in np.arange(i): top_word.append([''.join(re.findall('"(.*)"',i)) / for i in term[k][1].split('+')]) # 列出所有詞 # 構(gòu)造詞頻向量 word = sum(top_word,[]) # 列出所有的詞 unique_word = set(word) # 去除重復(fù)的詞 # 構(gòu)造主題詞列表,行表示主題號(hào),列表示各主題詞 mat = [] for j in np.arange(i): top_w = top_word[j] mat.append(tuple([top_w.count(k) for k in unique_word])) p = list(itertools.permutations(list(np.arange(i)),2)) l = len(p) top_similarity = [0] for w in np.arange(l): vector1 = mat[p[w][0]] vector2 = mat[p[w][1]] top_similarity.append(cos(vector1, vector2)) # 計(jì)算平均余弦相似度 mean_similarity.append(sum(top_similarity)/l) return(mean_similarity)# 計(jì)算主題平均余弦相似度pos_k = lda_k(pos_corpus, pos_dict)neg_k = lda_k(neg_corpus, neg_dict)# 繪制主題平均余弦相似度圖形from matplotlib.font_manager import FontProperties font = FontProperties(size=14)fig = plt.figure(figsize=(10,8))ax1 = fig.add_subplot(211)ax1.plot(pos_k)ax1.set_xlabel('正面評(píng)論LDA主題數(shù)尋優(yōu)', fontproperties=font)ax2 = fig.add_subplot(212)ax2.plot(neg_k)ax2.set_xlabel('負(fù)面評(píng)論LDA主題數(shù)尋優(yōu)', fontproperties=font)
由圖可知,對(duì)于正面評(píng)論數(shù)據(jù),當(dāng)主題數(shù)為2或3時(shí),主題間的平均余弦相似度就達(dá)到了最低。因此,對(duì)正面評(píng)論數(shù)據(jù)做LDA,可以選擇主題數(shù)為3;對(duì)于負(fù)面評(píng)論數(shù)據(jù),當(dāng)主題數(shù)為3時(shí),主題間的平均余弦相似度也達(dá)到了最低。因此,對(duì)負(fù)面評(píng)論數(shù)據(jù)做LDA,也可以選擇主題數(shù)為3。# LDA主題分析pos_lda = models.LdaModel(pos_corpus, num_topics = 3, id2word = pos_dict) neg_lda = models.LdaModel(neg_corpus, num_topics = 3, id2word = neg_dict)pos_lda.print_topics(num_words = 10)
[(0,
'0.031*"服務(wù)" + 0.025*"好評(píng)" + 0.021*"信賴" + 0.020*"售后" + 0.019*"人員" + 0.016*"太" + 0.016*"送" + 0.015*"品牌" + 0.014*"電話" + 0.013*"質(zhì)量"'),
(1,
'0.029*"很快" + 0.028*"不錯(cuò)" + 0.026*"值得" + 0.023*"客服" + 0.017*"物流" + 0.017*"差" + 0.014*"速度" + 0.012*"態(tài)度" + 0.012*"贊" + 0.011*"收到"'),
(2,
'0.115*"安裝" + 0.050*"滿意" + 0.038*"師傅" + 0.028*"送貨" + 0.017*"東西" + 0.013*"購物" + 0.012*"家里" + 0.011*"裝" + 0.010*"真的" + 0.010*"預(yù)約"')]
neg_lda.print_topics(num_words = 10)
[(0,七、可視化模型訓(xùn)練結(jié)果
'0.022*"東西" + 0.019*"裝" + 0.016*"加熱" + 0.016*"燒水" + 0.015*"漏水" + 0.013*"真的" + 0.011*"產(chǎn)品" + 0.010*"錢" + 0.009*"電話" + 0.009*"價(jià)格"'),
(1,
'0.140*"安裝" + 0.033*"師傅" + 0.032*"太" + 0.019*"收費(fèi)" + 0.019*"打電話" + 0.018*"貴" + 0.017*"慢" + 0.016*"太慢" + 0.012*"材料" + 0.011*"高"'),
(2,
'0.031*"垃圾" + 0.029*"售后" + 0.027*"差" + 0.023*"安裝費(fèi)" + 0.019*"客服" + 0.018*"小時(shí)" + 0.017*"不好" + 0.017*"收" + 0.012*"人員" + 0.012*"坑人"')]
import pyLDAvisvis = pyLDAvis.gensim.prepare(pos_lda,pos_corpus,pos_dict)# 需要的三個(gè)參數(shù)都可以從硬盤讀取的,前面已經(jīng)存儲(chǔ)下來了# 在瀏覽器中心打開一個(gè)界面# pyLDAvis.show(vis)# 在notebook的output cell中顯示pyLDAvis.display(vis)
綜合以上對(duì)主題及其中的高頻特征詞的分析得出,美的電熱水器有價(jià)格實(shí)惠、性價(jià)比高、外觀好看、服務(wù)好等優(yōu)勢。相對(duì)而言,用戶對(duì)美的電熱水器的抱怨點(diǎn)主要體現(xiàn)在安裝的費(fèi)用高及售后服務(wù)差等方面。因此,用戶的購買原因可以總結(jié)為以下幾個(gè)方面:美的是大品牌值得信賴、美的電熱水器價(jià)格實(shí)惠、性價(jià)比高。關(guān)鍵詞:數(shù)據(jù),分析,評(píng)論,情感,產(chǎn)品
客戶&案例
營銷資訊
關(guān)于我們
微信公眾號(hào)
版權(quán)所有? 億企邦 1997-2025 保留一切法律許可權(quán)利。