摘要: 我們在處理資料時,為了萃取資料的重要資訊常常會使用主成份分析,不過有時候卻難以解釋主成分分析的結果與成因。此篇教導了主成份分析的視覺化方法,可以有效地幫助我們了解並給予主成份分析背後的意義


這篇筆記示範,結合了互動式的資料視覺化之後, 傳統的主成份分析技術運用在大數據上, 也可以有很不錯的效果。

library(magrittr)
library(FactoMineR)
library(factoextra)
library(dplyr)
library(highcharter)
library(RColorBrewer)            #
Set3 <- brewer.pal(12, "Set3")   # define the Set3 palette  
load('data/yelp1.rdata')         # loading yelp data
load('data/empath.rdata')        # 


 

1. 資料整理

利用我們上周用過程式,先將資料整理一下。

# number of biz per category
CA = biz$cat %>% strsplit('|',T) %>% unlist %>% table %>% 
  data.frame %>% 'names<-'(c('name','nbiz'))
# number of review per category
CA$nrev = CA$name %>% sapply(function(z){sum(
  review$bid %in% biz$bid[grep(z,biz$cat,fixed=T)] )})
# average number of reviews per business
CA$avg.rev = CA$nrev / CA$nbiz    
CA = CA[order(-CA$nrev),]  # order CA by no. review
rownames(CA)= CA$name 
CA$name = NULL
#View(CA)
# category-business matrix
mxBC = rownames(CA) %>% sapply(function(z)
  grepl(z,biz$cat,fixed=T)); dim(mxBC) 
[1] 11537   508
rownames(mxBC) = biz$bid
# class weights: the total weight of a class in the corpus
# order the score matrix by class weights
scores = scores[,order(-colSums(scores))]
wClass = colSums(scores)  # class weights

 

然後將 評論情緒評分(10項) 和 評論內容評分(194項) 依 商業類別(508類) 平均起來,分別放在:

  • sx [508 x 10] : 10 Average Sentiment Scores per business category
  • wx [508 x 194] : 194 Average Class Weights per business category

這兩個矩陣裡面。

# Avg. Sentiment Scores by Category [508 x 194]
sx = apply(mxBC,2,function(v){
  i = review$bid %in% rownames(mxBC)[v]
  colMeans(senti[i,]) }) %>% t
dim(sx)
[1] 508  10
# Avg. Class Weights by Category    [508 x 194]
wx = apply(mxBC,2,function(v){
  i = review$bid %in% rownames(mxBC)[v]
  colMeans(scores[i,]) }) %>% t
dim(wx)
[1] 508 194


 

2. 主成份分析

先對情緒矩陣(sx)做主成份分析

ncp=10  # number of components to keep
pcx = PCA(sx,ncp=ncp,graph=F) 
barplot(pcx$eig[1:ncp,3],names=1:ncp,main="Accumulated Variance",
        xlab="No. Components", ylab="% of Variance")
abline(h=seq(0,100,10),col='lightgray')

跟據上圖,前兩個主成份就涵蓋了將近80%的變異量。
但是當我們想要將商業類別標示在前兩個主成份的平面上的時候 …

fviz_pca_biplot(pcx)

由於類別太多(共508類),大部分的類別都幾乎無法辨識。
 

3. 資料視覺化

近兩年來R的畫圖套件幾乎都具備了輸出互動網頁的能力,以下我們先寫一個helper function,來幫助我們檢視主成份分析的結果。

# a helper function that generates Interactive PCA charts
bipcx = function(pcx, d1, d2, nvar, nobs, t1="", t2="",
                 main="Principle Component Anaylysis", 
                 obs='obs.', col.o='gold', ratio=0.7) {
  dfvar = pcx$var$coord %>% 
    {data.frame(name=rownames(.),x=.[,d1],y=.[,d2] )}
  dfvar = head(dfvar[order(-rowSums(pcx$var$cos2[,c(d1,d2)])),], nvar)
  dfobs = pcx$ind$coord %>% 
    {data.frame(name=rownames(.),x=.[,d1],y=.[,d2])}
  dfobs = head(dfobs[order(-rowSums(pcx$ind$cos2[,c(d1,d2)])),], nobs)
  dfvar[-1] = ratio*dfvar[-1]*max(abs(dfobs[,-1]))/max(abs(dfvar[-1])) 
  lsvar = dfvar %>% group_by_("name") %>%
    do(data = list(c(0, 0), c(.$x, .$y))) %>% list_parse()
  highchart() %>%
    hc_colors(substr(Set3, 0, 7)) %>% 
    hc_plotOptions( 
      line = list(
        marker=list(enabled=F),
        tooltip=list(pointFormat="{series.name}")),
      scatter = list(marker=list(radius=4, symbol="circle"))
      ) %>%
    hc_tooltip(headerFormat = "",valueDecimals=1,borderWidth=2) %>%
    hc_add_series_list(lsvar) %>%
    hc_add_series(data = list_parse(dfobs), 
      name = obs, type = "scatter", color = hex_to_rgba(col.o, 0.65),
      tooltip = list(headerFormat="",pointFormat="{point.name}")) %>%
    hc_chart(zoomType = "xy") %>%
    hc_add_theme(hc_theme_flatdark()) %>% 
    hc_title(text=main) %>% 
    hc_xAxis(title=list(
      text=sprintf("dim%d(%.2f%%) %s",d1,pcx$eig[d1,2],t1),
      style=list(color="white")))%>% 
    hc_yAxis(title=list(
      text=sprintf("dim%d(%.2f%%) %s",d2,pcx$eig[d2,2],t2),
      style=list(color="white"))) %>% 
    hc_legend(align="right", verticalAlign="top",layout="vertical")
  }


 

4. 情緒矩陣 的 主成份分析

使用上面bipcx()這個function,我們可以清楚的看到商業類別(由於後面的商業類別評論數不多,我們只畫前300個類別)在第一、二主成份 …

bipcx(pcx,1,2,10,300,t1="Strength",t2="Valence",obs='Biz Category',
      main="PCA on Sentiment Scores",ratio=0.5)
dim1(61.56%) Strengthdim2(18.02%) ValencePCA on Sentiment ScoresangeranticipationdisgustfearjoynegativepositivesadnesssurprisetrustBiz Category-7.5-5-2.502.557.51012.5-6-4-20246


和第二、三主成份平面上的分布狀況。

bipcx(pcx,3,2,10,300,t1="Arousal",t2="Valence",obs='Biz Category',
      main="PCA on Sentiment Scores")
dim3(6.20%) Arousaldim2(18.02%) ValencePCA on Sentiment ScoresangeranticipationdisgustfearjoynegativepositivesadnesssurprisetrustBiz Category-4-3-2-10123-6-4-20246


從以上的圖形我們可以辨識出來,第一、二、三主成份正好分別代表情緒的:

  • 強度 (Strength)
  • 正負值 (Valence)
  • 激發程度 (Arousal)


 

5. 內容矩陣 的 主成份分析

內容矩陣的尺度(194)比情緒矩陣(10)大很多, 即使我們只挑前250個商業類別和前100個內容項目 …

ncp=30
# only take large categories and large classes
pcx = PCA(wx[1:250,1:100],ncp=ncp,graph=F) 
par(cex=0.8)
barplot(pcx$eig[1:ncp,3],names=1:ncp,main="Accumulated Variance",
        xlab="No. Components", ylab="% of Variance")
abline(h=seq(0,100,10),col='lightgray')  # 12 PC's cover ~75% of variance


做完主成份分析之後,前12個主成份也只涵蓋75%的變異量。 在這種資料點和尺度都很多的狀況之下,互動式的圖表更能幫助我們觀察到 原始尺度和資料點之間的關係。 以下我們將前12個主成份,以兩兩成對的方式, 分別畫出在該平面上變異最大的12個內容項目和100個商業類別。 在這些平面上,我們可以看到一些不容易從簡單的敘事統計看出來的關係。

bipcx(pcx,1,2,12,100,obs='Biz Category',
      main="PCA on LIWC Classes, Dim. 1 & 2",ratio=0.5)
dim1(17.80%)dim2(10.52%)PCA on LIWC Classes, Dim. 1 & 2businesscelebrationcookingeatingfungivingleisurepartyrestaurantsmelltoyyouthBiz Category-10-7.5-5-2.502.557.5-7.5-5-2.502.557.510
bipcx(pcx,3,4,12,100,obs='Biz Category',main="PCA on LIWC Classes, Dim. 3 & 4")
dim3(9.38%)dim4(7.22%)PCA on LIWC Classes, Dim. 3 & 4affectionhealinglovemeetingnegative_emotionofficepainpositive_emotionshameshoppingsufferingwealthyBiz Category-5-2.502.557.51012.5-10-7.5-5-2.502.557.510office
bipcx(pcx,5,6,12,100,obs='Biz Category',main="PCA on LIWC Classes, Dim. 5 & 6")
dim5(5.11%)dim6(4.47%)PCA on LIWC Classes, Dim. 5 & 6artbeautycleaningemotionalhatehearinglistensleepswearing_termstourismtravelingvacationBiz Category-6-4-20246-10-7.5-5-2.502.557.510
bipcx(pcx,7,8,12,100,obs='Biz Category',main="PCA on LIWC Classes, Dim. 7 & 8")
dim7(3.85%)dim8(3.37%)PCA on LIWC Classes, Dim. 7 & 8achievementappearanceblue_collar_jobcarcelebrationchildrencleaningdancedomestic_workhygieneoccupationweddingBiz Category-4-20246810-7.5-5-2.502.557.5
bipcx(pcx,9,10,12,100,obs='Biz Category',main="PCA on LIWC Classes, Dim. 9 & 10")
dim9(3.00%)dim10(2.57%)PCA on LIWC Classes, Dim. 9 & 10animaldrivingfurniturehatehomeoceansleepstrengthswimmingtravelingvacationwaterBiz Category-4-20246-7.5-5-2.502.557.51012.51517.5
bipcx(pcx,11,12,12,100,obs='Biz Category',main="PCA on LIWC Classes, Dim. 11 & 12")
dim11(2.30%)dim12(2.15%)PCA on LIWC Classes, Dim. 11 & 12cleaningcomputerdomestic_workhealinghomehygienepolitenessreadingsocial_mediasportstrustwritingBiz Category-5-4-3-2-101234-6-4-202468


如果我們重新組合這些主成份, 我們也許還可以發現更多隱藏在資料裡面的有趣現象。

 

轉貼自: RPubs 作者:Tony Chuo


留下你的回應

以訪客張貼回應

0
  • 找不到回應

Popular Tags

每月文章