主成份分析的資料視覺化

摘要： 我們在處理資料時，為了萃取資料的重要資訊常常會使用主成份分析，不過有時候卻難以解釋主成分分析的結果與成因。此篇教導了主成份分析的視覺化方法，可以有效地幫助我們了解並給予主成份分析背後的意義

``````library(magrittr)
library(FactoMineR)
library(factoextra)
library(dplyr)
library(highcharter)
library(RColorBrewer)            #
Set3 <- brewer.pal(12, "Set3")   # define the Set3 palette

1. 資料整理

``````# number of biz per category
CA = biz\$cat %>% strsplit('|',T) %>% unlist %>% table %>%
data.frame %>% 'names<-'(c('name','nbiz'))
# number of review per category
CA\$nrev = CA\$name %>% sapply(function(z){sum(
review\$bid %in% biz\$bid[grep(z,biz\$cat,fixed=T)] )})
# average number of reviews per business
CA\$avg.rev = CA\$nrev / CA\$nbiz
CA = CA[order(-CA\$nrev),]  # order CA by no. review
rownames(CA)= CA\$name
CA\$name = NULL
#View(CA)
mxBC = rownames(CA) %>% sapply(function(z)
grepl(z,biz\$cat,fixed=T)); dim(mxBC) ``````
``[1] 11537   508``
``````rownames(mxBC) = biz\$bid
# class weights: the total weight of a class in the corpus
# order the score matrix by class weights
scores = scores[,order(-colSums(scores))]
wClass = colSums(scores)  # class weights``````

• `sx [508 x 10]` : 10 Average Sentiment Scores per business category
• `wx [508 x 194]` : 194 Average Class Weights per business category

``````# Avg. Sentiment Scores by Category [508 x 194]
sx = apply(mxBC,2,function(v){
i = review\$bid %in% rownames(mxBC)[v]
colMeans(senti[i,]) }) %>% t
dim(sx)``````
``[1] 508  10``
``````# Avg. Class Weights by Category    [508 x 194]
wx = apply(mxBC,2,function(v){
i = review\$bid %in% rownames(mxBC)[v]
colMeans(scores[i,]) }) %>% t
dim(wx)``````
``[1] 508 194``

2. 主成份分析

``````ncp=10  # number of components to keep
pcx = PCA(sx,ncp=ncp,graph=F)
barplot(pcx\$eig[1:ncp,3],names=1:ncp,main="Accumulated Variance",
xlab="No. Components", ylab="% of Variance")
abline(h=seq(0,100,10),col='lightgray')``````

``fviz_pca_biplot(pcx)``

3. 資料視覺化

``````# a helper function that generates Interactive PCA charts
bipcx = function(pcx, d1, d2, nvar, nobs, t1="", t2="",
main="Principle Component Anaylysis",
obs='obs.', col.o='gold', ratio=0.7) {
dfvar = pcx\$var\$coord %>%
{data.frame(name=rownames(.),x=.[,d1],y=.[,d2] )}
dfobs = pcx\$ind\$coord %>%
{data.frame(name=rownames(.),x=.[,d1],y=.[,d2])}
dfvar[-1] = ratio*dfvar[-1]*max(abs(dfobs[,-1]))/max(abs(dfvar[-1]))
lsvar = dfvar %>% group_by_("name") %>%
do(data = list(c(0, 0), c(.\$x, .\$y))) %>% list_parse()
highchart() %>%
hc_colors(substr(Set3, 0, 7)) %>%
hc_plotOptions(
line = list(
marker=list(enabled=F),
tooltip=list(pointFormat="{series.name}")),
) %>%
name = obs, type = "scatter", color = hex_to_rgba(col.o, 0.65),
hc_chart(zoomType = "xy") %>%
hc_title(text=main) %>%
hc_xAxis(title=list(
text=sprintf("dim%d(%.2f%%) %s",d1,pcx\$eig[d1,2],t1),
style=list(color="white")))%>%
hc_yAxis(title=list(
text=sprintf("dim%d(%.2f%%) %s",d2,pcx\$eig[d2,2],t2),
style=list(color="white"))) %>%
hc_legend(align="right", verticalAlign="top",layout="vertical")
}``````

4. 情緒矩陣 的 主成份分析

``````bipcx(pcx,1,2,10,300,t1="Strength",t2="Valence",obs='Biz Category',
main="PCA on Sentiment Scores",ratio=0.5)``````

``````bipcx(pcx,3,2,10,300,t1="Arousal",t2="Valence",obs='Biz Category',
main="PCA on Sentiment Scores")``````

• 強度 (Strength)
• 正負值 (Valence)
• 激發程度 (Arousal)

5. 內容矩陣 的 主成份分析

``````ncp=30
# only take large categories and large classes
pcx = PCA(wx[1:250,1:100],ncp=ncp,graph=F)
par(cex=0.8)
barplot(pcx\$eig[1:ncp,3],names=1:ncp,main="Accumulated Variance",
xlab="No. Components", ylab="% of Variance")
abline(h=seq(0,100,10),col='lightgray')  # 12 PC's cover ~75% of variance``````

``````bipcx(pcx,1,2,12,100,obs='Biz Category',
main="PCA on LIWC Classes, Dim. 1 & 2",ratio=0.5)``````
``bipcx(pcx,3,4,12,100,obs='Biz Category',main="PCA on LIWC Classes, Dim. 3 & 4")``
``bipcx(pcx,5,6,12,100,obs='Biz Category',main="PCA on LIWC Classes, Dim. 5 & 6")``
``bipcx(pcx,7,8,12,100,obs='Biz Category',main="PCA on LIWC Classes, Dim. 7 & 8")``
``bipcx(pcx,9,10,12,100,obs='Biz Category',main="PCA on LIWC Classes, Dim. 9 & 10")``
``bipcx(pcx,11,12,12,100,obs='Biz Category',main="PCA on LIWC Classes, Dim. 11 & 12")``

0

• 找不到回應