Try   HackMD

助理救星-穿梭於SPSS和R之間


title: 語法示範
author: Wei Hsu
date: 2020/2/15

# 安裝套件
packages <- c("sjlabelled", "sjmisc", "sjPlot", "rio", "foreign"
              ) 

for(p in packages){
  if(!require(p,character.only = TRUE)) install.packages(p)
  library(p,character.only = TRUE)
}

# Environment
print(paste("R", getRversion()))
print("-------------")
for (package_name in packages) {
    print(paste(package_name, packageVersion(package_name)))
}
  
# -------------------------------------------------------------------

## 執行環境

setwd("/Users") # 設定檔案路徑

### 載入套件
library(sjlabelled)
library(sjmisc)
library(sjPlot)
library(rio)
library(foreign)
library(dplyr)

# 各種讀檔方式
r <- import("data107.sav")
h <- haven::read_sav("data107.sav")
s <- sjlabelled::read_data("data107.sav")
f <- foreign::read.spss(file = "data107.sav", to.data.frame = T)

# 只保留示範用的變項
df <- s %>%
  select(a1, a2, b2a, c7, c7o, starts_with("d3"), e1, e4, e5, e6, e8, e9)

# -------------------------------------------------------------------

# 描述統計
view_df(df,
        show.na = T, # 顯示無效值個數
        show.frq = T, # 顯示次數
        show.prc = T # 顯示百分比 
        )

# -------------------------------------------------------------------

## 變數命名
df2 <- rename_variables(df,
                        a1 = "days",
                        c7 = "spend",
                        e1 = "nationality", 
                        e4 = "age", 
                        e5 = "inc", 
                        e6 = "edu", 
                        e8 = "sex", 
                        e9 = "religion"
                        )
# 變數標籤
df2$spend <- set_label(df2$spend, label = "總花費")

# 選項標籤
df2$edu <- set_labels(df2$edu, labels = c("高中職以下" = 1, 
                                          "大專" = 2,
                                          "碩博士" = 3, 
                                          "其他" = 4,
                                          "未回答" = 99
                                          )
                      )

# -------------------------------------------------------------------

## 缺失值處理

# 將`99`設為na
df2$age <- set_na(df2$age, na = 99)
## 批次處理
# df2 <- set_na(df2, na = list(inc = 99, edu = c(4, 99), religion = c(7, 99)))
df2$edu <- set_na(df2$edu, na = c(4, 99))
df2$religion <- set_na(df2$religion, na = c(7, 99))
df2$inc <- set_na(df2$inc, na = 99)

# -------------------------------------------------------------------

## 重新編碼

# 反向編碼
reverse <- rec(df2$d3b, rec = "rev")

# 重新處理labels
d3b <- set_labels(df2$d3b, labels = c("沒有搭乘", "非常不滿意", "不滿意", "普通",
                                     "滿意", "非常滿意")
                 )
reverse <- rec(d3b, rec = "rev")
frq(reverse)

# 但有時候還是會遇到編碼問題QQ
# 以下示範兩種變項編碼(recode)的方式:

df2$male <- rec(df2$sex, rec = "1=1; 2=0", 
             val.labels = c('男姓', '女性'), as.num = T)
frq(df2$male)

# 另一種方式則是在編碼的時候同時定義選項的標籤,這個方法只要在數值後面加上`[選項標籤]`即可,這樣看起來雖然比較便利,但在中文的處理上卻很容易出現程式錯誤,
# 對於windows系統中文字的顯示較常會有無法辨識的問題,同樣的語法在Mac上就沒有問題。

# recode
df2$male <- rec(df2$sex,
                rec = c("1=1[男性]; 2=0[女性]"), as.num = T)
frq(df2$male)

df2$male <- rec(df2$sex,
                rec = c("1=1[male]; 2=0[feamle]"), as.num = T)
frq(df2$male)


# 對於windows系統中文字的顯示較常會有無法辨識的問題

多重編碼
```{r}
df2$agegroup <- rec(df2$age, 
                    rec = c("1:2=1; 3:5=2; 6:7=3; else=NA"),
                    val.labels = c("青年", "壯年", "老年"),
                    var.label = "年齡分組", 
                    as.num = F #輸出為`factor`
                    )
frq(df2$agegroup)

# -------------------------------------------------------------------

# 數值變項分組
# 將旅客在台灣的停留天數以每10天進行分組

df2$daygroup <- group_var(df2$days, size = 10, as.num = FALSE)
levels(df2$daygroup) <- group_labels(df2$days, size = 10)
frq(df2$daygroup)

# -------------------------------------------------------------------

# 資料探索和視覺化
## 繪製直方圖
df3 <- df2 %>%
  filter(c7o == 16) %>%
  copy_labels(df2) 

plot_frq(df2$days, type = "hist", axis.title = "天數") +
  theme_sjplot(base_family = 'NotoSansCJKtc-Black')

df3$ln_spend = log(df3$spend + 1)
plot_frq(df3$ln_spend, type = "hist", normal.curve = TRUE, axis.title = "ln(總花費)",
         normal.curve.color = "red",
         normal.curve.size = 2) +
  theme_sjplot(base_family = 'NotoSansCJKtc-Black')

## 交叉表

df2$b2a <- set_labels(df2$b2a, labels = c("觀光", "業務", "國際會議或展覽", 
                                          "探親或訪友", "求學", "醫療", "其他"))
tab_xtab(df2$edu, df2$b2a, show.row.prc = T, show.col.prc = T, encoding = "UTF-8") #製表

## 分組長條圖

plot_grpfrq(df2$b2a, df2$edu, show.values = F) +
  theme_sjplot(base_family='NotoSansCJKtc-Black')  # 橫向比較
plot_grpfrq(df2$b2a, df2$edu, bar.pos = "stack", show.values = F) +
  theme_sjplot(base_family = 'NotoSansCJKtc-Black') # 堆疊

# -------------------------------------------------------------------

# 題組探索

like <- find_var(df2, pattern = "d3", out = "df")
like <- set_na(like, na = 0)

# create labels
levels_5 <- c("非常不滿意", "不滿意", "普通", "滿意", "非常滿意")

# create item labels
items <- names(like)

tab_stackfrq(like, value.labels = levels_5, var.labels = items, show.na = T)


plot_likert(like, 
            values = "sum.outside",
            show.prc.sign = TRUE) +
  theme_sjplot(base_family = 'NotoSansCJKtc-Black')

# -------------------------------------------------------------------

# 模型檢視

df3 <- as_factor(df3, sex, religion, edu, inc, age, b2a) # 設定類別變數
m1 <- lm(ln_spend ~ sex + edu + age, data = df3)
m2 <- lm(ln_spend ~ sex + edu + age + b2a, data = df3)

tab_model(m1, m2, show.reflvl = T) # # 模型表格
plot_model(m2, vline.color = "red", show.values = TRUE, 
           value.offset = .3) +
  theme_sjplot(base_family = 'NotoSansCJKtc-Black') # 模型視覺化