Try   HackMD
tags: R Data Processing 資料前處理 Regex 正則表達式 文字分析 Text Mining

Text Mining (Preprocessing)

Reference: Ebook

Dataset: 古騰堡書庫

Regex in R

尋找 符號 舉例
開頭為 ^ ^第[0:9]章
結尾為 $ 區$
包含 [] 區$
數字從1~10以上 [1-9]|1[9] 1~19

找更多範例

String Manipulation (package: stringr)

str_detect(string, pattern)

  • 找出匹配的字元
  • 回傳布林值
library(stringr)
library(janeaustenr) # austen_books

original_books <- austen_books() %>%
  group_by(book) %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>%
  ungroup()
## # A tibble: 73,422 x 4
##    text                    book                linenumber chapter
##    <chr>                   <fct>                    <int>   <int>
##  1 "SENSE AND SENSIBILITY" Sense & Sensibility          1       0
##  2 ""                      Sense & Sensibility          2       0
##  3 "by Jane Austen"        Sense & Sensibility          3       0
##  4 ""                      Sense & Sensibility          4       0
##  5 "(1811)"                Sense & Sensibility          5       0
##  6 ""                      Sense & Sensibility          6       0
##  7 ""                      Sense & Sensibility          7       0
##  8 ""                      Sense & Sensibility          8       0
##  9 ""                      Sense & Sensibility          9       0
## 10 "CHAPTER 1"             Sense & Sensibility         10       1
## # … with 73,412 more rows

str_count(string, pattern)

  • 計算字串內pattern數
str_count('aaa444sssddd', "a") # 3

str_split(string, pattern, n)

  • 分割字串
val <- "abc,123,234,iuuu"
s1 <- str_split(val, ",") # 遇到,就切割
s2 <- str_split(val, ",", 2) # 切兩段
## "abc"  "123"  "234"  "iuuu"
## "abc"  "123,234,iuuu"
strsplit(df$sentence,"[。!;?!?;]") # 以全形或半形 驚歎號、問號、分號 以及 全形句號 爲依據進行斷句

sub_extract(string, pattern)

  • 提取匹配
val <- c("abca4", 123, "cba2")
str_extract(val, "\\d") # 返還匹配的數字
str_extract(val, "[a-z]+") # 返回匹配的字符
str_extract_all(val, "\\d") # 返還匹配的數字

[1] "4" "1" "2"
[1] "abca" NA "cba"

[[1]]
[1] "4"

[[2]]
[1] "1" "2" "3"

[[3]]
[1] "2"

substr(x, start, stop)

  • 從字串擷取字元
substr(df, 1, 4)

grep(pattern, x)

  • 找哪裡有關鍵字
  • 回傳index值
grep("鄉$", df$區域別) # 找最後一個字為鄉
##   [1] 163 164 165 166 167 168 169 170 175 176 177 178 179 180 181 182 183
##  [18] 191 192 193 194 195 196 197 198 199 200 201 210 211 212 213 214 215

gsub(pattern, replacement, x)

  • 替代字元
gsub("區$", "鄉", df$區域別) # 找出區結尾的並用「鄉」代替「區」

scan

scan(file = "./dict/lexicon.txt", what=character(), 
     sep='\n', encoding='utf-8', fileEncoding='utf-8')

Unstructure data to Meta data (Tidy Data)

1. gather(col_name, value_name, key)

把key合成單一變數以col_name儲存並在value_name填入value

Example 1

library(tidyr)

preg2 <- preg %>% 
  gather(treatment, n, treatmenta:treatmentb) %>% # 將treatmentatreatmentb收斂在treatment並總計nmutate(treatment = gsub("treatment", "", treatment)) %>% # 找出關鍵字treatment並以空值替代(只保留a和b)
  arrange(name, treatment)

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →

Example 2

tb <- as_tibble(read.csv("tb.csv", stringsAsFactors = FALSE))
tb2 <- tb %>% 
  gather(demo, n, -iso2, -year, na.rm = TRUE) # iso2,year維持不動, 將剩餘的收斂為demo並計算值到n

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →

2. spread(key, value)

根據key展開做成複數個變數並填入value

Example 1

library(gutenbergr)
bronte <- gutenberg_download(c(1260, 768, 969, 9182, 767)) # 選擇書號

frequency <- bind_rows(mutate(tidy_bronte, author = "Brontë Sisters"), # 在bronte資料集新增全部值為Brontë Sisters的變數
                       mutate(tidy_hgwells, author = "H.G. Wells"), 
                       mutate(tidy_books, author = "Jane Austen")) %>% 
  mutate(word = str_extract(word, "[a-z']+")) %>%   # 去掉非a~z的字符
  count(author, word) %>%             # 計算三個作者對某詞使用次數
  group_by(author) %>%
  mutate(proportion = n / sum(n)) %>% # 將次數變為比例
  select(-n) %>%                      # 把次數拿掉
  spread(author, proportion) %>%      # 將三個author展成三個變數Brontë Sisters, H.G. Wells, Jane Austen並拿proportion作為值
  gather(author, proportion, `Brontë Sisters`:`H.G. Wells`) # 保留Jane Austen並將Brontë SistersH.G. Wells收斂為author

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →

Raw Data

Image Not Showing Possible Reasons
  • The image file may be corrupted
  • The server hosting the image is unavailable
  • The image path is incorrect
  • The image format is not supported
Learn More →

Proportion


Spread Data


Gather Data

Example 2: Visualization

library(scales)

# expect a warning about rows with missing values being removed
ggplot(frequency, aes(x = proportion, y = `Jane Austen`, color = abs(`Jane Austen` - proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  facet_wrap(~author, ncol = 2) +
  theme(legend.position="none") +
  labs(y = "Jane Austen", x = NULL)

3. separate(data, col, into, sep)

  • sep
    • If character: as a regular expression.
    • If numeric: as positions to split at.
      • 正數從左開始數
      • 負數從右開始數

Example 1

tb3 <- tb2 %>% 
  separate(col=demo, into=c("sex", "age"), sep=1) # 以第一個字切割, 將demo展開成兩個變數sex和age

4. unite(col_name, word, word, sep)

tidyr’s unite() function is the inverse of separate(), and lets us recombine the columns into one.

Example 1

# A tibble: 44,784 x 3
   book                word1       word2       
   <fct>               <chr>       <chr>       
 1 Sense & Sensibility jane        austen      
 2 Sense & Sensibility austen      1811        
 3 Sense & Sensibility 1811        chapter     
 4 Sense & Sensibility chapter     1           
 5 Sense & Sensibility norland     park        
 6 Sense & Sensibility surrounding acquaintance
 7 Sense & Sensibility late        owner       
 8 Sense & Sensibility advanced    age         
 9 Sense & Sensibility constant    companion   
10 Sense & Sensibility happened    ten         
# … with 44,774 more rows
bigrams_united <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")
## # A tibble: 44,784 x 2
##    book                bigram                  
##    <fct>               <chr>                   
##  1 Sense & Sensibility jane austen             
##  2 Sense & Sensibility austen 1811             
##  3 Sense & Sensibility 1811 chapter            
##  4 Sense & Sensibility chapter 1               
##  5 Sense & Sensibility norland park            
##  6 Sense & Sensibility surrounding acquaintance
##  7 Sense & Sensibility late owner              
##  8 Sense & Sensibility advanced age            
##  9 Sense & Sensibility constant companion      
## 10 Sense & Sensibility happened ten            
## # … with 44,774 more rows

斷句/斷詞&停用字

tibble

  • tibble data frame
    • 不在意儲存格的型態,可以為list
    • 多一列定義欄位的class type
  • 斷句
text <- c("Because I could not stop for Death -",
          "He kindly stopped for me -",
          "The Carriage held but just Ourselves -",
          "and Immortality")
text_df <- tibble(line = 1:4, text = text)
## # A tibble: 4 x 2
##    line text                                  
##   <int> <chr>                                 
## 1     1 Because I could not stop for Death -  
## 2     2 He kindly stopped for me -            
## 3     3 The Carriage held but just Ourselves -
## 4     4 and Immortality

unnest_tokens

  • 斷詞
  • 去除標點
  • 轉為小寫
library(text)

text_df %>%
  unnest_tokens(word, text, to_lower=True) # 將text做斷詞並儲存結果到word
## # A tibble: 20 x 2
##     line word   
##    <int> <chr>  
##  1     1 because
##  2     1 i      
##  3     1 could  
##  4     1 not    
##  5     1 stop   
##  6     1 for    
##  7     1 death  
##  8     2 he     
##  9     2 kindly 
## 10     2 stopped
## # … with 10 more rows

anti_join

  • to remove stop words
data(stop_words)
tidy_books <- tidy_books %>%
  anti_join(stop_words)

tidy

  • turn non-tidy data into tidy form(one-token-per-document-per-row)
  • similar to the melt() function from the reshape2 package for non-sparse matrices.
library(tidytext)

ap_td <- tidy(AssociatedPress)
ap_td

還原字為原形

lemmatize_words

tidy_books$lemma = lemmatize_words(tidy_books$word)