tags: `R` `Data Processing` `資料前處理` `Regex` `正則表達式` `文字分析` `Text Mining`

Text Mining (Preprocessing)

Reference: Ebook

Regex in R

尋找	符號	舉例
開頭為	^	^第[0:9]章
結尾為	$	區$
包含	[]	區$
數字從1~10以上	[1-9]\|1[9]	1~19

找更多範例

String Manipulation (package: stringr)

str_detect(string, pattern)

找出匹配的字元
回傳布林值

library(stringr)
library(janeaustenr) # austen_books

original_books <- austen_books() %>%
  group_by(book) %>%
  mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>%
  ungroup()

## # A tibble: 73,422 x 4
##    text                    book                linenumber chapter
##    <chr>                   <fct>                    <int>   <int>
##  1 "SENSE AND SENSIBILITY" Sense & Sensibility          1       0
##  2 ""                      Sense & Sensibility          2       0
##  3 "by Jane Austen"        Sense & Sensibility          3       0
##  4 ""                      Sense & Sensibility          4       0
##  5 "(1811)"                Sense & Sensibility          5       0
##  6 ""                      Sense & Sensibility          6       0
##  7 ""                      Sense & Sensibility          7       0
##  8 ""                      Sense & Sensibility          8       0
##  9 ""                      Sense & Sensibility          9       0
## 10 "CHAPTER 1"             Sense & Sensibility         10       1
## # … with 73,412 more rows

str_count(string, pattern)

計算字串內pattern數

str_count('aaa444sssddd', "a") # 3

str_split(string, pattern, n)

分割字串

val <- "abc,123,234,iuuu"
s1 <- str_split(val, ",") # 遇到,就切割
s2 <- str_split(val, ",", 2) # 切兩段

## "abc"  "123"  "234"  "iuuu"
## "abc"  "123,234,iuuu"

strsplit(df$sentence,"[。！；？!?;]") # 以全形或半形 驚歎號、問號、分號 以及 全形句號 爲依據進行斷句

sub_extract(string, pattern)

提取匹配

val <- c("abca4", 123, "cba2")
str_extract(val, "\\d") # 返還匹配的數字
str_extract(val, "[a-z]+") # 返回匹配的字符
str_extract_all(val, "\\d") # 返還匹配的數字

[1] "4" "1" "2"
[1] "abca" NA "cba"

[[1]]
[1] "4"

[[2]]
[1] "1" "2" "3"

[[3]]
[1] "2"

substr(x, start, stop)

從字串擷取字元

substr(df, 1, 4)

grep(pattern, x)

找哪裡有關鍵字
回傳index值

grep("鄉$", df$區域別) # 找最後一個字為鄉

##   [1] 163 164 165 166 167 168 169 170 175 176 177 178 179 180 181 182 183
##  [18] 191 192 193 194 195 196 197 198 199 200 201 210 211 212 213 214 215

gsub(pattern, replacement, x)

替代字元

gsub("區$", "鄉", df$區域別) # 找出區結尾的並用「鄉」代替「區」

scan

scan(file = "./dict/lexicon.txt", what=character(), 
     sep='\n', encoding='utf-8', fileEncoding='utf-8')

Unstructure data to Meta data (Tidy Data)

1. gather(col_name, value_name, key)

把key合成單一變數以col_name儲存並在value_name填入value

Example 1

library(tidyr)

preg2 <- preg %>% 
  gather(treatment, n, treatmenta:treatmentb) %>% # 將treatmenta和treatmentb收斂在treatment並總計n次
  mutate(treatment = gsub("treatment", "", treatment)) %>% # 找出關鍵字treatment並以空值替代(只保留a和b)
  arrange(name, treatment)

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

Example 2

tb <- as_tibble(read.csv("tb.csv", stringsAsFactors = FALSE))
tb2 <- tb %>% 
  gather(demo, n, -iso2, -year, na.rm = TRUE) # iso2,year維持不動, 將剩餘的收斂為demo並計算值到n

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

2. spread(key, value)

根據key展開做成複數個變數並填入value

Example 1

library(gutenbergr)
bronte <- gutenberg_download(c(1260, 768, 969, 9182, 767)) # 選擇書號

frequency <- bind_rows(mutate(tidy_bronte, author = "Brontë Sisters"), # 在bronte資料集新增全部值為Brontë Sisters的變數
                       mutate(tidy_hgwells, author = "H.G. Wells"), 
                       mutate(tidy_books, author = "Jane Austen")) %>% 
  mutate(word = str_extract(word, "[a-z']+")) %>%   # 去掉非a~z的字符
  count(author, word) %>%             # 計算三個作者對某詞使用次數
  group_by(author) %>%
  mutate(proportion = n / sum(n)) %>% # 將次數變為比例
  select(-n) %>%                      # 把次數拿掉
  spread(author, proportion) %>%      # 將三個author展成三個變數Brontë Sisters, H.G. Wells, Jane Austen並拿proportion作為值
  gather(author, proportion, `Brontë Sisters`:`H.G. Wells`) # 保留Jane Austen並將Brontë Sisters和H.G. Wells收斂為author

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

Raw Data

Image Not Showing Possible Reasons

The image file may be corrupted
The server hosting the image is unavailable
The image path is incorrect
The image format is not supported

Learn More →

Proportion

Spread Data

Gather Data

Example 2: Visualization

library(scales)

# expect a warning about rows with missing values being removed
ggplot(frequency, aes(x = proportion, y = `Jane Austen`, color = abs(`Jane Austen` - proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
  facet_wrap(~author, ncol = 2) +
  theme(legend.position="none") +
  labs(y = "Jane Austen", x = NULL)

3. separate(data, col, into, sep)

sep
- If character: as a regular expression.
- If numeric: as positions to split at.
  - 正數從左開始數
  - 負數從右開始數

Example 1

tb3 <- tb2 %>% 
  separate(col=demo, into=c("sex", "age"), sep=1) # 以第一個字切割, 將demo展開成兩個變數sex和age

4. unite(col_name, word, word, sep)

tidyr’s unite() function is the inverse of separate(), and lets us recombine the columns into one.

Example 1

# A tibble: 44,784 x 3
   book                word1       word2       
   <fct>               <chr>       <chr>       
 1 Sense & Sensibility jane        austen      
 2 Sense & Sensibility austen      1811        
 3 Sense & Sensibility 1811        chapter     
 4 Sense & Sensibility chapter     1           
 5 Sense & Sensibility norland     park        
 6 Sense & Sensibility surrounding acquaintance
 7 Sense & Sensibility late        owner       
 8 Sense & Sensibility advanced    age         
 9 Sense & Sensibility constant    companion   
10 Sense & Sensibility happened    ten         
# … with 44,774 more rows

bigrams_united <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

## # A tibble: 44,784 x 2
##    book                bigram                  
##    <fct>               <chr>                   
##  1 Sense & Sensibility jane austen             
##  2 Sense & Sensibility austen 1811             
##  3 Sense & Sensibility 1811 chapter            
##  4 Sense & Sensibility chapter 1               
##  5 Sense & Sensibility norland park            
##  6 Sense & Sensibility surrounding acquaintance
##  7 Sense & Sensibility late owner              
##  8 Sense & Sensibility advanced age            
##  9 Sense & Sensibility constant companion      
## 10 Sense & Sensibility happened ten            
## # … with 44,774 more rows

斷句/斷詞&停用字

tibble

tibble data frame
- 不在意儲存格的型態，可以為list
- 多一列定義欄位的class type
斷句

text <- c("Because I could not stop for Death -",
          "He kindly stopped for me -",
          "The Carriage held but just Ourselves -",
          "and Immortality")
text_df <- tibble(line = 1:4, text = text)

## # A tibble: 4 x 2
##    line text                                  
##   <int> <chr>                                 
## 1     1 Because I could not stop for Death -  
## 2     2 He kindly stopped for me -            
## 3     3 The Carriage held but just Ourselves -
## 4     4 and Immortality

unnest_tokens

斷詞
去除標點
轉為小寫

library(text)

text_df %>%
  unnest_tokens(word, text, to_lower=True) # 將text做斷詞並儲存結果到word

## # A tibble: 20 x 2
##     line word   
##    <int> <chr>  
##  1     1 because
##  2     1 i      
##  3     1 could  
##  4     1 not    
##  5     1 stop   
##  6     1 for    
##  7     1 death  
##  8     2 he     
##  9     2 kindly 
## 10     2 stopped
## # … with 10 more rows

anti_join

to remove stop words

data(stop_words)
tidy_books <- tidy_books %>%
  anti_join(stop_words)

tidy

turn non-tidy data into tidy form(one-token-per-document-per-row)
similar to the melt() function from the reshape2 package for non-sparse matrices.

library(tidytext)

ap_td <- tidy(AssociatedPress)
ap_td

還原字為原形

lemmatize_words

tidy_books$lemma = lemmatize_words(tidy_books$word)

tags: R Data Processing 資料前處理 Regex 正則表達式 文字分析 Text Mining

Text Mining (Preprocessing)

Regex in R

String Manipulation (package: stringr)

str_detect(string, pattern)

str_count(string, pattern)

str_split(string, pattern, n)

sub_extract(string, pattern)

substr(x, start, stop)

grep(pattern, x)

gsub(pattern, replacement, x)

scan

Unstructure data to Meta data (Tidy Data)

1. gather(col_name, value_name, key)

Example 1

Example 2

2. spread(key, value)

Example 1

Example 2: Visualization

3. separate(data, col, into, sep)

Example 1

4. unite(col_name, word, word, sep)

Example 1

斷句/斷詞&停用字

tibble

unnest_tokens

anti_join

tidy

還原字為原形

lemmatize_words

Read more

Visualization

統計分析方法應用於R語言

recommenderlab套件介紹

TraMineR套件介紹

tags: `R` `Data Processing` `資料前處理` `Regex` `正則表達式` `文字分析` `Text Mining`