# 計量經濟學導論 R程式筆記
## Basic R
### Basic
```python=
*** Arithmetic ***
5 + 5 # An addition
5 - 5 # A subtraction
3 * 5 # A multiplication
(5 + 5) / 2 # A division
2^5 # Exponentiation
28%%6 # Modulo
*** Variable assignment ***
x <- 42 # Assign the value 42 to x
x # Print out the value of the variable x
*** Basic Datatype ***
nume <- 42 #numeric
chara <- "universe" #character
logi <- FALSE #logical
```
### Vector
```python=
*** Create Vector ***
n_vector <- c(1, 10, 49) #numeric
c_vector <- c("a", "b", "c") #character
b_vector <- c(TRUE, FALSE, TRUE) #boolean
*** Naming Vector ***
# Poker winnings from Monday to Friday
poker <- c(140, -50, 20, -120, 240)
days <- c("Mon", "Tue", "Wed", "Thu", "Fri")
names(poker) <- days
** Calculating and Comparing ***
# Total winnings with poker
sumporker <- sum(poker)
avgpoker <- mean(poker)
# Select from poker_vector these days
selection <- poker > 0
winning_days <- poker[selection]
*** Vector Selection ***
poker_wednesday <- poker[3]
poker_midweek <- poker[c(2,3,4)]
poker_start <- poker[c(1:3)]
```
### Matrix
```python=
*** Construct Matrix ***
vec <- c(1,2,3,4,5,6)
rownames <- c("A", "B")
colnames <- c("X", "Y", "Z")
mt <- matrix(vec, nrow = 3, byrow = TRUE, dimnames = list(rownames, colnames))
# Or can just use rowname and colname
*** Calculating ***
rowtotal <- rowSums(mt) #rowSums
coltotal <- colSums(mt) #colSums
*** Adding column and rows ***
mtnewrow <- rbind(mt, newrow)
mtnewcol <- cbind(mt, newcol)
```
### Factor
```python=
# Create speed_vector
speed <- c("medium", "slow", "slow", "medium", "fast")
# Convert speed_vector to ordered factor vector
facspeed <-factor(speed, order=TRUE, levels=c("slow", "medium", "fast"))
# Print factor_speed_vector
facspeed
summary(facspeed)
[1] medium slow slow medium fast
Levels: slow < medium < fast
slow medium fast
2 2 1
```
### Dataframe
```python=
head(dataset) # look the head
tail(dataset) # look the tail
str(dataset) # Investigate the structure
data.frame(v1,v2,...) # Create dataframe with vectors
dataframe[1,3] # Pick (1,3)
dataframe[,2:4] # Pick the 2 to 4 column
dataframe[1:2,"loc"] # Pick the first two row of loc column
dataframe$loc # Pick the loc column
subset(dataframe,subset = loc<1) # Pick where loc<1
a<- (1,2,3)
a[order(a)] # Sorting
positions <- order(planets_df$diameter) # Use order() to create positions
planets_df[positions,] # Use positions to sort planets_df
```
### Lists
```python=
my_vector <- 1:10
my_matrix <- matrix(1:9, ncol = 3)
my_df <- mtcars[1:10,]
my_list <- list(vec= my_vector, mat= my_matrix, df= my_df)
```
## Statistics
### Exploring datas
```python=
comics # Print out dataset
level(comics$align) # Print out the types of align in dataset
table(comics$id, comics$align) # Print out the joint number table of two category
prop.table(table(comics$id, comics$align)) # Print out their probability
prop.table(table, [1/2]) # Conditional proability, 1 for row and 2 for column
# Remove align level
comics_filtered <- comics %>%
filter(align != "Reformed Criminals") %>%
droplevels()
# Facet datas
ggplot(cars, aes(x = city_mpg)) +
geom_histogram() +
facet_wrap(~ suv) # Facet the plot by suv
facet_grid(ncyl ~ suv) # Facet plot by ncyl and suv
```
### Center measurment
```python=
mean(dataset$col) # mean
median(dataset$col) # median
dataset %>% count(col, sort=TRUE) # mode
dataset %>%
# Filter name rice col
filter(col == "name") %>%
# Create histogram of col2
ggplot(aes(col2)) + geom_histogram()
# Summarize the mean_col2 and median_col2
summarize(mean_col2 = mean(col2), median_col2 = median(col2))
```
### Spread measurment
```python=
var(dataset$col) # variance
sd(dataset$col) # standard deviation
quantile(dataset$col, probs = seq(0,1,0.2)) # quantiles (by sequence or by c-vector)
q1 <- quantile(dataset$col, 0.25) # 1st quariles
q3 <- quantile(dataset$col, 0.75) # 3rd quartiles
iqr <- q3 - q1 # IQR
lower <- q1 - 1.5*iqr
upper <- q3 + 1.5*iqr
food_consumption %>%
filter(col<lower|col>upper) # finding outliers
```
### Probability
```python=
set.seed(123)
*** Sampling ***
dataset %>%
sample_n(5, replace=TRUE) # Sampling with replacement
count(group_size) %>%
mutate(probability = n / sum(n)) # Add a new col of prob
replicate(times, sample(vector, n, replace=TRUE)) # Sampling multiple times
*** Uniform ***
runif(number, min=0, max=1) # Generate multiple outcomes
punif(0.2, min=0, max=1, lower.tail=FALSE) # P(x>0.2) in unif(0,1)
*** Binomial ***
rbinom(number, n, p) # Generate multiple outcmes
dbinom(x, n, p) # P(X=x)
pbinom(x, n, p) # P(X<x)
*** Normal ***
rnorm(number, mean, sd) # Generate multiple outcomes
pnorm(x, mean, sd) # P(X<x)
qnorm(perctage, mean, sd) # x s.t. P(X<x)=perctange
*** Poisson ***
rpois(number, lambda) # Generate multiple outcomes
dpois(x, lambda) # P(X=x)
ppois(x, lambda) # P(X<x)
*** exponential ***
pexp(x, beta) # P(X,x)
- - - - - - - - - - - - - - - - - - - - - - - - - - -
*** Correlation ***
ggplot(df, aes(x,y))+geom_point() # x-y point plot
+geom_smooth(method = "lm", se = FALSE) # Add trendline
cor(df$x, df$y, use = "pairwise.complete.obs") # Correlation coefficient
# If the data is not quite linear, we use transformation (such as log)
mutate(col = log(col))
```
## R Markdown
### Strcture
```python
> ---
> title: "Investment Report"
> output:
> html_document:
> css: styles.css
> ---
>
> ```{r setup, include=FALSE}
> knitr::opts_chunk$set(fig.align='left', echo=TRUE) #Globally setting
> ```
>
> ```{r data, include = FALSE}
> library(readr)
> library(dplyr)
> library(ggplot2)
> library(knitr)
>
> investment_annual_summary <- read_csv("https://assets.datacamp.com/production/repositories/5756/datasets/d0251f26117bbcf0ea96ac276555b9003f4f7372/investment_annual_summary.csv")
> investment_services_projects <- read_csv("https://assets.datacamp.com/production/repositories/5756/datasets/78b002735b6f620df7f2767e63b76aaca317bf8d/investment_services_projects.csv")
> ```
>
> ## Datasets
> ### Investment Annual Summary
> The `Investment Annual Summary` dataset provides a summary of the dollars in millions provided to each region for each fiscal year, from 2012 to 2018.
> ```{r}
> investment_annual_summary
> ```
```
### YMAL Header
```python
---
title: title
author: author
date: "Last edited`r format(Sys.time(), '%d %B %Y')`"
output:
html/pdf_document: # Type of document
toc: true # Add table of content
toc_float: [float setting]
toc_depth=2 # Depth listing of toc (how many #s)
number_sections: true # Add number of sections
params:
[params type]: [parameter name]
---
*** Day control
%A or %a: weekday
%B or %b: month
%d: decimal day
%m decimal month
%Y or %y: year
*** Float Setting
true toc shows at the left all time
collapsed: false full toc reamin visible
smooth_scroll: false click to change part without animation
*** Parameter
Use [type] to replace [name] in the r chunk
Use `r parmas$[type]` to replace [name] in the text
```
### Style
```python
<style>
#TOC{
color: [color or 6 number code];
font-family: [font name];
font-size: 16px;
background-color: [color or 6 number code];
border-color: [color or 6 number code];
}
#header{
color: [color or 6 number code];
font-family: [font name];
opacity: 0.6;
font-size: 16px;
background-color: [color or 6 number code];
}
(Or use h1.title, h4.author, h4.date)
body/pre{
color: [color or 6 number code];
font-family: [font name];
background-color: [color or 6 number code];
}
</style>
Style text could be putting between the YMAL header and the first r chunk.
Or you can make another file styles.css and link this file in the YMAL header.
```
### R code chunk
```python
> ```{r chunk name, include= TRUE, echo= TRUE, eval= TRUE,
collapse= FALSE, warning= TRUE, message= TRUE, error= FALSE}
> code
> ```
#Default: TRUE
include FALSE: Run / Not show code / Not show result
echo FALSE: Run / Not show code / Show result
eval FALSE: Not run / Show code / Not show result
message FALSE: Run / Show code / Not show result
warning FALSE: Do not show warning
#Default: FALSE
collapse TRUE: code and text shows in the same block
error TRUE: run code and show error chunk
```
### Text
```python
Header
# Biggest header
## Big header
### Normal header
Text
*text*
**text**
~text~
Links
[DataCamp](https://learn.datacamp.com/)
Image
 for images
```
### Plots
```python
> ```{r figurename, [dimension control], fig.cap= "cap name"}
> ggplot(name, aes(x= xdata, y= ydata, color= colordata))+
> geom_line()+
> labs(title= "title", x= "xlabel", y= "ylabel")
> ```
Dimension control
fig.width=5, fig.height=3 # Or fig.dim=c(5,3)
out.width='50%'
out.height='30%'
fig.align= 'left/right/center'
# Or just globally setting in the r setup chunk
```
### Table
```python
> ```{r tablename}
> kable(name,
> col.names= c("col1", "col2"),
> align= "ccc",
> caption= "caption")
> ```
```
## Importing Data
### CSV (Flat files)
```python=
path <- file.path("~", "datasets", "data.csv")
*** Utile package (install default)
read.csv(path) # Dot seperated file (sep= ",")
read.delim(path) # Tab delimited file (sep= "\t")
read.table(path, header= TRUE, sep= "/") # "/" seperate file
read.csv2(path) # sep= ";"
*** readr package
library(readr)
read_csv(path) # Dot seperated file (sep= ",")
read_tsv(path) # Tab delimited file (sep= "\t")
read_delim(path, delim= "/", col_names= FALSE, col_types= "c/d/i/l/_",
skip= [skiprows], n_max= [readrows])
*** Data table fread
install.packages("data.table")
library(data.table)
fread(path, drop=[droprows], select=[readrows])
```
### Excel
```python=
*** Readxl
install.packages("readxl")
library("readxl")
excel_sheets("data.xlsx") # Show sheets
read_excel("data.xlsx", sheet= [sheetnumber or sheetname],
col_names= TRUE, col_types= [types], skip= [skiprows])
pop_list <- lapply(excel_sheets("data.xlsx"), read_excel, path= "data.xlsx") # Read all files in data.xlsx
* col_types
"text" text
"numeric" numbers
"date" date
"blank" ignore column
*** XLconnect
install.packages("XLConnect")
library("XLConnect")
book <- loadWorkbook("data.xlsx")
getSheets(book) # Show sheets
readWorksheet(book, sheet= "name", header= FALSE,
startRow=2, endRow=4, startCol=3, endCol=5)
createSheet(book, name= "name")
writeWorksheet(book, data, sheet= "name")
saveWorkbook(book, file= "newname")
renameSheet(book, "oldname", "newname")
removeSheet(book, sheet= "name")
```
## Data Visualization
### Grammer of graphic

### Aesthetics
```python=
ggplot(dataset, aes([aesthetics], [col]))+
geom_point()+
labs(x= "xlabs", y= "ylabs")+
scale_[aes]_[discrete/continuous]()
# Note: aes如果寫在ggplot的話底下的geom會繼承aes
*** aesthetics:
x, y axis column
fill fill color
color color of point, outlines of other geoms
size radius of point, thickness of lines
alpha transparency
linetype line dash
label text on the plot or axes
*** scale functions
scale_x_continuous("xlabname", limits= [xlim], breaks= [xpassing],
expand= [expand between breaks], labels= [type name])
```
### Geometries
#### Scatter plot
```python=
geom_point(data= "newdataname", [aes]= [types], position= "[position]")
*** data: drawing with a new dataset (different then in ggplot)
*** aesthetics
x,y essential aes (written in ggplot)
alpha/color/fill/shape/size/stroke optional
*** position:
identity/dodge/stack/fill/jitter/jitterdodge/nudge Setting
position_jitter(0.1) control random argument
# geom_point(position= "jitter") = geom_jitter() 添加隨機噪聲
```
#### Point chart
```python=
# Reorder to adjust the order of plotting
ggplot(who_subset, aes(x = log10(cases_1992), y = reorder(country, cases_1992))) +
geom_point()+
# add a visual anchor at x = 0
geom_vline(xintercept = 0)+
# add facet_grid arranged in the column direction by region and free_y scales
facet_grid(region~., scales = 'free_y')
```
#### Histogram
```python=
geom_histogram(binwidth= 0.1, center= 0.05, [aes]= [types], position= "position")
geom_density() # change bar into density
geom_rug() # add rug to data
*** bindwidth, center: width of bins and the center width
*** aesthetics
x essential aes (written in ggplot)
alpha/color/fill/shape/size/stroke optional
*** position
stack bins stack
dodge bins show by types
fill normalize and fill to 1 by proportion
```
#### Bar chart
```python=
geom_bar() # count number of cases
geom_col() # plot actual value
coord_flip() # flip x and y such that label can be shown perfectly
# Plotting distributions
ggplot(dataset, aes(x= col))+
geom_col()+
geom_errorbar(aes(ymin= avg-stdev, ymax= avg+stdev), width= 0.1)
*** Stacked Bar ***
disease_counts <- who_disease %>%
# Filter to on or later than 1999
filter(year >= 1999) %>%
mutate(disease = ifelse(disease %in% c('measles', 'mumps'), disease, 'other')) %>%
group_by(disease, region) %>% # Add region to grouping
summarise(total_cases = sum(cases))
ggplot(disease_counts, aes(x = year, y = total_cases, fill = disease)) +
# Change the position argument to make bars full height
geom_col(position = 'fill')
```
#### Line plots
```python=
geom_line()
geom_area()
# Draw stack area plot
geom_ribbon(aes(ymax= capture, ymin= 0), alpha= 0.3)
```
#### Pie chart
```python=
ggplot(disease_counts, aes(x = 1, y = total_cases, fill = disease)) +
# Use a column geometry.
geom_col() +
# Change coordinate system to polar and set theta to 'y'.
coord_polar(theta = "y")+
# Clean up the background with theme_void and give it a proper title with ggtitle.
theme_void() +
ggtitle('Proportion of diseases')
```
#### Waffle Plot
```python=
disease_counts <- who_disease %>%
group_by(disease) %>%
summarise(total_cases = sum(cases)) %>%
mutate(percent = round(total_cases/sum(total_cases)*100))
# Create an array of rounded percentages for diseases.
case_counts <- disease_counts$percent
# Name the percentage array
names(case_counts) <- disease_counts$disease
# Pass case_counts vector to the waffle function to plot
waffle(case_counts)
```
#### Box plot
```python=
geom_boxplot()
facet_wrap(~vehicle_color) # show lots of plot
*** Beeswarm plot
library(ggbeeswarm)
md_speeding %>%
filter(vehicle_color == 'RED') %>%
ggplot(aes(x = gender, y = speed)) +
# change point size to 0.5 and alpha to 0.8
geom_beeswarm(cex = 0.5, alpha = 0.8) +
# add a transparent boxplot on top of points
geom_boxplot(alpha = 0)
*** Violin plot
md_speeding %>%
filter(vehicle_color == 'RED') %>%
ggplot(aes(x = gender, y = speed)) +
# Replace beeswarm geometry with a violin geometry with kernel width of 2.5
geom_violin(bw = 2.5) +
# add individual points on top of violins and set their alpha to 0.3 and size to 0.5
geom_point(alpha = 0.3, size = 0.5)
```
#### Ridge line plot
```python=
library(ggridges)
md_speeding %>%
mutate(day_of_week = factor(day_of_week, levels = c("Mon","Tues","Wed","Thu","Fri","Sat","Sun") )) %>%
ggplot(aes( x = percentage_over_limit, y = day_of_week)) +
# Set bandwidth to 3.5
geom_density_ridges(bandwidth = 3.5) +
# add limits of 0 to 150 to x-scale
scale_x_continuous(limits = c(0,150)) +
# provide subtitle with bandwidth
labs(subtitle = 'Gaussian kernel SD = 3.5')
```
### Theme
```python=
theme([type]= element_text/line/rect/blank())
theme([type]= unit(3,"cm"))
theme([type].margin= margin(20,30,40,50, "pt"))
theme_classic() # traditional
theme_tufte() # need library(ggthemes)
theme_gray() # default
theme_bw() # use transparency.
theme_void() # removes everything but the data
theme_fivethirtyeight() # 538.com
theme_wsj() # Wall street journal
theme_minimal() # lighten the chart up
original <- theme_opdate()
theme_set(original) # set as default
* Type: text
text
axis.title
axis.title.x
axis.title.x.top
axis.title.x.bottom
axis.title.y
axis.title.y.left
axis.title.y.right
title
legend.title
legend.position
plot.title
plot.subtitle
plot.caption
plot.tag
axis.text
axis.text.x
axis.text.x.top
axis.text.x.bottom
axis.text.y
axis.text.y.left
axis.text.y.right
legend.text
strip.text
strip.text.x
strip.text.y
* Type: line
line
axis.ticks
axis.ticks.x
axis.ticks.x.top
axis.ticks.x.bottom
axis.ticks.y
axis.ticks.y.left
axis.ticks.y.right
axis.line
axis.line.x
axis.line.x.top
axis.line.x.bottom
axis.line.y
axis.line.y.left
axis.line.y.right
panel. grid
panel.grid.major
panel.grid.major.x
panel.grid.major.y
panel.grid.minor
panel.grid.minor.x
panel.grid.minor.y
* Type: rectangle
rect
legend.background
legend.key
legend.box.background
panel.background
panel.border
plot.background
strip.background
strip.background.x
strip.background.y
```
##### Example
```python=
# Set the color scale
palette <- brewer.pal(5, "RdYlBu")[-(2:4)]
# Add a title and caption
ggplot(gm2007, aes(x = lifeExp, y = country, color = lifeExp)) +
geom_point(size = 4) +
geom_segment(aes(xend = 30, yend = country), size = 2) +
geom_text(aes(label = round(lifeExp,1)), color = "white", size = 1.5) +
scale_x_continuous("", expand = c(0,0), limits = c(30,90), position = "top") +
scale_color_gradientn(colors = palette) +
labs(title = "Highest and lowest life expectancies, 2007", caption = "Source: gapminder")
# Define the theme
plt_country_vs_lifeExp +
theme_classic() +
theme(axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text = element_text(color = "black"),
axis.title = element_blank(),
legend.position = "none")
# Add a vertical line
plt_country_vs_lifeExp +
step_1_themes +
geom_vline(xintercept= global_mean, color="grey40", linetype=3)
# Add text
plt_country_vs_lifeExp +
step_1_themes +
geom_vline(xintercept = global_mean, color = "grey40", linetype = 3) +
annotate(
"text",
x = x_start, y = y_start,
label = "The\nglobal\naverage",
vjust = 1, size = 3, color = "grey40"
)
# Add a curve
plt_country_vs_lifeExp +
step_1_themes +
geom_vline(xintercept = global_mean, color = "grey40", linetype = 3) +
step_3_annotation +
annotate(
"curve",
x = x_start, y = y_start,
xend = x_end, yend = y_end,
arrow = arrow(length = unit(0.2, "cm"), type = "closed"),
color = "grey40"
)
```
## dplyr
### Basic functions
```python=
data <- dataset %>%
select(col1, col2, ...) %>% # Select some columns
arrange(col1) %>% # Arrange by col1 (from the least)
arrange(desc(col1)) %>% # Arrange by col1 (from the max)
filter(col1="type1", col3<100) %>% # Filter data
mutate(newcol = col1*col2/100) %>% # Add new column
rename(colnewname = col1) # Rename a column
dataset %>%
count() %>% # Count total number
count(col1, wt=col2, sort=TRUE) %>% # Count number by col1, weighted by col2, sort
group_by(col1, col2) %>% # Group dataset by col1 and col2
slice_min(col3, n=3) %>% # Slice the min 3 obs. in col3 for each group after grouping
ungroup()
relocate(col1, .after = last_col()) %>% # Move col1 to the end
relocate(col2, .before = col5) # Move col2 before col5
dataset %>%
arrange(name, year) %>%
group_by(name) %>%
mutate(ratio = fraction / lag(fraction)) # Add a ratio column that contains the ratio of fraction between each year
*** Helpers to select
contains(" ")
start_with(" ")
ends_with(" ")
last_col(" ")
matches(" ")
```
### Joining Parts
```python=
part1 %>%
inner_join(part2, by = c("newcolname" = "col"), # Inner join part2 to part 1 by col
suffix = c("_part1", "_part2")) # Replace .x and.y if two parts have same colname
part1 %>%
left_join(part2, by = c("col1", "col2")) %>% # Inner join and do not ignore data only the first part
right_join(part3, by = c("col3", "col4")) %>% # Inner join and do not ignore data only the second part
full_join(part4, by = c("col5", "col6")) %>% # Full join, don't ignore any NA
replace_na(list(n=0)) # Replace missing value in the n column
part1 %>%
semi_join(part2, by = c("col1", "col2")) %>% # Part2 裡有哪些也在 part1 中
anti_join(part3, by = c("col3", "col4")) # Part2 裡有哪些不在 part1 中
# Join themes to itself to find the grandchild relationships
part %>%
inner_join(part, by = c("id" = "parent_id"),
suffix = c("_parent", "_child")) %>%
inner_join(themes, by = c("id_child" = "parent_id"),
suffix = c("_parent", "_grandchild"))
# Join two tables with rows
part <- bind_rows(part1 %>% mutate(type = "type1"),
part2 %>% mutate(type = "type2"))
```
## tidyr
### Dealing datas
```python=
dataset %>%
# Seperate or unite data
seperate(col1, into = c("value", "unit"),
sep=" ", convert = TRUE) %>% # Convert to integer
unite("colname, col1, col2", sep=" ") %>%
seperate_rows(col, sep=" ") %>% # Multiple data in one row
# Missing value
replace_na(list(col = "replace value")) %>% # Replace NA in col with value, 0L for integer 0
fill(col, .direction = "down") %>% # Fill NA in col with down direction
drop_na() %>% # Drop all rows have NA
drop_na(col) # Drop rows have NA in col
```
### Wide and long data
```python=
dataset %>%
# From wide to long data
pivot_longer(-country) %>% # pivot to long data except of country
pivot_longer(`1945`:`1951`,
names_to = c("name1", "name2"),
names_prefix = "ignore_name_part",
names_sep = "sep",
name_transform = list(name1 = as.integer, name2 = as.integer),
values_to = "value",
values_drop_na = FALSE) %>%
uncount(col, .id = "id") %>% # Create one row for each col and add the id
# From long to wide data
pivot_wider(names_from = col, values_from = value)
```
### Expanding data
```python=
full_df <- expand_grid( # Make a 9 rows grid data
col1 = c(type1, type2, type3),
col2 = c(type1, type2, type3))
full_df <- complete(col1, col2, fill = list(col3 = 0L)) # Complete create the grid data
years <- full_seq(c(2020, 2030), period = 1)
decades <- full_seq(c(1980, 2030), period = 10)
# Generate the dates for all days in 1980
outer_dates <- c(as.Date("1980-01-01"), as.Date("1980-12-31"))
full_seq(outer_dates, period = 1)
# Fill up time series data
complete(time = seq(min(time), max(time), by = "20 min"),
fill = list(enter = 0L, exit = 0L))
```
### Non rectangle data
```python=
# Create a tibble with a movie column
tibble(movie = movie_planets_list) %>%
unnest_wider(movie) %>% # Unnest the movie column
unnest_longer(planets) # Unnest the planets column
movie_df %>%
hoist(
movie,
title = "Title",
year = "Year",
rating = list("Ratings", "Rotten Tomatoes")
)
ansur_df %>%
group_by(sex) %>% # Group the data by sex
nest() %>% # Nest the data
mutate(
fit = map(data, function(df) lm(weight_kg ~ waist_circum_m + stature_m, data = df)),
glanced = map(fit, glance)) %>%
unnest(glanced) # Unnest the glanced column
```
## Cleaning Data
### Basic problem
```python=
glimpse(dataset) # See all datatype
# Datatype problem
col = as.[datatype](col) # factor, character, numeric, Date, logical
col = str_remove(col, "sep") # Remove unuse sep
# Range problem
library(dplyr)
library(assertive)
assert_all_are_in_past(dataset$datecol) # Check if all date column is in the past
dataset %>%
replace(col, condition, replacement) # Replace rows out of range by NA or border
# Uniqueness constraints
## Full Duplicate
duplicated(bike_share_rides) # Duplicated vector
distinct(bike_share_rides) # Remove duplicate
## Partial Duplicate
dataset %>%
count(col1, col2) %>% # Count rows duplicate in col1 and col2
filter(n>1) %>% # filter out these rows
dataset_unique <- dataset %>%
distinct(col1, col2, .keep_all = TRUE) # Remove partial duplicate
dataset_avgdupli %>% # Use min value to remove duplicate
group_by(col1, col2) %>%
mutate(colvalue_min = mean(colvalue)) %>%
distinct(col1, col2, .keep_all = TRUE) %>%
select(-colvalue)
```
### Categorical Data
```python=
dataset %>%
semijoin(checklist, by = "list") # Wrong factor problem
mutate(col_trim = str_trim(col), # Cleaning white space
col_low = str_to_lower(col_trim)) # Cleaning upper character
# Collapse same kind of type
collapse_type = c("type1", "type2", "type3")
dataset %>%
mutate(col_collapsed = fct_collapse(col, newname = collapse_type))
```
### Text Data
```python=
# str_detect
dataset %>%
filter(str_detect(col, "-")) %>%
filter(str_detect(col, fixed("(")) | str_detect(col, fixed(")")))
# str_remove
col_clean <- dataset$phone %>%
str_remove_all(fixed("(")) %>%
str_remove_all(fixed(")"))
# str_replace
dataset %>%
mutate(col_clean = col_clean,
col_clean_new = str_replace_all(col_clean, "-", " "))
# str_length
sfo_survey %>%
filter(str_length(phone) == 12)
```
### Date Data
```python=
# Date cleaning
formats <- c("%Y-%m-%d", "%B %d, %Y") # Define the date formats
dataset %>%
mutate(date_clean = parse_date_time(date, orders = formats))
# Find invalid age
dataset %>%
# theoretical_age: age based on date
mutate(theoretical_age = floor(as.numeric(date %--% today(), "years"))) %>%
filter(age != theoretical_age)
```
### Missing Value
```python=
library(visdat)
# Original visualize
vis_miss(dataset) # Visualize missing value by column
dataset %>%
mutate(missing_inv = is.na(inv_amount))
# Sorted visualize
dataset %>%
arrange(age) %>%
vis_miss()
# Dealing with NA
dataset_clean <- dataset %>%
filter(!is.na(cust_id)) %>%
mutate(col_filled = ifelse(is.na(col), inv_amount * 5, col))
assert_all_are_not_na(dataset_clean$col_filled) # Assert that cust_id has no missing vals
```
### Linkage
```python=
# String distance
library(stringdist)
stringdist("string1", "string2", method = "[method]") # dl, lcs, jaccard
stringdist_left_join(cities, by = c("city" = "city_actual"))
# Repair linkage
library(reclin)
pair_blocking(df_1, df_2, blocking_var="col") %>% # Generate pairs over col
compare_pairs(by = "name", default_comparator = lcs()) # Compare pairs by name using lcs()
# Scoring and linking
pair_blocking(df_1, df_2, blocking_var = "col") %>%
compare_pairs(by = c("name", "addr"),
default_comparator = jaro_winkler()) %>% # Compare pairs
score_problink() %>% # Score pairs
select_n_to_m() %>% # Select pairs
link() # Link data
```
## Functions
### Basic
```python=
arg(functions) # Looking the arguments of the function
func_name <- function(arg1, arg2) { # Write
# Codes
}
func_name(arg1, arg2) # Call
func_name <- function(dataarg, detailarg = default)
# dataarg: data to run in the function
# detailarg: some detail setting in the function (can be defaulting)
```
### Default setting example - na.rm vs. ...
```python=
calc_harmonic_mean <- function(x, na.rm = FALSE) {
x %>%
get_reciprocal() %>%
mean(na.rm = na.rm) %>%
get_reciprocal()
}
calc_harmonic_mean <- function(x, ...) {
x %>%
get_reciprocal() %>%
mean(...) %>%
get_reciprocal()
}
```
### Return
```python=
# Return multiple
groom_model <- function(model) {
list(
model = glance(model),
coefficients = tidy(model),
observations = augment(model)
)
}
# Returning metadata
pipeable_plot <- function(data, formula) {
plot(formula, data)
# Add a "formula" attribute to data
attr(data, "formula") <- formula
invisible(data)
}
```
### Environment
```py=
# Compare the contents of the global environment and rsa_env
ls.str(globalenv())
ls.str(rsa_env)
# Does population exist in rsa_env?
exists("population", envir = rsa_env)
# Does population exist in rsa_env, ignoring inheritance?
exists("population", envir = rsa_env, inherits = FALSE)
```
### Generalized Addictivity Model
```py=
library(mgcv)
gam(yield_kg_per_ha ~ s(year) + census_region, data = corn)
```
## Writing Efficient R Code
```python=
# Print the R version details using version
version
# How long does it take to read movies from CSV?
system.time(read.csv("movies.csv"))
# Compare the two functions
library(microbenchmark)
compare <- microbenchmark(read.csv("movies.csv"), readRDS("movies.rds"), times = 10)
compare
# Assign the variable ram to the amount of RAM on this machine
# Assign the variable cpu to the cpu specs
library("benchmarkme")
ram <- get_ram()
cpu <- get_cpu()
res <- benchmark_io(runs = 1, size = 5)
ram
cpu
plot(res)
# Use <- with system.time() to store the result as res_grow
system.time(res_grow <- growing(n))
```
### Profiling
* Build matrix is faster than build dataframe
* Use build-in function is faster than use random function
* && is useful than &
```python=
# Load the profvis package
library(profvis)
# Profile the following code
profvis({
# Load and select data
comedies <- movies[movies$Comedy == 1, ]
# Plot data of interest
plot(comedies$year, comedies$rating)
# Loess regression line
model <- loess(rating ~ year, data = comedies)
j <- order(comedies$year)
# Add fitted line to the plot
lines(comedies$year[j], model$fitted[j], col = "red")
})
```
### Parallel Running
```python=
library("parallel")
detectCores() # Determine the number of available cores.
cl <- makeCluster(2) # Create a cluster via makeCluster
parApply(cl, dd, 2, median) # Parallelize this code
stopCluster(cl) # Stop the cluster
no_of_games <- 1e5
# Set the number of games to play
# Time serial version
system.time(serial <- sapply(1:no_of_games, function(i) play()))
cl <- makeCluster(4) # Set up cluster
clusterExport(cl, "play")
# Time parallel version
system.time(par <- parSapply(cl, 1:no_of_games, function(i) play()))
stopCluster(cl) # Stop cluster
```
## Shell
### Basic
```r=
cat file.txt # View a file's contents
less file1.txt file2.txt | cat # View file piece by piece
head file.txt # View the start of a file
```
### Flags
```r=
# ls
ls -R -F /home/dir1 # -R: List all ; -F: Add "/" after dir, "*" after program
# head & tail: Seclect rows
head -n [10] file.txt # View the head 10 lines
tail -n [+7] file.txt # View the last +7 lines
# cut: Select columns
cut -f [2-5,8] -d [,] file.csv # -f: Select col 2-5 and 8 ; -d: Delimiter as ","
# grep: Selects lines according to what they contain
-c: print a count of matching lines rather than the lines themselves
-h: do not print the names of files when searching multiple files
-i: ignore case (e.g., treat "Regression" and "regression" as matches)
-l: print the names of files that contain matches, not the matches
-n: print line numbers for matching lines
-v: invert the match, i.e., only show lines that don't match
# wc: Word count
-c: Characters
-w: Words
-l: Lines
# sort
-n: numerically
-r: reverse
-b: ignore leading blanks
-f: fold case
# uniq
-c: display unique lines with a count of how often each occur
```
### Combining
```r=
head -n 5 file.csv > new.csv # Store head 5 lines to new.csv
cut -d , -f 2 file.csv | grep -v Tooth # Combine commands
head -n 5 files/* # Select many files
```
### Shell storage
```r=
echo $var # echo back variable value
for [filetype] in [gif jpg png]; do [echo $filetype]; done # repeat command
```