計量經濟學導論 R程式筆記

# 計量經濟學導論 R程式筆記 ## Basic R ### Basic ```python= *** Arithmetic *** 5 + 5 # An addition 5 - 5 # A subtraction 3 * 5 # A multiplication (5 + 5) / 2 # A division 2^5 # Exponentiation 28%%6 # Modulo *** Variable assignment *** x <- 42 # Assign the value 42 to x x # Print out the value of the variable x *** Basic Datatype *** nume <- 42 #numeric chara <- "universe" #character logi <- FALSE #logical ``` ### Vector ```python= *** Create Vector *** n_vector <- c(1, 10, 49) #numeric c_vector <- c("a", "b", "c") #character b_vector <- c(TRUE, FALSE, TRUE) #boolean *** Naming Vector *** # Poker winnings from Monday to Friday poker <- c(140, -50, 20, -120, 240) days <- c("Mon", "Tue", "Wed", "Thu", "Fri") names(poker) <- days ** Calculating and Comparing *** # Total winnings with poker sumporker <- sum(poker) avgpoker <- mean(poker) # Select from poker_vector these days selection <- poker > 0 winning_days <- poker[selection] *** Vector Selection *** poker_wednesday <- poker[3] poker_midweek <- poker[c(2,3,4)] poker_start <- poker[c(1:3)] ``` ### Matrix ```python= *** Construct Matrix *** vec <- c(1,2,3,4,5,6) rownames <- c("A", "B") colnames <- c("X", "Y", "Z") mt <- matrix(vec, nrow = 3, byrow = TRUE, dimnames = list(rownames, colnames)) # Or can just use rowname and colname *** Calculating *** rowtotal <- rowSums(mt) #rowSums coltotal <- colSums(mt) #colSums *** Adding column and rows *** mtnewrow <- rbind(mt, newrow) mtnewcol <- cbind(mt, newcol) ``` ### Factor ```python= # Create speed_vector speed <- c("medium", "slow", "slow", "medium", "fast") # Convert speed_vector to ordered factor vector facspeed <-factor(speed, order=TRUE, levels=c("slow", "medium", "fast")) # Print factor_speed_vector facspeed summary(facspeed) [1] medium slow slow medium fast Levels: slow < medium < fast slow medium fast 2 2 1 ``` ### Dataframe ```python= head(dataset) # look the head tail(dataset) # look the tail str(dataset) # Investigate the structure data.frame(v1,v2,...) # Create dataframe with vectors dataframe[1,3] # Pick (1,3) dataframe[,2:4] # Pick the 2 to 4 column dataframe[1:2,"loc"] # Pick the first two row of loc column dataframe$loc # Pick the loc column subset(dataframe,subset = loc<1) # Pick where loc<1 a<- (1,2,3) a[order(a)] # Sorting positions <- order(planets_df$diameter) # Use order() to create positions planets_df[positions,] # Use positions to sort planets_df ``` ### Lists ```python= my_vector <- 1:10 my_matrix <- matrix(1:9, ncol = 3) my_df <- mtcars[1:10,] my_list <- list(vec= my_vector, mat= my_matrix, df= my_df) ``` ## Statistics ### Exploring datas ```python= comics # Print out dataset level(comics$align) # Print out the types of align in dataset table(comics$id, comics$align) # Print out the joint number table of two category prop.table(table(comics$id, comics$align)) # Print out their probability prop.table(table, [1/2]) # Conditional proability, 1 for row and 2 for column # Remove align level comics_filtered <- comics %>% filter(align != "Reformed Criminals") %>% droplevels() # Facet datas ggplot(cars, aes(x = city_mpg)) + geom_histogram() + facet_wrap(~ suv) # Facet the plot by suv facet_grid(ncyl ~ suv) # Facet plot by ncyl and suv ``` ### Center measurment ```python= mean(dataset$col) # mean median(dataset$col) # median dataset %>% count(col, sort=TRUE) # mode dataset %>% # Filter name rice col filter(col == "name") %>% # Create histogram of col2 ggplot(aes(col2)) + geom_histogram() # Summarize the mean_col2 and median_col2 summarize(mean_col2 = mean(col2), median_col2 = median(col2)) ``` ### Spread measurment ```python= var(dataset$col) # variance sd(dataset$col) # standard deviation quantile(dataset$col, probs = seq(0,1,0.2)) # quantiles (by sequence or by c-vector) q1 <- quantile(dataset$col, 0.25) # 1st quariles q3 <- quantile(dataset$col, 0.75) # 3rd quartiles iqr <- q3 - q1 # IQR lower <- q1 - 1.5*iqr upper <- q3 + 1.5*iqr food_consumption %>% filter(col<lower|col>upper) # finding outliers ``` ### Probability ```python= set.seed(123) *** Sampling *** dataset %>% sample_n(5, replace=TRUE) # Sampling with replacement count(group_size) %>% mutate(probability = n / sum(n)) # Add a new col of prob replicate(times, sample(vector, n, replace=TRUE)) # Sampling multiple times *** Uniform *** runif(number, min=0, max=1) # Generate multiple outcomes punif(0.2, min=0, max=1, lower.tail=FALSE) # P(x>0.2) in unif(0,1) *** Binomial *** rbinom(number, n, p) # Generate multiple outcmes dbinom(x, n, p) # P(X=x) pbinom(x, n, p) # P(X<x) *** Normal *** rnorm(number, mean, sd) # Generate multiple outcomes pnorm(x, mean, sd) # P(X<x) qnorm(perctage, mean, sd) # x s.t. P(X<x)=perctange *** Poisson *** rpois(number, lambda) # Generate multiple outcomes dpois(x, lambda) # P(X=x) ppois(x, lambda) # P(X<x) *** exponential *** pexp(x, beta) # P(X,x) - - - - - - - - - - - - - - - - - - - - - - - - - - - *** Correlation *** ggplot(df, aes(x,y))+geom_point() # x-y point plot +geom_smooth(method = "lm", se = FALSE) # Add trendline cor(df$x, df$y, use = "pairwise.complete.obs") # Correlation coefficient # If the data is not quite linear, we use transformation (such as log) mutate(col = log(col)) ``` ## R Markdown ### Strcture ```python > --- > title: "Investment Report" > output: > html_document: > css: styles.css > --- > > ```{r setup, include=FALSE} > knitr::opts_chunk$set(fig.align='left', echo=TRUE) #Globally setting > ``` > > ```{r data, include = FALSE} > library(readr) > library(dplyr) > library(ggplot2) > library(knitr) > > investment_annual_summary <- read_csv("https://assets.datacamp.com/production/repositories/5756/datasets/d0251f26117bbcf0ea96ac276555b9003f4f7372/investment_annual_summary.csv") > investment_services_projects <- read_csv("https://assets.datacamp.com/production/repositories/5756/datasets/78b002735b6f620df7f2767e63b76aaca317bf8d/investment_services_projects.csv") > ``` > > ## Datasets > ### Investment Annual Summary > The `Investment Annual Summary` dataset provides a summary of the dollars in millions provided to each region for each fiscal year, from 2012 to 2018. > ```{r} > investment_annual_summary > ``` ``` ### YMAL Header ```python --- title: title author: author date: "Last edited`r format(Sys.time(), '%d %B %Y')`" output: html/pdf_document: # Type of document toc: true # Add table of content toc_float: [float setting] toc_depth=2 # Depth listing of toc (how many #s) number_sections: true # Add number of sections params: [params type]: [parameter name] --- *** Day control %A or %a: weekday %B or %b: month %d: decimal day %m decimal month %Y or %y: year *** Float Setting true toc shows at the left all time collapsed: false full toc reamin visible smooth_scroll: false click to change part without animation *** Parameter Use [type] to replace [name] in the r chunk Use `r parmas$[type]` to replace [name] in the text ``` ### Style ```python <style> #TOC{ color: [color or 6 number code]; font-family: [font name]; font-size: 16px; background-color: [color or 6 number code]; border-color: [color or 6 number code]; } #header{ color: [color or 6 number code]; font-family: [font name]; opacity: 0.6; font-size: 16px; background-color: [color or 6 number code]; } (Or use h1.title, h4.author, h4.date) body/pre{ color: [color or 6 number code]; font-family: [font name]; background-color: [color or 6 number code]; } </style> Style text could be putting between the YMAL header and the first r chunk. Or you can make another file styles.css and link this file in the YMAL header. ``` ### R code chunk ```python > ```{r chunk name, include= TRUE, echo= TRUE, eval= TRUE, collapse= FALSE, warning= TRUE, message= TRUE, error= FALSE} > code > ``` #Default: TRUE include FALSE: Run / Not show code / Not show result echo FALSE: Run / Not show code / Show result eval FALSE: Not run / Show code / Not show result message FALSE: Run / Show code / Not show result warning FALSE: Do not show warning #Default: FALSE collapse TRUE: code and text shows in the same block error TRUE: run code and show error chunk ``` ### Text ```python Header # Biggest header ## Big header ### Normal header Text *text* **text** ~text~ Links [DataCamp](https://learn.datacamp.com/) Image ![](datacamp.png) for images ``` ### Plots ```python > ```{r figurename, [dimension control], fig.cap= "cap name"} > ggplot(name, aes(x= xdata, y= ydata, color= colordata))+ > geom_line()+ > labs(title= "title", x= "xlabel", y= "ylabel") > ``` Dimension control fig.width=5, fig.height=3 # Or fig.dim=c(5,3) out.width='50%' out.height='30%' fig.align= 'left/right/center' # Or just globally setting in the r setup chunk ``` ### Table ```python > ```{r tablename} > kable(name, > col.names= c("col1", "col2"), > align= "ccc", > caption= "caption") > ``` ``` ## Importing Data ### CSV (Flat files) ```python= path <- file.path("~", "datasets", "data.csv") *** Utile package (install default) read.csv(path) # Dot seperated file (sep= ",") read.delim(path) # Tab delimited file (sep= "\t") read.table(path, header= TRUE, sep= "/") # "/" seperate file read.csv2(path) # sep= ";" *** readr package library(readr) read_csv(path) # Dot seperated file (sep= ",") read_tsv(path) # Tab delimited file (sep= "\t") read_delim(path, delim= "/", col_names= FALSE, col_types= "c/d/i/l/_", skip= [skiprows], n_max= [readrows]) *** Data table fread install.packages("data.table") library(data.table) fread(path, drop=[droprows], select=[readrows]) ``` ### Excel ```python= *** Readxl install.packages("readxl") library("readxl") excel_sheets("data.xlsx") # Show sheets read_excel("data.xlsx", sheet= [sheetnumber or sheetname], col_names= TRUE, col_types= [types], skip= [skiprows]) pop_list <- lapply(excel_sheets("data.xlsx"), read_excel, path= "data.xlsx") # Read all files in data.xlsx * col_types "text" text "numeric" numbers "date" date "blank" ignore column *** XLconnect install.packages("XLConnect") library("XLConnect") book <- loadWorkbook("data.xlsx") getSheets(book) # Show sheets readWorksheet(book, sheet= "name", header= FALSE, startRow=2, endRow=4, startCol=3, endCol=5) createSheet(book, name= "name") writeWorksheet(book, data, sheet= "name") saveWorkbook(book, file= "newname") renameSheet(book, "oldname", "newname") removeSheet(book, sheet= "name") ``` ## Data Visualization ### Grammer of graphic ![截圖 2025-03-03 下午2.01.19](https://hackmd.io/_uploads/BJ1kLazoJx.png) ### Aesthetics ```python= ggplot(dataset, aes([aesthetics], [col]))+ geom_point()+ labs(x= "xlabs", y= "ylabs")+ scale_[aes]_[discrete/continuous]() # Note: aes如果寫在ggplot的話底下的geom會繼承aes *** aesthetics: x, y axis column fill fill color color color of point, outlines of other geoms size radius of point, thickness of lines alpha transparency linetype line dash label text on the plot or axes *** scale functions scale_x_continuous("xlabname", limits= [xlim], breaks= [xpassing], expand= [expand between breaks], labels= [type name]) ``` ### Geometries #### Scatter plot ```python= geom_point(data= "newdataname", [aes]= [types], position= "[position]") *** data: drawing with a new dataset (different then in ggplot) *** aesthetics x,y essential aes (written in ggplot) alpha/color/fill/shape/size/stroke optional *** position: identity/dodge/stack/fill/jitter/jitterdodge/nudge Setting position_jitter(0.1) control random argument # geom_point(position= "jitter") = geom_jitter() 添加隨機噪聲 ``` #### Point chart ```python= # Reorder to adjust the order of plotting ggplot(who_subset, aes(x = log10(cases_1992), y = reorder(country, cases_1992))) + geom_point()+ # add a visual anchor at x = 0 geom_vline(xintercept = 0)+ # add facet_grid arranged in the column direction by region and free_y scales facet_grid(region~., scales = 'free_y') ``` #### Histogram ```python= geom_histogram(binwidth= 0.1, center= 0.05, [aes]= [types], position= "position") geom_density() # change bar into density geom_rug() # add rug to data *** bindwidth, center: width of bins and the center width *** aesthetics x essential aes (written in ggplot) alpha/color/fill/shape/size/stroke optional *** position stack bins stack dodge bins show by types fill normalize and fill to 1 by proportion ``` #### Bar chart ```python= geom_bar() # count number of cases geom_col() # plot actual value coord_flip() # flip x and y such that label can be shown perfectly # Plotting distributions ggplot(dataset, aes(x= col))+ geom_col()+ geom_errorbar(aes(ymin= avg-stdev, ymax= avg+stdev), width= 0.1) *** Stacked Bar *** disease_counts <- who_disease %>% # Filter to on or later than 1999 filter(year >= 1999) %>% mutate(disease = ifelse(disease %in% c('measles', 'mumps'), disease, 'other')) %>% group_by(disease, region) %>% # Add region to grouping summarise(total_cases = sum(cases)) ggplot(disease_counts, aes(x = year, y = total_cases, fill = disease)) + # Change the position argument to make bars full height geom_col(position = 'fill') ``` #### Line plots ```python= geom_line() geom_area() # Draw stack area plot geom_ribbon(aes(ymax= capture, ymin= 0), alpha= 0.3) ``` #### Pie chart ```python= ggplot(disease_counts, aes(x = 1, y = total_cases, fill = disease)) + # Use a column geometry. geom_col() + # Change coordinate system to polar and set theta to 'y'. coord_polar(theta = "y")+ # Clean up the background with theme_void and give it a proper title with ggtitle. theme_void() + ggtitle('Proportion of diseases') ``` #### Waffle Plot ```python= disease_counts <- who_disease %>% group_by(disease) %>% summarise(total_cases = sum(cases)) %>% mutate(percent = round(total_cases/sum(total_cases)*100)) # Create an array of rounded percentages for diseases. case_counts <- disease_counts$percent # Name the percentage array names(case_counts) <- disease_counts$disease # Pass case_counts vector to the waffle function to plot waffle(case_counts) ``` #### Box plot ```python= geom_boxplot() facet_wrap(~vehicle_color) # show lots of plot *** Beeswarm plot library(ggbeeswarm) md_speeding %>% filter(vehicle_color == 'RED') %>% ggplot(aes(x = gender, y = speed)) + # change point size to 0.5 and alpha to 0.8 geom_beeswarm(cex = 0.5, alpha = 0.8) + # add a transparent boxplot on top of points geom_boxplot(alpha = 0) *** Violin plot md_speeding %>% filter(vehicle_color == 'RED') %>% ggplot(aes(x = gender, y = speed)) + # Replace beeswarm geometry with a violin geometry with kernel width of 2.5 geom_violin(bw = 2.5) + # add individual points on top of violins and set their alpha to 0.3 and size to 0.5 geom_point(alpha = 0.3, size = 0.5) ``` #### Ridge line plot ```python= library(ggridges) md_speeding %>% mutate(day_of_week = factor(day_of_week, levels = c("Mon","Tues","Wed","Thu","Fri","Sat","Sun") )) %>% ggplot(aes( x = percentage_over_limit, y = day_of_week)) + # Set bandwidth to 3.5 geom_density_ridges(bandwidth = 3.5) + # add limits of 0 to 150 to x-scale scale_x_continuous(limits = c(0,150)) + # provide subtitle with bandwidth labs(subtitle = 'Gaussian kernel SD = 3.5') ``` ### Theme ```python= theme([type]= element_text/line/rect/blank()) theme([type]= unit(3,"cm")) theme([type].margin= margin(20,30,40,50, "pt")) theme_classic() # traditional theme_tufte() # need library(ggthemes) theme_gray() # default theme_bw() # use transparency. theme_void() # removes everything but the data theme_fivethirtyeight() # 538.com theme_wsj() # Wall street journal theme_minimal() # lighten the chart up original <- theme_opdate() theme_set(original) # set as default * Type: text text axis.title axis.title.x axis.title.x.top axis.title.x.bottom axis.title.y axis.title.y.left axis.title.y.right title legend.title legend.position plot.title plot.subtitle plot.caption plot.tag axis.text axis.text.x axis.text.x.top axis.text.x.bottom axis.text.y axis.text.y.left axis.text.y.right legend.text strip.text strip.text.x strip.text.y * Type: line line axis.ticks axis.ticks.x axis.ticks.x.top axis.ticks.x.bottom axis.ticks.y axis.ticks.y.left axis.ticks.y.right axis.line axis.line.x axis.line.x.top axis.line.x.bottom axis.line.y axis.line.y.left axis.line.y.right panel. grid panel.grid.major panel.grid.major.x panel.grid.major.y panel.grid.minor panel.grid.minor.x panel.grid.minor.y * Type: rectangle rect legend.background legend.key legend.box.background panel.background panel.border plot.background strip.background strip.background.x strip.background.y ``` ##### Example ```python= # Set the color scale palette <- brewer.pal(5, "RdYlBu")[-(2:4)] # Add a title and caption ggplot(gm2007, aes(x = lifeExp, y = country, color = lifeExp)) + geom_point(size = 4) + geom_segment(aes(xend = 30, yend = country), size = 2) + geom_text(aes(label = round(lifeExp,1)), color = "white", size = 1.5) + scale_x_continuous("", expand = c(0,0), limits = c(30,90), position = "top") + scale_color_gradientn(colors = palette) + labs(title = "Highest and lowest life expectancies, 2007", caption = "Source: gapminder") # Define the theme plt_country_vs_lifeExp + theme_classic() + theme(axis.line.y = element_blank(), axis.ticks.y = element_blank(), axis.text = element_text(color = "black"), axis.title = element_blank(), legend.position = "none") # Add a vertical line plt_country_vs_lifeExp + step_1_themes + geom_vline(xintercept= global_mean, color="grey40", linetype=3) # Add text plt_country_vs_lifeExp + step_1_themes + geom_vline(xintercept = global_mean, color = "grey40", linetype = 3) + annotate( "text", x = x_start, y = y_start, label = "The\nglobal\naverage", vjust = 1, size = 3, color = "grey40" ) # Add a curve plt_country_vs_lifeExp + step_1_themes + geom_vline(xintercept = global_mean, color = "grey40", linetype = 3) + step_3_annotation + annotate( "curve", x = x_start, y = y_start, xend = x_end, yend = y_end, arrow = arrow(length = unit(0.2, "cm"), type = "closed"), color = "grey40" ) ``` ## dplyr ### Basic functions ```python= data <- dataset %>% select(col1, col2, ...) %>% # Select some columns arrange(col1) %>% # Arrange by col1 (from the least) arrange(desc(col1)) %>% # Arrange by col1 (from the max) filter(col1="type1", col3<100) %>% # Filter data mutate(newcol = col1*col2/100) %>% # Add new column rename(colnewname = col1) # Rename a column dataset %>% count() %>% # Count total number count(col1, wt=col2, sort=TRUE) %>% # Count number by col1, weighted by col2, sort group_by(col1, col2) %>% # Group dataset by col1 and col2 slice_min(col3, n=3) %>% # Slice the min 3 obs. in col3 for each group after grouping ungroup() relocate(col1, .after = last_col()) %>% # Move col1 to the end relocate(col2, .before = col5) # Move col2 before col5 dataset %>% arrange(name, year) %>% group_by(name) %>% mutate(ratio = fraction / lag(fraction)) # Add a ratio column that contains the ratio of fraction between each year *** Helpers to select contains(" ") start_with(" ") ends_with(" ") last_col(" ") matches(" ") ``` ### Joining Parts ```python= part1 %>% inner_join(part2, by = c("newcolname" = "col"), # Inner join part2 to part 1 by col suffix = c("_part1", "_part2")) # Replace .x and.y if two parts have same colname part1 %>% left_join(part2, by = c("col1", "col2")) %>% # Inner join and do not ignore data only the first part right_join(part3, by = c("col3", "col4")) %>% # Inner join and do not ignore data only the second part full_join(part4, by = c("col5", "col6")) %>% # Full join, don't ignore any NA replace_na(list(n=0)) # Replace missing value in the n column part1 %>% semi_join(part2, by = c("col1", "col2")) %>% # Part2 裡有哪些也在 part1 中 anti_join(part3, by = c("col3", "col4")) # Part2 裡有哪些不在 part1 中 # Join themes to itself to find the grandchild relationships part %>% inner_join(part, by = c("id" = "parent_id"), suffix = c("_parent", "_child")) %>% inner_join(themes, by = c("id_child" = "parent_id"), suffix = c("_parent", "_grandchild")) # Join two tables with rows part <- bind_rows(part1 %>% mutate(type = "type1"), part2 %>% mutate(type = "type2")) ``` ## tidyr ### Dealing datas ```python= dataset %>% # Seperate or unite data seperate(col1, into = c("value", "unit"), sep=" ", convert = TRUE) %>% # Convert to integer unite("colname, col1, col2", sep=" ") %>% seperate_rows(col, sep=" ") %>% # Multiple data in one row # Missing value replace_na(list(col = "replace value")) %>% # Replace NA in col with value, 0L for integer 0 fill(col, .direction = "down") %>% # Fill NA in col with down direction drop_na() %>% # Drop all rows have NA drop_na(col) # Drop rows have NA in col ``` ### Wide and long data ```python= dataset %>% # From wide to long data pivot_longer(-country) %>% # pivot to long data except of country pivot_longer(`1945`:`1951`, names_to = c("name1", "name2"), names_prefix = "ignore_name_part", names_sep = "sep", name_transform = list(name1 = as.integer, name2 = as.integer), values_to = "value", values_drop_na = FALSE) %>% uncount(col, .id = "id") %>% # Create one row for each col and add the id # From long to wide data pivot_wider(names_from = col, values_from = value) ``` ### Expanding data ```python= full_df <- expand_grid( # Make a 9 rows grid data col1 = c(type1, type2, type3), col2 = c(type1, type2, type3)) full_df <- complete(col1, col2, fill = list(col3 = 0L)) # Complete create the grid data years <- full_seq(c(2020, 2030), period = 1) decades <- full_seq(c(1980, 2030), period = 10) # Generate the dates for all days in 1980 outer_dates <- c(as.Date("1980-01-01"), as.Date("1980-12-31")) full_seq(outer_dates, period = 1) # Fill up time series data complete(time = seq(min(time), max(time), by = "20 min"), fill = list(enter = 0L, exit = 0L)) ``` ### Non rectangle data ```python= # Create a tibble with a movie column tibble(movie = movie_planets_list) %>% unnest_wider(movie) %>% # Unnest the movie column unnest_longer(planets) # Unnest the planets column movie_df %>% hoist( movie, title = "Title", year = "Year", rating = list("Ratings", "Rotten Tomatoes") ) ansur_df %>% group_by(sex) %>% # Group the data by sex nest() %>% # Nest the data mutate( fit = map(data, function(df) lm(weight_kg ~ waist_circum_m + stature_m, data = df)), glanced = map(fit, glance)) %>% unnest(glanced) # Unnest the glanced column ``` ## Cleaning Data ### Basic problem ```python= glimpse(dataset) # See all datatype # Datatype problem col = as.[datatype](col) # factor, character, numeric, Date, logical col = str_remove(col, "sep") # Remove unuse sep # Range problem library(dplyr) library(assertive) assert_all_are_in_past(dataset$datecol) # Check if all date column is in the past dataset %>% replace(col, condition, replacement) # Replace rows out of range by NA or border # Uniqueness constraints ## Full Duplicate duplicated(bike_share_rides) # Duplicated vector distinct(bike_share_rides) # Remove duplicate ## Partial Duplicate dataset %>% count(col1, col2) %>% # Count rows duplicate in col1 and col2 filter(n>1) %>% # filter out these rows dataset_unique <- dataset %>% distinct(col1, col2, .keep_all = TRUE) # Remove partial duplicate dataset_avgdupli %>% # Use min value to remove duplicate group_by(col1, col2) %>% mutate(colvalue_min = mean(colvalue)) %>% distinct(col1, col2, .keep_all = TRUE) %>% select(-colvalue) ``` ### Categorical Data ```python= dataset %>% semijoin(checklist, by = "list") # Wrong factor problem mutate(col_trim = str_trim(col), # Cleaning white space col_low = str_to_lower(col_trim)) # Cleaning upper character # Collapse same kind of type collapse_type = c("type1", "type2", "type3") dataset %>% mutate(col_collapsed = fct_collapse(col, newname = collapse_type)) ``` ### Text Data ```python= # str_detect dataset %>% filter(str_detect(col, "-")) %>% filter(str_detect(col, fixed("(")) | str_detect(col, fixed(")"))) # str_remove col_clean <- dataset$phone %>% str_remove_all(fixed("(")) %>% str_remove_all(fixed(")")) # str_replace dataset %>% mutate(col_clean = col_clean, col_clean_new = str_replace_all(col_clean, "-", " ")) # str_length sfo_survey %>% filter(str_length(phone) == 12) ``` ### Date Data ```python= # Date cleaning formats <- c("%Y-%m-%d", "%B %d, %Y") # Define the date formats dataset %>% mutate(date_clean = parse_date_time(date, orders = formats)) # Find invalid age dataset %>% # theoretical_age: age based on date mutate(theoretical_age = floor(as.numeric(date %--% today(), "years"))) %>% filter(age != theoretical_age) ``` ### Missing Value ```python= library(visdat) # Original visualize vis_miss(dataset) # Visualize missing value by column dataset %>% mutate(missing_inv = is.na(inv_amount)) # Sorted visualize dataset %>% arrange(age) %>% vis_miss() # Dealing with NA dataset_clean <- dataset %>% filter(!is.na(cust_id)) %>% mutate(col_filled = ifelse(is.na(col), inv_amount * 5, col)) assert_all_are_not_na(dataset_clean$col_filled) # Assert that cust_id has no missing vals ``` ### Linkage ```python= # String distance library(stringdist) stringdist("string1", "string2", method = "[method]") # dl, lcs, jaccard stringdist_left_join(cities, by = c("city" = "city_actual")) # Repair linkage library(reclin) pair_blocking(df_1, df_2, blocking_var="col") %>% # Generate pairs over col compare_pairs(by = "name", default_comparator = lcs()) # Compare pairs by name using lcs() # Scoring and linking pair_blocking(df_1, df_2, blocking_var = "col") %>% compare_pairs(by = c("name", "addr"), default_comparator = jaro_winkler()) %>% # Compare pairs score_problink() %>% # Score pairs select_n_to_m() %>% # Select pairs link() # Link data ``` ## Functions ### Basic ```python= arg(functions) # Looking the arguments of the function func_name <- function(arg1, arg2) { # Write # Codes } func_name(arg1, arg2) # Call func_name <- function(dataarg, detailarg = default) # dataarg: data to run in the function # detailarg: some detail setting in the function (can be defaulting) ``` ### Default setting example - na.rm vs. ... ```python= calc_harmonic_mean <- function(x, na.rm = FALSE) { x %>% get_reciprocal() %>% mean(na.rm = na.rm) %>% get_reciprocal() } calc_harmonic_mean <- function(x, ...) { x %>% get_reciprocal() %>% mean(...) %>% get_reciprocal() } ``` ### Return ```python= # Return multiple groom_model <- function(model) { list( model = glance(model), coefficients = tidy(model), observations = augment(model) ) } # Returning metadata pipeable_plot <- function(data, formula) { plot(formula, data) # Add a "formula" attribute to data attr(data, "formula") <- formula invisible(data) } ``` ### Environment ```py= # Compare the contents of the global environment and rsa_env ls.str(globalenv()) ls.str(rsa_env) # Does population exist in rsa_env? exists("population", envir = rsa_env) # Does population exist in rsa_env, ignoring inheritance? exists("population", envir = rsa_env, inherits = FALSE) ``` ### Generalized Addictivity Model ```py= library(mgcv) gam(yield_kg_per_ha ~ s(year) + census_region, data = corn) ``` ## Writing Efficient R Code ```python= # Print the R version details using version version # How long does it take to read movies from CSV? system.time(read.csv("movies.csv")) # Compare the two functions library(microbenchmark) compare <- microbenchmark(read.csv("movies.csv"), readRDS("movies.rds"), times = 10) compare # Assign the variable ram to the amount of RAM on this machine # Assign the variable cpu to the cpu specs library("benchmarkme") ram <- get_ram() cpu <- get_cpu() res <- benchmark_io(runs = 1, size = 5) ram cpu plot(res) # Use <- with system.time() to store the result as res_grow system.time(res_grow <- growing(n)) ``` ### Profiling * Build matrix is faster than build dataframe * Use build-in function is faster than use random function * && is useful than & ```python= # Load the profvis package library(profvis) # Profile the following code profvis({ # Load and select data comedies <- movies[movies$Comedy == 1, ] # Plot data of interest plot(comedies$year, comedies$rating) # Loess regression line model <- loess(rating ~ year, data = comedies) j <- order(comedies$year) # Add fitted line to the plot lines(comedies$year[j], model$fitted[j], col = "red") }) ``` ### Parallel Running ```python= library("parallel") detectCores() # Determine the number of available cores. cl <- makeCluster(2) # Create a cluster via makeCluster parApply(cl, dd, 2, median) # Parallelize this code stopCluster(cl) # Stop the cluster no_of_games <- 1e5 # Set the number of games to play # Time serial version system.time(serial <- sapply(1:no_of_games, function(i) play())) cl <- makeCluster(4) # Set up cluster clusterExport(cl, "play") # Time parallel version system.time(par <- parSapply(cl, 1:no_of_games, function(i) play())) stopCluster(cl) # Stop cluster ``` ## Shell ### Basic ```r= cat file.txt # View a file's contents less file1.txt file2.txt | cat # View file piece by piece head file.txt # View the start of a file ``` ### Flags ```r= # ls ls -R -F /home/dir1 # -R: List all ; -F: Add "/" after dir, "*" after program # head & tail: Seclect rows head -n [10] file.txt # View the head 10 lines tail -n [+7] file.txt # View the last +7 lines # cut: Select columns cut -f [2-5,8] -d [,] file.csv # -f: Select col 2-5 and 8 ; -d: Delimiter as "," # grep: Selects lines according to what they contain -c: print a count of matching lines rather than the lines themselves -h: do not print the names of files when searching multiple files -i: ignore case (e.g., treat "Regression" and "regression" as matches) -l: print the names of files that contain matches, not the matches -n: print line numbers for matching lines -v: invert the match, i.e., only show lines that don't match # wc: Word count -c: Characters -w: Words -l: Lines # sort -n: numerically -r: reverse -b: ignore leading blanks -f: fold case # uniq -c: display unique lines with a count of how often each occur ``` ### Combining ```r= head -n 5 file.csv > new.csv # Store head 5 lines to new.csv cut -d , -f 2 file.csv | grep -v Tooth # Combine commands head -n 5 files/* # Select many files ``` ### Shell storage ```r= echo $var # echo back variable value for [filetype] in [gif jpg png]; do [echo $filetype]; done # repeat command ```