--- tags: Master --- # Data viz bibbia ## Setup R markdown ```yaml title: "Italian taste preprocessing" output: html_document: toc: yes df_print: paged html_notebook: toc: yes params: echo: yes message: no warning: no ``` ```r {r setup, include=FALSE} knitr::opts_chunk$set(echo = params$echo) knitr::opts_chunk$set(message = params$message) knitr::opts_chunk$set(warning = params$warning) options(reticulate.repl.quiet = TRUE) ``` ## R libraries ```r library(reticulate) # for accessing Python environment use `py$var_name` library(tidyverse) # has ggplot + stuff library(broom) # tidy functions library(geojsonio) # read geojson library(umap) # UMAP library(smacof) # MDS library(Rtsne) # TSNE ... ``` ## Python libraries ```python import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.manifold import TSNE, MDS from sklearn.decomposition import PCA from umap import UMAP ``` ## Fast Excel read ```python wb = openpyxl.load_workbook("Database generale 2015 2016 2017 TN.xlsx", read_only=True, data_only=True) sheet = wb["dati"] rows = sheet.iter_rows(max_row=257, max_col=896, values_only=True) first_row = next(rows) data = pd.DataFrame(rows, columns=first_row) ``` ## Read geojson ```r geo_data <- geojson_read("data/countries.geojson", what = "sp") geo_data <- tidy(geo_data, region = "ADMIN") ``` ## Maps https://cengel.github.io/R-spatial/mapping.html#choropleth-mapping-with-ggplot2 ## Plot in pandas ```py data.plot.scatter(x='TSNE1', y='TSNE2') plt.show() ``` ## Seaborn ```python import seaborn as sns ... ax = sns.barplot() # use matplotlib settings ax.set(xlabel='common xlabel', ylabel='common ylabel', title='some title') ax.title("Population Pyramid of the Marketing Funnel", fontsize=22) ``` ## Theme + legend in R http://www.cookbook-r.com/Graphs/Legends_(ggplot2) https://ggplot2.tidyverse.org/reference/theme.html http://zevross.com/blog/2014/08/04/beautiful-plotting-in-r-a-ggplot2-cheatsheet-3/ ```r ggplot(data) + # hide aesthetic from legend geom_text(data, aes(x, y, label=round(temp), size=4), show_guide=FALSE) # modify scales scale_y_continuous(breaks = seq(0, 1, by = 0.1), limits = c(0.2, 1)) + # range and tick intervals scale_x_continuous(label = function(x){return(paste("My value is", x, "degrees"))}) + # rename ticks labels scale_x_date(date_breaks = "3 day", expand = c(0, 0)) + # time ticks and remove empty offset in the axis coord_equal() + # if you want same scaling # modify theme theme_minimal() + # full style (eg.: theme_void(), theme_tufte(),...) theme( # overwrite theme settings # text text = element_text(family = "Verdana"), # all text axis.title = element_text(size=20, family="Bauhaus 93", face="bold", margin = margin(10, 0, 10, 0)), axis.line.x = element_line(size = 0.3), axis.ticks.x = element_line(), # background grid panel.grid.major.y = element_line(size = 0.3, linetype = "solid", colour = "black"), panel.grid.major.x = element_blank(), # legend legend.position = "top", legend.title = element_text(size = 12, colour="chocolate") ) + # modify text labs( title = "", subtitle = "", caption = "", tag = "", x = "", y = "", colour = c("Region" = ""), # rename legend for color ... # and more ) ``` ## Manual shapes/colors ### Manual color ```r library(tidyverse) library(ggplot2) library(Cairo) library(plotly) data <- read_csv(file = "economist_data.csv") cols <- c("Asia Pacific" = "#461313", "Americas" = "#279e27", "East EU Cemt Asia" = "blue", "SSA" = "brown", "EU W. Europe" = "yellow", "MENA" = "grey") p1 <- ggplot(data, aes(x = CPI, y = HDI, colour = Region)) + geom_point(shape = 21, stroke = 1.5, size = 1.5) + geom_smooth( method = lm, aes( colour = NULL, lty = "diocan" ), colour = "red" ) + scale_color_manual(values = cols) + scale_fill_discrete(guide = FALSE) + theme_classic() + theme( panel.grid.major.y = element_line(size = 2), panel.grid.major = element_line(size = 0.5, color = "black"), panel.grid.minor = element_blank() ) + labs( x = "X", y = "Y" ) p1 ``` ### Manual shape ```r library(ggplot2) library(tidyverse) library(ggExtra) data <- read_csv("iris.csv") # classic plot : p1 <- ggplot(data, aes(x = `sepal length`, y = `sepal width`, color = `class`, shape = `class`)) + geom_point() + scale_shape_manual(values = c("Iris-setosa" = 21, "Iris-versicolor" = 22, "Iris-virginica" = 23)) + theme(legend.position = "none") # marginal density p2 <- ggMarginal(p1, type = "density", groupColour = TRUE, groupFill = TRUE) p2 ``` ### Manual fill ```r library(tidyverse) library(ggplot2) library(Cairo) library(plotly) library(gridExtra) data <- read_tsv(file = "barplot_data.csv") data$SEQC_SIGN <- as.character(sign(data$FoldChange_SEQC)) data$NANO_SIGN <- as.character(sign(data$FoldChange_Nanostring)) p1 <- ggplot(data, aes(x = abs(FoldChange_SEQC), y = Gene)) + geom_bar(stat = "identity", aes(fill = SEQC_SIGN), show.legend = FALSE) + geom_text(aes(label = strrep("*", floor(-log(Pvalue_SEQC) / 8)))) + scale_fill_manual(values = c("-1" = "red", "1" = "green")) + theme_minimal() + theme(axis.text.y = element_text(hjust = 0.5)) p2 <- ggplot(data, aes(x = -abs(FoldChange_Nanostring), y = Gene)) + geom_bar(stat = "identity", aes(fill = NANO_SIGN), show.legend = FALSE) + geom_text(aes(label = strrep("*", floor(-log(Pvalue_SEQC) / 8)))) + scale_fill_manual(values = c("-1" = "red", "1" = "green")) + scale_x_continuous( labels = function(x) signif(abs(x), 3), breaks = c(0, -2, -4, -6, -8), limits = c(-9, 0) ) + theme_minimal() grid.arrange(p2, p1, ncol = 2) ``` ## Data transform in r ```r # set new column as "-1" or "1" (need categorical, not numeric) data$SEQC_SIGN <- as.character(sign(data$FoldChange_SEQC)) data$NANO_SIGN <- as.character(sign(data$FoldChange_Nanostring)) # unique countries <- unique(data$Country) ``` ## Faceting in R ```r p1 <- ggplot(data, aes(x = abs(FoldChange_SEQC), y = Gene)) + geom_bar(stat = "identity", aes(fill = SEQC_SIGN), show.legend = FALSE) + geom_text(aes(label = strrep("*", floor(-log(Pvalue_SEQC) / 8)))) + scale_fill_manual(values = c("-1" = "red", "1" = "green")) + theme_minimal() + theme(axis.text.y = element_text(hjust = 0.5)) p2 <- ggplot(data, aes(x = -abs(FoldChange_Nanostring), y = Gene)) + geom_bar(stat = "identity", aes(fill = NANO_SIGN), show.legend = FALSE) + geom_text(aes(label = strrep("*", floor(-log(Pvalue_SEQC) / 8)))) + scale_fill_manual(values = c("-1" = "red", "1" = "green")) + scale_x_continuous( labels = function(x) signif(abs(x), 3), breaks = c(0, -2, -4, -6, -8), limits = c(-9, 0) ) + theme_minimal() grid.arrange(p2, p1, ncol = 2) europe <- mutate(europe, gud = (HDI > 0.4)) ```