---
tags: Master
---
# Data viz bibbia
## Setup R markdown
```yaml
title: "Italian taste preprocessing"
output:
html_document:
toc: yes
df_print: paged
html_notebook:
toc: yes
params:
echo: yes
message: no
warning: no
```
```r
{r setup, include=FALSE}
knitr::opts_chunk$set(echo = params$echo)
knitr::opts_chunk$set(message = params$message)
knitr::opts_chunk$set(warning = params$warning)
options(reticulate.repl.quiet = TRUE)
```
## R libraries
```r
library(reticulate) # for accessing Python environment use `py$var_name`
library(tidyverse) # has ggplot + stuff
library(broom) # tidy functions
library(geojsonio) # read geojson
library(umap) # UMAP
library(smacof) # MDS
library(Rtsne) # TSNE
...
```
## Python libraries
```python
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE, MDS
from sklearn.decomposition import PCA
from umap import UMAP
```
## Fast Excel read
```python
wb = openpyxl.load_workbook("Database generale 2015 2016 2017 TN.xlsx", read_only=True, data_only=True)
sheet = wb["dati"]
rows = sheet.iter_rows(max_row=257, max_col=896, values_only=True)
first_row = next(rows)
data = pd.DataFrame(rows, columns=first_row)
```
## Read geojson
```r
geo_data <- geojson_read("data/countries.geojson", what = "sp")
geo_data <- tidy(geo_data, region = "ADMIN")
```
## Maps
https://cengel.github.io/R-spatial/mapping.html#choropleth-mapping-with-ggplot2
## Plot in pandas
```py
data.plot.scatter(x='TSNE1', y='TSNE2')
plt.show()
```
## Seaborn
```python
import seaborn as sns
...
ax = sns.barplot()
# use matplotlib settings
ax.set(xlabel='common xlabel', ylabel='common ylabel', title='some title')
ax.title("Population Pyramid of the Marketing Funnel", fontsize=22)
```
## Theme + legend in R
http://www.cookbook-r.com/Graphs/Legends_(ggplot2)
https://ggplot2.tidyverse.org/reference/theme.html
http://zevross.com/blog/2014/08/04/beautiful-plotting-in-r-a-ggplot2-cheatsheet-3/
```r
ggplot(data) +
# hide aesthetic from legend
geom_text(data, aes(x, y, label=round(temp), size=4), show_guide=FALSE)
# modify scales
scale_y_continuous(breaks = seq(0, 1, by = 0.1), limits = c(0.2, 1)) + # range and tick intervals
scale_x_continuous(label = function(x){return(paste("My value is", x, "degrees"))}) + # rename ticks labels
scale_x_date(date_breaks = "3 day", expand = c(0, 0)) + # time ticks and remove empty offset in the axis
coord_equal() + # if you want same scaling
# modify theme
theme_minimal() + # full style (eg.: theme_void(), theme_tufte(),...)
theme( # overwrite theme settings
# text
text = element_text(family = "Verdana"), # all text
axis.title = element_text(size=20, family="Bauhaus 93", face="bold", margin = margin(10, 0, 10, 0)),
axis.line.x = element_line(size = 0.3),
axis.ticks.x = element_line(),
# background grid
panel.grid.major.y = element_line(size = 0.3, linetype = "solid", colour = "black"),
panel.grid.major.x = element_blank(),
# legend
legend.position = "top",
legend.title = element_text(size = 12, colour="chocolate")
) +
# modify text
labs(
title = "",
subtitle = "",
caption = "",
tag = "",
x = "",
y = "",
colour = c("Region" = ""), # rename legend for color
... # and more
)
```
## Manual shapes/colors
### Manual color
```r
library(tidyverse)
library(ggplot2)
library(Cairo)
library(plotly)
data <- read_csv(file = "economist_data.csv")
cols <- c("Asia Pacific" = "#461313", "Americas" = "#279e27", "East EU Cemt Asia" = "blue", "SSA" = "brown", "EU W. Europe" = "yellow", "MENA" = "grey")
p1 <- ggplot(data, aes(x = CPI, y = HDI, colour = Region)) +
geom_point(shape = 21, stroke = 1.5, size = 1.5) +
geom_smooth(
method = lm,
aes(
colour = NULL,
lty = "diocan"
),
colour = "red"
) +
scale_color_manual(values = cols) +
scale_fill_discrete(guide = FALSE) +
theme_classic() +
theme(
panel.grid.major.y = element_line(size = 2),
panel.grid.major = element_line(size = 0.5, color = "black"),
panel.grid.minor = element_blank()
) +
labs(
x = "X",
y = "Y"
)
p1
```
### Manual shape
```r
library(ggplot2)
library(tidyverse)
library(ggExtra)
data <- read_csv("iris.csv")
# classic plot :
p1 <- ggplot(data, aes(x = `sepal length`, y = `sepal width`, color = `class`, shape = `class`)) +
geom_point() +
scale_shape_manual(values = c("Iris-setosa" = 21, "Iris-versicolor" = 22, "Iris-virginica" = 23)) +
theme(legend.position = "none")
# marginal density
p2 <- ggMarginal(p1, type = "density", groupColour = TRUE, groupFill = TRUE)
p2
```
### Manual fill
```r
library(tidyverse)
library(ggplot2)
library(Cairo)
library(plotly)
library(gridExtra)
data <- read_tsv(file = "barplot_data.csv")
data$SEQC_SIGN <- as.character(sign(data$FoldChange_SEQC))
data$NANO_SIGN <- as.character(sign(data$FoldChange_Nanostring))
p1 <- ggplot(data, aes(x = abs(FoldChange_SEQC), y = Gene)) +
geom_bar(stat = "identity", aes(fill = SEQC_SIGN), show.legend = FALSE) +
geom_text(aes(label = strrep("*", floor(-log(Pvalue_SEQC) / 8)))) +
scale_fill_manual(values = c("-1" = "red", "1" = "green")) +
theme_minimal() +
theme(axis.text.y = element_text(hjust = 0.5))
p2 <- ggplot(data, aes(x = -abs(FoldChange_Nanostring), y = Gene)) +
geom_bar(stat = "identity", aes(fill = NANO_SIGN), show.legend = FALSE) +
geom_text(aes(label = strrep("*", floor(-log(Pvalue_SEQC) / 8)))) +
scale_fill_manual(values = c("-1" = "red", "1" = "green")) +
scale_x_continuous(
labels = function(x) signif(abs(x), 3),
breaks = c(0, -2, -4, -6, -8),
limits = c(-9, 0)
) +
theme_minimal()
grid.arrange(p2, p1, ncol = 2)
```
## Data transform in r
```r
# set new column as "-1" or "1" (need categorical, not numeric)
data$SEQC_SIGN <- as.character(sign(data$FoldChange_SEQC))
data$NANO_SIGN <- as.character(sign(data$FoldChange_Nanostring))
# unique
countries <- unique(data$Country)
```
## Faceting in R
```r
p1 <- ggplot(data, aes(x = abs(FoldChange_SEQC), y = Gene)) +
geom_bar(stat = "identity", aes(fill = SEQC_SIGN), show.legend = FALSE) +
geom_text(aes(label = strrep("*", floor(-log(Pvalue_SEQC) / 8)))) +
scale_fill_manual(values = c("-1" = "red", "1" = "green")) +
theme_minimal() +
theme(axis.text.y = element_text(hjust = 0.5))
p2 <- ggplot(data, aes(x = -abs(FoldChange_Nanostring), y = Gene)) +
geom_bar(stat = "identity", aes(fill = NANO_SIGN), show.legend = FALSE) +
geom_text(aes(label = strrep("*", floor(-log(Pvalue_SEQC) / 8)))) +
scale_fill_manual(values = c("-1" = "red", "1" = "green")) +
scale_x_continuous(
labels = function(x) signif(abs(x), 3),
breaks = c(0, -2, -4, -6, -8),
limits = c(-9, 0)
) +
theme_minimal()
grid.arrange(p2, p1, ncol = 2)
europe <- mutate(europe, gud = (HDI > 0.4))
```