Getting familiar with ggplot2
Version March 5th 2020
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.2.1 ✓ purrr 0.3.3
## ✓ tibble 2.1.3 ✓ dplyr 0.8.4
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(ggplot2)
library(datasets)
head(trees)
## Girth Height Volume
## 1 8.3 70 10.3
## 2 8.6 65 10.3
## 3 8.8 63 10.2
## 4 10.5 72 16.4
## 5 10.7 81 18.8
## 6 10.8 83 19.7
str(trees)
## 'data.frame': 31 obs. of 3 variables:
## $ Girth : num 8.3 8.6 8.8 10.5 10.7 10.8 11 11 11.1 11.2 ...
## $ Height: num 70 65 63 72 81 83 66 75 80 75 ...
## $ Volume: num 10.3 10.3 10.2 16.4 18.8 19.7 15.6 18.2 22.6 19.9 ...
First step: inform your dataset using one of the two options
ggplot(data= trees) # option 1
trees %>% ggplot () # option 2
Second step: precise the aesthetics of your plot (x-axis and y-axis)
trees %>% ggplot (aes(x=Height,y=Volume))
Third step: specifiy the geom. For example, I used geom_point
trees %>% ggplot (aes(x=Height,y=Volume)) +
geom_point()
After these 3 steps, you can start customizing your plot. yoohoo
How to add a linear regression? use geom_smooth
trees %>% ggplot (aes(x=Height,y=Volume)) +
geom_point() +
geom_smooth(method="lm",se=FALSE)
By adding se=FALSE
, the confidence interval around smooth is not displayed
Note that by default, se=TRUE
Modify the apparence of the points (size and color)
trees %>% ggplot (aes(x=Height,y=Volume)) +
geom_point(size=3,color="orange") +
geom_smooth(method="lm")
Change the display of your plot
theme_bw()
is a popular choice.
trees %>% ggplot (aes(x=Height,y=Volume)) +
geom_point(size=3,color="white") +
geom_smooth(method="lm",color="pink") +
theme_bw()
Another example with theme_dark()
.
trees %>% ggplot (aes(x=Height,y=Volume)) +
geom_point(size=3,color="white") +
geom_smooth(method="lm",color="pink") +
theme_dark()
Different themes can be found here
Specify the labels and titles
trees %>% ggplot (aes(x=Height,y=Volume)) +
geom_point(size=3) +
geom_smooth(method="lm") +
theme_bw() +
xlab("my x-axis title") +
ylab("my y-axis title") +
ggtitle("My title")
Increase the font size
trees %>% ggplot (aes(x=Height,y=Volume)) +
geom_point(size=3) +
geom_smooth(method="lm") +
theme_bw() +
xlab("x-axis title") +
ylab("y-axis title") +
ggtitle("Volume as a function of height") +
theme(axis.text=element_text(size=13), # font size axis labels
axis.title =element_text(size=15)) # # font size axis title and bold style
It’s time to discover other popular geoms. This nice cheat sheet describes many available geoms.
geom_boxplot
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
dim(iris)
## [1] 150 5
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
iris %>% ggplot(aes(x = Species, y = Sepal.Length)) +
geom_boxplot() +
theme_bw()
Use coord_flip()
to flip the x-axis and the y-axis and change the color by group (species) by using color=
in aes()
.
iris %>% ggplot(aes(x = Species, y = Sepal.Length,color=Species)) +
geom_boxplot() +
coord_flip() +
theme_bw()
use fill=
to add some colors
iris %>% ggplot(aes(x = Species, y = Sepal.Length,fill=Species)) +
geom_boxplot() +
coord_flip() +
theme_bw()
geom_histogram
library(gapminder)
head(gapminder)
## # A tibble: 6 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
dim(gapminder)
## [1] 1704 6
str(gapminder)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1704 obs. of 6 variables:
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ pop : int 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
## $ gdpPercap: num 779 821 853 836 740 ...
- lifeExp = life expectancy at birth
- pop = total population
- gdpPercap = per-capita GDP (Gross domestic product)
ggplot(gapminder, aes(x = lifeExp)) +
geom_histogram(binwidth = 1) # binwidth = think in term of the unit of the x variable. Choose the binwidth consciously
ggplot(gapminder, aes(x = lifeExp)) +
geom_histogram(binwidth = 20) # bin width = think in term of the unit of the x variable
Differentiate by continent and change the theme
ggplot(gapminder, aes(x = lifeExp, fill = continent)) +
geom_histogram(binwidth = 1) +
theme_bw()
geom_line
gapminder %>% filter (continent %in% c("Europe","Americas")) %>%
ggplot( aes(x=year,y=lifeExp,group=country)) +
geom_line() +
theme_bw()
By plotting all the countries using group=country
you get a spaghetti plot that does not look great. A way to improve your plot is to use facet_wrap
(facetting). You can also highlight one specific line Learn more watching this nice R talk
** facetting**
gapminder %>% filter(continent %in% c("Europe","Americas")) %>%
ggplot(aes(x=year,y=lifeExp,group=country)) +
geom_line() +
theme_bw() +
facet_wrap(~country)
gapminder %>%filter (continent %in% c("Europe","Americas")) %>%
ggplot(aes(x=year,y=lifeExp,group=country)) +
geom_line() +
theme_bw() +
facet_wrap(~country,scale="free_y")
gapminder %>%
ggplot(aes(x=year,y=lifeExp,group=country)) +
geom_line() +
theme_bw() +
facet_wrap(~continent,scale="free_y")
highlight one specific country
France <- gapminder %>% filter(country=="France")
gapminder %>% filter (continent %in% c("Europe","Americas")) %>%
ggplot( aes(x=year,y=lifeExp)) +
geom_line(aes(x=year,y=lifeExp,group=country),colour="grey") +
geom_line(aes(x=year,y=lifeExp), data = France,colour="red") +
theme_bw()
Bar chart
gapminder %>% filter(year==2002 & continent== "Europe") %>%
ggplot(aes(x = country, y = lifeExp)) +
geom_bar(stat="identity", # statistical transformation to use on the data
position="identity", # position adjustment
fill="grey")
if stat="identity"
, the heights of the bar represent values in the data
if stat="bin"
(by default), the height of each bar equal to the number of cases in each group, and it is incompatible with mapping values to the y aesthetic.
We can improve this graph by:
* modifying the position of x-axis labels
* adapting the y-axis scale
* adding axis title
* ordering the countries by increasing life expectancy
gapminder %>% filter(year==2002 & continent== "Europe") %>%
ggplot(aes(x = reorder(country,lifeExp), y = lifeExp)) +
geom_bar(stat="identity",
position="identity",
fill="steelblue") +
coord_cartesian(ylim=c(70,85)) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1), # vertical x-axis labels
axis.text=element_text(size=12), # font size axis labels
axis.title =element_text(size=12)) +
labs(title="Life expectancy in 2002",
subtitle = "Case study of Europe")+
ylab("Life expectancy")
group bar chart
gapminder %>% filter(year == 1967 | year == 2007) %>%
filter (country %in% c("Peru","Argentina")) %>%
ggplot(aes(x = country, y = lifeExp, fill=as.factor(year))) +
geom_bar(stat="identity",
position="dodge") +
ylab("Life expectancy") +
theme_bw() +
scale_fill_manual(values = c("#1380A1","#FAAB18")) +
theme(axis.text=element_text(size=13), # font size axis labels
axis.title =element_text(size=15))
position="dodge"
adjust position by dodging overlaps to the side
Change the legend
gapminder %>% filter(year == 1967 | year == 2007) %>%
filter (country %in% c("Peru","Argentina")) %>%
ggplot(aes(x = country, y = lifeExp, fill=as.factor(year))) +
geom_bar(stat="identity",
position="dodge") +
scale_fill_manual(values = c("#1380A1", "#FAAB18"),name="year") + # name of the legend
ylab("Life expectancy") +
xlab("") +
theme_bw() +
theme(axis.text=element_text(size=13), # font size axis labels
axis.title =element_text(size=15),
legend.text=element_text(size=14),
axis.title.x = element_blank(),
legend.position="bottom", # legend position
legend.direction="horizontal") # legend direction
# legend.title = element_blank()) # if you want to remove legend title
Label the outliers using geom_text
We can spot some decline in life expectancy in Africa and Asia around 90’s ans 70’s, respectively
gapminder %>% ggplot( aes(x=year,y=lifeExp,group=country)) +
geom_line() +
facet_wrap(~continent,scale="free_y") +
theme_bw()
gapminder %>% ggplot( aes(x=year,y=lifeExp,group=country)) +
geom_line() +
facet_wrap(~continent,scale="free_y") +
theme_bw() +
geom_text(aes(x = year, y = lifeExp, label=country),
data = gapminder %>% filter(lifeExp < 40))
When youn use geom_text, the filter is really important
gapminder %>% ggplot( aes(x=year,y=lifeExp,group=country)) +
geom_line() +
facet_wrap(~continent,scale="free_y") +
theme_bw() +
geom_text(aes(x = year, y = lifeExp, label=country),
data = gapminder %>% filter(lifeExp < 32 & year > 1970 & year < 1995))
Visualizing time: time series
Example with the data set nasa (part of the GGaly package) it consists of atmospheric measurements across a grid of locations in Middle America
data(nasa, package="GGally")
head(nasa)
## time y x lat long date cloudhigh cloudlow cloudmid ozone
## 1 1 1 1 -21.2 -113.8000 1995-01-01 0.5 31.0 2.0 260
## 2 1 1 2 -21.2 -111.2957 1995-01-01 1.5 31.5 2.5 260
## 3 1 1 3 -21.2 -108.7913 1995-01-01 1.5 32.5 3.5 260
## 4 1 1 4 -21.2 -106.2870 1995-01-01 1.0 39.0 4.0 258
## 5 1 1 5 -21.2 -103.7826 1995-01-01 0.5 48.0 4.5 258
## 6 1 1 6 -21.2 -101.2783 1995-01-01 0.0 50.0 2.5 258
## pressure surftemp temperature id day month year
## 1 1000 297.4 296.9 1-1 0 1 1995
## 2 1000 297.4 296.5 2-1 0 1 1995
## 3 1000 297.4 296.0 3-1 0 1 1995
## 4 1000 296.9 296.5 4-1 0 1 1995
## 5 1000 296.5 295.5 5-1 0 1 1995
## 6 1000 296.5 295.0 6-1 0 1 1995
For each observational unit, we have multiple measurements
nasa %>% filter(x == 1, y == 1) %>%
ggplot(aes(x = time, y = temperature)) +
geom_point() +
theme_bw()
We connect the multiple measurements by a line
nasa %>% filter(x == 1, y == 1) %>%
ggplot(aes(x = time, y = temperature)) +
geom_line() +
theme_bw()
Each observational unit forms a group, we only connect points within a group by a line
nasa %>% filter(x == 1, y %in% c(1, 10)) %>% head(n=6) # how look the data
## time y x lat long date cloudhigh cloudlow cloudmid ozone
## 1 1 1 1 -21.20000 -113.8 1995-01-01 0.5 31.0 2.0 260
## 2 1 10 1 1.26087 -113.8 1995-01-01 0.5 43.5 4.0 248
## 3 2 1 1 -21.20000 -113.8 1995-02-01 1.0 33.5 3.0 254
## 4 2 10 1 1.26087 -113.8 1995-02-01 1.0 28.5 5.5 246
## 5 3 1 1 -21.20000 -113.8 1995-03-01 2.0 25.5 4.0 254
## 6 3 10 1 1.26087 -113.8 1995-03-01 1.5 12.5 3.5 254
## pressure surftemp temperature id day month year
## 1 1000 297.4 296.9 1-1 0 1 1995
## 2 1000 297.8 298.3 1-10 0 1 1995
## 3 1000 298.7 297.8 1-1 31 2 1995
## 4 1000 298.7 300.1 1-10 31 2 1995
## 5 1000 298.3 297.8 1-1 59 3 1995
## 6 1000 297.4 299.2 1-10 59 3 1995
nasa %>% filter(x == 1, y %in% c(1, 10)) %>%
ggplot(aes(x = time, y = temperature, group=id)) +
geom_line() +
theme_bw()
Connect points with a line
nasa %>% filter(x == 1, y %in% c(1, 10)) %>%
ggplot(aes(x = time, y = temperature, group=id)) +
geom_point() +
geom_line() +
theme_bw()
Customise color, size and shape
Use scale_color_manual, scale_size_manual and scale_shape_manual
iris %>% ggplot(aes(x = Sepal.Width, y = Sepal.Length,color=Species)) +
geom_point() +
theme_bw()
scale_color_manual to change point colors
useful website: https://www.color-hex.com/
iris %>% ggplot(aes(x = Sepal.Width, y = Sepal.Length,color=Species)) +
geom_point() +
theme_bw() +
scale_color_manual(values=c("#666547","#fb2e01","#6fcb9f"))
scale_size_manual to change point sizes
iris %>% ggplot(aes(x = Sepal.Width, y = Sepal.Length,size=Species)) +
geom_point() +
theme_bw() +
scale_size_manual(values=c(2,2,5))
iris %>% ggplot(aes(x = Sepal.Width, y = Sepal.Length,size=Petal.Length)) +
geom_point() +
theme_bw()
scale_shape_manual to change point shapes
iris %>% ggplot(aes(x = Sepal.Width, y = Sepal.Length,shape=Species)) +
geom_point() +
theme_bw() +
scale_shape_manual(values=c(19,17,15))
combine different sets of mapping
iris %>% ggplot(aes(x = Sepal.Width, y = Sepal.Length,shape=Species,color=Species,size=Petal.Width)) +
geom_point() +
theme_bw() +
scale_shape_manual(values=c(19,17,15)) +
scale_color_manual(values=c("#e37d78","#8bc6c4","#17a37f"))
Change your axis scale
iris %>% ggplot(aes(x = Sepal.Width, y = Sepal.Length,shape=Species,color=Species,size=Petal.Width)) +
geom_point() +
theme_bw() +
scale_shape_manual(values=c(19,17,15)) +
scale_color_manual(values=c("#e37d78","#8bc6c4","#17a37f")) +
scale_x_continuous(limits =c(0,5))
iris %>% ggplot(aes(x = Sepal.Width, y = Sepal.Length,shape=Species,color=Species,size=Petal.Width)) +
geom_point() +
theme_bw() +
scale_shape_manual(values=c(19,17,15)) +
scale_color_manual(values=c("#e37d78","#8bc6c4","#17a37f")) +
scale_x_continuous(breaks = seq(2, 5, by = 0.5))
seq(2, 5, by = 0.5)
## [1] 2.0 2.5 3.0 3.5 4.0 4.5 5.0
arrange multiple plots
you need the package ggpubr
p1<-trees %>% ggplot (aes(x=Height,y=Volume)) +
geom_point(size=2,color="orange") + # no confidence interval displayed
geom_smooth(method="lm",se=FALSE,color="black") +
ggtitle("linear regression")
p2<-trees %>% ggplot (aes(x=Height,y=Volume)) +
geom_point(size=2,color="orange") + # no confidence interval displayed
geom_smooth(method="loess",se=FALSE,color="black") +
ggtitle("loess regression")
library(ggpubr)
## Loading required package: magrittr
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
ggarrange(p1,p2,ncol=1,nrow=2)
ggarrange(p1,p2,ncol=2,nrow=1)
save your graphics
setwd(“~/Documents/Dossier1/Figures/”) ggsave(“nameofyourgraphic.pdf”)
ggsave(“nameofyourgraphic.pdf”,width=6,height=7) # adjust the size