Visualization Best Practices
Published:
This lesson is from datacamp Visualization Best Practices in R
library(dplyr)
library(ggplot2)
setwd("~/Downloads/teaching/R")
who_disease = read.csv("dc/who_disease.csv")
View(who_disease)
# simple bar chart of the number of observations
# by region
ggplot(who_disease, aes(x=region)) +
geom_bar()
# scatter plot of the year by cases for a region
who_disease %>%
filter(region == "AMR") %>%
ggplot(aes(x=year, y=cases)) +
geom_point(alpha=0.5)
# Number of cases obsereved
# for a given region for different disease
who_disease %>%
filter(region == "AMR") %>%
ggplot(aes(x=year, y=cases, color=disease)) +
geom_point(alpha=0.5)
Proportion Data
Parts making up a whole
- Often used to understand population
- Pie charts
great for quick visualizations of proportion data for a single group
suffer from accuracy problems due to encoding data in angles but are intuitive and compact
- Waffle Charts
- If you need more precision in your representations than a pie chart can offer, want to compare more classes than three, or simply have the space
- Stacked bar chart
- If you want to compare different wholes to each other you should use a stacked bar chart, as they allow the viewer to easily compare across groups due to all proportions sharing the same y-axis
Pie chart
- Advantages
- intuitive
- convey the concept of slices of a whole very well
- Limitations
- not very precise
- data encoded in angles
- class with 25% of data is encoded by angle of 90 degree
- we like comparing lengths and sizes
- lot of classes
- First slice has meaningful anchoring point, usually the vertical line on the top of pie but after that other starts at arbitrary points
- after three slices it becomes hard to compare
df = data.frame(value = c(10, 23, 15, 18),
group = paste0("G", 1:4))
print(df)
# pie chart as a stacked bar chart that has been 'wrapped' around some central axis.
ggplot(df, aes(x="", y=value, fill=group)) +
geom_col() # heights of bars to represent values
# Pie Chart
ggplot(df, aes(x="", y=value, fill=group)) +
geom_col() + # heights of bars to represent values
coord_polar(theta = "y") +
theme_void()
a = c(5,7,2,9)
ifelse(a %% 2 == 0, "even", "odd")
# "odd" "odd" "even" "odd"
# Compact to see that half of disease is from AFR
who_disease %>%
mutate(
region = ifelse(
region %in% c("EUR", "AFR"),
region, "Other"
)
) %>%
ggplot(aes(x="", fill = region)) +
geom_bar(color="white") +
coord_polar(theta = "y") +
theme_void()
Waffle Chart
More precise than Pie chart
Encode data in area and not angles
library(dplyr) library(ggplot2) library(waffle) setwd("~/Downloads/teaching/R") who_disease = read.csv("dc/who_disease.csv") obs_by_region = who_disease %>% group_by(region) %>% summarize(num_obs = n()) %>% # n() count mutate(percent = round(num_obs/sum(num_obs)*100)) percent = obs_by_region$percent print(percent) names(percent) = obs_by_region$region print(percent) title = "Proportion of Obeservations by Region" waffle::waffle(percent, rows=5, title=title)
Pie Chart
- measles, mumps, other disease count
library(dplyr) library(ggplot2) library(waffle) setwd("~/Downloads/teaching/R") who_disease = read.csv("dc/who_disease.csv") # measles, mumps, other disease count disease_counts <- who_disease %>% mutate(disease = ifelse( disease %in% c('measles', 'mumps'), disease, 'other')) %>% group_by(disease) %>% summarise(total_cases = sum(cases)) print(disease_counts) ggplot(disease_counts, aes(x = 1, y = total_cases, fill = disease)) + geom_col() + # heights of bars to represent values coord_polar(theta = "y") + theme_void() + ggtitle("Proportion of diseases")
Waffle Chart
library(dplyr) library(ggplot2) library(waffle) setwd("~/Downloads/teaching/R") who_disease = read.csv("dc/who_disease.csv") disease_counts <- who_disease %>% group_by(disease) %>% summarise(total_cases = sum(cases)) %>% mutate(percent = round(total_cases/sum(total_cases)*100)) print(disease_counts) case_counts = disease_counts$percent names(case_counts) = disease_counts$disease print(case_counts) waffle(case_counts, rows = 5)
Stacked Bar Chart
library(dplyr)
library(ggplot2)
library(waffle)
setwd("~/Downloads/teaching/R")
who_disease = read.csv("dc/who_disease.csv")
View(who_disease)
# Bar Chart of country code and cases
# for SEAR region
who_disease %>%
filter(region == "SEAR") %>%
ggplot(aes(x=countryCode, y=cases)) +
geom_col()
# fill bar with disease
who_disease %>%
filter(region == "SEAR") %>%
ggplot(aes(x=countryCode, y=cases, fill=disease)) +
geom_col()
# stacks bars and constant height
who_disease %>%
filter(region == "SEAR") %>%
ggplot(aes(x=countryCode, y=cases, fill=disease)) +
geom_col(position="fill")
# stack bar 'measles', 'mumps', other with year
disease_counts <- who_disease %>%
mutate(disease = ifelse(
disease %in% c('measles', 'mumps'),
disease,
'other')) %>%
group_by(disease, year) %>% # year to the grouping
summarise(total_cases = sum(cases))
print(disease_counts)
ggplot(disease_counts,
aes(x=year,
y = total_cases,
fill = disease)) +
geom_col(position="fill") # bars full height
# create factors
# rearrange factor level as in order as
# "measles", "other", "mumps"
disease_counts <- who_disease %>%
mutate(
disease = ifelse(disease %in% c('measles', 'mumps'), disease, 'other') %>%
factor(levels = c("measles", "other", "mumps")) # change factor levels to desired ordering
) %>%
group_by(disease, year) %>%
summarise(total_cases = sum(cases))
# plot
ggplot(disease_counts, aes(x = year, y = total_cases, fill = disease)) +
geom_col(position = 'fill')
# Filter later than 1999
disease_counts <- who_disease %>%
filter(year >= 1999) %>%
mutate(disease = ifelse( disease %in% c('measles', 'mumps'),
disease,
'other')) %>%
group_by(disease, region) %>% # Add region column to grouping
summarise(total_cases = sum(cases))
# Set aesthetics so disease is the stacking variable,
# region is the x-axis and counts are the y
ggplot(disease_counts, aes(x=region,
y=total_cases,
fill=disease)) +
geom_col(position = "fill")
- Accuracy degrades after3 classes
- Worse in isolation than pie or waffle chart
Point Data
- Single number associated with each category of data
- Count, average, rate
Bar Chart
# Bar Chart
library(dplyr)
library(ggplot2)
setwd("~/Downloads/teaching/R")
who_disease = read.csv("dc/who_disease.csv")
# Total cases in each disease
ggplot(who_disease) +
geom_col(aes(x=disease, y=cases))
# Bar chart should be used to represent meaning quantity
# Bar should be for stacking
# e.g. money spent on different activities in a project
# geom_col() when height is encoded in data
# Number of cases for disease in India in 1980
who_disease %>%
filter((year == 1980) & (country=="India")) %>%
ggplot(aes(x=disease, y=cases)) +
geom_col()
who_disease %>%
filter(year == 1980, country=="India") %>%
ggplot(aes(x=disease, y=cases)) +
geom_col()
# geom_bar need only x-axis
# who_disease observations for large cases
who_disease %>%
# filter data to observations of greater than 1,000 cases
filter(cases > 1000) %>%
# map the x-axis to the region column
ggplot(aes(x=region)) +
# add a geom_bar call
geom_bar()
- Bar Chart is not idle
- not a quantity e.g. percentile, ratio, sensor reading like. temp
- Non-linear transformation e.g. log, square root, or exponentiation
- This is because depending upon where they fall on the transformation curve, one transformed unit equals a different number of un-transformed units
- Benefits of Point Chart
- High Precision
- Efficient Representation
- Simple
- more classes can be easily represented
# Single number associated with each category of data
# Count, average, rate
# Point Chart
#library(dplyr)
library(tidyverse)
library(ggplot2)
library(forcats)
setwd("~/Downloads/teaching/R")
who_disease = read.csv("dc/who_disease.csv")
interesting_countries = c(
"NGA", "SDN", "FRA", "NPL", "MYS",
"TZA", "YEM", "UKR", "BGD", "VNM"
)
who_subset = who_disease %>%
filter(
countryCode %in% interesting_countries,
disease == "measles",
year %in% c(2006, 2016)
) %>%
mutate(year = paste0("cases_", year)) %>%
spread(year, cases)#library(tidyverse) key,value to cols
View(who_subset)
who_subset %>%
ggplot(aes(y=country, x=log10(cases_2016)) ) +
geom_point()
who_subset %>%
mutate(logFoldChange = log2(cases_2016/cases_2006)) %>%
ggplot(aes(
x = logFoldChange,
y = reorder(country, logFoldChange)
)) +
geom_point()