Visualization Best Practices

Published:

This lesson is from datacamp Visualization Best Practices in R

library(dplyr)
library(ggplot2)
View(who_disease)

# simple bar chart of the number of observations
# by region
ggplot(who_disease, aes(x=region)) +
geom_bar()

# scatter plot of the year by cases for a region
who_disease %>%
filter(region == "AMR") %>%
ggplot(aes(x=year, y=cases)) +
geom_point(alpha=0.5)

# Number of cases obsereved
# for a given region for different disease
who_disease %>%
filter(region == "AMR") %>%
ggplot(aes(x=year, y=cases, color=disease)) +
geom_point(alpha=0.5)


Proportion Data

• Parts making up a whole

• Often used to understand population
• Pie charts
• great for quick visualizations of proportion data for a single group

• suffer from accuracy problems due to encoding data in angles but are intuitive and compact

• Waffle Charts
• If you need more precision in your representations than a pie chart can offer, want to compare more classes than three, or simply have the space
• Stacked bar chart
• If you want to compare different wholes to each other you should use a stacked bar chart, as they allow the viewer to easily compare across groups due to all proportions sharing the same y-axis

Pie chart

• intuitive
• convey the concept of slices of a whole very well
• Limitations
• not very precise
• data encoded in angles
• class with 25% of data is encoded by angle of 90 degree
• we like comparing lengths and sizes
• lot of classes
• First slice has meaningful anchoring point, usually the vertical line on the top of pie but after that other starts at arbitrary points
• after three slices it becomes hard to compare
df = data.frame(value = c(10, 23, 15, 18),
group = paste0("G", 1:4))
print(df)

# pie chart as a stacked bar chart that has been 'wrapped' around some central axis.
ggplot(df, aes(x="", y=value, fill=group)) +
geom_col() # heights of bars to represent values

# Pie Chart
ggplot(df, aes(x="", y=value, fill=group)) +
geom_col() + # heights of bars to represent values
coord_polar(theta = "y") +
theme_void()

a = c(5,7,2,9)
ifelse(a %% 2 == 0, "even", "odd")
# "odd"  "odd"  "even" "odd"

# Compact to see that half of disease is from AFR
who_disease %>%
mutate(
region = ifelse(
region %in% c("EUR", "AFR"),
region, "Other"
)
) %>%
ggplot(aes(x="", fill = region)) +
geom_bar(color="white") +
coord_polar(theta = "y") +
theme_void()


Waffle Chart

• More precise than Pie chart

• Encode data in area and not angles

  library(dplyr)
library(ggplot2)
library(waffle)

obs_by_region = who_disease %>%
group_by(region) %>%
summarize(num_obs = n()) %>% # n() count
mutate(percent = round(num_obs/sum(num_obs)*100))

percent = obs_by_region$percent print(percent) names(percent) = obs_by_region$region
print(percent)

title = "Proportion of Obeservations by Region"
waffle::waffle(percent, rows=5, title=title)

• Pie Chart

• measles, mumps, other disease count
  library(dplyr)
library(ggplot2)
library(waffle)

# measles, mumps, other disease count
disease_counts <- who_disease %>%
mutate(disease = ifelse(
disease %in% c('measles', 'mumps'),
disease,
'other')) %>%
group_by(disease) %>%
summarise(total_cases = sum(cases))

print(disease_counts)

ggplot(disease_counts,
aes(x = 1,
y = total_cases,
fill = disease)) +
geom_col() + # heights of bars to represent values
coord_polar(theta = "y") +
theme_void() +
ggtitle("Proportion of diseases")

• Waffle Chart

  library(dplyr)
library(ggplot2)
library(waffle)

disease_counts <- who_disease %>%
group_by(disease) %>%
summarise(total_cases = sum(cases)) %>%
mutate(percent = round(total_cases/sum(total_cases)*100))

print(disease_counts)

case_counts = disease_counts$percent names(case_counts) = disease_counts$disease
print(case_counts)

waffle(case_counts, rows = 5)


Stacked Bar Chart

library(dplyr)
library(ggplot2)
library(waffle)

View(who_disease)

# Bar Chart of country code and cases
# for SEAR region
who_disease %>%
filter(region == "SEAR") %>%
ggplot(aes(x=countryCode, y=cases)) +
geom_col()

# fill bar with disease
who_disease %>%
filter(region == "SEAR") %>%
ggplot(aes(x=countryCode, y=cases, fill=disease)) +
geom_col()

# stacks bars and constant height
who_disease %>%
filter(region == "SEAR") %>%
ggplot(aes(x=countryCode, y=cases, fill=disease)) +
geom_col(position="fill")

# stack bar 'measles', 'mumps', other with year
disease_counts <- who_disease %>%
mutate(disease = ifelse(
disease %in% c('measles', 'mumps'),
disease,
'other')) %>%
group_by(disease, year) %>% # year to the grouping
summarise(total_cases = sum(cases))

print(disease_counts)

ggplot(disease_counts,
aes(x=year,
y = total_cases,
fill = disease)) +
geom_col(position="fill") # bars full height

# create factors
# rearrange factor level as in order as
# "measles", "other", "mumps"
disease_counts <- who_disease %>%
mutate(
disease = ifelse(disease %in% c('measles', 'mumps'), disease, 'other') %>%
factor(levels = c("measles", "other", "mumps")) # change factor levels to desired ordering
) %>%
group_by(disease, year) %>%
summarise(total_cases = sum(cases))
# plot
ggplot(disease_counts, aes(x = year, y = total_cases, fill = disease)) +
geom_col(position = 'fill')

# Filter later than 1999
disease_counts <- who_disease %>%
filter(year >= 1999) %>%
mutate(disease = ifelse( disease %in% c('measles', 'mumps'),
disease,
'other')) %>%
group_by(disease, region) %>%    # Add region column to grouping
summarise(total_cases = sum(cases))

# Set aesthetics so disease is the stacking variable,
# region is the x-axis and counts are the y
ggplot(disease_counts, aes(x=region,
y=total_cases,
fill=disease)) +
geom_col(position = "fill")


• Worse in isolation than pie or waffle chart

Point Data

• Single number associated with each category of data
• Count, average, rate

Bar Chart

# Bar Chart
library(dplyr)
library(ggplot2)

# Total cases in each disease
ggplot(who_disease) +
geom_col(aes(x=disease, y=cases))

# Bar chart should be used to represent meaning quantity
# Bar should be for stacking
# e.g. money spent on different activities in a project

# geom_col() when height is encoded in data

# Number of cases for disease in India in 1980
who_disease %>%
filter((year == 1980) & (country=="India")) %>%
ggplot(aes(x=disease, y=cases)) +
geom_col()

who_disease %>%
filter(year == 1980, country=="India") %>%
ggplot(aes(x=disease, y=cases)) +
geom_col()

# geom_bar need only x-axis
# who_disease observations for large cases
who_disease %>%
# filter data to observations of greater than 1,000 cases
filter(cases > 1000) %>%
# map the x-axis to the region column
ggplot(aes(x=region)) +
geom_bar()

• Bar Chart is not idle
• not a quantity e.g. percentile, ratio, sensor reading like. temp
• Non-linear transformation e.g. log, square root, or exponentiation
• This is because depending upon where they fall on the transformation curve, one transformed unit equals a different number of un-transformed units
• Benefits of Point Chart
• High Precision
• Efficient Representation
• Simple
• more classes can be easily represented
# Single number associated with each category of data
# Count, average, rate

# Point Chart
#library(dplyr)
library(tidyverse)
library(ggplot2)
library(forcats)

interesting_countries = c(
"NGA", "SDN", "FRA", "NPL", "MYS",
"TZA", "YEM", "UKR", "BGD", "VNM"
)

who_subset = who_disease %>%
filter(
countryCode %in% interesting_countries,
disease == "measles",
year %in% c(2006, 2016)
) %>%
mutate(year = paste0("cases_", year)) %>%

View(who_subset)

who_subset %>%
ggplot(aes(y=country, x=log10(cases_2016)) ) +
geom_point()

who_subset %>%
mutate(logFoldChange = log2(cases_2016/cases_2006)) %>%
ggplot(aes(
x = logFoldChange,
y = reorder(country, logFoldChange)
)) +
geom_point()



Tags: