Visualization Best Practices

6 minute read

Published:

This lesson is from datacamp Visualization Best Practices in R

library(dplyr)
library(ggplot2)
setwd("~/Downloads/teaching/R")
who_disease = read.csv("dc/who_disease.csv")
View(who_disease)

# simple bar chart of the number of observations 
# by region
ggplot(who_disease, aes(x=region)) +
  geom_bar()

# scatter plot of the year by cases for a region
who_disease %>%
  filter(region == "AMR") %>% 
  ggplot(aes(x=year, y=cases)) +
  geom_point(alpha=0.5)


# Number of cases obsereved 
# for a given region for different disease
who_disease %>%
  filter(region == "AMR") %>% 
  ggplot(aes(x=year, y=cases, color=disease)) +
  geom_point(alpha=0.5)

Proportion Data

  • Parts making up a whole

  • Often used to understand population
  • Pie charts
    • great for quick visualizations of proportion data for a single group

    • suffer from accuracy problems due to encoding data in angles but are intuitive and compact

  • Waffle Charts
    • If you need more precision in your representations than a pie chart can offer, want to compare more classes than three, or simply have the space
  • Stacked bar chart
    • If you want to compare different wholes to each other you should use a stacked bar chart, as they allow the viewer to easily compare across groups due to all proportions sharing the same y-axis

Pie chart

  • Advantages
    • intuitive
    • convey the concept of slices of a whole very well
  • Limitations
    • not very precise
    • data encoded in angles
    • class with 25% of data is encoded by angle of 90 degree
      • we like comparing lengths and sizes
    • lot of classes
      • First slice has meaningful anchoring point, usually the vertical line on the top of pie but after that other starts at arbitrary points
      • after three slices it becomes hard to compare
df = data.frame(value = c(10, 23, 15, 18),
                group = paste0("G", 1:4))
print(df)

# pie chart as a stacked bar chart that has been 'wrapped' around some central axis.
ggplot(df, aes(x="", y=value, fill=group)) +
  geom_col() # heights of bars to represent values

# Pie Chart
ggplot(df, aes(x="", y=value, fill=group)) +
  geom_col() + # heights of bars to represent values
  coord_polar(theta = "y") +
  theme_void()
a = c(5,7,2,9)
ifelse(a %% 2 == 0, "even", "odd")
# "odd"  "odd"  "even" "odd" 
# Compact to see that half of disease is from AFR
who_disease %>%
  mutate(
    region = ifelse(
      region %in% c("EUR", "AFR"),
      region, "Other"
    )
  ) %>%
  ggplot(aes(x="", fill = region)) +
  geom_bar(color="white") +
  coord_polar(theta = "y") +
  theme_void()

Waffle Chart

  • More precise than Pie chart

  • Encode data in area and not angles

      library(dplyr)
      library(ggplot2)
      library(waffle)
        
      setwd("~/Downloads/teaching/R")
      who_disease = read.csv("dc/who_disease.csv")
        
      obs_by_region = who_disease %>%
        group_by(region) %>%
        summarize(num_obs = n()) %>% # n() count
        mutate(percent = round(num_obs/sum(num_obs)*100))
        
      percent = obs_by_region$percent
      print(percent)
      names(percent) = obs_by_region$region
      print(percent)
        
      title = "Proportion of Obeservations by Region"
      waffle::waffle(percent, rows=5, title=title)
    
  • Pie Chart

    • measles, mumps, other disease count
      library(dplyr)
      library(ggplot2)
      library(waffle)
        
      setwd("~/Downloads/teaching/R")
      who_disease = read.csv("dc/who_disease.csv")
        
      # measles, mumps, other disease count
      disease_counts <- who_disease %>%
        mutate(disease = ifelse(
          disease %in% c('measles', 'mumps'), 
          disease, 
          'other')) %>%
        group_by(disease) %>%
        summarise(total_cases = sum(cases))
        
      print(disease_counts)
        
      ggplot(disease_counts, 
             aes(x = 1, 
                 y = total_cases, 
                 fill = disease)) +
        geom_col() + # heights of bars to represent values
        coord_polar(theta = "y") +
        theme_void() +
        ggtitle("Proportion of diseases")
    
  • Waffle Chart

      library(dplyr)
      library(ggplot2)
      library(waffle)
        
      setwd("~/Downloads/teaching/R")
      who_disease = read.csv("dc/who_disease.csv")
        
      disease_counts <- who_disease %>%
        group_by(disease) %>%
        summarise(total_cases = sum(cases)) %>% 
        mutate(percent = round(total_cases/sum(total_cases)*100))
        
      print(disease_counts)
        
      case_counts = disease_counts$percent
      names(case_counts) = disease_counts$disease
      print(case_counts)
        
      waffle(case_counts, rows = 5)
    

Stacked Bar Chart

library(dplyr)
library(ggplot2)
library(waffle)

setwd("~/Downloads/teaching/R")
who_disease = read.csv("dc/who_disease.csv")
View(who_disease)

# Bar Chart of country code and cases
# for SEAR region
who_disease %>%
  filter(region == "SEAR") %>%
  ggplot(aes(x=countryCode, y=cases)) +
  geom_col()

# fill bar with disease
who_disease %>%
  filter(region == "SEAR") %>%
  ggplot(aes(x=countryCode, y=cases, fill=disease)) +
  geom_col()

# stacks bars and constant height
who_disease %>%
  filter(region == "SEAR") %>%
  ggplot(aes(x=countryCode, y=cases, fill=disease)) +
  geom_col(position="fill")

# stack bar 'measles', 'mumps', other with year
disease_counts <- who_disease %>%
  mutate(disease = ifelse(
    disease %in% c('measles', 'mumps'), 
    disease, 
    'other')) %>%
  group_by(disease, year) %>% # year to the grouping
  summarise(total_cases = sum(cases))

print(disease_counts)

ggplot(disease_counts, 
       aes(x=year,
           y = total_cases, 
           fill = disease)) +
  geom_col(position="fill") # bars full height


# create factors
# rearrange factor level as in order as
# "measles", "other", "mumps"
disease_counts <- who_disease %>%
  mutate(
    disease = ifelse(disease %in% c('measles', 'mumps'), disease, 'other') %>% 
      factor(levels = c("measles", "other", "mumps")) # change factor levels to desired ordering
  ) %>%
  group_by(disease, year) %>%
  summarise(total_cases = sum(cases)) 
# plot
ggplot(disease_counts, aes(x = year, y = total_cases, fill = disease)) +
  geom_col(position = 'fill')


# Filter later than 1999
disease_counts <- who_disease %>%
	filter(year >= 1999) %>% 
	mutate(disease = ifelse( disease %in% c('measles', 'mumps'), 
                          disease, 
                          'other')) %>%
	group_by(disease, region) %>%    # Add region column to grouping
	summarise(total_cases = sum(cases))

# Set aesthetics so disease is the stacking variable, 
# region is the x-axis and counts are the y
ggplot(disease_counts, aes(x=region, 
                           y=total_cases, 
                           fill=disease)) +
	geom_col(position = "fill")

  • Accuracy degrades after3 classes
  • Worse in isolation than pie or waffle chart

Point Data

  • Single number associated with each category of data
  • Count, average, rate

Bar Chart

# Bar Chart
library(dplyr)
library(ggplot2)
setwd("~/Downloads/teaching/R")
who_disease = read.csv("dc/who_disease.csv")

# Total cases in each disease
ggplot(who_disease) +
  geom_col(aes(x=disease, y=cases))

# Bar chart should be used to represent meaning quantity
# Bar should be for stacking
# e.g. money spent on different activities in a project

# geom_col() when height is encoded in data

# Number of cases for disease in India in 1980
who_disease %>%
  filter((year == 1980) & (country=="India")) %>%
  ggplot(aes(x=disease, y=cases)) +
  geom_col()

who_disease %>%
  filter(year == 1980, country=="India") %>%
  ggplot(aes(x=disease, y=cases)) +
  geom_col()

# geom_bar need only x-axis
# who_disease observations for large cases
who_disease %>%
  # filter data to observations of greater than 1,000 cases
  filter(cases > 1000) %>%
  # map the x-axis to the region column
  ggplot(aes(x=region)) +
  # add a geom_bar call
  geom_bar()
  • Bar Chart is not idle
    • not a quantity e.g. percentile, ratio, sensor reading like. temp
    • Non-linear transformation e.g. log, square root, or exponentiation
      • This is because depending upon where they fall on the transformation curve, one transformed unit equals a different number of un-transformed units
  • Benefits of Point Chart
    • High Precision
    • Efficient Representation
    • Simple
    • more classes can be easily represented
# Single number associated with each category of data
# Count, average, rate

# Point Chart
#library(dplyr)
library(tidyverse)
library(ggplot2)
library(forcats)
setwd("~/Downloads/teaching/R")
who_disease = read.csv("dc/who_disease.csv")

interesting_countries = c(
  "NGA", "SDN", "FRA", "NPL", "MYS",
  "TZA", "YEM", "UKR", "BGD", "VNM"
)

who_subset = who_disease %>%
  filter(
    countryCode %in% interesting_countries,
    disease == "measles",
    year %in% c(2006, 2016)
  ) %>%
  mutate(year = paste0("cases_", year)) %>%
  spread(year, cases)#library(tidyverse) key,value to cols

View(who_subset)

who_subset %>%
  ggplot(aes(y=country, x=log10(cases_2016)) ) +
  geom_point()


who_subset %>%
  mutate(logFoldChange = log2(cases_2016/cases_2006)) %>%
  ggplot(aes(
    x = logFoldChange,
    y = reorder(country, logFoldChange) 
  )) +
  geom_point()

Tags: