dplyr

6 minute read

Published:

This lesson is from datacamp Introduction to Tidyverse

Data wrangling with dplyr

library(gapminder)
library(dplyr)

# display Gapminder Objects structured in dataframe
print(gapminder)
print(dim(gapminder))

# filter verb with pipe %>%
max(gapminder$year)
gapminder %>% filter(year == 2007)
gapminder %>% filter(country == "United States")
gapminder %>% filter(year == 2007, country == "United States")

# arrange verb
gapminder %>% arrange(gdpPercap) # ascending order

gapminder %>% arrange(desc(gdpPercap)) # descending order

gapminder %>% filter(year == 2007) %>% arrange(desc(gdpPercap))

gapminder %>% filter(year == 1957) %>% arrange(desc(pop))

# The mutate verb to change or add new
gapminder %>% mutate(pop = pop/1000000) # change million 1,000,000 

gapminder %>% mutate(gdp = gdpPercap * pop) # add new total gdp from gdrPercap


gapminder %>% # Highest GDP for 2007
  mutate(gdp = gdpPercap * pop) %>%
  filter(year == 2007) %>%
  arrange(desc(gdp))

Data Visualization with ggplot2

library(gapminder)
library(dplyr)
library(ggplot2)

gapminder_2007 = gapminder %>% filter(year == 2007)
View(gapminder_2007)

ggplot(gapminder_2007, aes(x=gdpPercap, y=lifeExp)) +
  geom_point()

# Higher lifeExp when higher gdpPercap, however, lot of countries on the left
# Useful to have logscale
ggplot(gapminder_2007, aes(x=gdpPercap, y=lifeExp)) +
  geom_point() +
  scale_x_log10()

gapminder_1952 <- gapminder %>%
  filter(year == 1952)
ggplot(gapminder_1952, aes(x = pop, y = lifeExp)) +
  geom_point() + 
  scale_x_log10()

gapminder_1952 <- gapminder %>%
  filter(year == 1952)

ggplot(gapminder_1952, aes(x=pop, y=gdpPercap)) + 
  geom_point() + 
  scale_x_log10() + 
  scale_y_log10()

# Additional Aesthetics

gapminder_2007 = gapminder %>% filter(year == 2007)
ggplot(gapminder_2007, aes(x=gdpPercap, y=lifeExp, color=continent)) +
  geom_point() +
  scale_x_log10()

gapminder_2007 = gapminder %>% filter(year == 2007)
ggplot(gapminder_2007, aes(x=gdpPercap, y=lifeExp, color=continent, size=pop)) +
  geom_point() +
  scale_x_log10()

# Faceting
ggplot(gapminder_2007, aes(x=gdpPercap, y=lifeExp)) +
  geom_point() +
  scale_x_log10() +
  facet_wrap(~ continent) # ~ means by

# Scatter plot comparing gdpPercap and lifeExp, with color representing continent
# and size representing population, faceted by year
ggplot(gapminder, aes(x=gdpPercap, y=lifeExp, color=continent, size=pop)) + 
  geom_point() + 
  scale_x_log10() + 
  facet_wrap(~ year)

Grouping and summarizing

library(gapminder)
library(dplyr)
library(ggplot2)

# The summarize verb with functions mean, sum, median, min, max
gapminder %>% 
  summarize(meanLifeExp = mean(lifeExp))

gapminder %>% 
  filter(year==2007) %>%
  summarize(meanLifeExp = mean(lifeExp))

gapminder %>% 
  filter(year==2007) %>%
  summarize(meanLifeExp = mean(lifeExp),
            totalPop = sum(pop))

# Summarize to find the median life expectancy
gapminder %>% 
  summarize(medianLifeExp = median(lifeExp))

# Filter for 1957 then summarize the median life expectancy
gapminder %>% 
  filter(year == 1957) %>% 
  summarize(medianLifeExp = median(lifeExp))

# Filter for 1957 then summarize the median life expectancy and the maximum GDP per capita
gapminder %>% 
  filter(year == 1957) %>% 
  summarize(medianLifeExp = median(lifeExp), maxGdpPercap = max(gdpPercap))

# The group_by verb
gapminder %>% 
  group_by(year) %>% 
  summarize(medianLifeExp = median(lifeExp), totalPop = sum(pop))

gapminder %>%
  filter(year == 2007) %>%
  group_by(continent) %>%
  summarize(meanLifeExp = mean(lifeExp),
            totalPop = sum(pop))

gapminder %>%
  group_by(year, continent) %>%
  summarize(totalPop = sum(pop),
            meanLifeExp = mean(lifeExp))

# Find median life expectancy and maximum GDP per capita in each year
gapminder %>% 
  group_by(year) %>% 
  summarize(medianLifeExp = median(lifeExp), maxGdpPercap = max(gdpPercap) )

# Find median life expectancy and maximum GDP per capita in each continent in 1957
gapminder %>% 
  filter(year == 1957) %>% 
  group_by(continent) %>% 
  summarize(medianLifeExp=median(lifeExp), maxGdpPercap=max(gdpPercap))

# Find median life expectancy and maximum GDP per capita in each continent/year combination
gapminder %>% 
  group_by(continent, year) %>% 
  summarize(medianLifeExp=median(lifeExp), maxGdpPercap=max(gdpPercap))


by_year = gapminder %>%
  group_by(year) %>%
  summarize(totalPop = sum(pop),
            medianLifeExp = mean(lifeExp))
print(by_year)
ggplot(by_year, aes(x=year, y=totalPop)) +
  geom_point() +
  expand_limits(y=0)


by_year_continent = gapminder %>%
  group_by(year, continent) %>%
  summarize(totalPop = sum(pop),
            meanLifeExp = mean(lifeExp))
print(by_year_continent)
ggplot(by_year_continent, aes(x=year, y=totalPop, color=continent)) +
  geom_point() +
  expand_limits(y=0)


# Create a scatter plot showing the change in medianLifeExp over time
by_year <- gapminder %>%
  group_by(year) %>%
  summarize(medianLifeExp = median(lifeExp),
            maxGdpPercap = max(gdpPercap))
print(by_year)
ggplot(by_year, aes(x=year, y=medianLifeExp)) + 
  geom_point() + 
  expand_limits(y=0)


# Summarize medianGdpPercap within each continent within each year: 
# by_year_continent
by_year_continent = gapminder %>% group_by(continent, year) %>% summarize(medianGdpPercap=median(gdpPercap))

# Plot the change in medianGdpPercap in each continent over time
ggplot(by_year_continent, aes(x=year, y=medianGdpPercap, color=continent)) + geom_point() + expand_limits(y=0)


# Summarize the median GDP and median life expectancy per continent in 2007
by_continent_2007 <- gapminder %>%
  filter(year == 2007) %>%
  group_by(continent) %>%
  summarize(medianGdpPercap = median(gdpPercap),
            medianLifeExp = median(lifeExp))

# Use a scatter plot to compare the median GDP and median life expectancy
ggplot(by_continent_2007, 
       aes(x = medianGdpPercap, y = medianLifeExp, color = continent)) +
  geom_point()


# Line Plots

# Summarize the median gdpPercap by year, then save it as by_year
by_year = gapminder %>% group_by(year) %>% summarize(medianGdpPercap=median(gdpPercap))
print(by_year)
# Create a line plot showing the change in medianGdpPercap over time
ggplot(by_year, aes(x=year, y=medianGdpPercap)) + geom_line() + expand_limits(y=0)

# Summarize the median gdpPercap by year & continent, save as by_year_continent
by_year_continent = gapminder %>% 
  group_by(year, continent) %>% 
  summarize(medianGdpPercap=median(gdpPercap))
# Create a line plot showing the change in medianGdpPercap by continent over time
ggplot(by_year_continent, 
       aes(x=year, y=medianGdpPercap, color=continent)) + 
  geom_line() + 
  expand_limits(y=0)

# Bar Chart
by_continent = gapminder %>%
  filter(year==2007) %>%
  group_by(continent) %>%
  summarize(meanLifeExp=mean(lifeExp))
print(by_continent)
ggplot(by_continent, aes(x=continent, y=meanLifeExp)) +
  geom_col()


# Summarize the median gdpPercap by continent in 1952
by_continent = gapminder %>% 
  filter(year==1952) %>% 
  group_by(continent) %>% 
  summarize(medianGdpPercap=median(gdpPercap))
print(by_continent)
# Create a bar plot showing medianGdp by continent
ggplot(by_continent, 
       aes(x=continent, y=medianGdpPercap)) + 
  geom_col()


# Filter for observations in the Oceania continent in 1952
oceania_1952 = gapminder %>% filter(year==1952, continent=="Oceania")
print(oceania_1952)
# Create a bar plot of gdpPercap by country
ggplot(oceania_1952, aes(x=country, y=gdpPercap)) + geom_col()


# Histogram
gapminder_2007 = gapminder %>% filter(year==2007)
ggplot(gapminder_2007, aes(x=lifeExp)) +
  geom_histogram()

# binwidth
gapminder_2007 = gapminder %>% filter(year==2007)
ggplot(gapminder_2007, aes(x=lifeExp)) +
  geom_histogram(binwidth=5)

# bins
# Create a histogram of population (pop_by_mil)
gapminder_1952 <- gapminder %>%
  filter(year == 1952) %>%
  mutate(pop_by_mil = pop / 1000000)
ggplot(gapminder_1952, aes(x=pop_by_mil)) + 
  geom_histogram(bins=50)


# Create a histogram of population (pop), with x on a log scale
gapminder_1952 <- gapminder %>%
  filter(year == 1952)
ggplot(gapminder_1952, aes(x=pop)) + geom_histogram() + 
  scale_x_log10()

# Box Plots
gapminder_2007 = gapminder %>% filter(year==2007)
ggplot(gapminder_2007, aes(x=continent, y=lifeExp)) +
  geom_boxplot()

# Create a boxplot comparing gdpPercap among continents
gapminder_1952 <- gapminder %>%
  filter(year == 1952)
ggplot(gapminder_1952, aes(x=continent, y=gdpPercap)) + 
  geom_boxplot() + 
  scale_y_log10()

# Add a title to this graph
ggplot(gapminder_1952, aes(x = continent, y = gdpPercap)) +
  geom_boxplot() +
  scale_y_log10() + 
  ggtitle("Comparing GDP per capita across continents")