dplyr
Published:
This lesson is from datacamp Introduction to Tidyverse
Data wrangling with dplyr
library(gapminder)
library(dplyr)
# display Gapminder Objects structured in dataframe
print(gapminder)
print(dim(gapminder))
# filter verb with pipe %>%
max(gapminder$year)
gapminder %>% filter(year == 2007)
gapminder %>% filter(country == "United States")
gapminder %>% filter(year == 2007, country == "United States")
# arrange verb
gapminder %>% arrange(gdpPercap) # ascending order
gapminder %>% arrange(desc(gdpPercap)) # descending order
gapminder %>% filter(year == 2007) %>% arrange(desc(gdpPercap))
gapminder %>% filter(year == 1957) %>% arrange(desc(pop))
# The mutate verb to change or add new
gapminder %>% mutate(pop = pop/1000000) # change million 1,000,000
gapminder %>% mutate(gdp = gdpPercap * pop) # add new total gdp from gdrPercap
gapminder %>% # Highest GDP for 2007
mutate(gdp = gdpPercap * pop) %>%
filter(year == 2007) %>%
arrange(desc(gdp))
Data Visualization with ggplot2
library(gapminder)
library(dplyr)
library(ggplot2)
gapminder_2007 = gapminder %>% filter(year == 2007)
View(gapminder_2007)
ggplot(gapminder_2007, aes(x=gdpPercap, y=lifeExp)) +
geom_point()
# Higher lifeExp when higher gdpPercap, however, lot of countries on the left
# Useful to have logscale
ggplot(gapminder_2007, aes(x=gdpPercap, y=lifeExp)) +
geom_point() +
scale_x_log10()
gapminder_1952 <- gapminder %>%
filter(year == 1952)
ggplot(gapminder_1952, aes(x = pop, y = lifeExp)) +
geom_point() +
scale_x_log10()
gapminder_1952 <- gapminder %>%
filter(year == 1952)
ggplot(gapminder_1952, aes(x=pop, y=gdpPercap)) +
geom_point() +
scale_x_log10() +
scale_y_log10()
# Additional Aesthetics
gapminder_2007 = gapminder %>% filter(year == 2007)
ggplot(gapminder_2007, aes(x=gdpPercap, y=lifeExp, color=continent)) +
geom_point() +
scale_x_log10()
gapminder_2007 = gapminder %>% filter(year == 2007)
ggplot(gapminder_2007, aes(x=gdpPercap, y=lifeExp, color=continent, size=pop)) +
geom_point() +
scale_x_log10()
# Faceting
ggplot(gapminder_2007, aes(x=gdpPercap, y=lifeExp)) +
geom_point() +
scale_x_log10() +
facet_wrap(~ continent) # ~ means by
# Scatter plot comparing gdpPercap and lifeExp, with color representing continent
# and size representing population, faceted by year
ggplot(gapminder, aes(x=gdpPercap, y=lifeExp, color=continent, size=pop)) +
geom_point() +
scale_x_log10() +
facet_wrap(~ year)
Grouping and summarizing
library(gapminder)
library(dplyr)
library(ggplot2)
# The summarize verb with functions mean, sum, median, min, max
gapminder %>%
summarize(meanLifeExp = mean(lifeExp))
gapminder %>%
filter(year==2007) %>%
summarize(meanLifeExp = mean(lifeExp))
gapminder %>%
filter(year==2007) %>%
summarize(meanLifeExp = mean(lifeExp),
totalPop = sum(pop))
# Summarize to find the median life expectancy
gapminder %>%
summarize(medianLifeExp = median(lifeExp))
# Filter for 1957 then summarize the median life expectancy
gapminder %>%
filter(year == 1957) %>%
summarize(medianLifeExp = median(lifeExp))
# Filter for 1957 then summarize the median life expectancy and the maximum GDP per capita
gapminder %>%
filter(year == 1957) %>%
summarize(medianLifeExp = median(lifeExp), maxGdpPercap = max(gdpPercap))
# The group_by verb
gapminder %>%
group_by(year) %>%
summarize(medianLifeExp = median(lifeExp), totalPop = sum(pop))
gapminder %>%
filter(year == 2007) %>%
group_by(continent) %>%
summarize(meanLifeExp = mean(lifeExp),
totalPop = sum(pop))
gapminder %>%
group_by(year, continent) %>%
summarize(totalPop = sum(pop),
meanLifeExp = mean(lifeExp))
# Find median life expectancy and maximum GDP per capita in each year
gapminder %>%
group_by(year) %>%
summarize(medianLifeExp = median(lifeExp), maxGdpPercap = max(gdpPercap) )
# Find median life expectancy and maximum GDP per capita in each continent in 1957
gapminder %>%
filter(year == 1957) %>%
group_by(continent) %>%
summarize(medianLifeExp=median(lifeExp), maxGdpPercap=max(gdpPercap))
# Find median life expectancy and maximum GDP per capita in each continent/year combination
gapminder %>%
group_by(continent, year) %>%
summarize(medianLifeExp=median(lifeExp), maxGdpPercap=max(gdpPercap))
by_year = gapminder %>%
group_by(year) %>%
summarize(totalPop = sum(pop),
medianLifeExp = mean(lifeExp))
print(by_year)
ggplot(by_year, aes(x=year, y=totalPop)) +
geom_point() +
expand_limits(y=0)
by_year_continent = gapminder %>%
group_by(year, continent) %>%
summarize(totalPop = sum(pop),
meanLifeExp = mean(lifeExp))
print(by_year_continent)
ggplot(by_year_continent, aes(x=year, y=totalPop, color=continent)) +
geom_point() +
expand_limits(y=0)
# Create a scatter plot showing the change in medianLifeExp over time
by_year <- gapminder %>%
group_by(year) %>%
summarize(medianLifeExp = median(lifeExp),
maxGdpPercap = max(gdpPercap))
print(by_year)
ggplot(by_year, aes(x=year, y=medianLifeExp)) +
geom_point() +
expand_limits(y=0)
# Summarize medianGdpPercap within each continent within each year:
# by_year_continent
by_year_continent = gapminder %>% group_by(continent, year) %>% summarize(medianGdpPercap=median(gdpPercap))
# Plot the change in medianGdpPercap in each continent over time
ggplot(by_year_continent, aes(x=year, y=medianGdpPercap, color=continent)) + geom_point() + expand_limits(y=0)
# Summarize the median GDP and median life expectancy per continent in 2007
by_continent_2007 <- gapminder %>%
filter(year == 2007) %>%
group_by(continent) %>%
summarize(medianGdpPercap = median(gdpPercap),
medianLifeExp = median(lifeExp))
# Use a scatter plot to compare the median GDP and median life expectancy
ggplot(by_continent_2007,
aes(x = medianGdpPercap, y = medianLifeExp, color = continent)) +
geom_point()
# Line Plots
# Summarize the median gdpPercap by year, then save it as by_year
by_year = gapminder %>% group_by(year) %>% summarize(medianGdpPercap=median(gdpPercap))
print(by_year)
# Create a line plot showing the change in medianGdpPercap over time
ggplot(by_year, aes(x=year, y=medianGdpPercap)) + geom_line() + expand_limits(y=0)
# Summarize the median gdpPercap by year & continent, save as by_year_continent
by_year_continent = gapminder %>%
group_by(year, continent) %>%
summarize(medianGdpPercap=median(gdpPercap))
# Create a line plot showing the change in medianGdpPercap by continent over time
ggplot(by_year_continent,
aes(x=year, y=medianGdpPercap, color=continent)) +
geom_line() +
expand_limits(y=0)
# Bar Chart
by_continent = gapminder %>%
filter(year==2007) %>%
group_by(continent) %>%
summarize(meanLifeExp=mean(lifeExp))
print(by_continent)
ggplot(by_continent, aes(x=continent, y=meanLifeExp)) +
geom_col()
# Summarize the median gdpPercap by continent in 1952
by_continent = gapminder %>%
filter(year==1952) %>%
group_by(continent) %>%
summarize(medianGdpPercap=median(gdpPercap))
print(by_continent)
# Create a bar plot showing medianGdp by continent
ggplot(by_continent,
aes(x=continent, y=medianGdpPercap)) +
geom_col()
# Filter for observations in the Oceania continent in 1952
oceania_1952 = gapminder %>% filter(year==1952, continent=="Oceania")
print(oceania_1952)
# Create a bar plot of gdpPercap by country
ggplot(oceania_1952, aes(x=country, y=gdpPercap)) + geom_col()
# Histogram
gapminder_2007 = gapminder %>% filter(year==2007)
ggplot(gapminder_2007, aes(x=lifeExp)) +
geom_histogram()
# binwidth
gapminder_2007 = gapminder %>% filter(year==2007)
ggplot(gapminder_2007, aes(x=lifeExp)) +
geom_histogram(binwidth=5)
# bins
# Create a histogram of population (pop_by_mil)
gapminder_1952 <- gapminder %>%
filter(year == 1952) %>%
mutate(pop_by_mil = pop / 1000000)
ggplot(gapminder_1952, aes(x=pop_by_mil)) +
geom_histogram(bins=50)
# Create a histogram of population (pop), with x on a log scale
gapminder_1952 <- gapminder %>%
filter(year == 1952)
ggplot(gapminder_1952, aes(x=pop)) + geom_histogram() +
scale_x_log10()
# Box Plots
gapminder_2007 = gapminder %>% filter(year==2007)
ggplot(gapminder_2007, aes(x=continent, y=lifeExp)) +
geom_boxplot()
# Create a boxplot comparing gdpPercap among continents
gapminder_1952 <- gapminder %>%
filter(year == 1952)
ggplot(gapminder_1952, aes(x=continent, y=gdpPercap)) +
geom_boxplot() +
scale_y_log10()
# Add a title to this graph
ggplot(gapminder_1952, aes(x = continent, y = gdpPercap)) +
geom_boxplot() +
scale_y_log10() +
ggtitle("Comparing GDP per capita across continents")