Univariate Visualization

2 minute read

Published:

This post covers Univariate Data Visualization.

Data Visualisation

  • Exploratory analysis to look for relationship in the data
    • Gather -> Clean -> Explore -> Analyse -> Share
  • Matplotlib, Seaborn, Pandas
  • Univariate Visualisation
    • Qualitative Variables - Bar chart
    • Quantitive Variables - Histogram

Univariate Visualisation

%matplotlib inline

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
file_path = f'./data/titanic/train.csv'
df = pd.read_csv(file_path)
display(df.head(2))
# Clean Gender values
df_clean = df.copy()

df_clean.Gender.replace(to_replace='M', value='male', inplace=True)
df_clean.Gender.replace(to_replace='Male', value='male', inplace=True)

df_clean.Gender.replace(to_replace='F', value='female', inplace=True)
df_clean.Gender.replace(to_replace='Female', value='female', inplace=True)

df_gender = df_clean.Gender.value_counts()
df_gender

Bar Chart - Qualtative Variables

# Gender Bar plot
sns.countplot(data=df_clean, x='Gender');
# Gender Bar plot with color

base_color = sns.color_palette()[0]

sns.countplot(data=df_clean, x='Gender', color=base_color);
# Gender Bar plot with color and order

base_color = sns.color_palette()[0]

sns.countplot(data=df_clean, x='Gender', color=base_color, order=['male','female']);
# Gender Bar plot with color and sort

base_color = sns.color_palette()[0]

order = df_clean.Gender.value_counts().index

sns.countplot(data=df_clean, x='Gender', color=base_color, order=order);
# Gender Bar plot with color, sort, and ticks rotation

base_color = sns.color_palette()[0]

order = df_clean.Gender.value_counts().index

sns.countplot(data=df_clean, x='Gender', color=base_color, order=order);

plt.xticks(rotation=90);
# Gender Bar plot with label on y-axix and count x-axix

base_color = sns.color_palette()[0]

order = df_clean.Gender.value_counts().index

sns.countplot(data=df_clean, y='Gender', color=base_color, order=order);

plt.xticks(rotation=90);
# SibSp Bar plot

color = sns.color_palette()[0]

order = df_clean.SibSp.value_counts().index

sns.countplot(data=df_clean, y='SibSp', color=color, order=order);
# Embarked Bar plot
sns.countplot(data=df, y='Embarked');
# Embarked Bar plot

base_color = sns.color_palette()[0]

order = df_clean.Embarked.value_counts().index

b = sns.countplot(data=df_clean, y='Embarked', color=base_color, order=order);

ports = ['Southampton', 'Cherbourg', 'Queenstown']

plt.yticks(np.arange(3), ports);

b.tick_params(labelsize=16)
b.axes.set_title("Embarked",fontsize=20)
b.set_xlabel("Count of People",fontsize=20)
b.set_ylabel("Ports",fontsize=20);
# Embarked Bar plot - compute ticks

n_ = df_clean.shape[0]
max_ = df_clean.Embarked.value_counts(normalize=True)[0]
tick_props = np.arange(0, max_, 0.1)
tick_labels = [f'{tp:.1f}' for tp in tick_props]
tick_props_abs = tick_props * n_
# Embarked Bar plot - Proportion

color = sns.color_palette()[0]

order = df_clean.Embarked.value_counts().index

b = sns.countplot(data=df, x='Embarked', color=color, order=order);

ports = ['Southampton', 'Cherbourg', 'Queenstown']
plt.xticks(np.arange(3), ports);

b.tick_params(labelsize=16)
b.set_xlabel('Port', fontsize=16);
b.set_ylabel('Count of Passengers', fontsize=16);
b.set_title('Embarkation Analysis - Titanic dataset', fontsize=14);

plt.yticks(tick_props_abs, tick_labels);
# Embarked Bar plot - Proportion with labels

base_color = sns.color_palette()[0]

order = df_clean.Embarked.value_counts().index

b = sns.countplot(data=df_clean, x='Embarked', color=base_color, order=order);

ports = ['Southampton', 'Cherbourg', 'Queenstown']
plt.xticks(np.arange(3), ports);

plt.yticks(tick_props_abs, tick_labels);

plt.ylim((0, 720))

b.tick_params(labelsize=16)
b.axes.set_title("Embarked",fontsize=20)
b.set_ylabel("Percent of People",fontsize=20)
b.set_xlabel("Ports",fontsize=20);

for i in range(len(order)):
    count = df_clean.Embarked.value_counts()[i]
    plt.text(i-.2, count+30, f'{count/n_:.2f}', fontsize=14)
df_clean.Embarked.value_counts(normalize=True)

Pie Chart

cats = df_clean.Embarked.value_counts(normalize=True).index
vals = df_clean.Embarked.value_counts(normalize=True).values
print(cats)
print(vals)

ports = ['Southampton', 'Cherbourg', 'Queenstown']
explode = [0, 0, 0.1]

plt.pie(vals, labels=ports, startangle=90, explode=explode);

Histogram - Quantative Variables

plt.hist(data=df_clean, x='Age');
plt.hist(data=df_clean, x='Age', bins=20); # defaul bins is 10
# customize bins
bins = np.arange(0, df_clean.Age.max()+5, 5)

plt.hist(data=df_clean, x='Age', bins=bins); # defaul bins is 10
sns.distplot(df_clean.Age);
sns.distplot(df_clean.Age, kde=False);
bins = np.arange(0, df_clean.Age.max()+5, 5)
sns.distplot(df_clean.Age, kde=False, bins=bins);

Case Study: Diabetes Dataset

  • https://www.kaggle.com/uciml/pima-indians-diabetes-database
df = pd.read_csv('./data/diabetes/diabetes.csv')
print(df.shape)

print(df.info())

display(df.describe().T)
sns.distplot(df[df.Outcome==0].Glucose, label='No Diabetes');
sns.distplot(df[df.Outcome==1].Glucose, label='Diabetes');
plt.legend();
sns.boxplot(data=df, y='Glucose', x='Outcome');