Univariate Visualization

2 minute read

Published:

This post covers Univariate Data Visualization.

Data Visualisation

  • Exploratory analysis to look for relationship in the data
    • Gather -> Clean -> Explore -> Analyse -> Share
  • Matplotlib, Seaborn, Pandas
  • Univariate Visualisation
    • Qualitative Variables - Bar chart
    • Quantitive Variables - Histogram

Univariate Visualisation

%matplotlib inline

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
file_path = f'./data/titanic/train.csv'
df = pd.read_csv(file_path)
print(df.shape)
display(df.head(2))

# Gender Bar plot
sns.countplot(data=df, x='Gender');
# Clean Gender values
df_clean = df #df.copy(deep=False)

df_clean.Gender.replace(to_replace=['M', "Male"], value='male', inplace=True)

df_clean.Gender.replace(to_replace=['F', "Female"], value='female', inplace=True)

print(df_clean.Gender.value_counts())
print(df.Gender.value_counts())

Bar Chart - Qualtative Variables

# Gender Bar plot
sns.countplot(data=df, x='Gender');
plt.show()
sns.countplot(data=df, y='Gender');
# Gender Bar plot with color

base_color = sns.color_palette()[0]

sns.countplot(data=df, x='Gender', color=base_color);
# Gender Bar plot with color and order

base_color = sns.color_palette()[0]

sns.countplot(data=df, x='Gender', color=base_color, order=['male','female']);
# Gender Bar plot with color and sort

base_color = sns.color_palette()[0]

order = df.Gender.value_counts().index

sns.countplot(data=df, x='Gender', color=base_color, order=order);
# Gender Bar plot with color, sort, and ticks rotation

base_color = sns.color_palette()[0]

order = df.Gender.value_counts().index

sns.countplot(data=df, x='Gender', color=base_color, order=order);

plt.xticks(rotation=90);
# Gender Bar plot with label on y-axix and count x-axix

base_color = sns.color_palette()[0]

order = df.Gender.value_counts().index

sns.countplot(data=df, y='Gender', color=base_color, order=order);

plt.xticks(rotation=90);
# SibSp Bar plot

color = sns.color_palette()[0]

order = df.SibSp.value_counts().index

sns.countplot(data=df, y='SibSp', color=color, order=order);
# Embarked Bar plot
sns.countplot(data=df, y='Embarked');
# Embarked Bar plot

base_color = sns.color_palette()[0]

order = df.Embarked.value_counts().index

ax = sns.countplot(data=df, x='Embarked', color=base_color, order=order);

ports = ['Southampton', 'Cherbourg', 'Queenstown']

plt.xticks(np.arange(3), ports);

ax.tick_params(labelsize=16)

ax.set_ylabel("Count of People",fontsize=20)
ax.set_xlabel("Port",fontsize=20);

ax.set_title('Embarkation Analysis - Titanic dataset', fontsize=14);
# Embarked Bar plot - compute ticks
n_ = df.shape[0]
n_ = df.shape[0]
max_ = df.Embarked.value_counts(normalize=True)[0]
tick_props = np.arange(0, max_, 0.1)
tick_labels = [f'{tp:.1f}' for tp in tick_props]
tick_props_abs = tick_props * n_

plt.yticks(tick_props_abs, tick_labels);
# Embarked Bar plot - Proportion with labels
plt.ylim((0, 730))
for i in range(len(order)):
    count = df.Embarked.value_counts()[i]
    plt.text(i-.2, count+30, f'{count/n_:.2f}', fontsize=14)
df.Embarked.value_counts(normalize=True)

Pie Chart

cats = df.Embarked.value_counts(normalize=True).index
vals = df.Embarked.value_counts(normalize=True).values
print(cats)
print(vals)

ports = ['Southampton', 'Cherbourg', 'Queenstown']
explode = [0, 0, 0.1]

plt.pie(vals, labels=ports, startangle=90, explode=explode);

Histogram - Quantative Variables

plt.hist(data=df, x='Age');
plt.hist(data=df, x='Age', bins=20); # defaul bins is 10
# customize bins
bins = np.arange(0, df.Age.max()+5, 5)

plt.hist(data=df, x='Age', bins=bins); # defaul bins is 10
sns.histplot(df.Age);
sns.histplot(df.Age, kde=True); #  plot a gaussian kernel density estimate
bins = np.arange(0, df.Age.max()+5, 5)

sns.histplot(df.Age, kde=False, bins=bins);

Case Study: Diabetes Dataset

  • https://www.kaggle.com/uciml/pima-indians-diabetes-database
df = pd.read_csv('./data/diabetes/diabetes.csv')
print(df.shape)

print(df.info())

display(df.describe().T)
sns.histplot(df[df.Outcome==0].Glucose, label='No Diabetes');

sns.histplot(df[df.Outcome==1].Glucose, label='Diabetes');

plt.legend();
sns.boxplot(data=df, y='Glucose', x='Outcome');