Univariate Visualization
Published:
This post covers Univariate Data Visualization.
Data Visualisation
- Exploratory analysis to look for relationship in the data
- Gather -> Clean -> Explore -> Analyse -> Share
- Matplotlib, Seaborn, Pandas
- Univariate Visualisation
- Qualitative Variables - Bar chart
- Quantitive Variables - Histogram
Univariate Visualisation
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
file_path = f'./data/titanic/train.csv'
df = pd.read_csv(file_path)
print(df.shape)
display(df.head(2))
# Gender Bar plot
sns.countplot(data=df, x='Gender');
# Clean Gender values
df_clean = df #df.copy(deep=False)
df_clean.Gender.replace(to_replace=['M', "Male"], value='male', inplace=True)
df_clean.Gender.replace(to_replace=['F', "Female"], value='female', inplace=True)
print(df_clean.Gender.value_counts())
print(df.Gender.value_counts())
Bar Chart - Qualtative Variables
# Gender Bar plot
sns.countplot(data=df, x='Gender');
plt.show()
sns.countplot(data=df, y='Gender');
# Gender Bar plot with color
base_color = sns.color_palette()[0]
sns.countplot(data=df, x='Gender', color=base_color);
# Gender Bar plot with color and order
base_color = sns.color_palette()[0]
sns.countplot(data=df, x='Gender', color=base_color, order=['male','female']);
# Gender Bar plot with color and sort
base_color = sns.color_palette()[0]
order = df.Gender.value_counts().index
sns.countplot(data=df, x='Gender', color=base_color, order=order);
# Gender Bar plot with color, sort, and ticks rotation
base_color = sns.color_palette()[0]
order = df.Gender.value_counts().index
sns.countplot(data=df, x='Gender', color=base_color, order=order);
plt.xticks(rotation=90);
# Gender Bar plot with label on y-axix and count x-axix
base_color = sns.color_palette()[0]
order = df.Gender.value_counts().index
sns.countplot(data=df, y='Gender', color=base_color, order=order);
plt.xticks(rotation=90);
# SibSp Bar plot
color = sns.color_palette()[0]
order = df.SibSp.value_counts().index
sns.countplot(data=df, y='SibSp', color=color, order=order);
# Embarked Bar plot
sns.countplot(data=df, y='Embarked');
# Embarked Bar plot
base_color = sns.color_palette()[0]
order = df.Embarked.value_counts().index
ax = sns.countplot(data=df, x='Embarked', color=base_color, order=order);
ports = ['Southampton', 'Cherbourg', 'Queenstown']
plt.xticks(np.arange(3), ports);
ax.tick_params(labelsize=16)
ax.set_ylabel("Count of People",fontsize=20)
ax.set_xlabel("Port",fontsize=20);
ax.set_title('Embarkation Analysis - Titanic dataset', fontsize=14);
# Embarked Bar plot - compute ticks
n_ = df.shape[0]
n_ = df.shape[0]
max_ = df.Embarked.value_counts(normalize=True)[0]
tick_props = np.arange(0, max_, 0.1)
tick_labels = [f'{tp:.1f}' for tp in tick_props]
tick_props_abs = tick_props * n_
plt.yticks(tick_props_abs, tick_labels);
# Embarked Bar plot - Proportion with labels
plt.ylim((0, 730))
for i in range(len(order)):
count = df.Embarked.value_counts()[i]
plt.text(i-.2, count+30, f'{count/n_:.2f}', fontsize=14)
df.Embarked.value_counts(normalize=True)
Pie Chart
cats = df.Embarked.value_counts(normalize=True).index
vals = df.Embarked.value_counts(normalize=True).values
print(cats)
print(vals)
ports = ['Southampton', 'Cherbourg', 'Queenstown']
explode = [0, 0, 0.1]
plt.pie(vals, labels=ports, startangle=90, explode=explode);
Histogram - Quantative Variables
plt.hist(data=df, x='Age');
plt.hist(data=df, x='Age', bins=20); # defaul bins is 10
# customize bins
bins = np.arange(0, df.Age.max()+5, 5)
plt.hist(data=df, x='Age', bins=bins); # defaul bins is 10
sns.histplot(df.Age);
sns.histplot(df.Age, kde=True); # plot a gaussian kernel density estimate
bins = np.arange(0, df.Age.max()+5, 5)
sns.histplot(df.Age, kde=False, bins=bins);
Case Study: Diabetes Dataset
- https://www.kaggle.com/uciml/pima-indians-diabetes-database
df = pd.read_csv('./data/diabetes/diabetes.csv')
print(df.shape)
print(df.info())
display(df.describe().T)
sns.histplot(df[df.Outcome==0].Glucose, label='No Diabetes');
sns.histplot(df[df.Outcome==1].Glucose, label='Diabetes');
plt.legend();
sns.boxplot(data=df, y='Glucose', x='Outcome');