Multivariate Visualization

1 minute read

Published:

This post covers Multivariate Visualization.

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd
import numpy as np
import random
df = pd.read_csv('./data/diabetes/diabetes.csv')
print(df.shape)
print(df.info())
df.Outcome.replace({0:'nodiabetes', 1:'diabetes'}, inplace=True)
df.Outcome.unique()
genders = ['female', 'male']

df['gender'] =random.choices(genders, k=df.shape[0])
df.head(2)
# Glucose vs BloodPressure
sb.regplot(data=df, x='Glucose', y='BloodPressure', fit_reg=False, x_jitter=0.1, scatter_kws={'alpha': .4});
# Glucose vs BloodPressure for categorical Diabetes and Non-Diabetes (Marker and Color encoding)

markers = [['nodiabetes', '*'], ['diabetes', '^']]

for ttype, marker in markers:
    
    df_plot = df.loc[df.Outcome == ttype]
    
    sb.regplot(data=df_plot, x='Glucose', y='BloodPressure', fit_reg=False, x_jitter=0.1, scatter_kws={'alpha': .4}, marker=marker);
    
#plt.legend(['No Diabetes', 'Diabetes']);
# Glucose vs BloodPressure for numeric Insulin (size encoding)

sb.regplot(data=df, x='Glucose', y='BloodPressure', fit_reg=False, x_jitter=0.1, scatter_kws={'s': df.Insulin/4});
# Glucose vs BloodPressure for numeric Insulin (size encoding with legend)

sb.regplot(data=df, x='Glucose', y='BloodPressure', fit_reg=False, x_jitter=0.1, scatter_kws={'s': df.Insulin/4});

sizes = [100, 300, 500]

color = sns.color_palette()[0]

legends = []

for s in sizes:
    legend = plt.scatter([], [], s=s/4, color=color)
    legends.append(legend)

plt.legend(legends, sizes, title='BMI');
# Glucose vs BloodPressure for categorical Diabetes and Non-Diabetes (col graphs)

g = sb.FacetGrid(data=df, col='Outcome', sharey=True)

g.map(sb.regplot, 'Glucose', 'BloodPressure', fit_reg=False);
# Glucose vs BloodPressure for categorical Diabetes and Non-Diabetes (with hue)

g = sb.FacetGrid(data=df, hue='Outcome', height=4, aspect=1.5, hue_order=['diabetes', 'nodiabetes'])

g.map(sb.regplot, 'Glucose', 'BloodPressure', fit_reg=False, x_jitter=.03);

g.add_legend();
# Glucose vs BloodPressure for categorical Diabetes and Non-Diabetes (with hue)

g = sb.FacetGrid(data=df, col='Outcome', row='gender', height=4, aspect=1.5, hue_order=[0, 1], sharey=True, sharex=True)

g.map(plt.scatter, 'Glucose', 'BloodPressure');

g.add_legend();
# Glucose vs BloodPressure for categorical Diabetes/Non-Diabetes and gender (row and col)

g = sb.FacetGrid(data=df, row='Outcome', col='gender', height=4, aspect=1.5, hue_order=[0, 1], sharey=True, sharex=True)

g.map(plt.scatter, 'Glucose', 'BloodPressure');

# Multi variables

vars = ['Glucose', 'BloodPressure', 'BMI', 'Insulin']

g = sb.PairGrid(data=df, vars=vars)

g.map(plt.scatter);
# Multi variables (hist on diag and scatter off-diag)

vars = ['Glucose', 'BloodPressure', 'BMI', 'Insulin']

g = sb.PairGrid(data=df, vars=vars);

g.map_offdiag(plt.scatter);

g.map_diag(plt.hist);
# Heatmap
sb.heatmap(df[vars].corr(), cmap='rocket_r', annot=True);