Visualization
Published:
This post covers Multivariate Visualization.
Multivariate
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import random
df = pd.read_csv('./data/diabetes/diabetes.csv')
print(df.shape)
print(df.info())
df.Outcome.replace({0:'nodiabetes', 1:'diabetes'}, inplace=True)
df.Outcome.unique()
genders = ['female', 'male']
df['gender'] =random.choices(genders, k=df.shape[0])
df.head(2)
# Glucose vs BloodPressure
sns.regplot(data=df, x='Glucose', y='BloodPressure', fit_reg=False, x_jitter=0.1, scatter_kws={'alpha': .4});
# Glucose vs BloodPressure for categorical Diabetes and Non-Diabetes (Marker and Color encoding)
markers = [['nodiabetes', '*'], ['diabetes', '^']]
for ttype, marker in markers:
df_plot = df.loc[df.Outcome == ttype]
sns.regplot(data=df_plot, x='Glucose', y='BloodPressure', fit_reg=False, x_jitter=0.1, scatter_kws={'alpha': .4}, marker=marker, label=ttype);
plt.legend();
# Glucose vs BloodPressure for numeric Insulin (size encoding)
sns.regplot(data=df, x='Glucose', y='BloodPressure', fit_reg=False, x_jitter=0.1, scatter_kws={'s': df.Insulin/4});
# Glucose vs BloodPressure for numeric Insulin (size encoding with legend)
sns.regplot(data=df, x='Glucose', y='BloodPressure', fit_reg=False, x_jitter=0.1, scatter_kws={'s': df.Insulin/4});
sizes = [100, 300, 500]
color = sns.color_palette()[0]
legends = []
for s in sizes:
legend = plt.scatter([], [], s=s/4, color=color)
legends.append(legend)
plt.legend(legends, sizes, title='BMI');
# Glucose vs BloodPressure for categorical Diabetes and Non-Diabetes (col graphs)
g = sns.FacetGrid(data=df, col='Outcome', sharey=True)
g.map(sns.regplot, 'Glucose', 'BloodPressure', fit_reg=False);
# Glucose vs BloodPressure for categorical Diabetes and Non-Diabetes (with hue)
g = sns.FacetGrid(data=df, hue='Outcome', height=4, aspect=1.5, hue_order=['diabetes', 'nodiabetes'])
g.map(sns.regplot, 'Glucose', 'BloodPressure', fit_reg=False, x_jitter=.03);
g.add_legend();
# Glucose vs BloodPressure for categorical Diabetes/Non-Diabetes and gender (row and col)
g = sb.FacetGrid(data=df, row='Outcome', col='gender', height=4, aspect=1.5, hue_order=[0, 1], sharey=True, sharex=True)
g.map(plt.scatter, 'Glucose', 'BloodPressure');
# Multi variables
vars = ['Glucose', 'BloodPressure', 'BMI', 'Insulin']
g = sns.PairGrid(data=df, vars=vars)
g.map(plt.scatter);
# Multi variables (hist on diag and scatter off-diag)
vars = ['Glucose', 'BloodPressure', 'BMI', 'Insulin']
g = sns.PairGrid(data=df, vars=vars);
g.map_offdiag(plt.scatter);
g.map_diag(plt.hist);
# Heatmap
sns.heatmap(df[vars].corr(), cmap='rocket_r', annot=True);