Sampling Distributions Example

4 minute read

Published:

This post covers Sampling Distributions Example.

Download the Dataset

  • Compute Population Mean and Standard Deviation

  • What is the mean and standard deviation of sampling distribution with sample size $ 35 $?

  • Consider a sample with mean $ 40 $ and sample size $ 35 $. Where does this sample mean lie on sampling distribution i.e. how many standard deviation

  • What is the probability of randomly selecting a sample $ (n=35) $ with mean be atleast $ 40 $?

  • What is the mean and standard deviation of sampling distribution with sample size $ 250 $?

  • Consider a sample with mean $40$ and sample size $250$. Where does this sample mean lie on sampling distribution i.e. how many standard deviation?

  • What is the probability of randomly selecting a sample $(n=250)$ with mean be atleast $40$?

L01 - Sampling Distribution

%matplotlib inline

import numpy as np
import pandas as pd
import scipy
import itertools
import math
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os
from IPython.core.display import display, HTML
def get_data():
    score_file = './data/Klout/score.csv'

    with open(score_file, 'r') as fp:
        data = fp.readlines()

    data = [float(d.strip()) for d in data]
    return data
data = get_data()

Population Parameters

  • $ \mu = 37.72 $ and $ \sigma = 16.04 $
# Population Mean and Std
MEAN = np.mean(data)
STD = np.std(data)

msg = f'<h3>Population Parameters: Mean {MEAN:.2f} and Standard Deviation {STD:.2f}</h3>'
display(HTML(msg))
# Histogram of data - Bimodal
plt.hist(data);
plt.axvline(x=MEAN, color='red');

Sampling Distribution - Distribution of Sample Means

  • n = 35
def statistic(n):
    mean, std_error = MEAN, STD/math.sqrt(n)
    return mean, std_error
n = 35
mean, std_error = statistic(n)

msg = f'<h3>Sampling distribution with sample size {n}: Mean {mean:.2f} and Standard Error {std_error:.2f}</h3>'
display(HTML(msg))
def plot_samp_dist(mean, std_error, ax=None):
    values = np.random.normal(loc=mean, scale=std_error, size=1000) # 1000 values using normal distribution
    if ax:
        sns.kdeplot(values);
    else:
        sns.kdeplot(values, ax=ax);
n = 35
mean, std_error = statistic(n)
plot_samp_dist(mean, std_error)

One Sample

  • n = 35
  • $ \mu = 40 $
def z_score(mean, se, obs):
    z = (obs - mean) / se
    return z
# Where does this sample mean lie on sampling distribution - how many standard deviation

n = 35
obs = 40
mean, std_error = statistic(n)

ax = plt.subplot()

plot_samp_dist(mean, std_error, ax)

ax.axvline(mean, color='red', label=f'{mean:.2f}');
ax.axvline(obs, color='green', label=obs);

z = z_score(mean, std_error, obs)

plt.title(f'z={z:.2f}');
ax.legend();

msg = f'<h3>Sample mean {obs} is {z:.2f} standard deviation far from mean  when n = {n}</h3>'
display(HTML(msg))
# What is the probability that a sample drawn from this population will be atleast 40
# Cumulative distribution function - scipy.stats.norm.cdf() and Probability density function - scipy.stats.norm.ppf(.95)

area = scipy.stats.norm.cdf(z) 

prob =  1 - area

msg = f'<h3>Prob is {prob:.2f}</h3>'
display(HTML(msg))

Another Sampling Distribution

  • n = 250
  • $ \mu = 40 $
n = 250
mean, std_error = statistic(n)

msg = f'<h3>Sampling distribution with sample size {n}: Mean {mean:.2f} and Standard Error {std_error:.2f}</h3>'
display(HTML(msg))
ax = plt.subplot()

mean, se = statistic(35)
plot_samp_dist(mean, se, ax=ax)

mean, se = statistic(250)
plot_samp_dist(mean, se, ax=ax)
# Where does this sample mean lie on sampling distribution (n=250) - how many standard deviation

n = 250
obs = 40
mean, std_error = statistic(n)

ax = plt.subplot()

plot_samp_dist(mean, std_error, ax)

ax.axvline(mean, color='red', label=f'{mean:.2f}');
ax.axvline(obs, color='green', label=obs);

z = z_score(mean, std_error, obs)

plt.title(f'z={z:.2f}');
ax.legend();

msg = f'<h3>Sample mean {obs} is {z:.2f} standard deviation far from mean when n = {n}</h3>'
display(HTML(msg))
# What is the probability of randomly selecting a sample (n=250) with mean be atleast 40
# Cumulative distribution function - scipy.stats.norm.cdf() and Probability density function - scipy.stats.norm.ppf(.95)

area = scipy.stats.norm.cdf(z) 

prob =  1 - area

msg = f'<h3>Prob is {prob:.2f}</h3>'
display(HTML(msg))
# Where does this sample mean lie on sampling distribution (n=250) - how many standard deviation
ax = plt.subplot()

n = 35
obs = 40
mean, std_error = statistic(n)
plot_samp_dist(mean, std_error, ax=ax)
z = z_score(mean, std_error, obs)
area = scipy.stats.norm.cdf(z) 
prob =  1 - area
print(f'n={n} obs={obs} prob is {prob:.2f}')

n = 250
obs = 40
mean, std_error = statistic(n)
plot_samp_dist(mean, std_error, ax=ax)
z = z_score(mean, std_error, obs)
area = scipy.stats.norm.cdf(z) 
prob =  1 - area
print(f'n={n} obs={obs} prob is {prob:.2f}')

ax.axvline(obs, color='red');
def sample_means(k, n=1000):
    samples = [random.choices(data, k=k) for i in range(n)]
    means = [np.mean(sample) for sample in samples]
    return means
plt.hist(sample_means(1));
plt.hist(sample_means(5));
plt.hist(sample_means(10));
plt.hist(sample_means(35));
plt.hist(sample_means(250));