Sampling Distributions Example

5 minute read

Published:

This post covers Sampling Distributions Example.

Steps

  • Download the dataset Klout Score

  • Compute Population Mean and Standard Deviation

  • What is the mean and standard deviation of sampling distribution with sample size $ 35 $?

  • Consider a sample with mean $ 40 $ and sample size $ 35 $. Where does this sample mean lie on sampling distribution i.e. how many standard deviation

  • What is the probability of randomly selecting a sample $ (n=35) $ with mean be atleast $ 40 $?

  • What is the mean and standard deviation of sampling distribution with sample size $ 250 $?

  • Consider a sample with mean $40$ and sample size $250$. Where does this sample mean lie on sampling distribution i.e. how many standard deviation?

  • What is the probability of randomly selecting a sample $(n=250)$ with mean be atleast $40$?

L01 - Sampling Distribution

%matplotlib inline

import numpy as np
import pandas as pd
import scipy
import itertools
import math
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os
from IPython.core.display import display, HTML
def get_data():
    score_file = './data/Klout/score.csv'

    with open(score_file, 'r') as fp:
        data = fp.readlines()

    data = [float(d.strip()) for d in data]
    return data
data = get_data()
print(len(data))

Population Parameters

  • Compute Population Mean and Standard Deviation

  • $ \mu = 37.72 $ and $ \sigma = 16.04 $

# Population Mean and Std
MEAN = np.mean(data)
STD = np.std(data)

msg = f"Population Parameters:\n\t Mean {MEAN:.2f}"
msg += f"\n\tStandard Deviation {STD:.2f}"
print(msg)
# Histogram of Klout data - Bimodal
# There are two peaks
plt.hist(data);
plt.axvline(x=MEAN, color='red'); # mean
plt.title("Bimodal");

Sampling Distribution - Distribution of Sample Means

def sample_means(k, n=1000):
    samples = [random.choices(data, k=k) for i in range(n)]
    means = [np.mean(sample) for sample in samples]
    return means
plt.hist(sample_means(1));
plt.hist(sample_means(5));
plt.hist(sample_means(10));
plt.hist(sample_means(35));
plt.hist(sample_means(250));
  • n = 35
  • Let’s take all possible samples of size 35 and calculate mean of each of these and graph distribution of sample means
    • This should be normal
    • What will be mean of this distribution
      • eq to population mean, 37.72
    • What will be Standard Deviation of this distribution
      • $\frac{37.72}{\sqrt{35}} = 2.71$
n = 35
mean, std_error = MEAN, STD/math.sqrt(n)

msg = f"Sampling distribution: sample size {n}"
msg += f"\n\tMean {mean:.2f}" # 37.71
msg += f"\n\tStandard Error {std_error:.2f}" # 2.71
print(msg)
def statistic(n):
    mean, std_error = MEAN, STD/math.sqrt(n)
    return mean, std_error
  • Kernel Density Estimate (KDE)

    • used to visualize the Probability Density of a continuous variable

    • It depicts the probability density at different values in a continuous variable.

    • Def

      • Let (x1, x2, …, xn) be independent and identically distributed samples drawn from some univariate distribution with an unknown density $ƒ$ at any given point $x$.

      • We are interested in estimating the shape of this function $ƒ$.

      • $K$ is Kernel and $h>0$ is a smoothing parameter called bandwidth, $K_h$ is scaled kernel

          values = np.random.normal(loc=0, 
                                    scale=1, 
                                    size=100)
          print(values.shape)
          sns.kdeplot(values);
        
def plot_samp_dist(mean, std_error, ax=None):
  
  	# 1000 values using normal distribution
    values = np.random.normal(loc=mean, scale=std_error, size=1000)
    if ax:
        sns.kdeplot(values);
    else:
        sns.kdeplot(values, ax=ax);
n = 35
mean, std_error = MEAN, STD/math.sqrt(n)
plot_samp_dist(mean, std_error)

One Sample

  • Consider a sample with mean $ 40 $ and sample size $ 35 $. Where does this sample mean lie on sampling distribution i.e. how many standard deviation
    • $n$ = Sample Size = 35
    • obs = $ \mu = 40$
    • How far is 40 from mean of sampling distribution i.e 37.71
# Where does this sample mean lie on sampling distribution - how many standard deviation

def z_score(mean, se, obs):
    z = (obs - mean) / se
    return z

n = 35
obs = 40
mean, std_error = MEAN, STD/math.sqrt(n)

ax = plt.subplot()

values = np.random.normal(loc=mean, scale=std_error, size=1000)
sns.kdeplot(values);


ax.axvline(mean, color='red', label=f'{mean:.2f}');
ax.axvline(obs, color='green', label=obs);

z = (obs - mean) / std_error

plt.title(f'z={z:.2f}');
ax.legend();

msg = f'Sample mean {obs} is {z:.2f} standard deviation far from mean  when n = {n}'
print(msg)
  • What is the probability of randomly selecting a sample $ (n=35) $ with mean be atleast $ 40 $?
    • $z = \frac{40 - 37.72}{2.71} = 0.84$
    • prob of right side for z=0.84 = 0.20
# What is the probability that a sample drawn from this population will be atleast 40
# Cumulative distribution function - scipy.stats.norm.cdf() # prob to the left
# Probability density function - scipy.stats.norm.ppf(.95)

area = scipy.stats.norm.cdf(z) # left side

prob =  1 - area

msg = f'<h3>Prob is {prob:.2f}</h3>'
display(HTML(msg))

Another Sampling Distribution

  • n = 250
  • $ \mu = 40 $
n = 250
mean, std_error = statistic(n)

msg = f'<h3>Sampling distribution with sample size {n}: Mean {mean:.2f} and Standard Error {std_error:.2f}</h3>'
display(HTML(msg))
ax = plt.subplot()

mean, se = statistic(35)
plot_samp_dist(mean, se, ax=ax)

mean, se = statistic(250)
plot_samp_dist(mean, se, ax=ax)
# Where does this sample mean lie on sampling distribution (n=250) - how many standard deviation

n = 250
obs = 40
mean, std_error = statistic(n)

ax = plt.subplot()

plot_samp_dist(mean, std_error, ax)

ax.axvline(mean, color='red', label=f'{mean:.2f}');
ax.axvline(obs, color='green', label=obs);

z = z_score(mean, std_error, obs)

plt.title(f'z={z:.2f}');
ax.legend();

msg = f'<h3>Sample mean {obs} is {z:.2f} standard deviation far from mean when n = {n}</h3>'
display(HTML(msg))
# What is the probability of randomly selecting a sample (n=250) with mean be atleast 40
# Cumulative distribution function - scipy.stats.norm.cdf() and Probability density function - scipy.stats.norm.ppf(.95)

area = scipy.stats.norm.cdf(z) 

prob =  1 - area

msg = f'<h3>Prob is {prob:.2f}</h3>'
display(HTML(msg))
# Where does this sample mean lie on sampling distribution (n=250) 
# how many standard deviation
ax = plt.subplot()

n = 35
obs = 40
mean, std_error = statistic(n)
plot_samp_dist(mean, std_error, ax=ax)
z = z_score(mean, std_error, obs)
area = scipy.stats.norm.cdf(z) 
prob =  1 - area
print(f'n={n} obs={obs} prob is {prob:.2f}')

n = 250
obs = 40
mean, std_error = statistic(n)
plot_samp_dist(mean, std_error, ax=ax)
z = z_score(mean, std_error, obs)
area = scipy.stats.norm.cdf(z) 
prob =  1 - area
print(f'n={n} obs={obs} prob is {prob:.2f}')

ax.axvline(obs, color='red');