Data Wrangling
Published:
This post covers Data Wrangling.
Data Wrangling
Data Gathering, Assessing, and Cleaning
Common issues:
- Missing Data
- Duplicate Data
- Typo error
- Different Format
- Inaccuracy in the data
Data Gathering
Gather data using Facebook, Twitter API
Download Titanic Dataset
Unzip file if needed
import zipfile with zipfile.ZipFile('./data/titanic.zip', 'r') as fp: fp.extractall(path='data')
import pandas as pd csv_file = './data/titanic/train.csv' df = pd.read_csv(csv_file) display(df.head(2))
Data Assessing
Identify issues related to Quality and Tidiness using Visual and Programmatic tools
import pandas as pd csv_file = './data/titanic/train.csv' df = pd.read_csv(csv_file) display(df.info())
Data Cleaning
Define cleaning plan - Code to clean - Test if cleaned
# Rename Columns cols = {'SibSp':'SiblingSpouseAboard', 'Parch':'ParentsChildrenAboard'} df_clean.rename(columns=cols, inplace=True) df_clean.head(2)
# Replace values csv_file = './data/titanic/train.csv' df = pd.read_csv(csv_file) df_clean = df.copy() display(df_clean.Gender.value_counts()) male_list = ['M', 'Male'] female_list = ['F', 'Female'] for text in male_list: df_clean.Gender.replace(to_replace=text, value='male', inplace=True) for text in female_list: df_clean.Gender.replace(to_replace=text, value='female', inplace=True) print() display(df_clean.Gender.value_counts())
# Replace values csv_file = './data/titanic/train.csv' df = pd.read_csv(csv_file) df_clean = df.copy() display(df_clean.Gender.value_counts()) male_list = ['M', 'Male'] female_list = ['F', 'Female'] for text in male_list: df_clean.loc[df_clean.Gender==text, 'Gender'] = 'male' for text in female_list: df_clean.loc[df_clean.Gender==text, 'Gender'] = 'female' print() display(df_clean.Gender.value_counts())
# Test for text in male_list: assert text not in df_clean.Gender.values for text in female_list: assert text not in df_clean.Gender.values
def clean_gender(df): male_list = ["M", "Male"] # replaced with male female_list = ["F", "Female"] # replaced with female for text in male_list: df.Gender.replace(to_replace=text, value="male", inplace=True) for text in female_list: df.Gender.replace(to_replace=text, value="female", inplace=True)
Data Analysis and Visualisation
# Percentage of Passenger Survived display(df_clean.Survived.value_counts()) print() survived = df_clean.Survived.value_counts()[1] total = df_clean.Survived.count() print(f'Survived {survived} from Total {total}: {survived/total:.2f}%')
# Plots %matplotlib inline df_clean.Survived.value_counts().plot(kind='pie', legend=True);
df_clean.Survived.value_counts().plot(kind='bar', legend=True);
df_clean.Survived.value_counts().plot(kind='barh', legend=True);
df_titanic = pd.read_csv("data/titanic.csv") clean_gender(df_titanic) plt.figure(figsize=(3,3)) labels = ["Not Survived", "Survived"] ax = df_titanic.Survived.value_counts().plot(kind='pie', labels=labels); ax.yaxis.set_visible(False) plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1)); # upper right, upper left, lower left, lower right, right, # center left, center right, lower center, upper center, center
df_titanic = pd.read_csv("data/titanic.csv") clean_gender(df_titanic) plt.figure(figsize=(5,3)) ax = df_titanic.Survived.value_counts().plot(kind="bar") labels = ["Not Survived", "Survived"] ax.set_xticklabels(labels, rotation=45, fontsize=14); plt.title("Titanic Survival"); ax.yaxis.set_visible(False)
df_titanic = pd.read_csv("data/titanic.csv") clean_gender(df_titanic) plt.figure(figsize=(5,3)) ax = df_titanic.Survived.value_counts().plot(kind="barh") labels = ["Not Survived", "Survived"] ax.set_yticklabels(labels, rotation=0, fontsize=14); plt.title("Titanic Survival"); ax.xaxis.set_visible(False)