Data Assessing and Cleaning

1 minute read

Published:

This post covers Data Assessing and Cleaning.

Data Assessing and Cleaning

Missing Values

import warnings
warnings.filterwarnings('ignore')

import math
import numpy as np

print(np.divide(8,4))
print(np.divide(0,0))
print(np.log(-1))
print(np.sqrt(-2))
print(math.inf * 0)
print(math.inf/math.inf)
import pandas as pd
A = [10, 100, 120, np.sqrt(-12)] # NaN value will appear
B = [20, 40, 128, 90]
C = [20, None, 128, None]
D = [15, 0, 0, -10]

data = {'A': A, 'B': B, 'C':C, 'D':D}
df = pd.DataFrame(data)
display(df)
# Pandas - isna
display(df.isna())
display(df.isna().sum()) 
print(df.isna().sum().sum())
# Pandas - notna
display(df.notna())
display(df.notna().sum()) 
print(df.notna().sum().sum())
# Pandas - isnull
display(df.isnull())
display(df.isnull().sum())
print(df.isnull().sum().sum())
# Pandas - notnull
display(df.notnull())
display(df.notnull().sum())
print(df.notnull().sum().sum())
# numpy
display(np.isnan(df))
display(np.isnan(df).sum()) 
print(np.isnan(df).sum().sum())
df.dropna(axis=0)
df.dropna(axis=1)
display(df)

# to drop iftrain.csv non NaN is less than three
df.dropna(axis=0, thresh=3) 
display(df)

# to drop is non NaN is less than four
df.dropna(axis=0, thresh=4) 
display(df)
df.fillna(method='ffill', axis=0)
display(df)
df.fillna(method='ffill', axis=1)
display(df)
df.fillna(method='bfill', axis=0)
display(df)
df.fillna(method='bfill', axis=1)
df.fillna(value=10)

Duplicated Recocrds

import pandas as pd
A = ['a', 100, 120, 10]
B = ['b', 40, 128, 90]
C = ['c', None, 128, None]
D = ['d', 10, 0, -10]
E = ['a', 100, 120, 10]
F = ['a', 100, 120, 0]
data = {'A': A, 'B': B, 'C':C, 'D':D, 'E':E, 'F':F}
df = pd.DataFrame.from_dict(data, orient='index').reset_index(drop=True)
df.rename(columns={0:'A', 1:'B', 2:'C', 3:'D'}, inplace=True)
display(df)
df[df.duplicated()]
df[df.duplicated(['A', 'B', 'C'])]
df[df.duplicated(['A', 'B', 'D'])]
df.drop_duplicates()
df.drop_duplicates(['A', 'B', 'C'])
df.drop_duplicates(['A', 'B', 'D'])