# Data Assessing and Cleaning

Published:

This post covers Data Assessing and Cleaning.

# Data Assessing and Cleaning

## Missing Values

import warnings
warnings.filterwarnings('ignore')

import math
import numpy as np

print(np.divide(8,4))
print(np.divide(0,0))
print(np.log(-1))
print(np.sqrt(-2))
print(math.inf * 0)
print(math.inf/math.inf)

import pandas as pd
A = [10, 100, 120, np.sqrt(-12)] # NaN value will appear
B = [20, 40, 128, 90]
C = [20, None, 128, None]
D = [15, 0, 0, -10]

data = {'A': A, 'B': B, 'C':C, 'D':D}
df = pd.DataFrame(data)
display(df)

# Pandas - isna
display(df.isna())
display(df.isna().sum())
print(df.isna().sum().sum())

# Pandas - notna
display(df.notna())
display(df.notna().sum())
print(df.notna().sum().sum())

# Pandas - isnull
display(df.isnull())
display(df.isnull().sum())
print(df.isnull().sum().sum())

# Pandas - notnull
display(df.notnull())
display(df.notnull().sum())
print(df.notnull().sum().sum())

# numpy
display(np.isnan(df))
display(np.isnan(df).sum())
print(np.isnan(df).sum().sum())

df.dropna(axis=0)

df.dropna(axis=1)

display(df)

# to drop iftrain.csv non NaN is less than three
df.dropna(axis=0, thresh=3)

display(df)

# to drop is non NaN is less than four
df.dropna(axis=0, thresh=4)

display(df)
df.fillna(method='ffill', axis=0)

display(df)
df.fillna(method='ffill', axis=1)

display(df)
df.fillna(method='bfill', axis=0)

display(df)
df.fillna(method='bfill', axis=1)

df.fillna(value=10)


## Duplicated Recocrds

import pandas as pd
A = ['a', 100, 120, 10]
B = ['b', 40, 128, 90]
C = ['c', None, 128, None]
D = ['d', 10, 0, -10]
E = ['a', 100, 120, 10]
F = ['a', 100, 120, 0]
data = {'A': A, 'B': B, 'C':C, 'D':D, 'E':E, 'F':F}
df = pd.DataFrame.from_dict(data, orient='index').reset_index(drop=True)
df.rename(columns={0:'A', 1:'B', 2:'C', 3:'D'}, inplace=True)
display(df)

df[df.duplicated()]

df[df.duplicated(['A', 'B', 'C'])]

df[df.duplicated(['A', 'B', 'D'])]

df.drop_duplicates()

df.drop_duplicates(['A', 'B', 'C'])

df.drop_duplicates(['A', 'B', 'D'])


Tags: