Load a Pandas dataframe with a selected dataset. Identify and count the missing values in a dataframe. Clean the data after removing noise as follows: a. Drop duplicate rows. b. Detect the outliers and remove the rows having outliers c. Identify the most correlated positively correlated attributes and negatively correlated attributes

CODE

import pandas as pd

# Load the dataset

df = pd.read_csv('your_dataset.csv')

# Identify and count missing values

missing_values = df.isnull().sum()

print("Missing values:")

print(missing_values)

# Drop duplicate rows

df = df.drop_duplicates()

# Detect outliers and remove rows with outliers

def remove_outliers(df, column):

Q1 = df[column].quantile(0.25)

Q3 = df[column].quantile(0.75)

IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR

upper_bound = Q3 + 1.5 * IQR

return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

columns_to_check = ['column1', 'column2']

for column in columns_to_check:

df = remove_outliers(df, column)

# Identify the most correlated attributes

correlation_matrix = df.corr()

# Find the most positively correlated attributes

positive_corr = correlation_matrix.unstack().sort_values(ascending=False)

print("Most positively correlated attributes:")

print(positive_corr)

# Find the most negatively correlated attributes

negative_corr = correlation_matrix.unstack().sort_values(ascending=True)

print("Most negatively correlated attributes:")

print(negative_corr)

Info hub