Download any dataset and do the following: a. Count number of categorical and numeric features b. Remove one correlated attribute (if any) c. Display five-number summary of each attribute and show it visually
Download any dataset and do the following:
a. Count number of categorical and numeric features
b. Remove one correlated attribute (if any)
c. Display five-number summary of each attribute and show it visually
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Load the Iris dataset into a pandas DataFrame
iris_df = pd.read_csv('iris.data', header=None,
names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'])
# Count the number of categorical and numeric features
categorical_features = iris_df.select_dtypes(include=['object']).columns
numeric_features = iris_df.select_dtypes(include=['float64']).columns
print(f"Number of categorical features: {len(categorical_features)}")
print(f"Number of numeric features: {len(numeric_features)}")
# Calculate the correlation matrix
correlation_matrix = iris_df[numeric_features].corr()
# Find the most highly correlated attribute
most_correlated_attribute = correlation_matrix.abs().sum().idxmax()
# Remove the most highly correlated attribute
iris_df = iris_df.drop(columns=most_correlated_attribute)
# Display the five-number summary of each attribute and show it visually
plt.figure(figsize=(10, 6))
sns.boxplot(data=iris_df, orient='h')
plt.title("Five-Number Summary of Iris Dataset")
plt.xlabel("Value")
plt.show()
# Display the summary statistics of each attribute
summary_statistics = iris_df.describe()
print(summary_statistics)
Comments
Post a Comment