1
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
california_housing = fetch_california_housing()
df = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)
df['target'] = california_housing.target
print(california_housing.DESCR)
print("First 5 rows of the dataset:")
print(df.head())
df.hist(bins=30, figsize=(12, 10))
plt.suptitle("Histograms of Numerical Features", fontsize=16)
plt.show()
plt.figure(figsize=(12, 10))
for i, feature in enumerate(df.columns[:-1]): # Exclude 'target' column
plt.subplot(3, 4, i + 1) # 3 rows, 4 columns
sns.boxplot(df[feature])
plt.title(f'Box Plot of {feature}')
plt.tight_layout()
plt.show()
print("Outliers Detection:")
for feature in df.select_dtypes(include=['float64', 'int64']).columns:
Q1, Q3 = df[feature].quantile([0.25, 0.75])
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
print(f"{feature}: {len(outliers)} outliers")
Comments
Post a Comment