import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, mean_squared_error
from typing import Tuple, Optional
[docs]class AdvancedAnalysis:
"""
A class providing advanced data analysis methods for DataLib.
Includes regression, classification, clustering, and dimensionality
reduction techniques.
"""
[docs] @staticmethod
def linear_regression(X: pd.DataFrame,
y: pd.Series,
test_size: float = 0.2) -> dict:
"""
Perform linear regression analysis.
Args:
X (pd.DataFrame): Input features.
y (pd.Series): Target variable.
test_size (float, optional): Proportion of data for testing. Defaults to 0.2.
Returns:
dict: Regression analysis results including model, coefficients, and performance metrics.
"""
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
# Fit linear regression model
model = LinearRegression()
model.fit(X_train, y_train)
# Predictions and evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
return {
'model': model,
'coefficients': dict(zip(X.columns, model.coef_)),
'intercept': model.intercept_,
'mean_squared_error': mse,
'r_squared': model.score(X_test, y_test)
}
[docs] @staticmethod
def polynomial_regression(X: pd.DataFrame,
y: pd.Series,
degree: int = 2,
test_size: float = 0.2) -> dict:
"""
Perform polynomial regression analysis.
Args:
X (pd.DataFrame): Input features.
y (pd.Series): Target variable.
degree (int, optional): Polynomial degree. Defaults to 2.
test_size (float, optional): Proportion of data for testing. Defaults to 0.2.
Returns:
dict: Polynomial regression analysis results.
"""
# Create polynomial features
poly = PolynomialFeatures(degree=degree)
X_poly = poly.fit_transform(X)
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=test_size, random_state=42)
# Fit polynomial regression
model = LinearRegression()
model.fit(X_train, y_train)
# Predictions and evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
return {
'model': model,
'mean_squared_error': mse,
'r_squared': model.score(X_test, y_test)
}
[docs] @staticmethod
def knn_classification(X: pd.DataFrame,
y: pd.Series,
n_neighbors: int = 5,
test_size: float = 0.2) -> dict:
"""
Perform K-Nearest Neighbors classification.
Args:
X (pd.DataFrame): Input features.
y (pd.Series): Target variable.
n_neighbors (int, optional): Number of neighbors. Defaults to 5.
test_size (float, optional): Proportion of data for testing. Defaults to 0.2.
Returns:
dict: KNN classification results.
"""
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
# Fit KNN classifier
model = KNeighborsClassifier(n_neighbors=n_neighbors)
model.fit(X_train, y_train)
# Predictions and evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
return {
'model': model,
'accuracy': accuracy,
'predictions': y_pred
}
[docs] @staticmethod
def decision_tree_classification(X: pd.DataFrame,
y: pd.Series,
max_depth: Optional[int] = None,
test_size: float = 0.2) -> dict:
"""
Perform Decision Tree classification.
Args:
X (pd.DataFrame): Input features.
y (pd.Series): Target variable.
max_depth (int, optional): Maximum tree depth. Defaults to None.
test_size (float, optional): Proportion of data for testing. Defaults to 0.2.
Returns:
dict: Decision Tree classification results.
"""
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
# Fit Decision Tree classifier
model = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
model.fit(X_train, y_train)
# Predictions and evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
return {
'model': model,
'accuracy': accuracy,
'feature_importance': dict(zip(X.columns, model.feature_importances_)),
'predictions': y_pred
}
[docs] @staticmethod
def kmeans_clustering(X: pd.DataFrame,
n_clusters: int = 3,
random_state: int = 42) -> dict:
"""
Perform K-means clustering.
Args:
X (pd.DataFrame): Input features.
n_clusters (int, optional): Number of clusters. Defaults to 3.
random_state (int, optional): Random seed for reproducibility.
Returns:
dict: K-means clustering results.
"""
# Fit K-means model
model = KMeans(n_clusters=n_clusters, random_state=random_state)
model.fit(X)
# Cluster labels and centroids
labels = model.labels_
centroids = model.cluster_centers_
return {
'model': model,
'cluster_labels': labels,
'centroids': centroids
}
[docs] @staticmethod
def principal_component_analysis(X: pd.DataFrame,
n_components: Optional[int] = None) -> dict:
"""
Perform Principal Component Analysis (PCA).
Args:
X (pd.DataFrame): Input features.
n_components (int, optional): Number of components to keep.
Defaults to None (min of features or samples).
Returns:
dict: PCA analysis results.
"""
# Fit PCA model
pca = PCA(n_components=n_components)
X_transformed = pca.fit_transform(X)
return {
'transformed_data': X_transformed,
'explained_variance_ratio': pca.explained_variance_ratio_,
'cumulative_variance_ratio': np.cumsum(pca.explained_variance_ratio_),
'components': pca.components_
}