### Project 1 - Predicting Exoplanets  

#### David Kinney - DSS 680 - Spring 2021 - Professor Catherine Williams

In [2]:
%pip install seaborn

Note: you may need to restart the kernel to use updated packages.


In [3]:
# %% Import libraries
from datetime import datetime

import matplotlib.pyplot as plt
from matplotlib import rcParams
import numpy as np
import seaborn as sns
import pandas as pd
from pandas_profiling import ProfileReport
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, \
                                    cross_val_score, \
                                    cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, \
                                    accuracy_score, \
                                    precision_score, \
                                    recall_score, \
                                    classification_report
from tpot import TPOTClassifier

ModuleNotFoundError: No module named 'seaborn'

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
# function definitions

def pca(df):
    
    # standardize the features matrix
    features = StandardScaler().fit_transform(df)
    
    # Create a PCA that retains 99% of the variance
    pca = PCA(n_components = 0.95)
    features_pca = pca.fit_transform(features)
    
    return features, features_pca


def plot_cm(cm):
    # plot confusion matrix
    fig, ax = plt.subplots(figsize = (10,8))
    
    sns.heatmap(conf_matrix_rf/np.sum(conf_matrix_rf), annot=True, 
                fmt='.2%', cmap='Blues', annot_kws={'size':15})
    
    ax.set_title('Random Forest Confusion Matrix', fontsize = 18, loc='left')
    
    ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize = 12)
    ax.set_yticklabels(ax.get_ymajorticklabels(), fontsize = 12)
    
    plt.show()

In [4]:
# %% read df_final
# Read the Kepler Objects of Interest (KOI) df_final and look at one observation
df_koi = pd.read_csv('../data/cumulative_2021.03.16_17.10.21.csv')
print(df_koi.shape)
print(df_koi[1:2].T)

(9564, 141)
                                                                    1
rowid                                                               2
kepid                                                        10797460
kepoi_name                                                  K00752.02
kepler_name                                              Kepler-227 c
koi_disposition                                             CONFIRMED
koi_vet_stat                                                     Done
koi_vet_date                                                8/16/2018
koi_pdisposition                                            CANDIDATE
koi_score                                                       0.969
koi_fpflag_nt                                                       0
koi_fpflag_ss                                                       0
koi_fpflag_co                                                       0
koi_fpflag_ec                                                       0
koi_disp

#### clean data  

In [5]:
# Remove variables with no data
df_koi_cleaned = df_koi.dropna(axis=1, how='all')
# Remove variables with no data
df_koi_cleaned = df_koi_cleaned.loc[:, (df_koi_cleaned != 0).any(axis=0)]
# Remove the err columns
df_koi_cleaned = df_koi_cleaned[df_koi_cleaned.columns.drop(
    list(df_koi_cleaned.filter(regex='_err')))]
# Still some variables that are all 0.0; will just drop them manually...
cols = ['koi_eccen','koi_ldm_coeff4','koi_ldm_coeff3']
df_koi_cleaned = df_koi_cleaned.drop(cols,axis=1)

In [6]:
df_koi_cleaned.shape
# %% describe
df_describe = pd.DataFrame(df_koi_cleaned.describe())
print(df_describe)

             rowid         kepid    koi_score  koi_fpflag_nt  koi_fpflag_ss  \
count  9564.000000  9.564000e+03  8054.000000    9564.000000    9564.000000   
mean   4782.500000  7.690628e+06     0.480829       0.208595       0.232748   
std    2761.033321  2.653459e+06     0.476928       4.767290       0.422605   
min       1.000000  7.574500e+05     0.000000       0.000000       0.000000   
25%    2391.750000  5.556034e+06     0.000000       0.000000       0.000000   
50%    4782.500000  7.906892e+06     0.334000       0.000000       0.000000   
75%    7173.250000  9.873066e+06     0.998000       0.000000       0.000000   
max    9564.000000  1.293514e+07     1.000000     465.000000       1.000000   

       koi_fpflag_co  koi_fpflag_ec     koi_period  koi_time0bk     koi_time0  \
count    9564.000000    9564.000000    9564.000000  9564.000000  9.564000e+03   
mean        0.197512       0.120033      75.671358   166.183251  2.454999e+06   
std         0.398142       0.325018    1334.7

#### prepare data

In [7]:
"""
Remove all descriptive variables to further simplify the df_final
In the interest of time, remove all categorical variables
"""
# remove_cols = ['rowid', 'kepid', 'kepoi_name', 'kepler_name', 'koi_vet_stat',
#                'koi_vet_date', 'koi_pdisposition', 'koi_fpflag_nt',
#                'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_disp_prov',
#                'koi_comment', 'koi_limbdark_mod', 'koi_parm_prov', 'koi_tce_delivname',
#                'koi_trans_mod', 'koi_trans_mod', 'koi_datalink_dvr', 'koi_datalink_dvs',
#                'koi_sparprov', 'koi_fittype']
remove_cols = ['rowid', 'kepid', 'kepoi_name', 'kepler_name', 'koi_vet_stat',
               'koi_vet_date', 'koi_fpflag_nt',
               'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_disp_prov',
               'koi_comment', 'koi_limbdark_mod', 'koi_parm_prov', 'koi_tce_delivname',
               'koi_trans_mod', 'koi_trans_mod', 'koi_datalink_dvr', 'koi_datalink_dvs',
               'koi_sparprov', 'koi_fittype']
df_final = df_koi_cleaned.drop(remove_cols, axis=1)

In [None]:
# Separate labels from features
labels = df_final['koi_disposition']
df_features = df_final.drop(['koi_disposition'], axis=1)

# Separate labels from features
imputer = SimpleImputer(strategy="median")
imputer.fit(df_features)
X = imputer.transform(df_features)
df_final = pd.DataFrame(X, columns=df_features.columns, index=df_features.index)

#### Dimensionality Reduction

In [None]:
# %% correlation matrix before dimentionality reduction
rcParams['figure.figsize'] = 20, 14
plt.matshow(df_final.corr())
plt.yticks(np.arange(df_final.shape[1]), df_final.columns)
plt.xticks(np.arange(df_final.shape[1]), df_final.columns, rotation='vertical')
plt.colorbar()

In [None]:
features, features_pca = pca(df_final)
print('Original number of features: {}'.format(features.shape[1]))
print('Reduced number of features: {}'.format(features_pca.shape[1]))
#df_final = pd.DataFrame(features_pca, columns=df_final.columns, index=df_final.index)

# Uncomment as needed to train on features with or without PCA
# df_features = pd.DataFrame(features_pca)
df_features = pd.DataFrame(features)

In [None]:
# %% correlation matrix after dimentionaloity reduction
rcParams['figure.figsize'] = 20, 14
plt.matshow(df_features.corr())
plt.yticks(np.arange(df_features.shape[1]), df_features.columns)
plt.xticks(np.arange(df_features.shape[1]), df_features.columns, rotation='vertical')
plt.colorbar()

In [None]:
# %% train and test sets
# labels = np.array(labels)
train_features, test_features, train_labels, test_labels = train_test_split(
    df_features, labels, test_size = 0.25, random_state = 42)

In [None]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)
print('Training distribution: ',train_labels.value_counts(normalize=True))
print('Test distribution: ',test_labels.value_counts(normalize=True))

## Pardon the interruption...

#### pycaret

In [8]:
from pycaret.classification import setup
from pycaret.classification import compare_models
# setup the dataset

grid = setup(data=df_final, target='koi_disposition', html=False, silent=True, verbose=False)
# evaluate models and compare models
best = compare_models()
# report the best model
print(best)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.2435,0.5,0.3333,0.0593,0.0954,0.0,0.0,0.302


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.6195,0.786,0.5871,0.6304,0.6233,0.3978,0.399,0.041
lr,Logistic Regression,0.2435,0.5,0.3333,0.0593,0.0954,0.0,0.0,0.302


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.6195,0.786,0.5871,0.6304,0.6233,0.3978,0.399,0.041
lr,Logistic Regression,0.2435,0.5,0.3333,0.0593,0.0954,0.0,0.0,0.302
nb,Naive Bayes,0.2435,0.0,0.3333,0.0593,0.0954,0.0,0.0,0.02


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,0.8935,0.9292,0.856,0.8937,0.8934,0.8285,0.8286,0.045
knn,K Neighbors Classifier,0.6195,0.786,0.5871,0.6304,0.6233,0.3978,0.399,0.041
lr,Logistic Regression,0.2435,0.5,0.3333,0.0593,0.0954,0.0,0.0,0.302
nb,Naive Bayes,0.2435,0.0,0.3333,0.0593,0.0954,0.0,0.0,0.02


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,0.8935,0.9292,0.856,0.8937,0.8934,0.8285,0.8286,0.045
knn,K Neighbors Classifier,0.6195,0.786,0.5871,0.6304,0.6233,0.3978,0.399,0.041
svm,SVM - Linear Kernel,0.3775,0.0,0.3333,0.1594,0.2199,0.0,0.0,0.089
lr,Logistic Regression,0.2435,0.5,0.3333,0.0593,0.0954,0.0,0.0,0.302
nb,Naive Bayes,0.2435,0.0,0.3333,0.0593,0.0954,0.0,0.0,0.02


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,0.8935,0.9292,0.856,0.8937,0.8934,0.8285,0.8286,0.045
knn,K Neighbors Classifier,0.6195,0.786,0.5871,0.6304,0.6233,0.3978,0.399,0.041
svm,SVM - Linear Kernel,0.3775,0.0,0.3333,0.1594,0.2199,0.0,0.0,0.089
lr,Logistic Regression,0.2435,0.5,0.3333,0.0593,0.0954,0.0,0.0,0.302
nb,Naive Bayes,0.2435,0.0,0.3333,0.0593,0.0954,0.0,0.0,0.02
ridge,Ridge Classifier,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9253,0.9869,0.8988,0.9256,0.9252,0.8797,0.8799,0.22
dt,Decision Tree Classifier,0.8935,0.9292,0.856,0.8937,0.8934,0.8285,0.8286,0.045
knn,K Neighbors Classifier,0.6195,0.786,0.5871,0.6304,0.6233,0.3978,0.399,0.041
svm,SVM - Linear Kernel,0.3775,0.0,0.3333,0.1594,0.2199,0.0,0.0,0.089
lr,Logistic Regression,0.2435,0.5,0.3333,0.0593,0.0954,0.0,0.0,0.302
nb,Naive Bayes,0.2435,0.0,0.3333,0.0593,0.0954,0.0,0.0,0.02
ridge,Ridge Classifier,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9253,0.9869,0.8988,0.9256,0.9252,0.8797,0.8799,0.22
dt,Decision Tree Classifier,0.8935,0.9292,0.856,0.8937,0.8934,0.8285,0.8286,0.045
knn,K Neighbors Classifier,0.6195,0.786,0.5871,0.6304,0.6233,0.3978,0.399,0.041
svm,SVM - Linear Kernel,0.3775,0.0,0.3333,0.1594,0.2199,0.0,0.0,0.089
lr,Logistic Regression,0.2435,0.5,0.3333,0.0593,0.0954,0.0,0.0,0.302
nb,Naive Bayes,0.2435,0.0,0.3333,0.0593,0.0954,0.0,0.0,0.02
qda,Quadratic Discriminant Analysis,0.2435,0.0,0.3333,0.0593,0.0954,0.0,0.0,0.025
ridge,Ridge Classifier,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9253,0.9869,0.8988,0.9256,0.9252,0.8797,0.8799,0.22
dt,Decision Tree Classifier,0.8935,0.9292,0.856,0.8937,0.8934,0.8285,0.8286,0.045
ada,Ada Boost Classifier,0.8851,0.9626,0.8436,0.8918,0.8834,0.8149,0.8189,0.218
knn,K Neighbors Classifier,0.6195,0.786,0.5871,0.6304,0.6233,0.3978,0.399,0.041
svm,SVM - Linear Kernel,0.3775,0.0,0.3333,0.1594,0.2199,0.0,0.0,0.089
lr,Logistic Regression,0.2435,0.5,0.3333,0.0593,0.0954,0.0,0.0,0.302
nb,Naive Bayes,0.2435,0.0,0.3333,0.0593,0.0954,0.0,0.0,0.02
qda,Quadratic Discriminant Analysis,0.2435,0.0,0.3333,0.0593,0.0954,0.0,0.0,0.025
ridge,Ridge Classifier,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9253,0.9869,0.8988,0.9256,0.9252,0.8797,0.8799,0.22
gbc,Gradient Boosting Classifier,0.9238,0.988,0.8967,0.9242,0.9237,0.8773,0.8775,3.043
dt,Decision Tree Classifier,0.8935,0.9292,0.856,0.8937,0.8934,0.8285,0.8286,0.045
ada,Ada Boost Classifier,0.8851,0.9626,0.8436,0.8918,0.8834,0.8149,0.8189,0.218
knn,K Neighbors Classifier,0.6195,0.786,0.5871,0.6304,0.6233,0.3978,0.399,0.041
svm,SVM - Linear Kernel,0.3775,0.0,0.3333,0.1594,0.2199,0.0,0.0,0.089
lr,Logistic Regression,0.2435,0.5,0.3333,0.0593,0.0954,0.0,0.0,0.302
nb,Naive Bayes,0.2435,0.0,0.3333,0.0593,0.0954,0.0,0.0,0.02
qda,Quadratic Discriminant Analysis,0.2435,0.0,0.3333,0.0593,0.0954,0.0,0.0,0.025
ridge,Ridge Classifier,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9253,0.9869,0.8988,0.9256,0.9252,0.8797,0.8799,0.22
gbc,Gradient Boosting Classifier,0.9238,0.988,0.8967,0.9242,0.9237,0.8773,0.8775,3.043
dt,Decision Tree Classifier,0.8935,0.9292,0.856,0.8937,0.8934,0.8285,0.8286,0.045
ada,Ada Boost Classifier,0.8851,0.9626,0.8436,0.8918,0.8834,0.8149,0.8189,0.218
lda,Linear Discriminant Analysis,0.8727,0.9701,0.8272,0.8744,0.872,0.7949,0.7962,0.035
knn,K Neighbors Classifier,0.6195,0.786,0.5871,0.6304,0.6233,0.3978,0.399,0.041
svm,SVM - Linear Kernel,0.3775,0.0,0.3333,0.1594,0.2199,0.0,0.0,0.089
lr,Logistic Regression,0.2435,0.5,0.3333,0.0593,0.0954,0.0,0.0,0.302
nb,Naive Bayes,0.2435,0.0,0.3333,0.0593,0.0954,0.0,0.0,0.02
qda,Quadratic Discriminant Analysis,0.2435,0.0,0.3333,0.0593,0.0954,0.0,0.0,0.025


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9253,0.9869,0.8988,0.9256,0.9252,0.8797,0.8799,0.22
gbc,Gradient Boosting Classifier,0.9238,0.988,0.8967,0.9242,0.9237,0.8773,0.8775,3.043
et,Extra Trees Classifier,0.9184,0.9858,0.8894,0.9187,0.9183,0.8686,0.8688,0.102
dt,Decision Tree Classifier,0.8935,0.9292,0.856,0.8937,0.8934,0.8285,0.8286,0.045
ada,Ada Boost Classifier,0.8851,0.9626,0.8436,0.8918,0.8834,0.8149,0.8189,0.218
lda,Linear Discriminant Analysis,0.8727,0.9701,0.8272,0.8744,0.872,0.7949,0.7962,0.035
knn,K Neighbors Classifier,0.6195,0.786,0.5871,0.6304,0.6233,0.3978,0.399,0.041
svm,SVM - Linear Kernel,0.3775,0.0,0.3333,0.1594,0.2199,0.0,0.0,0.089
lr,Logistic Regression,0.2435,0.5,0.3333,0.0593,0.0954,0.0,0.0,0.302
nb,Naive Bayes,0.2435,0.0,0.3333,0.0593,0.0954,0.0,0.0,0.02


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.9253,0.9869,0.8988,0.9256,0.9252,0.8797,0.8799,0.22
xgboost,Extreme Gradient Boosting,0.9243,0.9879,0.8973,0.9246,0.9241,0.878,0.8783,1.354
gbc,Gradient Boosting Classifier,0.9238,0.988,0.8967,0.9242,0.9237,0.8773,0.8775,3.043
et,Extra Trees Classifier,0.9184,0.9858,0.8894,0.9187,0.9183,0.8686,0.8688,0.102
dt,Decision Tree Classifier,0.8935,0.9292,0.856,0.8937,0.8934,0.8285,0.8286,0.045
ada,Ada Boost Classifier,0.8851,0.9626,0.8436,0.8918,0.8834,0.8149,0.8189,0.218
lda,Linear Discriminant Analysis,0.8727,0.9701,0.8272,0.8744,0.872,0.7949,0.7962,0.035
knn,K Neighbors Classifier,0.6195,0.786,0.5871,0.6304,0.6233,0.3978,0.399,0.041
svm,SVM - Linear Kernel,0.3775,0.0,0.3333,0.1594,0.2199,0.0,0.0,0.089
lr,Logistic Regression,0.2435,0.5,0.3333,0.0593,0.0954,0.0,0.0,0.302


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9304,0.9889,0.9056,0.9305,0.9303,0.8878,0.888,0.408
rf,Random Forest Classifier,0.9253,0.9869,0.8988,0.9256,0.9252,0.8797,0.8799,0.22
xgboost,Extreme Gradient Boosting,0.9243,0.9879,0.8973,0.9246,0.9241,0.878,0.8783,1.354
gbc,Gradient Boosting Classifier,0.9238,0.988,0.8967,0.9242,0.9237,0.8773,0.8775,3.043
et,Extra Trees Classifier,0.9184,0.9858,0.8894,0.9187,0.9183,0.8686,0.8688,0.102
dt,Decision Tree Classifier,0.8935,0.9292,0.856,0.8937,0.8934,0.8285,0.8286,0.045
ada,Ada Boost Classifier,0.8851,0.9626,0.8436,0.8918,0.8834,0.8149,0.8189,0.218
lda,Linear Discriminant Analysis,0.8727,0.9701,0.8272,0.8744,0.872,0.7949,0.7962,0.035
knn,K Neighbors Classifier,0.6195,0.786,0.5871,0.6304,0.6233,0.3978,0.399,0.041
svm,SVM - Linear Kernel,0.3775,0.0,0.3333,0.1594,0.2199,0.0,0.0,0.089


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9304,0.9889,0.9056,0.9305,0.9303,0.8878,0.888,0.408
rf,Random Forest Classifier,0.9253,0.9869,0.8988,0.9256,0.9252,0.8797,0.8799,0.22
xgboost,Extreme Gradient Boosting,0.9243,0.9879,0.8973,0.9246,0.9241,0.878,0.8783,1.354
gbc,Gradient Boosting Classifier,0.9238,0.988,0.8967,0.9242,0.9237,0.8773,0.8775,3.043
et,Extra Trees Classifier,0.9184,0.9858,0.8894,0.9187,0.9183,0.8686,0.8688,0.102
dt,Decision Tree Classifier,0.8935,0.9292,0.856,0.8937,0.8934,0.8285,0.8286,0.045
ada,Ada Boost Classifier,0.8851,0.9626,0.8436,0.8918,0.8834,0.8149,0.8189,0.218
lda,Linear Discriminant Analysis,0.8727,0.9701,0.8272,0.8744,0.872,0.7949,0.7962,0.035
knn,K Neighbors Classifier,0.6195,0.786,0.5871,0.6304,0.6233,0.3978,0.399,0.041
svm,SVM - Linear Kernel,0.3775,0.0,0.3333,0.1594,0.2199,0.0,0.0,0.089


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=2332, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)


In [9]:
# tune model hyperparameters on the sonar classification dataset
# from sklearn.ensemble import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from pycaret.classification import setup
from pycaret.classification import tune_model

# grid = setup(data=train_features, target=train_labels, html=False, silent=True, verbose=False)
# tune model hyperparameters
best = tune_model(LGBMClassifier(), n_iter=200, choose_better=True)
# report the best model
print(best)

Fitting 10 folds for each of 200 candidates, totalling 2000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   28.8s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed:  7.1min finished


Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9373,0.991,0.9152,0.937,0.9371,0.899,0.899
1,0.9313,0.9881,0.9068,0.9322,0.9311,0.8894,0.8899
2,0.9328,0.9895,0.9088,0.9335,0.9327,0.8918,0.8922
3,0.9075,0.9856,0.8748,0.9076,0.9075,0.851,0.8511
4,0.9253,0.9886,0.8985,0.9259,0.9252,0.8796,0.8799
5,0.9238,0.9858,0.8964,0.9247,0.9235,0.8771,0.8777
6,0.9223,0.9871,0.8945,0.9224,0.9223,0.8748,0.8748
7,0.9387,0.9914,0.9171,0.9388,0.9385,0.9012,0.9014
8,0.9268,0.9897,0.9006,0.927,0.9267,0.882,0.8822
9,0.9537,0.9946,0.9374,0.9537,0.9537,0.9254,0.9255


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)


#### train baseline model

In [10]:
plot_model(best)

NameError: name 'plot_model' is not defined

In [None]:
plot_model(best,plot = 'error')

In [None]:
plot_model(best,plot = 'feature')

In [None]:
evaluate_model(best)

In [None]:
interpret_model(best)

### AutoML

In [None]:
automl_model = automl(optimize = 'MAE')
pred_holdouts = predict_model(automl_model)
pred_holdouts.head()

In [None]:
new_data = features.copy()
new_data.drop(['koi_disposition'], axis=1, inplace=True)
predictions = predict_model(automl_model, data=new_data)
predictions.head()

In [None]:
!mlflow ui

In [None]:
from pycaret.regression import *
reg_experiment = setup(data=df_final, 
                      target='koi_disposition',
                      log_experiment=True,
                      experiment_name='exoplanets')

In [None]:
best_model = compare_models(fold=5)

In [None]:
# Instantiate model with 1000 decision trees
rf = RandomForestClassifier(n_estimators = 1000, random_state = 42)
rf.fit(train_features, train_labels)

In [None]:
predictions = rf.predict(test_features)
print("Accuracy score: ", accuracy_score(test_labels, predictions))
print("Recall score: ", recall_score(test_labels, predictions, average=None))
cv_score = cross_val_score(rf, train_features, train_labels, cv=3, scoring='accuracy')
print("Cross validation score: ", cv_score)
print(classification_report(test_labels,predictions))

#### confusion matrix

In [None]:
train_pred = cross_val_predict(rf, train_features,train_labels, cv=3)
conf_matrix_rf = pd.DataFrame(confusion_matrix(train_labels, 
                                               train_pred, 
                                               labels=['CANDIDATE','CONFIRMED','FALSE POSITIVE']), 
                              index = ['Actual Candidate', 'Actual Confirmed', 'Actual FP'], 
                              columns = ['Predicted Candidate', 'Predicted Confirmed', 'Predicted FP']
                             )
print(conf_matrix_rf)
plot_cm(conf_matrix_rf)

#### randomized search

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf_2 = RandomForestClassifier(random_state = 42)

Random search of parameters, using 3 fold cross validation, search across 100 different combinations, and use all available cores

In [None]:
rf_random = RandomizedSearchCV(estimator = rf_2, 
                               param_distributions = random_grid, 
                               n_iter = 100, 
                               cv = 3, 
                               verbose=2, 
                               random_state=42, 
                               n_jobs = -1)
# Fit the random search model
rf_random.fit(train_features, train_labels)

In [None]:
# %% best params - use these params for the next model
print(rf_random.best_params_)

#### Model with Random Search CV Params

In [None]:
rf_rs = RandomForestClassifier(n_estimators = 311,
                               min_samples_split = 5,
                               min_samples_leaf = 1,
                               max_features = 'auto',
                               max_depth = 90,
                               bootstrap = False)

In [None]:
rf_rs.fit(train_features, train_labels)

In [None]:
print(rf_rs.score(train_features, train_labels))
y_pred = rf_rs.predict(test_features)
print(accuracy_score(test_labels, y_pred))
print(classification_report(test_labels, y_pred))

In [None]:
# confusion matrix & accuracy
rs_pred = cross_val_predict(rf_rs, test_features,test_labels, cv=3)
conf_matrix_rf = confusion_matrix(test_labels, rs_pred)
conf_matrix_rf = pd.DataFrame(confusion_matrix(test_labels, 
                                               rs_pred, 
                                               labels=['CANDIDATE','CONFIRMED','FALSE POSITIVE']), 
                              index = ['Actual Candidate', 'Actual Confirmed', 'Actual FP'], 
                              columns = ['Predicted Candidate', 'Predicted Confirmed', 'Predicted FP']
                             )
print(conf_matrix_rf)
plot_cm(conf_matrix_rf)

In [None]:
print("Accuracy score: ", accuracy_score(test_labels, rs_pred))
print("Recall score: ", recall_score(test_labels, rs_pred, average=None))
print("Precision score: ", precision_score(test_labels, rs_pred, average=None))

TPOT is an open-source library for performing AutoML in Python. It makes use of the popular Scikit-Learn machine learning library for data transforms and machine learning algorithms and uses a Genetic Programming stochastic global search procedure to efficiently discover a top-performing model pipeline for a given dataset. [1](https://machinelearningmastery.com/tpot-for-automated-machine-learning-in-python/)

In [None]:
# TPOT

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
param = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}

In [None]:
tpot_classifier = TPOTClassifier(generations= 5, 
                                 population_size= 24, 
                                 offspring_size= 12,
                                 verbosity= 2, 
                                 early_stop= 12,
                                 config_dict={'sklearn.ensemble.RandomForestClassifier': param}, 
                                 cv = 4, 
                                 scoring = 'accuracy')
tpot_classifier.fit(train_features, train_labels)
accuracy = tpot_classifier.score(test_features, test_labels)
print(accuracy)

#### AdaBoost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm='SAMME.R', learning_rate=0.5)

ada_clf.fit(train_features, train_labels)
ada_pred = ada_clf.predict(test_features)

In [None]:
print(classification_report(test_labels, ada_pred))

#### SVM

In [None]:
from sklearn.svm import SVC
svm_clf = SVC()
svm_clf.fit(train_features, train_labels)
svm_preds = svm_clf.predict(test_features)

In [None]:
print(classification_report(test_labels, ada_pred))

#### XGBoost

In [None]:
import xgboost as xgb

le = LabelEncoder()
train_labels_encoded = le.fit_transform(list(train_labels))
test_labels_encoded = le.fit_transform(list(test_labels))
                              
D_train = xgb.DMatrix(train_features, label=train_labels_encoded)
D_test = xgb.DMatrix(test_features, label=test_labels_encoded)

param = {
    'eta': 0.3, 
    'max_depth': 3,  
    'objective': 'multi:softprob',  
    'num_class': 3} 

steps = 20

model = xgb.train(param, D_train, steps)

In [None]:
preds = model.predict(D_test)
best_preds = np.asarray([np.argmax(line) for line in preds])

print("Precision = {}".format(precision_score(test_labels_encoded, best_preds, average='macro')))
print("Recall = {}".format(recall_score(test_labels_encoded, best_preds, average='macro')))
print("Accuracy = {}".format(accuracy_score(test_labels_encoded, best_preds)))

In [None]:
clf = xgb.XGBClassifier()
parameters = {
     "eta"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
     "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
     "min_child_weight" : [ 1, 3, 5, 7 ],
     "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
     "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
     }

grid = GridSearchCV(clf,
                    parameters, n_jobs=4,
                    scoring="neg_log_loss",
                    cv=3)

grid.fit(train_features, train_labels_encoded)

In [None]:
preds = model.predict(D_test)
best_preds = np.asarray([np.argmax(line) for line in preds])

print("Precision = {}".format(precision_score(test_labels_encoded, best_preds, average='macro')))
print("Recall = {}".format(recall_score(test_labels_encoded, best_preds, average='macro')))
print("Accuracy = {}".format(accuracy_score(test_labels_encoded, best_preds)))

### Multi-classification using Keras  

<mark>**I did not get this to run, but is definitely something I'd like to revisit once the semester is over.**</mark>

In [None]:
# Vectorize the labels
from keras.utils.np_utils import to_categorical

one_hot_train_labels = to_categorical(train_labels)
one_hot_test_labels = to_categorical(test_labels)

In [None]:
# Build and compile a model
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(53,)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(46, activation='softmax'))

model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
# Train the model
history = model.fit(train_features,
                    train_labels,
                    epochs=20,
                    batch_size=512,
                    validation_data=(test_features, test_labels))

In [None]:
results = history.evaluate(test_features, one_hot_test_labels)

In [None]:
# Plot loss and accuracy curves
import matplotlib.pyplot as plt

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(loss) + 1)

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

In [None]:
plt.clf()   # clear figure

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()