# %% Import libraries
from datetime import datetime

import matplotlib.pyplot as plt
from matplotlib import rcParams
import numpy as np
import seaborn as sns
import pandas as pd
from pandas_profiling import ProfileReport
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, \
                                    cross_val_score, \
                                    cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, \
                                    accuracy_score, \
                                    precision_score, \
                                    recall_score, \
                                    classification_report


pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


# function definitions

def pca(df):
    
    # standardize the features matrix
    features = StandardScaler().fit_transform(df)
    
    # Create a PCA that retains 99% of the variance
    pca = PCA(n_components = 0.95)
    features_pca = pca.fit_transform(features)
    
    return features, features_pca


def plot_cm(cm):
    # plot confusion matrix
    fig, ax = plt.subplots(figsize = (10,8))
    
    sns.heatmap(conf_matrix_rf/np.sum(conf_matrix_rf), annot=True, 
                fmt='.2%', cmap='Blues', annot_kws={'size':15})
    
    ax.set_title('Random Forest Confusion Matrix', fontsize = 18, loc='left')
    
    ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize = 12)
    ax.set_yticklabels(ax.get_ymajorticklabels(), fontsize = 12)
    
    plt.show()


# %% read df_final
# Read the Kepler Objects of Interest (KOI) df_final and look at one observation
df_koi = pd.read_csv('../data/cumulative_2021.03.16_17.10.21.csv')
print(df_koi.shape)
print(df_koi[1:2].T)

(9564, 141)
                                                                    1
rowid                                                               2
kepid                                                        10797460
kepoi_name                                                  K00752.02
kepler_name                                              Kepler-227 c
koi_disposition                                             CONFIRMED
koi_vet_stat                                                     Done
koi_vet_date                                                8/16/2018
koi_pdisposition                                            CANDIDATE
koi_score                                                       0.969
koi_fpflag_nt                                                       0
koi_fpflag_ss                                                       0
koi_fpflag_co                                                       0
koi_fpflag_ec                                                       0
koi_disp_prov                                     q1_q17_dr25_sup_koi
koi_comment                                                NO_COMMENT
koi_period                                                    54.4184
koi_period_err1                                              0.000248
koi_period_err2                                             -0.000248
koi_time0bk                                                   162.514
koi_time0bk_err1                                              0.00352
koi_time0bk_err2                                             -0.00352
koi_time0                                                   2.455e+06
koi_time0_err1                                                0.00352
koi_time0_err2                                               -0.00352
koi_eccen                                                           0
koi_eccen_err1                                                    NaN
koi_eccen_err2                                                    NaN
koi_longp                                                         NaN
koi_longp_err1                                                    NaN
koi_longp_err2                                                    NaN
koi_impact                                                      0.586
koi_impact_err1                                                 0.059
koi_impact_err2                                                -0.443
koi_duration                                                    4.507
koi_duration_err1                                               0.116
koi_duration_err2                                              -0.116
koi_ingress                                                       NaN
koi_ingress_err1                                                  NaN
koi_ingress_err2                                                  NaN
koi_depth                                                         875
koi_depth_err1                                                   35.5
koi_depth_err2                                                  -35.5
koi_ror                                                      0.027954
koi_ror_err1                                                  0.00908
koi_ror_err2                                                 -0.00135
koi_srho                                                      3.02368
koi_srho_err1                                                 2.20489
koi_srho_err2                                                -2.49638
koi_fittype                                                   LS+MCMC
koi_prad                                                         2.83
koi_prad_err1                                                    0.32
koi_prad_err2                                                   -0.19
koi_sma                                                        0.2734
koi_sma_err1                                                      NaN
koi_sma_err2                                                      NaN
koi_incl                                                        89.57
koi_incl_err1                                                     NaN
koi_incl_err2                                                     NaN
koi_teq                                                           443
koi_teq_err1                                                      NaN
koi_teq_err2                                                      NaN
koi_insol                                                        9.11
koi_insol_err1                                                   2.87
koi_insol_err2                                                  -1.62
koi_dor                                                          77.9
koi_dor_err1                                                     28.4
koi_dor_err2                                                    -28.4
koi_limbdark_mod                    Claret (2011 A&A 529 75) ATLAS LS
koi_ldm_coeff4                                                      0
koi_ldm_coeff3                                                      0
koi_ldm_coeff2                                                 0.2291
koi_ldm_coeff1                                                 0.4603
koi_parm_prov                                         q1_q17_dr25_koi
koi_max_sngle_ev                                              7.02767
koi_max_mult_ev                                               20.1095
koi_model_snr                                                    25.8
koi_count                                                           2
koi_num_transits                                                   25
koi_tce_plnt_num                                                    2
koi_tce_delivname                                     q1_q17_dr25_tce
koi_quarters                                              1.11111e+31
koi_bin_oedp_sig                                               0.0023
koi_trans_mod                      Mandel and Agol (2002 ApJ 580 171)
koi_model_dof                                                     NaN
koi_model_chisq                                                   NaN
koi_datalink_dvr    010/010797/010797460/dv/kplr010797460-20160209...
koi_datalink_dvs    010/010797/010797460/dv/kplr010797460-002-2016...
koi_steff                                                        5455
koi_steff_err1                                                     81
koi_steff_err2                                                    -81
koi_slogg                                                       4.467
koi_slogg_err1                                                  0.064
koi_slogg_err2                                                 -0.096
koi_smet                                                         0.14
koi_smet_err1                                                    0.15
koi_smet_err2                                                   -0.15
koi_srad                                                        0.927
koi_srad_err1                                                   0.105
koi_srad_err2                                                  -0.061
koi_smass                                                       0.919
koi_smass_err1                                                  0.052
koi_smass_err2                                                 -0.046
koi_sage                                                          NaN
koi_sage_err1                                                     NaN
koi_sage_err2                                                     NaN
koi_sparprov                                      q1_q17_dr25_stellar
ra                                                            291.934
dec                                                           48.1417
koi_kepmag                                                     15.347
koi_gmag                                                        15.89
koi_rmag                                                        15.27
koi_imag                                                       15.114
koi_zmag                                                       15.006
koi_jmag                                                       14.082
koi_hmag                                                       13.751
koi_kmag                                                       13.648
koi_fwm_stat_sig                                                0.003
koi_fwm_sra                                                   19.4623
koi_fwm_sra_err                                                 2e-05
koi_fwm_sdec                                                   48.142
koi_fwm_sdec_err                                              0.00019
koi_fwm_srao                                                    -0.63
koi_fwm_srao_err                                                 0.72
koi_fwm_sdeco                                                    1.23
koi_fwm_sdeco_err                                                0.68
koi_fwm_prao                                                  0.00066
koi_fwm_prao_err                                              0.00065
koi_fwm_pdeco                                                -0.00105
koi_fwm_pdeco_err                                             0.00063
koi_dicco_mra                                                    0.39
koi_dicco_mra_err                                                0.36
koi_dicco_mdec                                                      0
koi_dicco_mdec_err                                               0.48
koi_dicco_msky                                                   0.39
koi_dicco_msky_err                                               0.36
koi_dikco_mra                                                    0.49
koi_dikco_mra_err                                                0.34
koi_dikco_mdec                                                   0.12
koi_dikco_mdec_err                                               0.73
koi_dikco_msky                                                    0.5
koi_dikco_msky_err                                               0.45


# Remove variables with no data
df_koi_cleaned = df_koi.dropna(axis=1, how='all')
# Remove variables with no data
df_koi_cleaned = df_koi_cleaned.loc[:, (df_koi_cleaned != 0).any(axis=0)]
# Remove the err columns
df_koi_cleaned = df_koi_cleaned[df_koi_cleaned.columns.drop(
    list(df_koi_cleaned.filter(regex='_err')))]
# Still some variables that are all 0.0; will just drop them manually...
cols = ['koi_eccen','koi_ldm_coeff4','koi_ldm_coeff3']
df_koi_cleaned = df_koi_cleaned.drop(cols,axis=1)


df_koi_cleaned.shape
# %% describe
df_describe = pd.DataFrame(df_koi_cleaned.describe())
print(df_describe)

             rowid         kepid    koi_score  koi_fpflag_nt  koi_fpflag_ss  \
count  9564.000000  9.564000e+03  8054.000000    9564.000000    9564.000000   
mean   4782.500000  7.690628e+06     0.480829       0.208595       0.232748   
std    2761.033321  2.653459e+06     0.476928       4.767290       0.422605   
min       1.000000  7.574500e+05     0.000000       0.000000       0.000000   
25%    2391.750000  5.556034e+06     0.000000       0.000000       0.000000   
50%    4782.500000  7.906892e+06     0.334000       0.000000       0.000000   
75%    7173.250000  9.873066e+06     0.998000       0.000000       0.000000   
max    9564.000000  1.293514e+07     1.000000     465.000000       1.000000   

       koi_fpflag_co  koi_fpflag_ec     koi_period  koi_time0bk     koi_time0  \
count    9564.000000    9564.000000    9564.000000  9564.000000  9.564000e+03   
mean        0.197512       0.120033      75.671358   166.183251  2.454999e+06   
std         0.398142       0.325018    1334.744046    67.918960  6.791896e+01   
min         0.000000       0.000000       0.241843   120.515914  2.454954e+06   
25%         0.000000       0.000000       2.733684   132.761718  2.454966e+06   
50%         0.000000       0.000000       9.752831   137.224595  2.454970e+06   
75%         0.000000       0.000000      40.715178   170.694603  2.455004e+06   
max         1.000000       1.000000  129995.778400  1472.522306  2.456306e+06   

        koi_impact  koi_duration     koi_depth      koi_ror     koi_srho  \
count  9201.000000   9564.000000  9.201000e+03  9201.000000  9243.000000   
mean      0.735105      5.621606  2.379245e+04     0.283646     9.164414   
std       3.348832      6.471554  8.224316e+04     3.306558    53.807967   
min       0.000000      0.052000  0.000000e+00     0.001289     0.000040   
25%       0.197000      2.437750  1.600000e+02     0.012341     0.229250   
50%       0.537000      3.792600  4.210000e+02     0.021076     0.956720   
75%       0.889000      6.276500  1.470000e+03     0.095348     2.897175   
max     100.806000    138.540000  1.540000e+06    99.870651   980.854190   

            koi_prad      koi_sma     koi_incl       koi_teq     koi_insol  \
count    9201.000000  9201.000000  9200.000000   9201.000000  9.243000e+03   
mean      102.891778     0.223989    82.469147   1085.385828  7.745737e+03   
std      3077.639126     0.566359    15.223627    856.351161  1.592047e+05   
min         0.080000     0.005900     2.290000     25.000000  0.000000e+00   
25%         1.400000     0.037700    83.920000    539.000000  2.015000e+01   
50%         2.390000     0.085100    88.500000    878.000000  1.416000e+02   
75%        14.930000     0.214400    89.770000   1379.000000  8.702900e+02   
max    200346.000000    44.989200    90.000000  14667.000000  1.094755e+07   

            koi_dor  koi_ldm_coeff2  koi_ldm_coeff1  koi_max_sngle_ev  \
count   9201.000000     9201.000000     9201.000000       8422.000000   
mean      76.736333        0.254439        0.407617        176.846052   
std      845.274598        0.064806        0.106076        770.902357   
min        0.373000       -0.120600        0.125400          2.417437   
25%        5.358000        0.228600        0.326800          3.997856   
50%       15.460000        0.271100        0.392000          5.589751   
75%       45.370000        0.299800        0.464100         16.947631   
max    79614.000000        0.482200        0.948600      22982.162000   

       koi_max_mult_ev  koi_model_snr    koi_count  koi_num_transits  \
count      8422.000000    9201.000000  9564.000000       8422.000000   
mean       1025.664672     259.895001     1.406315        385.006768   
std        4154.121620     795.806615     0.873289        545.756200   
min           7.105086       0.000000     1.000000          0.000000   
25%          10.733030      12.000000     1.000000         41.000000   
50%          19.254412      23.000000     1.000000        143.000000   
75%          71.998003      78.000000     1.000000        469.000000   
max      120049.680000    9054.700000     7.000000       2664.000000   

       koi_tce_plnt_num  koi_quarters  koi_bin_oedp_sig     koi_steff  \
count       9218.000000  8.422000e+03       8054.000000   9201.000000   
mean           1.243654  9.151387e+30          0.409500   5706.823280   
std            0.664573  4.153749e+30          0.500793    796.857947   
min            1.000000  1.000000e+15         -1.000000   2661.000000   
25%            1.000000  1.111100e+31          0.134650   5310.000000   
50%            1.000000  1.111110e+31          0.486600   5767.000000   
75%            1.000000  1.111110e+31          0.810375   6112.000000   
max            8.000000  1.111110e+31          1.000000  15896.000000   

         koi_slogg     koi_smet     koi_srad    koi_smass           ra  \
count  9201.000000  9178.000000  9201.000000  9201.000000  9564.000000   
mean      4.310157    -0.124431     1.728712     1.023706   292.060163   
std       0.432606     0.282111     6.127185     0.349447     4.766657   
min       0.047000    -2.500000     0.109000     0.000000   279.852720   
25%       4.218000    -0.260000     0.829000     0.845000   288.660770   
50%       4.438000    -0.100000     1.000000     0.974000   292.261125   
75%       4.543000     0.070000     1.345000     1.101000   295.859160   
max       5.364000     0.560000   229.908000     3.735000   301.720760   

               dec   koi_kepmag     koi_gmag     koi_rmag     koi_imag  \
count  9564.000000  9563.000000  9523.000000  9555.000000  9410.000000   
mean     43.810433    14.264606    14.830501    14.221565    14.075138   
std       3.601243     1.385448     1.501885     1.383713     1.292573   
min      36.577381     6.966000     7.225000     7.101000     7.627000   
25%      40.777173    13.440000    13.896500    13.393000    13.294000   
50%      43.677504    14.520000    15.064000    14.471000    14.317500   
75%      46.714611    15.322000    15.935500    15.275000    15.063000   
max      52.336010    20.003000    21.150000    19.960000    19.900000   

          koi_zmag     koi_jmag     koi_hmag    koi_kmag  koi_fwm_stat_sig  \
count  8951.000000  9539.000000  9539.000000  9539.00000       8488.000000   
mean     13.991724    12.993311    12.620604    12.54341          0.150994   
std       1.230351     1.291912     1.267215     1.26818          0.252648   
min       6.702000     4.097000     3.014000     2.31100          0.000000   
25%      13.276000    12.253000    11.914500    11.84300          0.000000   
50%      14.254000    13.236000    12.834000    12.74400          0.006000   
75%      14.943000    13.968000    13.551000    13.48450          0.196250   
max      17.403000    17.372000    17.615000    17.03800          1.000000   

       koi_fwm_sra  koi_fwm_sdec  koi_fwm_srao  koi_fwm_sdeco  koi_fwm_prao  \
count  9058.000000   9058.000000   9109.000000    9109.000000   8734.000000   
mean     19.471356     43.829239     -0.316136      -0.165817     -0.000097   
std       0.319158      3.599553     20.254777      20.534655      0.058225   
min      18.657036     36.576888   -742.430000    -417.900000     -4.000000   
25%      19.243889     40.798688     -0.600000      -0.680000     -0.000210   
50%      19.484983     43.694115     -0.000500      -0.034000      0.000000   
75%      19.726785     46.720630      0.570000       0.500000      0.000240   
max      20.114785     52.338190    549.500000     712.500000      1.190000   

       koi_fwm_pdeco  koi_dicco_mra  koi_dicco_mdec  koi_dicco_msky  \
count    8747.000000    8965.000000     8965.000000     8965.000000   
mean       -0.000714      -0.012281       -0.045420        1.866561   
std         0.092987       2.406550        2.573558        2.988742   
min        -6.000000     -25.100000      -75.900000        0.000000   
25%        -0.000220      -0.320000       -0.387000        0.170000   
50%         0.000000       0.000000        0.000000        0.610000   
75%         0.000240       0.309000        0.300000        2.160000   
max         5.000000      45.680000       27.500000       88.600000   

       koi_dikco_mra  koi_dikco_mdec  koi_dikco_msky  
count    8994.000000     8994.000000     8994.000000  
mean       -0.024244       -0.076749        1.812566  
std         2.382286        2.553758        2.986376  
min       -27.800000      -76.600000        0.000000  
25%        -0.310000       -0.390000        0.210000  
50%        -0.004000       -0.017000        0.583000  
75%         0.290000        0.300000        1.970000  
max        46.570000       34.000000       89.600000


"""
Remove all descriptive variables to further simplify the df_final
In the interest of time, remove all categorical variables
"""
# remove_cols = ['rowid', 'kepid', 'kepoi_name', 'kepler_name', 'koi_vet_stat',
#                'koi_vet_date', 'koi_pdisposition', 'koi_fpflag_nt',
#                'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_disp_prov',
#                'koi_comment', 'koi_limbdark_mod', 'koi_parm_prov', 'koi_tce_delivname',
#                'koi_trans_mod', 'koi_trans_mod', 'koi_datalink_dvr', 'koi_datalink_dvs',
#                'koi_sparprov', 'koi_fittype']
remove_cols = ['rowid', 'kepid', 'kepoi_name', 'kepler_name', 'koi_vet_stat',
               'koi_vet_date', 'koi_fpflag_nt','koi_pdisposition',
               'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_disp_prov',
               'koi_comment', 'koi_limbdark_mod', 'koi_parm_prov', 'koi_tce_delivname',
               'koi_trans_mod', 'koi_trans_mod', 'koi_datalink_dvr', 'koi_datalink_dvs',
               'koi_sparprov', 'koi_fittype']
df_cleaned = df_koi_cleaned.drop(remove_cols, axis=1)


# Separate labels from features
labels = df_cleaned['koi_disposition']
df_features = df_cleaned.drop(['koi_disposition'], axis=1)

# Separate labels from features
imputer = SimpleImputer(strategy="median")
imputer.fit(df_features)
X = imputer.transform(df_features)
df_final = pd.DataFrame(X, columns=df_features.columns, index=df_features.index)


# %% correlation matrix before dimentionality reduction
rcParams['figure.figsize'] = 20, 14
plt.matshow(df_final.corr())
plt.yticks(np.arange(df_final.shape[1]), df_final.columns)
plt.xticks(np.arange(df_final.shape[1]), df_final.columns, rotation='vertical')
plt.colorbar()

<matplotlib.colorbar.Colorbar at 0x7f35dd354460>


features, features_pca = pca(df_final)
print('Original number of features: {}'.format(features.shape[1]))
print('Reduced number of features: {}'.format(features_pca.shape[1]))
#df_final = pd.DataFrame(features_pca, columns=df_final.columns, index=df_final.index)

# Uncomment as needed to train on features with or without PCA
# df_features = pd.DataFrame(features_pca)
df_features = pd.DataFrame(features)

Original number of features: 53
Reduced number of features: 29


# %% train and test sets
# labels = np.array(labels)
train_features, test_features, train_labels, test_labels = train_test_split(
    df_features, labels, test_size = 0.25, random_state = 42)


print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)
print('Training distribution: ',train_labels.value_counts(normalize=True))
print('Test distribution: ',test_labels.value_counts(normalize=True))

Training Features Shape: (7173, 53)
Training Labels Shape: (7173,)
Testing Features Shape: (2391, 53)
Testing Labels Shape: (2391,)
Training distribution:  FALSE POSITIVE    0.508853
CANDIDATE         0.246062
CONFIRMED         0.245086
Name: koi_disposition, dtype: float64
Test distribution:  FALSE POSITIVE    0.497700
CANDIDATE         0.251359
CONFIRMED         0.250941
Name: koi_disposition, dtype: float64


from pycaret.classification import setup
from pycaret.classification import compare_models, plot_model, evaluate_model, interpret_model
# setup the dataset

grid = setup(data=df_cleaned, target='koi_disposition', html=False, silent=True, verbose=False)
# evaluate models and compare models
best = compare_models()
# report the best model
print(best)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=314, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)


# tune model hyperparameters on the sonar classification dataset
# from sklearn.ensemble import XGBClassifier
from lightgbm import LGBMClassifier
from pycaret.classification import setup
from pycaret.classification import tune_model

# grid = setup(data=train_features, target=train_labels, html=False, silent=True, verbose=False)
# tune model hyperparameters
best = tune_model(LGBMClassifier(), n_iter=200, choose_better=True)
# report the best model
print(best)

Fitting 10 folds for each of 200 candidates, totalling 2000 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   30.8s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed:  4.5min finished

LGBMClassifier(bagging_fraction=0.9, bagging_freq=5, boosting_type='gbdt',
               class_weight=None, colsample_bytree=1.0, feature_fraction=0.8,
               importance_type='split', learning_rate=0.15, max_depth=-1,
               min_child_samples=26, min_child_weight=0.001, min_split_gain=0.8,
               n_estimators=260, n_jobs=-1, num_leaves=90, objective=None,
               random_state=None, reg_alpha=0.4, reg_lambda=0.3, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)


plot_model(best)

Finished loading model, total used 64 iterations


plot_model(best,plot = 'error')

Finished loading model, total used 64 iterations


plot_model(best,plot = 'feature')

Finished loading model, total used 64 iterations


evaluate_model(best)

Finished loading model, total used 64 iterations


interpret_model(best)


automl_model = automl(optimize = 'MAE')
pred_holdouts = predict_model(automl_model)
pred_holdouts.head()


new_data = features.copy()
new_data.drop(['koi_disposition'], axis=1, inplace=True)
predictions = predict_model(automl_model, data=new_data)
predictions.head()


!mlflow ui


from pycaret.regression import *
reg_experiment = setup(data=df_final, 
                      target='koi_disposition',
                      log_experiment=True,
                      experiment_name='exoplanets')


best_model = compare_models(fold=5)


# Instantiate model with 1000 decision trees
rf = RandomForestClassifier(n_estimators = 1000, random_state = 42)
rf.fit(train_features, train_labels)


predictions = rf.predict(test_features)
print("Accuracy score: ", accuracy_score(test_labels, predictions))
print("Recall score: ", recall_score(test_labels, predictions, average=None))
cv_score = cross_val_score(rf, train_features, train_labels, cv=3, scoring='accuracy')
print("Cross validation score: ", cv_score)
print(classification_report(test_labels,predictions))


train_pred = cross_val_predict(rf, train_features,train_labels, cv=3)
conf_matrix_rf = pd.DataFrame(confusion_matrix(train_labels, 
                                               train_pred, 
                                               labels=['CANDIDATE','CONFIRMED','FALSE POSITIVE']), 
                              index = ['Actual Candidate', 'Actual Confirmed', 'Actual FP'], 
                              columns = ['Predicted Candidate', 'Predicted Confirmed', 'Predicted FP']
                             )
print(conf_matrix_rf)
plot_cm(conf_matrix_rf)


# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf_2 = RandomForestClassifier(random_state = 42)


rf_random = RandomizedSearchCV(estimator = rf_2, 
                               param_distributions = random_grid, 
                               n_iter = 100, 
                               cv = 3, 
                               verbose=2, 
                               random_state=42, 
                               n_jobs = -1)
# Fit the random search model
rf_random.fit(train_features, train_labels)


# %% best params - use these params for the next model
print(rf_random.best_params_)


rf_rs = RandomForestClassifier(n_estimators = 311,
                               min_samples_split = 5,
                               min_samples_leaf = 1,
                               max_features = 'auto',
                               max_depth = 90,
                               bootstrap = False)


rf_rs.fit(train_features, train_labels)


print(rf_rs.score(train_features, train_labels))
y_pred = rf_rs.predict(test_features)
print(accuracy_score(test_labels, y_pred))
print(classification_report(test_labels, y_pred))


# confusion matrix & accuracy
rs_pred = cross_val_predict(rf_rs, test_features,test_labels, cv=3)
conf_matrix_rf = confusion_matrix(test_labels, rs_pred)
conf_matrix_rf = pd.DataFrame(confusion_matrix(test_labels, 
                                               rs_pred, 
                                               labels=['CANDIDATE','CONFIRMED','FALSE POSITIVE']), 
                              index = ['Actual Candidate', 'Actual Confirmed', 'Actual FP'], 
                              columns = ['Predicted Candidate', 'Predicted Confirmed', 'Predicted FP']
                             )
print(conf_matrix_rf)
plot_cm(conf_matrix_rf)


print("Accuracy score: ", accuracy_score(test_labels, rs_pred))
print("Recall score: ", recall_score(test_labels, rs_pred, average=None))
print("Precision score: ", precision_score(test_labels, rs_pred, average=None))


# TPOT

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
param = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}


tpot_classifier = TPOTClassifier(generations= 5, 
                                 population_size= 24, 
                                 offspring_size= 12,
                                 verbosity= 2, 
                                 early_stop= 12,
                                 config_dict={'sklearn.ensemble.RandomForestClassifier': param}, 
                                 cv = 4, 
                                 scoring = 'accuracy')
tpot_classifier.fit(train_features, train_labels)
accuracy = tpot_classifier.score(test_features, test_labels)
print(accuracy)


from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm='SAMME.R', learning_rate=0.5)

ada_clf.fit(train_features, train_labels)
ada_pred = ada_clf.predict(test_features)


print(classification_report(test_labels, ada_pred))


from sklearn.svm import SVC
svm_clf = SVC()
svm_clf.fit(train_features, train_labels)
svm_preds = svm_clf.predict(test_features)


print(classification_report(test_labels, ada_pred))


import xgboost as xgb

le = LabelEncoder()
train_labels_encoded = le.fit_transform(list(train_labels))
test_labels_encoded = le.fit_transform(list(test_labels))
                              
D_train = xgb.DMatrix(train_features, label=train_labels_encoded)
D_test = xgb.DMatrix(test_features, label=test_labels_encoded)

param = {
    'eta': 0.3, 
    'max_depth': 3,  
    'objective': 'multi:softprob',  
    'num_class': 3} 

steps = 20

model = xgb.train(param, D_train, steps)


preds = model.predict(D_test)
best_preds = np.asarray([np.argmax(line) for line in preds])

print("Precision = {}".format(precision_score(test_labels_encoded, best_preds, average='macro')))
print("Recall = {}".format(recall_score(test_labels_encoded, best_preds, average='macro')))
print("Accuracy = {}".format(accuracy_score(test_labels_encoded, best_preds)))


clf = xgb.XGBClassifier()
parameters = {
     "eta"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
     "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
     "min_child_weight" : [ 1, 3, 5, 7 ],
     "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
     "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
     }

grid = GridSearchCV(clf,
                    parameters, n_jobs=4,
                    scoring="neg_log_loss",
                    cv=3)

grid.fit(train_features, train_labels_encoded)


preds = model.predict(D_test)
best_preds = np.asarray([np.argmax(line) for line in preds])

print("Precision = {}".format(precision_score(test_labels_encoded, best_preds, average='macro')))
print("Recall = {}".format(recall_score(test_labels_encoded, best_preds, average='macro')))
print("Accuracy = {}".format(accuracy_score(test_labels_encoded, best_preds)))


# Vectorize the labels
from keras.utils.np_utils import to_categorical

one_hot_train_labels = to_categorical(train_labels)
one_hot_test_labels = to_categorical(test_labels)


# Build and compile a model
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(53,)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(46, activation='softmax'))

model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


# Train the model
history = model.fit(train_features,
                    train_labels,
                    epochs=20,
                    batch_size=512,
                    validation_data=(test_features, test_labels))


results = history.evaluate(test_features, one_hot_test_labels)


# Plot loss and accuracy curves
import matplotlib.pyplot as plt

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(loss) + 1)

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()


plt.clf()   # clear figure

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

	Model	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC	TT (Sec)
knn	K Neighbors Classifier	0.6108	0.7783	0.5832	0.6223	0.6147	0.3888	0.3901	0.027
lr	Logistic Regression	0.2499	0.5000	0.3333	0.0625	0.0999	0.0000	0.0000	0.130

	Model	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC	TT (Sec)
knn	K Neighbors Classifier	0.6108	0.7783	0.5832	0.6223	0.6147	0.3888	0.3901	0.027
lr	Logistic Regression	0.2499	0.5000	0.3333	0.0625	0.0999	0.0000	0.0000	0.130
nb	Naive Bayes	0.2499	0.0000	0.3333	0.0625	0.0999	0.0000	0.0000	0.015

	Model	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC	TT (Sec)
dt	Decision Tree Classifier	0.7980	0.8478	0.7665	0.7996	0.7985	0.6772	0.6775	0.058
knn	K Neighbors Classifier	0.6108	0.7783	0.5832	0.6223	0.6147	0.3888	0.3901	0.027
lr	Logistic Regression	0.2499	0.5000	0.3333	0.0625	0.0999	0.0000	0.0000	0.130
nb	Naive Bayes	0.2499	0.0000	0.3333	0.0625	0.0999	0.0000	0.0000	0.015

	Model	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC	TT (Sec)
dt	Decision Tree Classifier	0.7980	0.8478	0.7665	0.7996	0.7985	0.6772	0.6775	0.058
knn	K Neighbors Classifier	0.6108	0.7783	0.5832	0.6223	0.6147	0.3888	0.3901	0.027
svm	SVM - Linear Kernel	0.3253	0.0000	0.3333	0.1188	0.1702	0.0000	0.0000	0.096
lr	Logistic Regression	0.2499	0.5000	0.3333	0.0625	0.0999	0.0000	0.0000	0.130
nb	Naive Bayes	0.2499	0.0000	0.3333	0.0625	0.0999	0.0000	0.0000	0.015

	Model	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC	TT (Sec)
dt	Decision Tree Classifier	0.7980	0.8478	0.7665	0.7996	0.7985	0.6772	0.6775	0.058
knn	K Neighbors Classifier	0.6108	0.7783	0.5832	0.6223	0.6147	0.3888	0.3901	0.027
svm	SVM - Linear Kernel	0.3253	0.0000	0.3333	0.1188	0.1702	0.0000	0.0000	0.096
lr	Logistic Regression	0.2499	0.5000	0.3333	0.0625	0.0999	0.0000	0.0000	0.130
nb	Naive Bayes	0.2499	0.0000	0.3333	0.0625	0.0999	0.0000	0.0000	0.015
ridge	Ridge Classifier	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000	0.013

Project 1 - Predicting Exoplanets¶

David Kinney - DSS 680 - Spring 2021 - Professor Catherine Williams¶

clean data¶

prepare data¶

Dimensionality Reduction¶

Pardon the interruption...¶

pycaret¶

train baseline model¶

AutoML¶

confusion matrix¶

randomized search¶

Model with Random Search CV Params¶

AdaBoost Classifier¶

SVM¶

XGBoost¶

Multi-classification using Keras¶

	Model	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC	TT (Sec)
rf	Random Forest Classifier	0.8521	0.9610	0.8204	0.8481	0.8486	0.7611	0.7625	0.256
dt	Decision Tree Classifier	0.7980	0.8478	0.7665	0.7996	0.7985	0.6772	0.6775	0.058
knn	K Neighbors Classifier	0.6108	0.7783	0.5832	0.6223	0.6147	0.3888	0.3901	0.027
svm	SVM - Linear Kernel	0.3253	0.0000	0.3333	0.1188	0.1702	0.0000	0.0000	0.096
lr	Logistic Regression	0.2499	0.5000	0.3333	0.0625	0.0999	0.0000	0.0000	0.130
nb	Naive Bayes	0.2499	0.0000	0.3333	0.0625	0.0999	0.0000	0.0000	0.015
ridge	Ridge Classifier	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000	0.013

	Model	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC	TT (Sec)
gbc	Gradient Boosting Classifier	0.8543	0.9627	0.8254	0.8512	0.8514	0.7653	0.7666	2.965
rf	Random Forest Classifier	0.8521	0.9610	0.8204	0.8481	0.8486	0.7611	0.7625	0.256
ada	Ada Boost Classifier	0.8154	0.9148	0.7825	0.8107	0.8115	0.7034	0.7047	0.197
dt	Decision Tree Classifier	0.7980	0.8478	0.7665	0.7996	0.7985	0.6772	0.6775	0.058
knn	K Neighbors Classifier	0.6108	0.7783	0.5832	0.6223	0.6147	0.3888	0.3901	0.027
svm	SVM - Linear Kernel	0.3253	0.0000	0.3333	0.1188	0.1702	0.0000	0.0000	0.096
lr	Logistic Regression	0.2499	0.5000	0.3333	0.0625	0.0999	0.0000	0.0000	0.130
nb	Naive Bayes	0.2499	0.0000	0.3333	0.0625	0.0999	0.0000	0.0000	0.015
qda	Quadratic Discriminant Analysis	0.2499	0.0000	0.3333	0.0625	0.0999	0.0000	0.0000	0.017
ridge	Ridge Classifier	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000	0.013

	Model	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC	TT (Sec)
lightgbm	Light Gradient Boosting Machine	0.8587	0.9645	0.8313	0.8557	0.8560	0.7726	0.7737	0.308
gbc	Gradient Boosting Classifier	0.8543	0.9627	0.8254	0.8512	0.8514	0.7653	0.7666	2.965
rf	Random Forest Classifier	0.8521	0.9610	0.8204	0.8481	0.8486	0.7611	0.7625	0.256
et	Extra Trees Classifier	0.8433	0.9574	0.8118	0.8398	0.8406	0.7480	0.7489	0.114
ada	Ada Boost Classifier	0.8154	0.9148	0.7825	0.8107	0.8115	0.7034	0.7047	0.197
dt	Decision Tree Classifier	0.7980	0.8478	0.7665	0.7996	0.7985	0.6772	0.6775	0.058
lda	Linear Discriminant Analysis	0.7889	0.9326	0.7503	0.7881	0.7872	0.6634	0.6646	0.022
knn	K Neighbors Classifier	0.6108	0.7783	0.5832	0.6223	0.6147	0.3888	0.3901	0.027
svm	SVM - Linear Kernel	0.3253	0.0000	0.3333	0.1188	0.1702	0.0000	0.0000	0.096
lr	Logistic Regression	0.2499	0.5000	0.3333	0.0625	0.0999	0.0000	0.0000	0.130
nb	Naive Bayes	0.2499	0.0000	0.3333	0.0625	0.0999	0.0000	0.0000	0.015
qda	Quadratic Discriminant Analysis	0.2499	0.0000	0.3333	0.0625	0.0999	0.0000	0.0000	0.017
ridge	Ridge Classifier	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000	0.0000	0.013

	Accuracy	AUC	Recall	Prec.	F1	Kappa	MCC
0	0.8642	0.9698	0.8372	0.8617	0.8623	0.7823	0.7829
1	0.8761	0.9700	0.8501	0.8730	0.8739	0.8003	0.8010
2	0.8627	0.9667	0.8362	0.8590	0.8599	0.7785	0.7794
3	0.8448	0.9582	0.8177	0.8432	0.8435	0.7518	0.7522
4	0.8670	0.9631	0.8346	0.8633	0.8625	0.7839	0.7864
5	0.8640	0.9665	0.8336	0.8602	0.8601	0.7801	0.7819
6	0.8550	0.9656	0.8286	0.8538	0.8532	0.7684	0.7694
7	0.8879	0.9738	0.8694	0.8855	0.8859	0.8194	0.8202
8	0.8356	0.9557	0.8037	0.8325	0.8337	0.7350	0.7354
9	0.8520	0.9659	0.8335	0.8512	0.8515	0.7636	0.7637
Mean	0.8609	0.9655	0.8345	0.8584	0.8586	0.7763	0.7773
SD	0.0143	0.0051	0.0165	0.0140	0.0139	0.0225	0.0227

	Parameters
boosting_type	gbdt
class_weight	None
colsample_bytree	1.0
importance_type	split
learning_rate	0.15
max_depth	-1
min_child_samples	26
min_child_weight	0.001
min_split_gain	0.8
n_estimators	260
n_jobs	-1
num_leaves	90
objective	None
random_state	None
reg_alpha	0.4
reg_lambda	0.3
silent	True
subsample	1.0
subsample_for_bin	200000
subsample_freq	0
feature_fraction	0.8
bagging_freq	5
bagging_fraction	0.9