# %% Import libraries
from datetime import datetime
import matplotlib.pyplot as plt
from matplotlib import rcParams
import numpy as np
import seaborn as sns
import pandas as pd
from pandas_profiling import ProfileReport
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, \
cross_val_score, \
cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, \
accuracy_score, \
precision_score, \
recall_score, \
classification_report
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# function definitions
def pca(df):
# standardize the features matrix
features = StandardScaler().fit_transform(df)
# Create a PCA that retains 99% of the variance
pca = PCA(n_components = 0.95)
features_pca = pca.fit_transform(features)
return features, features_pca
def plot_cm(cm):
# plot confusion matrix
fig, ax = plt.subplots(figsize = (10,8))
sns.heatmap(conf_matrix_rf/np.sum(conf_matrix_rf), annot=True,
fmt='.2%', cmap='Blues', annot_kws={'size':15})
ax.set_title('Random Forest Confusion Matrix', fontsize = 18, loc='left')
ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize = 12)
ax.set_yticklabels(ax.get_ymajorticklabels(), fontsize = 12)
plt.show()
# %% read df_final
# Read the Kepler Objects of Interest (KOI) df_final and look at one observation
df_koi = pd.read_csv('../data/cumulative_2021.03.16_17.10.21.csv')
print(df_koi.shape)
print(df_koi[1:2].T)
(9564, 141) 1 rowid 2 kepid 10797460 kepoi_name K00752.02 kepler_name Kepler-227 c koi_disposition CONFIRMED koi_vet_stat Done koi_vet_date 8/16/2018 koi_pdisposition CANDIDATE koi_score 0.969 koi_fpflag_nt 0 koi_fpflag_ss 0 koi_fpflag_co 0 koi_fpflag_ec 0 koi_disp_prov q1_q17_dr25_sup_koi koi_comment NO_COMMENT koi_period 54.4184 koi_period_err1 0.000248 koi_period_err2 -0.000248 koi_time0bk 162.514 koi_time0bk_err1 0.00352 koi_time0bk_err2 -0.00352 koi_time0 2.455e+06 koi_time0_err1 0.00352 koi_time0_err2 -0.00352 koi_eccen 0 koi_eccen_err1 NaN koi_eccen_err2 NaN koi_longp NaN koi_longp_err1 NaN koi_longp_err2 NaN koi_impact 0.586 koi_impact_err1 0.059 koi_impact_err2 -0.443 koi_duration 4.507 koi_duration_err1 0.116 koi_duration_err2 -0.116 koi_ingress NaN koi_ingress_err1 NaN koi_ingress_err2 NaN koi_depth 875 koi_depth_err1 35.5 koi_depth_err2 -35.5 koi_ror 0.027954 koi_ror_err1 0.00908 koi_ror_err2 -0.00135 koi_srho 3.02368 koi_srho_err1 2.20489 koi_srho_err2 -2.49638 koi_fittype LS+MCMC koi_prad 2.83 koi_prad_err1 0.32 koi_prad_err2 -0.19 koi_sma 0.2734 koi_sma_err1 NaN koi_sma_err2 NaN koi_incl 89.57 koi_incl_err1 NaN koi_incl_err2 NaN koi_teq 443 koi_teq_err1 NaN koi_teq_err2 NaN koi_insol 9.11 koi_insol_err1 2.87 koi_insol_err2 -1.62 koi_dor 77.9 koi_dor_err1 28.4 koi_dor_err2 -28.4 koi_limbdark_mod Claret (2011 A&A 529 75) ATLAS LS koi_ldm_coeff4 0 koi_ldm_coeff3 0 koi_ldm_coeff2 0.2291 koi_ldm_coeff1 0.4603 koi_parm_prov q1_q17_dr25_koi koi_max_sngle_ev 7.02767 koi_max_mult_ev 20.1095 koi_model_snr 25.8 koi_count 2 koi_num_transits 25 koi_tce_plnt_num 2 koi_tce_delivname q1_q17_dr25_tce koi_quarters 1.11111e+31 koi_bin_oedp_sig 0.0023 koi_trans_mod Mandel and Agol (2002 ApJ 580 171) koi_model_dof NaN koi_model_chisq NaN koi_datalink_dvr 010/010797/010797460/dv/kplr010797460-20160209... koi_datalink_dvs 010/010797/010797460/dv/kplr010797460-002-2016... koi_steff 5455 koi_steff_err1 81 koi_steff_err2 -81 koi_slogg 4.467 koi_slogg_err1 0.064 koi_slogg_err2 -0.096 koi_smet 0.14 koi_smet_err1 0.15 koi_smet_err2 -0.15 koi_srad 0.927 koi_srad_err1 0.105 koi_srad_err2 -0.061 koi_smass 0.919 koi_smass_err1 0.052 koi_smass_err2 -0.046 koi_sage NaN koi_sage_err1 NaN koi_sage_err2 NaN koi_sparprov q1_q17_dr25_stellar ra 291.934 dec 48.1417 koi_kepmag 15.347 koi_gmag 15.89 koi_rmag 15.27 koi_imag 15.114 koi_zmag 15.006 koi_jmag 14.082 koi_hmag 13.751 koi_kmag 13.648 koi_fwm_stat_sig 0.003 koi_fwm_sra 19.4623 koi_fwm_sra_err 2e-05 koi_fwm_sdec 48.142 koi_fwm_sdec_err 0.00019 koi_fwm_srao -0.63 koi_fwm_srao_err 0.72 koi_fwm_sdeco 1.23 koi_fwm_sdeco_err 0.68 koi_fwm_prao 0.00066 koi_fwm_prao_err 0.00065 koi_fwm_pdeco -0.00105 koi_fwm_pdeco_err 0.00063 koi_dicco_mra 0.39 koi_dicco_mra_err 0.36 koi_dicco_mdec 0 koi_dicco_mdec_err 0.48 koi_dicco_msky 0.39 koi_dicco_msky_err 0.36 koi_dikco_mra 0.49 koi_dikco_mra_err 0.34 koi_dikco_mdec 0.12 koi_dikco_mdec_err 0.73 koi_dikco_msky 0.5 koi_dikco_msky_err 0.45
# Remove variables with no data
df_koi_cleaned = df_koi.dropna(axis=1, how='all')
# Remove variables with no data
df_koi_cleaned = df_koi_cleaned.loc[:, (df_koi_cleaned != 0).any(axis=0)]
# Remove the err columns
df_koi_cleaned = df_koi_cleaned[df_koi_cleaned.columns.drop(
list(df_koi_cleaned.filter(regex='_err')))]
# Still some variables that are all 0.0; will just drop them manually...
cols = ['koi_eccen','koi_ldm_coeff4','koi_ldm_coeff3']
df_koi_cleaned = df_koi_cleaned.drop(cols,axis=1)
df_koi_cleaned.shape
# %% describe
df_describe = pd.DataFrame(df_koi_cleaned.describe())
print(df_describe)
rowid kepid koi_score koi_fpflag_nt koi_fpflag_ss \ count 9564.000000 9.564000e+03 8054.000000 9564.000000 9564.000000 mean 4782.500000 7.690628e+06 0.480829 0.208595 0.232748 std 2761.033321 2.653459e+06 0.476928 4.767290 0.422605 min 1.000000 7.574500e+05 0.000000 0.000000 0.000000 25% 2391.750000 5.556034e+06 0.000000 0.000000 0.000000 50% 4782.500000 7.906892e+06 0.334000 0.000000 0.000000 75% 7173.250000 9.873066e+06 0.998000 0.000000 0.000000 max 9564.000000 1.293514e+07 1.000000 465.000000 1.000000 koi_fpflag_co koi_fpflag_ec koi_period koi_time0bk koi_time0 \ count 9564.000000 9564.000000 9564.000000 9564.000000 9.564000e+03 mean 0.197512 0.120033 75.671358 166.183251 2.454999e+06 std 0.398142 0.325018 1334.744046 67.918960 6.791896e+01 min 0.000000 0.000000 0.241843 120.515914 2.454954e+06 25% 0.000000 0.000000 2.733684 132.761718 2.454966e+06 50% 0.000000 0.000000 9.752831 137.224595 2.454970e+06 75% 0.000000 0.000000 40.715178 170.694603 2.455004e+06 max 1.000000 1.000000 129995.778400 1472.522306 2.456306e+06 koi_impact koi_duration koi_depth koi_ror koi_srho \ count 9201.000000 9564.000000 9.201000e+03 9201.000000 9243.000000 mean 0.735105 5.621606 2.379245e+04 0.283646 9.164414 std 3.348832 6.471554 8.224316e+04 3.306558 53.807967 min 0.000000 0.052000 0.000000e+00 0.001289 0.000040 25% 0.197000 2.437750 1.600000e+02 0.012341 0.229250 50% 0.537000 3.792600 4.210000e+02 0.021076 0.956720 75% 0.889000 6.276500 1.470000e+03 0.095348 2.897175 max 100.806000 138.540000 1.540000e+06 99.870651 980.854190 koi_prad koi_sma koi_incl koi_teq koi_insol \ count 9201.000000 9201.000000 9200.000000 9201.000000 9.243000e+03 mean 102.891778 0.223989 82.469147 1085.385828 7.745737e+03 std 3077.639126 0.566359 15.223627 856.351161 1.592047e+05 min 0.080000 0.005900 2.290000 25.000000 0.000000e+00 25% 1.400000 0.037700 83.920000 539.000000 2.015000e+01 50% 2.390000 0.085100 88.500000 878.000000 1.416000e+02 75% 14.930000 0.214400 89.770000 1379.000000 8.702900e+02 max 200346.000000 44.989200 90.000000 14667.000000 1.094755e+07 koi_dor koi_ldm_coeff2 koi_ldm_coeff1 koi_max_sngle_ev \ count 9201.000000 9201.000000 9201.000000 8422.000000 mean 76.736333 0.254439 0.407617 176.846052 std 845.274598 0.064806 0.106076 770.902357 min 0.373000 -0.120600 0.125400 2.417437 25% 5.358000 0.228600 0.326800 3.997856 50% 15.460000 0.271100 0.392000 5.589751 75% 45.370000 0.299800 0.464100 16.947631 max 79614.000000 0.482200 0.948600 22982.162000 koi_max_mult_ev koi_model_snr koi_count koi_num_transits \ count 8422.000000 9201.000000 9564.000000 8422.000000 mean 1025.664672 259.895001 1.406315 385.006768 std 4154.121620 795.806615 0.873289 545.756200 min 7.105086 0.000000 1.000000 0.000000 25% 10.733030 12.000000 1.000000 41.000000 50% 19.254412 23.000000 1.000000 143.000000 75% 71.998003 78.000000 1.000000 469.000000 max 120049.680000 9054.700000 7.000000 2664.000000 koi_tce_plnt_num koi_quarters koi_bin_oedp_sig koi_steff \ count 9218.000000 8.422000e+03 8054.000000 9201.000000 mean 1.243654 9.151387e+30 0.409500 5706.823280 std 0.664573 4.153749e+30 0.500793 796.857947 min 1.000000 1.000000e+15 -1.000000 2661.000000 25% 1.000000 1.111100e+31 0.134650 5310.000000 50% 1.000000 1.111110e+31 0.486600 5767.000000 75% 1.000000 1.111110e+31 0.810375 6112.000000 max 8.000000 1.111110e+31 1.000000 15896.000000 koi_slogg koi_smet koi_srad koi_smass ra \ count 9201.000000 9178.000000 9201.000000 9201.000000 9564.000000 mean 4.310157 -0.124431 1.728712 1.023706 292.060163 std 0.432606 0.282111 6.127185 0.349447 4.766657 min 0.047000 -2.500000 0.109000 0.000000 279.852720 25% 4.218000 -0.260000 0.829000 0.845000 288.660770 50% 4.438000 -0.100000 1.000000 0.974000 292.261125 75% 4.543000 0.070000 1.345000 1.101000 295.859160 max 5.364000 0.560000 229.908000 3.735000 301.720760 dec koi_kepmag koi_gmag koi_rmag koi_imag \ count 9564.000000 9563.000000 9523.000000 9555.000000 9410.000000 mean 43.810433 14.264606 14.830501 14.221565 14.075138 std 3.601243 1.385448 1.501885 1.383713 1.292573 min 36.577381 6.966000 7.225000 7.101000 7.627000 25% 40.777173 13.440000 13.896500 13.393000 13.294000 50% 43.677504 14.520000 15.064000 14.471000 14.317500 75% 46.714611 15.322000 15.935500 15.275000 15.063000 max 52.336010 20.003000 21.150000 19.960000 19.900000 koi_zmag koi_jmag koi_hmag koi_kmag koi_fwm_stat_sig \ count 8951.000000 9539.000000 9539.000000 9539.00000 8488.000000 mean 13.991724 12.993311 12.620604 12.54341 0.150994 std 1.230351 1.291912 1.267215 1.26818 0.252648 min 6.702000 4.097000 3.014000 2.31100 0.000000 25% 13.276000 12.253000 11.914500 11.84300 0.000000 50% 14.254000 13.236000 12.834000 12.74400 0.006000 75% 14.943000 13.968000 13.551000 13.48450 0.196250 max 17.403000 17.372000 17.615000 17.03800 1.000000 koi_fwm_sra koi_fwm_sdec koi_fwm_srao koi_fwm_sdeco koi_fwm_prao \ count 9058.000000 9058.000000 9109.000000 9109.000000 8734.000000 mean 19.471356 43.829239 -0.316136 -0.165817 -0.000097 std 0.319158 3.599553 20.254777 20.534655 0.058225 min 18.657036 36.576888 -742.430000 -417.900000 -4.000000 25% 19.243889 40.798688 -0.600000 -0.680000 -0.000210 50% 19.484983 43.694115 -0.000500 -0.034000 0.000000 75% 19.726785 46.720630 0.570000 0.500000 0.000240 max 20.114785 52.338190 549.500000 712.500000 1.190000 koi_fwm_pdeco koi_dicco_mra koi_dicco_mdec koi_dicco_msky \ count 8747.000000 8965.000000 8965.000000 8965.000000 mean -0.000714 -0.012281 -0.045420 1.866561 std 0.092987 2.406550 2.573558 2.988742 min -6.000000 -25.100000 -75.900000 0.000000 25% -0.000220 -0.320000 -0.387000 0.170000 50% 0.000000 0.000000 0.000000 0.610000 75% 0.000240 0.309000 0.300000 2.160000 max 5.000000 45.680000 27.500000 88.600000 koi_dikco_mra koi_dikco_mdec koi_dikco_msky count 8994.000000 8994.000000 8994.000000 mean -0.024244 -0.076749 1.812566 std 2.382286 2.553758 2.986376 min -27.800000 -76.600000 0.000000 25% -0.310000 -0.390000 0.210000 50% -0.004000 -0.017000 0.583000 75% 0.290000 0.300000 1.970000 max 46.570000 34.000000 89.600000
"""
Remove all descriptive variables to further simplify the df_final
In the interest of time, remove all categorical variables
"""
# remove_cols = ['rowid', 'kepid', 'kepoi_name', 'kepler_name', 'koi_vet_stat',
# 'koi_vet_date', 'koi_pdisposition', 'koi_fpflag_nt',
# 'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_disp_prov',
# 'koi_comment', 'koi_limbdark_mod', 'koi_parm_prov', 'koi_tce_delivname',
# 'koi_trans_mod', 'koi_trans_mod', 'koi_datalink_dvr', 'koi_datalink_dvs',
# 'koi_sparprov', 'koi_fittype']
remove_cols = ['rowid', 'kepid', 'kepoi_name', 'kepler_name', 'koi_vet_stat',
'koi_vet_date', 'koi_fpflag_nt','koi_pdisposition',
'koi_fpflag_ss', 'koi_fpflag_co', 'koi_fpflag_ec', 'koi_disp_prov',
'koi_comment', 'koi_limbdark_mod', 'koi_parm_prov', 'koi_tce_delivname',
'koi_trans_mod', 'koi_trans_mod', 'koi_datalink_dvr', 'koi_datalink_dvs',
'koi_sparprov', 'koi_fittype']
df_cleaned = df_koi_cleaned.drop(remove_cols, axis=1)
# Separate labels from features
labels = df_cleaned['koi_disposition']
df_features = df_cleaned.drop(['koi_disposition'], axis=1)
# Separate labels from features
imputer = SimpleImputer(strategy="median")
imputer.fit(df_features)
X = imputer.transform(df_features)
df_final = pd.DataFrame(X, columns=df_features.columns, index=df_features.index)
# %% correlation matrix before dimentionality reduction
rcParams['figure.figsize'] = 20, 14
plt.matshow(df_final.corr())
plt.yticks(np.arange(df_final.shape[1]), df_final.columns)
plt.xticks(np.arange(df_final.shape[1]), df_final.columns, rotation='vertical')
plt.colorbar()
<matplotlib.colorbar.Colorbar at 0x7f35dd354460>
features, features_pca = pca(df_final)
print('Original number of features: {}'.format(features.shape[1]))
print('Reduced number of features: {}'.format(features_pca.shape[1]))
#df_final = pd.DataFrame(features_pca, columns=df_final.columns, index=df_final.index)
# Uncomment as needed to train on features with or without PCA
# df_features = pd.DataFrame(features_pca)
df_features = pd.DataFrame(features)
Original number of features: 53 Reduced number of features: 29
# %% train and test sets
# labels = np.array(labels)
train_features, test_features, train_labels, test_labels = train_test_split(
df_features, labels, test_size = 0.25, random_state = 42)
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)
print('Training distribution: ',train_labels.value_counts(normalize=True))
print('Test distribution: ',test_labels.value_counts(normalize=True))
Training Features Shape: (7173, 53) Training Labels Shape: (7173,) Testing Features Shape: (2391, 53) Testing Labels Shape: (2391,) Training distribution: FALSE POSITIVE 0.508853 CANDIDATE 0.246062 CONFIRMED 0.245086 Name: koi_disposition, dtype: float64 Test distribution: FALSE POSITIVE 0.497700 CANDIDATE 0.251359 CONFIRMED 0.250941 Name: koi_disposition, dtype: float64
from pycaret.classification import setup
from pycaret.classification import compare_models, plot_model, evaluate_model, interpret_model
# setup the dataset
grid = setup(data=df_cleaned, target='koi_disposition', html=False, silent=True, verbose=False)
# evaluate models and compare models
best = compare_models()
# report the best model
print(best)
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
lr | Logistic Regression | 0.2499 | 0.5 | 0.3333 | 0.0625 | 0.0999 | 0.0 | 0.0 | 0.13 |
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
knn | K Neighbors Classifier | 0.6108 | 0.7783 | 0.5832 | 0.6223 | 0.6147 | 0.3888 | 0.3901 | 0.027 |
lr | Logistic Regression | 0.2499 | 0.5000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.130 |
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
knn | K Neighbors Classifier | 0.6108 | 0.7783 | 0.5832 | 0.6223 | 0.6147 | 0.3888 | 0.3901 | 0.027 |
lr | Logistic Regression | 0.2499 | 0.5000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.130 |
nb | Naive Bayes | 0.2499 | 0.0000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.015 |
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
dt | Decision Tree Classifier | 0.7980 | 0.8478 | 0.7665 | 0.7996 | 0.7985 | 0.6772 | 0.6775 | 0.058 |
knn | K Neighbors Classifier | 0.6108 | 0.7783 | 0.5832 | 0.6223 | 0.6147 | 0.3888 | 0.3901 | 0.027 |
lr | Logistic Regression | 0.2499 | 0.5000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.130 |
nb | Naive Bayes | 0.2499 | 0.0000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.015 |
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
dt | Decision Tree Classifier | 0.7980 | 0.8478 | 0.7665 | 0.7996 | 0.7985 | 0.6772 | 0.6775 | 0.058 |
knn | K Neighbors Classifier | 0.6108 | 0.7783 | 0.5832 | 0.6223 | 0.6147 | 0.3888 | 0.3901 | 0.027 |
svm | SVM - Linear Kernel | 0.3253 | 0.0000 | 0.3333 | 0.1188 | 0.1702 | 0.0000 | 0.0000 | 0.096 |
lr | Logistic Regression | 0.2499 | 0.5000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.130 |
nb | Naive Bayes | 0.2499 | 0.0000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.015 |
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
dt | Decision Tree Classifier | 0.7980 | 0.8478 | 0.7665 | 0.7996 | 0.7985 | 0.6772 | 0.6775 | 0.058 |
knn | K Neighbors Classifier | 0.6108 | 0.7783 | 0.5832 | 0.6223 | 0.6147 | 0.3888 | 0.3901 | 0.027 |
svm | SVM - Linear Kernel | 0.3253 | 0.0000 | 0.3333 | 0.1188 | 0.1702 | 0.0000 | 0.0000 | 0.096 |
lr | Logistic Regression | 0.2499 | 0.5000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.130 |
nb | Naive Bayes | 0.2499 | 0.0000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.015 |
ridge | Ridge Classifier | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.013 |
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
rf | Random Forest Classifier | 0.8521 | 0.9610 | 0.8204 | 0.8481 | 0.8486 | 0.7611 | 0.7625 | 0.256 |
dt | Decision Tree Classifier | 0.7980 | 0.8478 | 0.7665 | 0.7996 | 0.7985 | 0.6772 | 0.6775 | 0.058 |
knn | K Neighbors Classifier | 0.6108 | 0.7783 | 0.5832 | 0.6223 | 0.6147 | 0.3888 | 0.3901 | 0.027 |
svm | SVM - Linear Kernel | 0.3253 | 0.0000 | 0.3333 | 0.1188 | 0.1702 | 0.0000 | 0.0000 | 0.096 |
lr | Logistic Regression | 0.2499 | 0.5000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.130 |
nb | Naive Bayes | 0.2499 | 0.0000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.015 |
ridge | Ridge Classifier | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.013 |
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
rf | Random Forest Classifier | 0.8521 | 0.9610 | 0.8204 | 0.8481 | 0.8486 | 0.7611 | 0.7625 | 0.256 |
dt | Decision Tree Classifier | 0.7980 | 0.8478 | 0.7665 | 0.7996 | 0.7985 | 0.6772 | 0.6775 | 0.058 |
knn | K Neighbors Classifier | 0.6108 | 0.7783 | 0.5832 | 0.6223 | 0.6147 | 0.3888 | 0.3901 | 0.027 |
svm | SVM - Linear Kernel | 0.3253 | 0.0000 | 0.3333 | 0.1188 | 0.1702 | 0.0000 | 0.0000 | 0.096 |
lr | Logistic Regression | 0.2499 | 0.5000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.130 |
nb | Naive Bayes | 0.2499 | 0.0000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.015 |
qda | Quadratic Discriminant Analysis | 0.2499 | 0.0000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.017 |
ridge | Ridge Classifier | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.013 |
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
rf | Random Forest Classifier | 0.8521 | 0.9610 | 0.8204 | 0.8481 | 0.8486 | 0.7611 | 0.7625 | 0.256 |
ada | Ada Boost Classifier | 0.8154 | 0.9148 | 0.7825 | 0.8107 | 0.8115 | 0.7034 | 0.7047 | 0.197 |
dt | Decision Tree Classifier | 0.7980 | 0.8478 | 0.7665 | 0.7996 | 0.7985 | 0.6772 | 0.6775 | 0.058 |
knn | K Neighbors Classifier | 0.6108 | 0.7783 | 0.5832 | 0.6223 | 0.6147 | 0.3888 | 0.3901 | 0.027 |
svm | SVM - Linear Kernel | 0.3253 | 0.0000 | 0.3333 | 0.1188 | 0.1702 | 0.0000 | 0.0000 | 0.096 |
lr | Logistic Regression | 0.2499 | 0.5000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.130 |
nb | Naive Bayes | 0.2499 | 0.0000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.015 |
qda | Quadratic Discriminant Analysis | 0.2499 | 0.0000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.017 |
ridge | Ridge Classifier | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.013 |
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
gbc | Gradient Boosting Classifier | 0.8543 | 0.9627 | 0.8254 | 0.8512 | 0.8514 | 0.7653 | 0.7666 | 2.965 |
rf | Random Forest Classifier | 0.8521 | 0.9610 | 0.8204 | 0.8481 | 0.8486 | 0.7611 | 0.7625 | 0.256 |
ada | Ada Boost Classifier | 0.8154 | 0.9148 | 0.7825 | 0.8107 | 0.8115 | 0.7034 | 0.7047 | 0.197 |
dt | Decision Tree Classifier | 0.7980 | 0.8478 | 0.7665 | 0.7996 | 0.7985 | 0.6772 | 0.6775 | 0.058 |
knn | K Neighbors Classifier | 0.6108 | 0.7783 | 0.5832 | 0.6223 | 0.6147 | 0.3888 | 0.3901 | 0.027 |
svm | SVM - Linear Kernel | 0.3253 | 0.0000 | 0.3333 | 0.1188 | 0.1702 | 0.0000 | 0.0000 | 0.096 |
lr | Logistic Regression | 0.2499 | 0.5000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.130 |
nb | Naive Bayes | 0.2499 | 0.0000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.015 |
qda | Quadratic Discriminant Analysis | 0.2499 | 0.0000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.017 |
ridge | Ridge Classifier | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.013 |
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
gbc | Gradient Boosting Classifier | 0.8543 | 0.9627 | 0.8254 | 0.8512 | 0.8514 | 0.7653 | 0.7666 | 2.965 |
rf | Random Forest Classifier | 0.8521 | 0.9610 | 0.8204 | 0.8481 | 0.8486 | 0.7611 | 0.7625 | 0.256 |
ada | Ada Boost Classifier | 0.8154 | 0.9148 | 0.7825 | 0.8107 | 0.8115 | 0.7034 | 0.7047 | 0.197 |
dt | Decision Tree Classifier | 0.7980 | 0.8478 | 0.7665 | 0.7996 | 0.7985 | 0.6772 | 0.6775 | 0.058 |
lda | Linear Discriminant Analysis | 0.7889 | 0.9326 | 0.7503 | 0.7881 | 0.7872 | 0.6634 | 0.6646 | 0.022 |
knn | K Neighbors Classifier | 0.6108 | 0.7783 | 0.5832 | 0.6223 | 0.6147 | 0.3888 | 0.3901 | 0.027 |
svm | SVM - Linear Kernel | 0.3253 | 0.0000 | 0.3333 | 0.1188 | 0.1702 | 0.0000 | 0.0000 | 0.096 |
lr | Logistic Regression | 0.2499 | 0.5000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.130 |
nb | Naive Bayes | 0.2499 | 0.0000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.015 |
qda | Quadratic Discriminant Analysis | 0.2499 | 0.0000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.017 |
ridge | Ridge Classifier | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.013 |
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
gbc | Gradient Boosting Classifier | 0.8543 | 0.9627 | 0.8254 | 0.8512 | 0.8514 | 0.7653 | 0.7666 | 2.965 |
rf | Random Forest Classifier | 0.8521 | 0.9610 | 0.8204 | 0.8481 | 0.8486 | 0.7611 | 0.7625 | 0.256 |
et | Extra Trees Classifier | 0.8433 | 0.9574 | 0.8118 | 0.8398 | 0.8406 | 0.7480 | 0.7489 | 0.114 |
ada | Ada Boost Classifier | 0.8154 | 0.9148 | 0.7825 | 0.8107 | 0.8115 | 0.7034 | 0.7047 | 0.197 |
dt | Decision Tree Classifier | 0.7980 | 0.8478 | 0.7665 | 0.7996 | 0.7985 | 0.6772 | 0.6775 | 0.058 |
lda | Linear Discriminant Analysis | 0.7889 | 0.9326 | 0.7503 | 0.7881 | 0.7872 | 0.6634 | 0.6646 | 0.022 |
knn | K Neighbors Classifier | 0.6108 | 0.7783 | 0.5832 | 0.6223 | 0.6147 | 0.3888 | 0.3901 | 0.027 |
svm | SVM - Linear Kernel | 0.3253 | 0.0000 | 0.3333 | 0.1188 | 0.1702 | 0.0000 | 0.0000 | 0.096 |
lr | Logistic Regression | 0.2499 | 0.5000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.130 |
nb | Naive Bayes | 0.2499 | 0.0000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.015 |
qda | Quadratic Discriminant Analysis | 0.2499 | 0.0000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.017 |
ridge | Ridge Classifier | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.013 |
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
lightgbm | Light Gradient Boosting Machine | 0.8587 | 0.9645 | 0.8313 | 0.8557 | 0.8560 | 0.7726 | 0.7737 | 0.308 |
gbc | Gradient Boosting Classifier | 0.8543 | 0.9627 | 0.8254 | 0.8512 | 0.8514 | 0.7653 | 0.7666 | 2.965 |
rf | Random Forest Classifier | 0.8521 | 0.9610 | 0.8204 | 0.8481 | 0.8486 | 0.7611 | 0.7625 | 0.256 |
et | Extra Trees Classifier | 0.8433 | 0.9574 | 0.8118 | 0.8398 | 0.8406 | 0.7480 | 0.7489 | 0.114 |
ada | Ada Boost Classifier | 0.8154 | 0.9148 | 0.7825 | 0.8107 | 0.8115 | 0.7034 | 0.7047 | 0.197 |
dt | Decision Tree Classifier | 0.7980 | 0.8478 | 0.7665 | 0.7996 | 0.7985 | 0.6772 | 0.6775 | 0.058 |
lda | Linear Discriminant Analysis | 0.7889 | 0.9326 | 0.7503 | 0.7881 | 0.7872 | 0.6634 | 0.6646 | 0.022 |
knn | K Neighbors Classifier | 0.6108 | 0.7783 | 0.5832 | 0.6223 | 0.6147 | 0.3888 | 0.3901 | 0.027 |
svm | SVM - Linear Kernel | 0.3253 | 0.0000 | 0.3333 | 0.1188 | 0.1702 | 0.0000 | 0.0000 | 0.096 |
lr | Logistic Regression | 0.2499 | 0.5000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.130 |
nb | Naive Bayes | 0.2499 | 0.0000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.015 |
qda | Quadratic Discriminant Analysis | 0.2499 | 0.0000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.017 |
ridge | Ridge Classifier | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.013 |
Model | Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | TT (Sec) | |
---|---|---|---|---|---|---|---|---|---|
lightgbm | Light Gradient Boosting Machine | 0.8587 | 0.9645 | 0.8313 | 0.8557 | 0.8560 | 0.7726 | 0.7737 | 0.308 |
gbc | Gradient Boosting Classifier | 0.8543 | 0.9627 | 0.8254 | 0.8512 | 0.8514 | 0.7653 | 0.7666 | 2.965 |
rf | Random Forest Classifier | 0.8521 | 0.9610 | 0.8204 | 0.8481 | 0.8486 | 0.7611 | 0.7625 | 0.256 |
et | Extra Trees Classifier | 0.8433 | 0.9574 | 0.8118 | 0.8398 | 0.8406 | 0.7480 | 0.7489 | 0.114 |
ada | Ada Boost Classifier | 0.8154 | 0.9148 | 0.7825 | 0.8107 | 0.8115 | 0.7034 | 0.7047 | 0.197 |
dt | Decision Tree Classifier | 0.7980 | 0.8478 | 0.7665 | 0.7996 | 0.7985 | 0.6772 | 0.6775 | 0.058 |
lda | Linear Discriminant Analysis | 0.7889 | 0.9326 | 0.7503 | 0.7881 | 0.7872 | 0.6634 | 0.6646 | 0.022 |
knn | K Neighbors Classifier | 0.6108 | 0.7783 | 0.5832 | 0.6223 | 0.6147 | 0.3888 | 0.3901 | 0.027 |
svm | SVM - Linear Kernel | 0.3253 | 0.0000 | 0.3333 | 0.1188 | 0.1702 | 0.0000 | 0.0000 | 0.096 |
lr | Logistic Regression | 0.2499 | 0.5000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.130 |
nb | Naive Bayes | 0.2499 | 0.0000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.015 |
qda | Quadratic Discriminant Analysis | 0.2499 | 0.0000 | 0.3333 | 0.0625 | 0.0999 | 0.0000 | 0.0000 | 0.017 |
ridge | Ridge Classifier | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.0000 | 0.013 |
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, importance_type='split', learning_rate=0.1, max_depth=-1, min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0, n_estimators=100, n_jobs=-1, num_leaves=31, objective=None, random_state=314, reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
# tune model hyperparameters on the sonar classification dataset
# from sklearn.ensemble import XGBClassifier
from lightgbm import LGBMClassifier
from pycaret.classification import setup
from pycaret.classification import tune_model
# grid = setup(data=train_features, target=train_labels, html=False, silent=True, verbose=False)
# tune model hyperparameters
best = tune_model(LGBMClassifier(), n_iter=200, choose_better=True)
# report the best model
print(best)
Fitting 10 folds for each of 200 candidates, totalling 2000 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers. [Parallel(n_jobs=-1)]: Done 34 tasks | elapsed: 9.1s [Parallel(n_jobs=-1)]: Done 184 tasks | elapsed: 30.8s [Parallel(n_jobs=-1)]: Done 434 tasks | elapsed: 1.1min [Parallel(n_jobs=-1)]: Done 784 tasks | elapsed: 2.0min [Parallel(n_jobs=-1)]: Done 1234 tasks | elapsed: 2.8min [Parallel(n_jobs=-1)]: Done 1784 tasks | elapsed: 4.1min [Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed: 4.5min finished
Accuracy | AUC | Recall | Prec. | F1 | Kappa | MCC | |
---|---|---|---|---|---|---|---|
0 | 0.8642 | 0.9698 | 0.8372 | 0.8617 | 0.8623 | 0.7823 | 0.7829 |
1 | 0.8761 | 0.9700 | 0.8501 | 0.8730 | 0.8739 | 0.8003 | 0.8010 |
2 | 0.8627 | 0.9667 | 0.8362 | 0.8590 | 0.8599 | 0.7785 | 0.7794 |
3 | 0.8448 | 0.9582 | 0.8177 | 0.8432 | 0.8435 | 0.7518 | 0.7522 |
4 | 0.8670 | 0.9631 | 0.8346 | 0.8633 | 0.8625 | 0.7839 | 0.7864 |
5 | 0.8640 | 0.9665 | 0.8336 | 0.8602 | 0.8601 | 0.7801 | 0.7819 |
6 | 0.8550 | 0.9656 | 0.8286 | 0.8538 | 0.8532 | 0.7684 | 0.7694 |
7 | 0.8879 | 0.9738 | 0.8694 | 0.8855 | 0.8859 | 0.8194 | 0.8202 |
8 | 0.8356 | 0.9557 | 0.8037 | 0.8325 | 0.8337 | 0.7350 | 0.7354 |
9 | 0.8520 | 0.9659 | 0.8335 | 0.8512 | 0.8515 | 0.7636 | 0.7637 |
Mean | 0.8609 | 0.9655 | 0.8345 | 0.8584 | 0.8586 | 0.7763 | 0.7773 |
SD | 0.0143 | 0.0051 | 0.0165 | 0.0140 | 0.0139 | 0.0225 | 0.0227 |
LGBMClassifier(bagging_fraction=0.9, bagging_freq=5, boosting_type='gbdt', class_weight=None, colsample_bytree=1.0, feature_fraction=0.8, importance_type='split', learning_rate=0.15, max_depth=-1, min_child_samples=26, min_child_weight=0.001, min_split_gain=0.8, n_estimators=260, n_jobs=-1, num_leaves=90, objective=None, random_state=None, reg_alpha=0.4, reg_lambda=0.3, silent=True, subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
plot_model(best)
Finished loading model, total used 64 iterations
plot_model(best,plot = 'error')
Finished loading model, total used 64 iterations
plot_model(best,plot = 'feature')
Finished loading model, total used 64 iterations
evaluate_model(best)
Finished loading model, total used 64 iterations
Parameters | |
---|---|
boosting_type | gbdt |
class_weight | None |
colsample_bytree | 1.0 |
importance_type | split |
learning_rate | 0.15 |
max_depth | -1 |
min_child_samples | 26 |
min_child_weight | 0.001 |
min_split_gain | 0.8 |
n_estimators | 260 |
n_jobs | -1 |
num_leaves | 90 |
objective | None |
random_state | None |
reg_alpha | 0.4 |
reg_lambda | 0.3 |
silent | True |
subsample | 1.0 |
subsample_for_bin | 200000 |
subsample_freq | 0 |
feature_fraction | 0.8 |
bagging_freq | 5 |
bagging_fraction | 0.9 |
interpret_model(best)
automl_model = automl(optimize = 'MAE')
pred_holdouts = predict_model(automl_model)
pred_holdouts.head()
new_data = features.copy()
new_data.drop(['koi_disposition'], axis=1, inplace=True)
predictions = predict_model(automl_model, data=new_data)
predictions.head()
!mlflow ui
from pycaret.regression import *
reg_experiment = setup(data=df_final,
target='koi_disposition',
log_experiment=True,
experiment_name='exoplanets')
best_model = compare_models(fold=5)
# Instantiate model with 1000 decision trees
rf = RandomForestClassifier(n_estimators = 1000, random_state = 42)
rf.fit(train_features, train_labels)
predictions = rf.predict(test_features)
print("Accuracy score: ", accuracy_score(test_labels, predictions))
print("Recall score: ", recall_score(test_labels, predictions, average=None))
cv_score = cross_val_score(rf, train_features, train_labels, cv=3, scoring='accuracy')
print("Cross validation score: ", cv_score)
print(classification_report(test_labels,predictions))
train_pred = cross_val_predict(rf, train_features,train_labels, cv=3)
conf_matrix_rf = pd.DataFrame(confusion_matrix(train_labels,
train_pred,
labels=['CANDIDATE','CONFIRMED','FALSE POSITIVE']),
index = ['Actual Candidate', 'Actual Confirmed', 'Actual FP'],
columns = ['Predicted Candidate', 'Predicted Confirmed', 'Predicted FP']
)
print(conf_matrix_rf)
plot_cm(conf_matrix_rf)
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
rf_2 = RandomForestClassifier(random_state = 42)
Random search of parameters, using 3 fold cross validation, search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf_2,
param_distributions = random_grid,
n_iter = 100,
cv = 3,
verbose=2,
random_state=42,
n_jobs = -1)
# Fit the random search model
rf_random.fit(train_features, train_labels)
# %% best params - use these params for the next model
print(rf_random.best_params_)
rf_rs = RandomForestClassifier(n_estimators = 311,
min_samples_split = 5,
min_samples_leaf = 1,
max_features = 'auto',
max_depth = 90,
bootstrap = False)
rf_rs.fit(train_features, train_labels)
print(rf_rs.score(train_features, train_labels))
y_pred = rf_rs.predict(test_features)
print(accuracy_score(test_labels, y_pred))
print(classification_report(test_labels, y_pred))
# confusion matrix & accuracy
rs_pred = cross_val_predict(rf_rs, test_features,test_labels, cv=3)
conf_matrix_rf = confusion_matrix(test_labels, rs_pred)
conf_matrix_rf = pd.DataFrame(confusion_matrix(test_labels,
rs_pred,
labels=['CANDIDATE','CONFIRMED','FALSE POSITIVE']),
index = ['Actual Candidate', 'Actual Confirmed', 'Actual FP'],
columns = ['Predicted Candidate', 'Predicted Confirmed', 'Predicted FP']
)
print(conf_matrix_rf)
plot_cm(conf_matrix_rf)
print("Accuracy score: ", accuracy_score(test_labels, rs_pred))
print("Recall score: ", recall_score(test_labels, rs_pred, average=None))
print("Precision score: ", precision_score(test_labels, rs_pred, average=None))
TPOT is an open-source library for performing AutoML in Python. It makes use of the popular Scikit-Learn machine learning library for data transforms and machine learning algorithms and uses a Genetic Programming stochastic global search procedure to efficiently discover a top-performing model pipeline for a given dataset. 1
# TPOT
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
param = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'criterion':['entropy','gini']}
tpot_classifier = TPOTClassifier(generations= 5,
population_size= 24,
offspring_size= 12,
verbosity= 2,
early_stop= 12,
config_dict={'sklearn.ensemble.RandomForestClassifier': param},
cv = 4,
scoring = 'accuracy')
tpot_classifier.fit(train_features, train_labels)
accuracy = tpot_classifier.score(test_features, test_labels)
print(accuracy)
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
ada_clf = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=1), n_estimators=200,
algorithm='SAMME.R', learning_rate=0.5)
ada_clf.fit(train_features, train_labels)
ada_pred = ada_clf.predict(test_features)
print(classification_report(test_labels, ada_pred))
from sklearn.svm import SVC
svm_clf = SVC()
svm_clf.fit(train_features, train_labels)
svm_preds = svm_clf.predict(test_features)
print(classification_report(test_labels, ada_pred))
import xgboost as xgb
le = LabelEncoder()
train_labels_encoded = le.fit_transform(list(train_labels))
test_labels_encoded = le.fit_transform(list(test_labels))
D_train = xgb.DMatrix(train_features, label=train_labels_encoded)
D_test = xgb.DMatrix(test_features, label=test_labels_encoded)
param = {
'eta': 0.3,
'max_depth': 3,
'objective': 'multi:softprob',
'num_class': 3}
steps = 20
model = xgb.train(param, D_train, steps)
preds = model.predict(D_test)
best_preds = np.asarray([np.argmax(line) for line in preds])
print("Precision = {}".format(precision_score(test_labels_encoded, best_preds, average='macro')))
print("Recall = {}".format(recall_score(test_labels_encoded, best_preds, average='macro')))
print("Accuracy = {}".format(accuracy_score(test_labels_encoded, best_preds)))
clf = xgb.XGBClassifier()
parameters = {
"eta" : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
"max_depth" : [ 3, 4, 5, 6, 8, 10, 12, 15],
"min_child_weight" : [ 1, 3, 5, 7 ],
"gamma" : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
"colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
}
grid = GridSearchCV(clf,
parameters, n_jobs=4,
scoring="neg_log_loss",
cv=3)
grid.fit(train_features, train_labels_encoded)
preds = model.predict(D_test)
best_preds = np.asarray([np.argmax(line) for line in preds])
print("Precision = {}".format(precision_score(test_labels_encoded, best_preds, average='macro')))
print("Recall = {}".format(recall_score(test_labels_encoded, best_preds, average='macro')))
print("Accuracy = {}".format(accuracy_score(test_labels_encoded, best_preds)))
**I did not get this to run, but is definitely something I'd like to revisit once the semester is over.**
# Vectorize the labels
from keras.utils.np_utils import to_categorical
one_hot_train_labels = to_categorical(train_labels)
one_hot_test_labels = to_categorical(test_labels)
# Build and compile a model
from keras import models
from keras import layers
model = models.Sequential()
model.add(layers.Dense(64, activation='relu', input_shape=(53,)))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(46, activation='softmax'))
model.compile(optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['accuracy'])
# Train the model
history = model.fit(train_features,
train_labels,
epochs=20,
batch_size=512,
validation_data=(test_features, test_labels))
results = history.evaluate(test_features, one_hot_test_labels)
# Plot loss and accuracy curves
import matplotlib.pyplot as plt
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()
plt.clf() # clear figure
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()