""" The following are sample solutions ot exercise set 5. It is a single python file, but you probably want to run it piece by piece, or on a python command line to undersatand, and experiment further. """ import pandas as pd import numpy as np from keras.models import Sequential from keras.layers import Dense, Activation from sklearn.cross_validation import StratifiedKFold, cross_val_score from sklearn.linear_model import LogisticRegression from sklearn.decomposition import PCA import matplotlib.pyplot as plt def train_test(model, x, y, folds): """ This function trains and tests a Keras model with k-fold cv. 'folds' is the array returned by sklearn *KFold splits. """ acc_sum = 0 for trn_i, test_i in folds: model.fit(x[trn_i], y[trn_i], nb_epoch=1) _ , acc = model.evaluate(x[test_i], y[test_i]) acc_sum += acc return acc_sum/len(folds) # data processing d = pd.read_csv('exercises5-data.csv') # # Keras requires the training/test sets to be matrices, slicing from # Pandas data frame creates "arrays", .as_matrix() method takes care # of this conversion. # x = d.iloc[:, 1:21].as_matrix() y = d.iloc[:, 0].as_matrix() # # This creates an index to be used cross validation # folds = StratifiedKFold(y, 10) # #1 sklearn LogisticRegression # we use direclty use cross validation evaluation of sklearn lm = LogisticRegression() lm_accuracy = cross_val_score(lm, x, y, cv=10, scoring='accuracy') print(lm_accuracy.mean()) #2 the single-layer model # single_layer = Sequential() single_layer.add(Dense(output_dim=1, input_dim=20, activation='sigmoid')) single_layer.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy']) single_layer_accuracy = train_test(single_layer, x, y, folds) print(single_layer_accuracy.mean()) #3 MLP models with varying number of hidden units def mlp_model(nhidden): """ This function creates two-layer networks """ m = Sequential() m.add(Dense(input_dim=20, output_dim=nhidden, activation='relu')) m.add(Dense(output_dim=1, activation='sigmoid')) m.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy']) return m mlp1 = mlp_model(1) mlp1_accuracy = train_test(mlp1, x, y, folds) mlp10 = mlp_model(10) mlp10_accuracy = train_test(mlp10, x, y, folds) mlp100 = mlp_model(100) mlp100_accuracy = train_test(mlp100, x, y, folds) mlp1000 = mlp_model(1000) mlp1000_accuracy = train_test(mlp1000, x, y, folds) print((mlp1_accuracy, mlp10_accuracy, mlp100_accuracy, mlp1000_accuracy)) #4 PCA pca = PCA() x_tr = pca.fit_transform(x) # Let's se how much variance is explained by each dimension: print(pca.explained_variance_ratio_.round(4)) # If you scale the data before preforming PCA, # d_norm = (d - d.mean()) / d.std() # x_norm = d_norm.iloc[:, 1:21].as_matrix() # you will see that more variaotion is explained by PC2 & PC3, but # still 95% of the variance is explained by PC1 # Although PC1 is contains 99% of the explained varience, # if you look at the transformed data closely, # you will see that the intersting part for classification is between # PC2 and PC2 (note that python indexes start from 0) plt.scatter(x_tr[:999,1], x_tr[:999,2], marker=".", color="red") plt.scatter(x_tr[1000:,1], x_tr[1000:,2], marker=".", color="blue") plt.show() #5 Logistic regression with transformed data # The configuration of PC2 & PC3 suggests that the classes are two # concentric circles (with some noise), so the radius of the circle # should be a good discriminator. remembering that r**2 = x**2 + # y**2, you can just do a quadratic transformation of these # variables. # pc2 = x_tr[:, 1] pc3 = x_tr[:, 2] pc2_2 = pc2 * pc2 pc3_2 = pc3 * pc3 xnew = np.matrix(np.vstack((pc2_2, pc3_2)).T) lm2 = LogisticRegression() lm2_accuracy = cross_val_score(lm2, xnew, y, cv=10, scoring='accuracy') print(lm2_accuracy.mean()) # # Alternatively, the same transformation should work fine # non-tranformed original data, since PCA performs only a linear # transformation. # # This would be more expensive (more predictors) but it would not have # any information loss due to dimensionality reduction. # xnew = (d * d).iloc[:, 1:21].as_matrix() lm3 = LogisticRegression() lm3_accuracy = cross_val_score(lm3, xnew, y, cv=10, scoring='accuracy') print(lm2_accuracy.mean())