Preface
Agenda
Review of K-fold cross-validation
Steps for cross-validation:
Benefits of cross-validation:
Drawback of cross-validation:
Review of parameter tuning using cross_val_score
Goal: Select the best tuning parameters (aka "hyperparameters") for KNN on the iris dataset
- test1.py
- #!/usr/bin/env python
- from sklearn.datasets import load_iris
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn.cross_validation import cross_val_score
- import matplotlib.pyplot as plt
- # Read in the iris data
- iris = load_iris()
- # Create X(features) and y(response)
- X = iris.data
- y = iris.target
- # 10-fold cross-validation with K=5 for KNN
- knn = KNeighborsClassifier(n_neighbors=5)
- scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
- print "KNN(n=5) with accuracy=%.02f on iris dataset." % scores.mean()
- # Search for an optimal value of K for KNN
- k_range = range(1, 31)
- k_scores = []
- for k in k_range:
- knn = KNeighborsClassifier(n_neighbors=k)
- scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
- k_scores.append(scores.mean())
- plt.plot(k_range, k_scores)
- plt.xlabel("Value of K for KNN")
- plt.ylabel("Cross-Validated Accruacy")
- plt.show()
Execution result:
More efficient parameter tuning using GridSearchCV
- test2.py
- #!/usr/bin/env python
- from sklearn.datasets import load_iris
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn.cross_validation import cross_val_score
- import matplotlib.pyplot as plt
- from sklearn.grid_search import GridSearchCV
- # Read in the iris data
- iris = load_iris()
- # Create X(features) and y(response)
- X = iris.data
- y = iris.target
- # 10-fold cross-validation with K=5 for KNN
- knn = KNeighborsClassifier(n_neighbors=5)
- # Define the parameter values that should be searched
- k_range = range(1, 31)
- # Create a parameter grid: map the parameter names to the values that should be searched
- param_grid = dict(n_neighbors=k_range)
- # Instantiate the grid
- grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
- # fit the grid with data
- grid.fit(X, y)
- # Examine the first tuple
- print "Tuple0 using parameter=%s" % grid.grid_scores_[0].parameters
- print "Tuple0 scores of 10-fold CV scores:\n%s\n" % grid.grid_scores_[0].cv_validation_scores
- print "Tuple0 with mean of 10-fold CV score=%.02f" % grid.grid_scores_[0].mean_validation_score
- # Create a list of the mean scores only
- grid_mean_scores = [result.mean_validation_score for result in grid.grid_scores_]
- # Plot the results
- #import matplotlib.pyplot as plt
- #plt.plot(k_range, grid_mean_scores)
- #plt.xlabel("Value of K for KNN")
- #plt.ylabel("Cross-Validated Accuracy")
- #plt.show()
- # Examine the best model
- print "Best score=%.02f" % grid.best_score_
- print "Best param=%s" % grid.best_params_
- print "Best etimator:\n%s\n" % grid.best_estimator_
Searching multiple parameters simultaneously
- test3.py
- #!/usr/bin/env python
- from sklearn.datasets import load_iris
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn.cross_validation import cross_val_score
- import matplotlib.pyplot as plt
- from sklearn.grid_search import GridSearchCV
- # Read in the iris data
- iris = load_iris()
- # Create X(features) and y(response)
- X = iris.data
- y = iris.target
- # 10-fold cross-validation with K=5 for KNN
- knn = KNeighborsClassifier(n_neighbors=5)
- # define the parameter values that should be searched
- k_range = range(1, 31)
- weight_options = ['uniform', 'distance']
- # Create a parameter grid: map the parameter names to the values that should be searched
- param_grid = dict(n_neighbors=k_range, weights=weight_options)
- # Instantiate and fit the grid
- grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
- grid.fit(X, y)
- # Examine the first tuple
- print "Tuple0 using parameter=%s" % grid.grid_scores_[0].parameters
- print "Tuple0 scores of 10-fold CV scores:\n%s\n" % grid.grid_scores_[0].cv_validation_scores
- print "Tuple0 with mean of 10-fold CV score=%.02f" % grid.grid_scores_[0].mean_validation_score
- # Create a list of the mean scores only
- grid_mean_scores = [result.mean_validation_score for result in grid.grid_scores_]
- # Plot the results
- #import matplotlib.pyplot as plt
- #plt.plot(k_range, grid_mean_scores)
- #plt.xlabel("Value of K for KNN")
- #plt.ylabel("Cross-Validated Accuracy")
- #plt.show()
- # Examine the best model
- print "Best score=%.02f" % grid.best_score_
- print "Best param=%s" % grid.best_params_
- print "Best etimator:\n%s\n" % grid.best_estimator_
- # train your model using all data and the best known parameters
- knn = KNeighborsClassifier(n_neighbors=13, weights='uniform')
- knn.fit(X, y)
- # make a prediction on out-of-sample data
- knn.predict([3, 5, 4, 2])
- # Shortcut: GridSearchCV automatically refits the best model using all of the data
- grid.predict([3, 5, 4, 2])
* Searching many different parameters at once may be computationally infeasible.
* RandomizedSearchCV searches a subset of the parameters, and you control the computational "budget"
- test4.py
- #!/usr/bin/env python
- from sklearn.datasets import load_iris
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn.cross_validation import cross_val_score
- import matplotlib.pyplot as plt
- from sklearn.grid_search import RandomizedSearchCV
- # Read in the iris data
- iris = load_iris()
- # Create X(features) and y(response)
- X = iris.data
- y = iris.target
- # 10-fold cross-validation with K=5 for KNN
- knn = KNeighborsClassifier(n_neighbors=5)
- # define the parameter values that should be searched
- k_range = range(1, 31)
- weight_options = ['uniform', 'distance']
- # Specify "parameter ditributions" rather than a "parameter grid"
- param_dist = dict(n_neighbors=k_range, weights=weight_options)
- # n_iter controls the number of searches
- rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10, random_state=5)
- rand.fit(X, y)
- print rand.grid_scores_
- # Examine the best model
- print "Best score=%.02f" % rand.best_score_
- print "Best param=%s" % rand.best_params_
- print "Best etimator:\n%s\n" % rand.best_estimator_
- # Run RandomizedSearchCV 20 times (with n_iter=10) and record the best score
- best_scores = []
- for i in range(20):
- rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10)
- rand.fit(X, y)
- best_scores.append(round( rand.best_score_, 3))
- print "Best scores collect in 20 times:\n%s\n" % best_scores
* Grid search user guide: http://scikit-learn.org/stable/module...
* GridSearchCV documentation: http://scikit-learn.org/stable/module...
* RandomizedSearchCV documentation: http://scikit-learn.org/stable/module...
* Comparing randomized search and grid search: http://scikit-learn.org/stable/auto_e...
* Randomized search video: https://youtu.be/0wUF_Ov8b0A?t=17m38s
* Randomized search notebook: http://nbviewer.ipython.org/github/am...
* Random Search for Hyper-Parameter Optimization (paper): http://www.jmlr.org/papers/volume13/b...