{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## sklearn - scikit-learn, machine learning made easy" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import sklearn\n", "import pandas as pd\n", "import numpy as np\n", "from sklearn.ensemble import RandomForestClassifier" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## General considerations\n", "\n", "In the end, you want to apply your classifier on data that you have never seen before.\n", "That means, your training-set should be representative of the whole population.\n", "Generally, you do not really know the full specs of the whole population.\n", "\n", "The easiest way is to just split your data into a train- and a test-set.\n", "Split your data, such that you train your model (and select your parameters) on the train-set, and evaluate it on the (before unseen) test-set\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "iris = sns.load_dataset('iris')\n", "\n", "# prepare train and test set, use 75% as train\n", "iris['species_num'],labels = pd.factorize(iris.species)\n", "iris['train'] = np.random.uniform(0,1,len(iris)) <= .75\n", "train, test = iris[iris.train], iris[~iris.train]\n", "\n", "# create index of columns to use as features\n", "features = iris.columns[:4]\n", "\n", "# convert string label (target) to number\n", "X_train = train[features]\n", "y_train = train.species_num\n", "\n", "X_test = test[features]" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "image/svg+xml": [ "\n", "\n", "\n", "\n", "\n", "\n", "Tree\n", "\n", "\n", "0\n", "\n", "petal_length ≤ 2.6\n", "gini = 0.666\n", "samples = 114\n", "value = [36, 38, 40]\n", "class = setosa\n", "\n", "\n", "1\n", "\n", "gini = 0.0\n", "samples = 36\n", "value = [36, 0, 0]\n", "class = setosa\n", "\n", "\n", "0->1\n", "\n", "\n", "True\n", "\n", "\n", "2\n", "\n", "petal_width ≤ 1.65\n", "gini = 0.5\n", "samples = 78\n", "value = [0, 38, 40]\n", "class = setosa\n", "\n", "\n", "0->2\n", "\n", "\n", "False\n", "\n", "\n", "3\n", "\n", "petal_length ≤ 4.95\n", "gini = 0.139\n", "samples = 40\n", "value = [0, 37, 3]\n", "class = setosa\n", "\n", "\n", "2->3\n", "\n", "\n", "\n", "\n", "8\n", "\n", "petal_length ≤ 4.85\n", "gini = 0.051\n", "samples = 38\n", "value = [0, 1, 37]\n", "class = setosa\n", "\n", "\n", "2->8\n", "\n", "\n", "\n", "\n", "4\n", "\n", "gini = 0.0\n", "samples = 36\n", "value = [0, 36, 0]\n", "class = setosa\n", "\n", "\n", "3->4\n", "\n", "\n", "\n", "\n", "5\n", "\n", "petal_width ≤ 1.55\n", "gini = 0.375\n", "samples = 4\n", "value = [0, 1, 3]\n", "class = setosa\n", "\n", "\n", "3->5\n", "\n", "\n", "\n", "\n", "6\n", "\n", "gini = 0.0\n", "samples = 3\n", "value = [0, 0, 3]\n", "class = setosa\n", "\n", "\n", "5->6\n", "\n", "\n", "\n", "\n", "7\n", "\n", "gini = 0.0\n", "samples = 1\n", "value = [0, 1, 0]\n", "class = setosa\n", "\n", "\n", "5->7\n", "\n", "\n", "\n", "\n", "9\n", "\n", "sepal_width ≤ 3.1\n", "gini = 0.444\n", "samples = 3\n", "value = [0, 1, 2]\n", "class = setosa\n", "\n", "\n", "8->9\n", "\n", "\n", "\n", "\n", "12\n", "\n", "gini = 0.0\n", "samples = 35\n", "value = [0, 0, 35]\n", "class = setosa\n", "\n", "\n", "8->12\n", "\n", "\n", "\n", "\n", "10\n", "\n", "gini = 0.0\n", "samples = 2\n", "value = [0, 0, 2]\n", "class = setosa\n", "\n", "\n", "9->10\n", "\n", "\n", "\n", "\n", "11\n", "\n", "gini = 0.0\n", "samples = 1\n", "value = [0, 1, 0]\n", "class = setosa\n", "\n", "\n", "9->11\n", "\n", "\n", "\n", "\n", "\n" ], "text/plain": [ "" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##### Train a decision tree classifier\n", "from sklearn import tree\n", "clf = tree.DecisionTreeClassifier()\n", "clf.fit(X_train, y_train)\n", "# predict on test-set and evaluate\n", "preds = clf.predict(X_test)\n", "pd.crosstab(test.species, labels[preds], rownames=['Actual species'], colnames=['Predicted species'] )\n", "\n", "import graphviz \n", "dot_data = tree.export_graphviz(clf, out_file=None, \n", " feature_names=iris.columns[:4], \n", " class_names=iris.species, \n", " filled=True, rounded=True, \n", " special_characters=True) \n", "graph = graphviz.Source(dot_data) \n", "graph" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Support Vector Machine\n", "\n", "* Maximizes margin of datapoints when computing separating hyperplane\n", "* Multi-class: one-vs-one classification (train n_classes * (n_classes -1)/2 classifiers\n", "\n", "\n", "\n", "* Kernel trick\n", "\n", "\n", "\n", "[source] http://www.codecops.in/2015/11/using-svm-classifier.html\n", "[source] http://blog.hackerearth.com/simple-tutorial-svm-parameter-tuning-python-r" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Predicted speciessetosaversicolorvirginica
Actual species
setosa1400
versicolor0120
virginica0010
\n", "
" ], "text/plain": [ "Predicted species setosa versicolor virginica\n", "Actual species \n", "setosa 14 0 0\n", "versicolor 0 12 0\n", "virginica 0 0 10" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "##### Train a support vector machine\n", "from sklearn.svm import SVC\n", "clf = SVC()\n", "clf.fit(X_train, y_train)\n", "# predict on test-set and evaluate\n", "preds = clf.predict(X_test)\n", "pd.crosstab(test.species, labels[preds], rownames=['Actual species'], colnames=['Predicted species'] )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Random Forest\n", "\n", "\n", "\n", "[source] https://www.kdnuggets.com/2017/10/random-forests-explained.html" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Predicted speciessetosaversicolorvirginica
Actual species
setosa1400
versicolor0111
virginica0010
\n", "
" ], "text/plain": [ "Predicted species setosa versicolor virginica\n", "Actual species \n", "setosa 14 0 0\n", "versicolor 0 11 1\n", "virginica 0 0 10" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# train a RandomForestClassifier\n", "clf = RandomForestClassifier(n_jobs=4, random_state=0)\n", "clf.fit(X_train, y_train)\n", "# predict on test-set and evaluate\n", "preds = clf.predict(X_test)\n", "pd.crosstab(test.species, labels[preds], rownames=['Actual species'], colnames=['Predicted species'] )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Model evaluation\n", "\n", "Similar to the unified classifier-interface, sklearn offers a wide variety of algorithms and metrics\n", "to evaluate your models (more at: http://scikit-learn.org/stable/modules/model_evaluation.html).\n", "You can easily calculate and print most of the common metrics and scorings:" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9444444444444444" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# evaluate metrics\n", "from sklearn.metrics import accuracy_score\n", "accuracy_score(test.species, labels[preds])" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " setosa 1.00 1.00 1.00 14\n", " versicolor 0.92 0.92 0.92 12\n", " virginica 0.90 0.90 0.90 10\n", "\n", "avg / total 0.94 0.94 0.94 36\n", "\n" ] } ], "source": [ "from sklearn.metrics import classification_report\n", "print(classification_report(test.species, labels[preds],))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Model evaluation/tuning\n", "Now that we get a feeling of how the classifiers behave, how can we select the best classifier and its parameters based on the data we have?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Cross validation is like the train/test-split on steroids, it gives a statistically more significant estimate of the models generalization capabilities:\n", "\n", "\n", "\n", "[source] https://upload.wikimedia.org/wikipedia/commons/1/1c/K-fold_cross_validation_EN.jpg" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.96, 0.98, 0.94])" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# cross validation - several train/test splits\n", "from sklearn.model_selection import KFold, cross_val_score\n", "\n", "k_fold = KFold(n_splits=3, shuffle=True)\n", "cross_val_score(clf, iris[features], iris.species_num, cv=k_fold, n_jobs=-1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can evaluate several models on the same k splits and decide which one performs best (the metric can also be selected).\n", "\n", "Hyper-parameters of models can be tuned using one of the following methods:\n", "\n", "More info here: http://scikit-learn.org/stable/tutorial/statistical_inference/model_selection.html\n", "\n", "* GridSearchCV\n", "* RandomizedSearchCV\n", "* Hyperopt (https://github.com/hyperopt/hyperopt)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "GridSearchCV(cv=None, error_score='raise',\n", " estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n", " max_features=None, max_leaf_nodes=None,\n", " min_impurity_decrease=0.0, min_impurity_split=None,\n", " min_samples_leaf=1, min_samples_split=2,\n", " min_weight_fraction_leaf=0.0, presort=False, random_state=None,\n", " splitter='best'),\n", " fit_params=None, iid=True, n_jobs=-1,\n", " param_grid={'max_features': [1, 2, 3], 'min_samples_split': [2, 3, 10], 'max_depth': [3, 10, 15]},\n", " pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n", " scoring=None, verbose=0)" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# grid search\n", "from sklearn.model_selection import GridSearchCV, cross_val_score\n", "\n", "param_grid = {\"max_depth\": [3,10,15],\n", " \"max_features\": [1, 2, 3],\n", " \"min_samples_split\": [2, 3, 10],}\n", "gs = GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=-1)\n", "gs.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9649122807017544" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gs.best_score_" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,\n", " max_features=2, max_leaf_nodes=None, min_impurity_decrease=0.0,\n", " min_impurity_split=None, min_samples_leaf=1,\n", " min_samples_split=3, min_weight_fraction_leaf=0.0,\n", " presort=False, random_state=None, splitter='best')" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "gs.best_estimator_" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.14" } }, "nbformat": 4, "nbformat_minor": 2 }