{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n=================================\nFace recognition (Olivetti faces)\n=================================\n\nThis dataset contains a set of face images taken between April 1992\nand April 1994 at AT&T Laboratories Cambridge.\nImage data is typically embedded in very high-dimensional spaces,\nwhich might be prone to hubness.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "import numpy as np\nfrom sklearn.datasets import olivetti_faces\nfrom sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV\n\nfrom skhubness import Hubness\nfrom skhubness.neighbors import KNeighborsClassifier\n\n# Fetch data and have a look\nd = olivetti_faces.fetch_olivetti_faces()\nX, y = d['data'], d['target']\nprint(f'Data shape: {X.shape}')\nprint(f'Label shape: {y.shape}')\n# (400, 4096)\n# (400,)\n\n# The data is embedded in a high-dimensional space.\n# Is there hubness, and can we reduce it?\nfor hubness in [None, 'dsl', 'ls', 'mp']:\n    hub = Hubness(k=10, hubness=hubness, return_value='k_skewness')\n    hub.fit(X)\n    score = hub.score()\n    print(f'Hubness (10-skew): {score:.3f} with hubness reduction: {hubness}')\n# Hubness (10-skew): 1.972 with hubness reduction: None\n# Hubness (10-skew): 1.526 with hubness reduction: dsl\n# Hubness (10-skew): 0.943 with hubness reduction: ls\n# Hubness (10-skew): 0.184 with hubness reduction: mp\n\n# There is some hubness, and all hubness reduction methods can reduce it (to varying degree)\n# Let's assess the best kNN strategy and its estimated performance.\ncv_perf = StratifiedKFold(n_splits=5, shuffle=True, random_state=7263)\ncv_select = StratifiedKFold(n_splits=5, shuffle=True, random_state=32634)\n\nknn = KNeighborsClassifier(algorithm_params={'n_candidates': 100})\n\n# specify parameters and distributions to sample from\nparam_dist = {\"n_neighbors\": np.arange(1, 26),\n              \"weights\": ['uniform', 'distance'],\n              \"hubness\": [None, 'dsl', 'ls', 'mp']}\n\n# Inner cross-validation to select best hyperparameters (incl hubness reduction method)\nsearch = RandomizedSearchCV(estimator=knn,\n                            param_distributions=param_dist,\n                            n_iter=100,\n                            cv=cv_select,\n                            random_state=2345,\n                            verbose=1)\n\n# Outer cross-validation to estimate performance\nscore = cross_val_score(search, X, y, cv=cv_perf, verbose=1)\nprint(f'Scores: {score}')\nprint(f'Mean acc = {score.mean():.3f} +/- {score.std():.3f}')\n\n# Select model that maximizes accuracy\nsearch.fit(X, y)\n\n# The best model's parameters\nprint(search.best_params_)\n\n# Does it correspond to the results of hubness reduction above?\n# Scores: [0.95   0.9625 1.     0.95   0.925 ]\n# Mean acc = 0.957 +/- 0.024\n# {'weights': 'distance', 'n_neighbors': 23, 'hubness': 'mp'}"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}