{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# Retrieving GLOVE word vectors\n\n\nIn this example we will retrieve similar words from\nGLOVE embeddings with an ANNG graph.\n\nPrecomputed ground-truth nearest neighbors are available\nfrom `ANN benchmarks <http://ann-benchmarks.com/index.html#datasets>`__.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "# For this example, the `h5py` package is required in addition to the requirements of scikit-hubness.\n# You may install it from PyPI by the following command (if you're in an IPython/Jupyter environment):\n# !pip install h5py\n\nimport numpy as np\nimport h5py\nfrom skhubness.neighbors import NearestNeighbors\n\n# Download the dataset with the following command.\n# If the dataset is already available in the current working dir, you can skip this:\n# !wget http://ann-benchmarks.com/glove-100-angular.hdf5\nf = h5py.File('glove-100-angular.hdf5', 'r')\n\n# Extract the split and ground-truth\nX_train = f['train']\nX_test = f['test']\nneigh_true = f['neighbors']\ndist = f['distances']\n\n# How many object have we got?\nfor k in f.keys():\n    print(f'{k}: shape = {f[k].shape}')\n\n# APPROXIMATE NEAREST NEIGHBOR SEARCH\n# In order to retrieve most similar words from the GLOVE embeddings,\n# we use the unsupervised `skhubness.neighbors.NearestNeighbors` class.\n# The (approximate) nearest neighbor algorithm is set to NNG by passing `algorithm='nng'`.\n# We can pass additional parameters to `NNG` via the `algorithm_params` dict.\n# Here we set `n_jobs=8` to enable parallelism.\n# Create the nearest neighbor index\nnn_plain = NearestNeighbors(n_neighbors=100,\n                            algorithm='nng',\n                            algorithm_params={'n_candidates': 1_000,\n                                              'index_dir': 'auto',\n                                              'n_jobs': 8},\n                            verbose=2,\n                            )\nnn_plain.fit(X_train)\n\n# Note that NNG must save its index. By setting `index_dir='auto'`,\n# NNG will try to save it to shared memory, if available, otherwise to $TMP.\n# This index is NOT removed automatically, as one will typically want build an index once and use it often.\n# Retrieve nearest neighbors for each test object\nneigh_pred_plain = nn_plain.kneighbors(X_test,\n                                       n_neighbors=100,\n                                       return_distance=False)\n\n# Calculate the recall per test object\nrecalled_plain = [np.intersect1d(neigh_true[i], neigh_pred_plain)\n                  for i in range(len(X_test))]\nrecall_plain = np.array([recalled_plain[i].size / neigh_true.shape[1]\n                         for i in range(len(X_test))])\n\n# Statistics\nprint(f'Mean = {recall_plain.mean():.4f}, '\n      f'stdev = {recall_plain.std():.4f}')\n\n\n# ANN with HUBNESS REDUCTION\n# Here we set `n_candidates=1000`, so that for each query,\n# 1000 neighbors will be retrieved first by `NNG`,\n# that are subsequently refined by hubness reduction.\n# Hubness reduction is performed by local scaling as specified with `hubness='ls'`.\n# Creating the NN index with hubness reduction enabled\nnn = NearestNeighbors(n_neighbors=100,\n                      algorithm='nng',\n                      algorithm_params={'n_candidates': 1_000,\n                                        'n_jobs': 8},\n                      hubness='ls',\n                      verbose=2,\n                      )\nnn.fit(X_train)\n\n# Retrieve nearest neighbors for each test object\nneigh_pred = nn.kneighbors(X_test,\n                           n_neighbors=100,\n                           return_distance=False)\n\n# Measure recall per object and on average\nrecalled = [np.intersect1d(neigh_true[i], neigh_pred)\n            for i in range(len(X_test))]\nrecall = np.array([recalled[i].size / neigh_true.shape[1]\n                   for i in range(len(X_test))])\nprint(f'Mean = {recall.mean():.4f}, '\n      f'stdev = {recall.std():.4f}')\n\n# If the second results are significantly better than the first,\n# this could indicate that the chosen ANN method is more prone\n# to hubness than exact NN, which might be an interesting research question."
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.7.3"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}