{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn import datasets\n",
    "from sklearn.model_selection import train_test_split\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preparing train/test sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [],
   "source": [
    "#directory = \"/Users/padjiman/data/KDDCUP1998/\"\n",
    "directory = \"/Users/padjiman/data/bankVW/\"\n",
    "data_file = \"train.vw\"\n",
    "data = pd.read_csv(directory+data_file, header=None)\n",
    "train, test = train_test_split(data, test_size=0.20 , random_state = 26 )\n",
    "train.to_csv(directory+'split_train.vw', index=False, header=None)\n",
    "test.to_csv(directory+'split_test.vw', index=False, header=None)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Actual train and test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "creating features for following interactions: ic \n",
      "final_regressor = model.vw\n",
      "Num weight bits = 26\n",
      "learning rate = 0.5\n",
      "initial_t = 0\n",
      "power_t = 0.5\n",
      "decay_learning_rate = 1\n",
      "using cache_file = split_train.vw.cache\n",
      "ignoring text input in favor of cache input\n",
      "num sources = 1\n",
      "average  since         example        example  current  current  current\n",
      "loss     last          counter         weight    label  predict features\n",
      "0.693147 0.693147            1            1.0   1.0000   0.0000       60\n",
      "1.109843 1.526538            2            2.0  -1.0000   1.2815       60\n",
      "0.760195 0.410548            4            4.0  -1.0000  -0.2754       60\n",
      "0.465710 0.171224            8            8.0  -1.0000  -0.7350       50\n",
      "0.318235 0.170760           16           16.0  -1.0000  -2.7830       60\n",
      "0.531552 0.744869           32           32.0  -1.0000  -6.1178       60\n",
      "2.579228 4.626905           64           64.0  -1.0000  -3.2457       60\n",
      "2.194169 1.809109          128          128.0  -1.0000  -2.9788       60\n",
      "2.114863 2.035556          256          256.0  -1.0000  -1.4744       60\n",
      "1.616420 1.117978          512          512.0  -1.0000  -1.9029       60\n",
      "1.143370 0.670321         1024         1024.0  -1.0000  -3.5995       60\n",
      "0.766199 0.389027         2048         2048.0  -1.0000  -3.3793       60\n",
      "0.530801 0.295404         4096         4096.0  -1.0000  -4.4217       60\n",
      "0.389938 0.249075         8192         8192.0  -1.0000  -2.8836       60\n",
      "0.318798 0.247658        16384        16384.0  -1.0000  -3.9569       70\n",
      "0.263437 0.263437        32768        32768.0  -1.0000  -4.6966       60 h\n",
      "0.252186 0.240939        65536        65536.0  -1.0000  -4.0019       60 h\n",
      "\n",
      "finished run\n",
      "number of examples per pass = 32552\n",
      "passes used = 4\n",
      "weighted example sum = 130208.000000\n",
      "weighted label sum = -99792.000000\n",
      "average loss = 0.239227 h\n",
      "best constant = -2.023110\n",
      "best constant's loss = 0.360496\n",
      "total feature number = 7949680\n",
      "creating features for following interactions: ic \n",
      "only testing\n",
      "predictions = preds.txt\n",
      "Num weight bits = 26\n",
      "learning rate = 0.5\n",
      "initial_t = 0\n",
      "power_t = 0.5\n",
      "using no cache\n",
      "Reading datafile = split_test.vw\n",
      "num sources = 1\n",
      "average  since         example        example  current  current  current\n",
      "loss     last          counter         weight    label  predict features\n",
      "1.358600 1.358600            1            1.0  -1.0000   0.1029       60\n",
      "7.592927 13.827254            2            2.0  -1.0000   0.0088       60\n",
      "9.232505 10.872083            4            4.0   1.0000   0.1225       60\n",
      "9.071785 8.911065            8            8.0  -1.0000   0.1698       60\n",
      "9.430997 9.790208           16           16.0  -1.0000   0.0333       60\n",
      "8.353756 7.276516           32           32.0   1.0000   0.2188       60\n",
      "8.469951 8.586146           64           64.0  -1.0000   0.0321       60\n",
      "7.585703 6.701455          128          128.0  -1.0000   0.0199       60\n",
      "7.449527 7.313351          256          256.0  -1.0000   0.2331       60\n",
      "7.174785 6.900042          512          512.0  -1.0000   0.1774       60\n",
      "7.090911 7.007037         1024         1024.0  -1.0000   0.1080       60\n",
      "7.169777 7.248644         2048         2048.0  -1.0000   0.0220       60\n",
      "7.188963 7.208148         4096         4096.0  -1.0000   0.0207       60\n",
      "7.211928 7.234893         8192         8192.0  -1.0000   0.0065       60\n",
      "\n",
      "finished run\n",
      "number of examples per pass = 9043\n",
      "passes used = 1\n",
      "weighted example sum = 9043.000000\n",
      "weighted label sum = -6955.000000\n",
      "average loss = 7.205263\n",
      "best constant = -0.769103\n",
      "best constant's loss = 0.408480\n",
      "total feature number = 552460\n"
     ]
    }
   ],
   "source": [
    "!cd $directory && vw split_train.vw -c --passes 4 -f model.vw --loss_function logistic --interactions ic -b 26\n",
    "!cd $directory && vw split_test.vw -t -i model.vw -p preds.txt  --link logistic"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Calculating the AUC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.914318312778\n"
     ]
    }
   ],
   "source": [
    "preds = pd.read_csv(directory+'preds.txt', header=None)\n",
    "test_split = pd.read_csv(directory+'split_test.vw', header=None, sep = '|')\n",
    "from sklearn import metrics\n",
    "fpr, tpr, thresholds = metrics.roc_curve(test_split[0].values, preds[0].values)\n",
    "auc = metrics.auc(fpr, tpr)\n",
    "print(auc)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}