{ "cells": [ { "cell_type": "code", "execution_count": 125, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn import datasets\n", "from sklearn.model_selection import train_test_split\n", "import matplotlib.pyplot as plt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Preparing train/test sets" ] }, { "cell_type": "code", "execution_count": 126, "metadata": {}, "outputs": [], "source": [ "#directory = \"/Users/padjiman/data/KDDCUP1998/\"\n", "directory = \"/Users/padjiman/data/bankVW/\"\n", "data_file = \"train.vw\"\n", "data = pd.read_csv(directory+data_file, header=None)\n", "train, test = train_test_split(data, test_size=0.20 , random_state = 26 )\n", "train.to_csv(directory+'split_train.vw', index=False, header=None)\n", "test.to_csv(directory+'split_test.vw', index=False, header=None)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Actual train and test" ] }, { "cell_type": "code", "execution_count": 128, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "creating features for following interactions: ic \n", "final_regressor = model.vw\n", "Num weight bits = 26\n", "learning rate = 0.5\n", "initial_t = 0\n", "power_t = 0.5\n", "decay_learning_rate = 1\n", "using cache_file = split_train.vw.cache\n", "ignoring text input in favor of cache input\n", "num sources = 1\n", "average since example example current current current\n", "loss last counter weight label predict features\n", "0.693147 0.693147 1 1.0 1.0000 0.0000 60\n", "1.109843 1.526538 2 2.0 -1.0000 1.2815 60\n", "0.760195 0.410548 4 4.0 -1.0000 -0.2754 60\n", "0.465710 0.171224 8 8.0 -1.0000 -0.7350 50\n", "0.318235 0.170760 16 16.0 -1.0000 -2.7830 60\n", "0.531552 0.744869 32 32.0 -1.0000 -6.1178 60\n", "2.579228 4.626905 64 64.0 -1.0000 -3.2457 60\n", "2.194169 1.809109 128 128.0 -1.0000 -2.9788 60\n", "2.114863 2.035556 256 256.0 -1.0000 -1.4744 60\n", "1.616420 1.117978 512 512.0 -1.0000 -1.9029 60\n", "1.143370 0.670321 1024 1024.0 -1.0000 -3.5995 60\n", "0.766199 0.389027 2048 2048.0 -1.0000 -3.3793 60\n", "0.530801 0.295404 4096 4096.0 -1.0000 -4.4217 60\n", "0.389938 0.249075 8192 8192.0 -1.0000 -2.8836 60\n", "0.318798 0.247658 16384 16384.0 -1.0000 -3.9569 70\n", "0.263437 0.263437 32768 32768.0 -1.0000 -4.6966 60 h\n", "0.252186 0.240939 65536 65536.0 -1.0000 -4.0019 60 h\n", "\n", "finished run\n", "number of examples per pass = 32552\n", "passes used = 4\n", "weighted example sum = 130208.000000\n", "weighted label sum = -99792.000000\n", "average loss = 0.239227 h\n", "best constant = -2.023110\n", "best constant's loss = 0.360496\n", "total feature number = 7949680\n", "creating features for following interactions: ic \n", "only testing\n", "predictions = preds.txt\n", "Num weight bits = 26\n", "learning rate = 0.5\n", "initial_t = 0\n", "power_t = 0.5\n", "using no cache\n", "Reading datafile = split_test.vw\n", "num sources = 1\n", "average since example example current current current\n", "loss last counter weight label predict features\n", "1.358600 1.358600 1 1.0 -1.0000 0.1029 60\n", "7.592927 13.827254 2 2.0 -1.0000 0.0088 60\n", "9.232505 10.872083 4 4.0 1.0000 0.1225 60\n", "9.071785 8.911065 8 8.0 -1.0000 0.1698 60\n", "9.430997 9.790208 16 16.0 -1.0000 0.0333 60\n", "8.353756 7.276516 32 32.0 1.0000 0.2188 60\n", "8.469951 8.586146 64 64.0 -1.0000 0.0321 60\n", "7.585703 6.701455 128 128.0 -1.0000 0.0199 60\n", "7.449527 7.313351 256 256.0 -1.0000 0.2331 60\n", "7.174785 6.900042 512 512.0 -1.0000 0.1774 60\n", "7.090911 7.007037 1024 1024.0 -1.0000 0.1080 60\n", "7.169777 7.248644 2048 2048.0 -1.0000 0.0220 60\n", "7.188963 7.208148 4096 4096.0 -1.0000 0.0207 60\n", "7.211928 7.234893 8192 8192.0 -1.0000 0.0065 60\n", "\n", "finished run\n", "number of examples per pass = 9043\n", "passes used = 1\n", "weighted example sum = 9043.000000\n", "weighted label sum = -6955.000000\n", "average loss = 7.205263\n", "best constant = -0.769103\n", "best constant's loss = 0.408480\n", "total feature number = 552460\n" ] } ], "source": [ "!cd $directory && vw split_train.vw -c --passes 4 -f model.vw --loss_function logistic --interactions ic -b 26\n", "!cd $directory && vw split_test.vw -t -i model.vw -p preds.txt --link logistic" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Calculating the AUC" ] }, { "cell_type": "code", "execution_count": 129, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.914318312778\n" ] } ], "source": [ "preds = pd.read_csv(directory+'preds.txt', header=None)\n", "test_split = pd.read_csv(directory+'split_test.vw', header=None, sep = '|')\n", "from sklearn import metrics\n", "fpr, tpr, thresholds = metrics.roc_curve(test_split[0].values, preds[0].values)\n", "auc = metrics.auc(fpr, tpr)\n", "print(auc)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.3" } }, "nbformat": 4, "nbformat_minor": 2 }