See More

{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "import statsmodels.formula.api as sm\n", "import pandas as pd\n", "import numpy as np" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 32 }, { "cell_type": "code", "collapsed": false, "input": [ "#df = pd.read_csv(\"/Users/williamliu/Dropbox/NYC-DAT-08/Homework_5/input/2013_NCAA_Game.csv\", sep=',')\n", "df = pd.read_csv(r\"C:\\Users\\wliu\\Dropbox\\NYC-DAT-08\\Homework_5\\input\\2013_NCAA_Game.csv\", sep=',')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 33 }, { "cell_type": "code", "collapsed": false, "input": [ "print df.head()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " Date Team Opponent Team Score Opponent Score \\\n", "0 3/10/13 Albany NY Stony Brook 61 59 \n", "1 3/10/13 Stony Brook Albany NY 59 61 \n", "2 3/10/13 Indiana Michigan 72 71 \n", "3 3/10/13 Michigan Indiana 71 72 \n", "4 3/10/13 Michigan St Northwestern 71 61 \n", "\n", " Location Team Margin Team Result Team Location \\\n", "0 Albany NY 2 Win Home \n", "1 Albany NY -2 Loss Away \n", "2 Michigan 1 Win Away \n", "3 Michigan -1 Loss Home \n", "4 Michigan St 10 Win Home \n", "\n", " Team Avg Scoring Margin Opponent Average Scoring Margin \n", "0 4.12 9.90 \n", "1 9.90 4.12 \n", "2 18.55 11.93 \n", "3 11.93 18.55 \n", "4 8.67 -2.58 \n", "\n", "[5 rows x 11 columns]\n" ] } ], "prompt_number": 34 }, { "cell_type": "code", "collapsed": false, "input": [ "df.columns = ['Date', 'Team', 'Opponent', 'TeamScore', 'OpponentScore', 'Location', 'TeamMargin', 'TeamResult', 'TeamLocation', 'TeamAvgScoringMargin', 'OpponentAverageScoringMargin']" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 35 }, { "cell_type": "code", "collapsed": false, "input": [ "#print df.columns # Note from Haski: Use list comprehension, can reassignquicker" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 36 }, { "cell_type": "code", "collapsed": false, "input": [ "df = df.rename(columns={'Team Score':'TeamScore', 'Opponent Score':'OpponentScore', 'Team Margin':'TeamMargin', 'Team Result':'TeamResult', 'Team Location':'TeamLocation', 'Team Avg Scoring Margin':'TeamAvgScoringMargin', 'Opponent Average Scoring Margin':'OpponentAverageScoringMargin'})" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 37 }, { "cell_type": "code", "collapsed": false, "input": [ "mycolumns = df.columns" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 38 }, { "cell_type": "code", "collapsed": false, "input": [ "print type(mycolumns)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 39 }, { "cell_type": "code", "collapsed": false, "input": [ "# Get training and test samples, 25% test, 75% training\n", "from sklearn.cross_validation import train_test_split\n", "\n", "train, test = train_test_split(df, test_size=0.25)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 40 }, { "cell_type": "code", "collapsed": false, "input": [ "# Training and test samples return as numpys\n", "print \"Train is type\", type(train)\n", "print \"Test is type\", type(test)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "Train is type \n", "Test is type \n" ] } ], "prompt_number": 41 }, { "cell_type": "code", "collapsed": false, "input": [ "print test" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[['1/9/13' 'Michigan' 'Nebraska' ..., 'Home' 11.93 -4.9]\n", " ['2/21/13' \"St Mary's CA\" 'BYU' ..., 'Home' 12.7 6.8]\n", " ['1/19/13' 'Nevada' 'Fresno St' ..., 'Away' -4.93 -3.15]\n", " ..., \n", " ['12/29/12' 'Tulsa' 'Florida St' ..., 'Neutral' -1.38 -1.0]\n", " ['3/2/13' 'Wofford' 'Citadel' ..., 'Home' -1.63 -11.41]\n", " ['11/15/12' 'Louisville' 'Samford' ..., 'Home' 15.55 -5.73]]\n" ] } ], "prompt_number": 42 }, { "cell_type": "code", "collapsed": false, "input": [ "test = pd.DataFrame(data=test, columns=mycolumns)\n", "train = pd.DataFrame(data=train, columns=mycolumns)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 43 }, { "cell_type": "code", "collapsed": false, "input": [ "print train.head()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " Date Team Opponent TeamScore OpponentScore \\\n", "0 1/22/13 Memphis Tulane 71 60 \n", "1 2/7/13 Colorado Oregon 48 47 \n", "2 2/23/13 Cleveland St W Illinois 60 54 \n", "3 1/10/13 IPFW N Dakota St 55 67 \n", "4 11/9/12 St Bonaventure Bethune-Cookman 65 55 \n", "\n", " Location TeamMargin TeamResult TeamLocation TeamAvgScoringMargin \\\n", "0 Memphis 11 Win Home 10.58 \n", "1 Oregon 1 Win Away 4.8 \n", "2 W Illinois 6 Win Away -7.17 \n", "3 N Dakota St -12 Loss Away -1.9 \n", "4 St Bonaventure 10 Win Home 0.72 \n", "\n", " OpponentAverageScoringMargin \n", "0 4.33 \n", "1 7.9 \n", "2 4.92 \n", "3 9.96 \n", "4 -3.48 \n", "\n", "[5 rows x 11 columns]\n" ] } ], "prompt_number": 44 }, { "cell_type": "code", "collapsed": false, "input": [ "print type(train.Date)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 45 }, { "cell_type": "code", "collapsed": false, "input": [ "train.TeamScore = train.TeamScore.astype(np.float64)\n", "train.OpponentScore = train.OpponentScore.astype(np.float64)\n", "train.TeamMargin = train.TeamMargin.astype(np.float64)\n", "train.TeamAvgScoringMargin = train.TeamAvgScoringMargin.astype(np.float64)\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 46 }, { "cell_type": "code", "collapsed": false, "input": [ "print type (train.TeamMargin)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 47 }, { "cell_type": "code", "collapsed": false, "input": [ "print type(train.Team)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "\n" ] } ], "prompt_number": 48 }, { "cell_type": "code", "collapsed": false, "input": [ "from patsy import dmatrices\n", "\n", "y, X = dmatrices('TeamMargin ~ Team + Opponent + TeamLocation + TeamAvgScoringMargin + OpponentAverageScoringMargin', data=train, return_type='dataframe')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 49 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn.linear_model import LinearRegression\n", "\n", "model = LinearRegression() # Using Ordinary Least Squares, r-squared is .58\n", "model = model.fit(X,y)\n", "print \" Classifier coefficient is: \", model.coef_\n", "print \" Classifier intercept is: \", model.intercept_\n", "print \" Classifier score is:\", model.score(X, y)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " Classifier coefficient is: [[ 0.00000000e+00 2.18520429e+12 6.51201778e+11 ..., -1.18382208e+10\n", " 1.94839646e+10 -2.72469363e+11]]\n", " Classifier intercept is: [ 3.04281271e+11]\n", " Classifier score is: 0.581173786337\n" ] } ], "prompt_number": 60 }, { "cell_type": "code", "collapsed": false, "input": [ "X.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "

\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
InterceptTeam[T.Akron]Team[T.Alabama]Team[T.Alabama A&M]Team[T.Alabama St]Team[T.Albany NY]Team[T.Alcorn St]Team[T.American Univ]Team[T.Appalachian St]Team[T.Arizona]Team[T.Arizona St]Team[T.Ark Little Rock]Team[T.Ark Pine Bluff]Team[T.Arkansas]Team[T.Arkansas St]Team[T.Army]Team[T.Auburn]Team[T.Austin Peay]Team[T.BYU]Team[T.Ball St]
0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...
1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...
2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...
3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...
4 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...
\n", "

5 rows \u00d7 1012 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 61, "text": [ " Intercept Team[T.Akron] Team[T.Alabama] Team[T.Alabama A&M] \\\n", "0 1 0 0 0 \n", "1 1 0 0 0 \n", "2 1 0 0 0 \n", "3 1 0 0 0 \n", "4 1 0 0 0 \n", "\n", " Team[T.Alabama St] Team[T.Albany NY] Team[T.Alcorn St] \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", "\n", " Team[T.American Univ] Team[T.Appalachian St] Team[T.Arizona] \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", "\n", " Team[T.Arizona St] Team[T.Ark Little Rock] Team[T.Ark Pine Bluff] \\\n", "0 0 0 0 \n", "1 0 0 0 \n", "2 0 0 0 \n", "3 0 0 0 \n", "4 0 0 0 \n", "\n", " Team[T.Arkansas] Team[T.Arkansas St] Team[T.Army] Team[T.Auburn] \\\n", "0 0 0 0 0 \n", "1 0 0 0 0 \n", "2 0 0 0 0 \n", "3 0 0 0 0 \n", "4 0 0 0 0 \n", "\n", " Team[T.Austin Peay] Team[T.BYU] Team[T.Ball St] \n", "0 0 0 0 ... \n", "1 0 0 0 ... \n", "2 0 0 0 ... \n", "3 0 0 0 ... \n", "4 0 0 0 ... \n", "\n", "[5 rows x 1012 columns]" ] } ], "prompt_number": 61 }, { "cell_type": "code", "collapsed": false, "input": [ "from patsy import dmatrices\n", "\n", "y, X = dmatrices('TeamMargin ~ Team * Opponent + TeamLocation + TeamAvgScoringMargin * OpponentAverageScoringMargin', data=train, return_type='dataframe')" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": "*" }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn import linear_model\n", "\n", "model = linear_model.Ridge(alpha = .5)\n", "model.fit(X,y) # Ridge Regression, penalty on size of coefficients\n", "\n", "print \" Classifier coefficient is: \", model.coef_\n", "print \" Classifier intercept is: \", model.intercept_\n", "print \" Classifier score is:\", model.score(X, y) # r-squared is .5809\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": "*" }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 53 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn import linear_model\n", "\n", "model = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0])\n", "model.fit(X,y) # Ridge Regression with built-in Cross Validation\n", "\n", "print \" Classifier coefficient is: \", model.coef_\n", "print \" Classifier intercept is: \", model.intercept_\n", "print \" Classifier score is:\", model.score(X, y) # .58049\n", "print \" Classifier alpha is: \", model.alpha_" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " Classifier coefficient is: [[ 0. -2.6251065 1.26128338 ..., -12.0694826 -10.99317295\n", " 1.18552653]]\n", " Classifier intercept is: [-3.20834861]\n", " Classifier score is: 0.580491402406\n", " Classifier alpha is: 1.0\n" ] } ], "prompt_number": 64 }, { "cell_type": "code", "collapsed": false, "input": [ "from sklearn import linear_model\n", "\n", "model = linear_model.Lasso(alpha=0.2) # Terrible, .29 score\n", "model.fit(X,y) # Lasso regularization (estimates sparse coefficients), higher the alpha, the fewer features selected\n", "\n", "print \" Classifier coefficient is: \", model.coef_\n", "print \" Classifier intercept is: \", model.intercept_\n", "print \" Classifier score is:\", model.score(X, y) " ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " Classifier coefficient is: [ 0. -0. -0. ..., -0. -0. 0.93443521]\n", " Classifier intercept is: [-4.10401916]\n", " Classifier score is: 0.299180550411\n" ] } ], "prompt_number": 71 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }