-
Notifications
You must be signed in to change notification settings - Fork 31
Expand file tree
/
Copy pathkdd2009tree.Rmd
More file actions
96 lines (82 loc) · 2.49 KB
/
kdd2009tree.Rmd
File metadata and controls
96 lines (82 loc) · 2.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
---
title: "kdd2008tree"
author: "Win-Vector LLC"
date: "July 18, 2016"
output: html_document
---
```{r setup}
knitr::opts_chunk$set(echo = TRUE)
library('vtreat')
library('WVPlots') # see: https://github.com/WinVector/WVPlots
library('rpart')
library('caret')
#library('doMC')
```
```{r inith2o}
ncore <- parallel::detectCores()
#registerDoMC(cores = ncore)
cl <- parallel::makeCluster(ncore)
```
```{r loaddata}
# see: https://github.com/WinVector/PreparingDataWorkshop/tree/master/KDD2009
d = read.table('orange_small_train.data.gz',
header=T,sep='\t',na.strings=c('NA',''),
strip.white = TRUE,
stringsAsFactors=FALSE)
churn = read.table('orange_small_train_churn.labels.txt',
header=F,sep='\t',
strip.white = TRUE,
stringsAsFactors = FALSE)
d$churn = churn$V1
set.seed(729375)
rgroup = runif(dim(d)[[1]])
dTrain = d[rgroup<=0.9,] # set for building models
dTest = d[rgroup>0.9,] # set for evaluation
rm(list=c('d','churn'))
outcomes = c('churn','appetency','upselling')
nonvars <- c(outcomes,'rgroup')
vars = setdiff(colnames(dTrain),
nonvars)
yName = 'churn'
yTarget = 1
```
```{r preparedata}
# build data treatments
set.seed(239525)
# build treatments
trainPlan = mkCrossFrameCExperiment(dTrain,
vars,yName,yTarget,
smFactor=2.0,
parallelCluster=cl)
print(trainPlan$method)
treatmentsC = trainPlan$treatments
treatedTrainM = trainPlan$crossFrame
#kddSig = 1/nrow(treatmentsC$scoreFrame)
selvars <- setdiff(colnames(treatedTrainM),outcomes)
treatedTrainM[[yName]] = treatedTrainM[[yName]]==yTarget
treatedTest = prepare(treatmentsC,
dTest,
varRestriction=selvars,
pruneSig=NULL,
parallelCluster=cl)
treatedTest[[yName]] = treatedTest[[yName]]==yTarget
```
```{r fit1}
goodvars <- treatmentsC$scoreFrame$varName[treatmentsC$scoreFrame$sig<1/nrow(treatmentsC$scoreFrame)]
form <- paste(yName,paste(goodvars,collapse=' + '),sep=' ~ ')
rcontrol <- rpart.control(cp = 0.001)
m <- rpart(form,treatedTrainM,control=rcontrol)
summary(m)
pTrain <- predict(m,newdata=treatedTrainM)
treatedTrainM$pred <- as.numeric(pTrain)
WVPlots::ROCPlot(treatedTrainM,'pred',yName,'prediction on train')
pTest <- predict(m,newdata=treatedTest)
treatedTest$pred <- as.numeric(pTest)
WVPlots::ROCPlot(treatedTest,'pred',yName,'prediction on test')
```
```{r shutdown}
if(!is.null(cl)) {
parallel::stopCluster(cl)
cl = NULL
}
```