patsy/patsy/splines.py at master · bashtage/patsy

History

429 lines (380 loc) · 16.6 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

# This file is part of Patsy

# See file LICENSE.txt for license information.

# R-compatible spline basis functions

# These are made available in the patsy.* namespace

__all__ = ["bs"]

import numpy as np

from patsy.util import have_pandas, no_pickling, assert_no_pickling

from patsy.state import stateful_transform

if have_pandas:

import pandas

def _eval_bspline_basis(x, knots, degree):

try:

from scipy.interpolate import splev

except ImportError: # pragma: no cover

raise ImportError("spline functionality requires scipy")

# 'knots' are assumed to be already pre-processed. E.g. usually you

# want to include duplicate copies of boundary knots; you should do

# that *before* calling this constructor.

knots = np.atleast_1d(np.asarray(knots, dtype=float))

assert knots.ndim == 1

knots.sort()

degree = int(degree)

x = np.atleast_1d(x)

if x.ndim == 2 and x.shape[1] == 1:

x = x[:, 0]

assert x.ndim == 1

# XX FIXME: when points fall outside of the boundaries, splev and R seem

# to handle them differently. I don't know why yet. So until we understand

# this and decide what to do with it, I'm going to play it safe and

# disallow such points.

if np.min(x) < np.min(knots) or np.max(x) > np.max(knots):

raise NotImplementedError(

"some data points fall outside the "

"outermost knots, and I'm not sure how "

"to handle them. (Patches accepted!)"

)

# Thanks to Charles Harris for explaining splev. It's not well

# documented, but basically it computes an arbitrary b-spline basis

# given knots and degree on some specified points (or derivatives

# thereof, but we don't use that functionality), and then returns some

# linear combination of these basis functions. To get out the basis

# functions themselves, we use linear combinations like [1, 0, 0], [0,

# 1, 0], [0, 0, 1].

# NB: This probably makes it rather inefficient (though I haven't checked

# to be sure -- maybe the fortran code actually skips computing the basis

# function for coefficients that are zero).

# Note: the order of a spline is the same as its degree + 1.

# Note: there are (len(knots) - order) basis functions.

n_bases = len(knots) - (degree + 1)

basis = np.empty((x.shape[0], n_bases), dtype=float)

for i in range(n_bases):

coefs = np.zeros((n_bases,))

coefs[i] = 1

basis[:, i] = splev(x, (knots, coefs, degree))

return basis

def _R_compat_quantile(x, probs):

# return np.percentile(x, 100 * np.asarray(probs))

probs = np.asarray(probs)

quantiles = np.asarray(

[np.percentile(x, 100 * prob) for prob in probs.ravel(order="C")]

)

return quantiles.reshape(probs.shape, order="C")

def test__R_compat_quantile():

def t(x, prob, expected):

assert np.allclose(_R_compat_quantile(x, prob), expected)

t([10, 20], 0.5, 15)

t([10, 20], 0.3, 13)

t([10, 20], [0.3, 0.7], [13, 17])

t(list(range(10)), [0.3, 0.7], [2.7, 6.3])

class BS(object):

"""bs(x, df=None, knots=None, degree=3, include_intercept=False, lower_bound=None, upper_bound=None)

Generates a B-spline basis for ``x``, allowing non-linear fits. The usual

usage is something like::

y ~ 1 + bs(x, 4)

to fit ``y`` as a smooth function of ``x``, with 4 degrees of freedom

given to the smooth.

:arg df: The number of degrees of freedom to use for this spline. The

return value will have this many columns. You must specify at least one

of ``df`` and ``knots``.

:arg knots: The interior knots to use for the spline. If unspecified, then

equally spaced quantiles of the input data are used. You must specify at

least one of ``df`` and ``knots``.

:arg degree: The degree of the spline to use.

:arg include_intercept: If ``True``, then the resulting

spline basis will span the intercept term (i.e., the constant

function). If ``False`` (the default) then this will not be the case,

which is useful for avoiding overspecification in models that include

multiple spline terms and/or an intercept term.

:arg lower_bound: The lower exterior knot location.

:arg upper_bound: The upper exterior knot location.

A spline with ``degree=0`` is piecewise constant with breakpoints at each

knot, and the default knot positions are quantiles of the input. So if you

find yourself in the situation of wanting to quantize a continuous

variable into ``num_bins`` equal-sized bins with a constant effect across

each bin, you can use ``bs(x, num_bins - 1, degree=0)``. (The ``- 1`` is

because one degree of freedom will be taken by the intercept;

alternatively, you could leave the intercept term out of your model and

use ``bs(x, num_bins, degree=0, include_intercept=True)``.

A spline with ``degree=1`` is piecewise linear with breakpoints at each

knot.

The default is ``degree=3``, which gives a cubic b-spline.

This is a stateful transform (for details see

:ref:`stateful-transforms`). If ``knots``, ``lower_bound``, or

``upper_bound`` are not specified, they will be calculated from the data

and then the chosen values will be remembered and re-used for prediction

from the fitted model.

Using this function requires scipy be installed.

.. note:: This function is very similar to the R function of the same

name. In cases where both return output at all (e.g., R's ``bs`` will

raise an error if ``degree=0``, while patsy's will not), they should

produce identical output given identical input and parameter settings.

.. warning:: I'm not sure on what the proper handling of points outside

the lower/upper bounds is, so for now attempting to evaluate a spline

basis at such points produces an error. Patches gratefully accepted.

.. versionadded:: 0.2.0

"""

def __init__(self):

self._tmp = {}

self._degree = None

self._all_knots = None

def memorize_chunk(

self,

df=None,

knots=None,

degree=3,

include_intercept=False,

lower_bound=None,

upper_bound=None,

args = {

"df": df,

"knots": knots,

"degree": degree,

"include_intercept": include_intercept,

"lower_bound": lower_bound,

"upper_bound": upper_bound,

}

self._tmp["args"] = args

# XX: check whether we need x values before saving them

x = np.atleast_1d(x)

if x.ndim == 2 and x.shape[1] == 1:

x = x[:, 0]

if x.ndim > 1:

raise ValueError("input to 'bs' must be 1-d, or a 2-d column vector")

# There's no better way to compute exact quantiles than memorizing

# all data.

self._tmp.setdefault("xs", []).append(x)

def memorize_finish(self):

tmp = self._tmp

args = tmp["args"]

del self._tmp

if args["degree"] < 0:

raise ValueError(

"degree must be greater than 0 (not %r)" % (args["degree"],)

)

if int(args["degree"]) != args["degree"]:

raise ValueError("degree must be an integer (not %r)" % (self._degree,))

# These are guaranteed to all be 1d vectors by the code above

x = np.concatenate(tmp["xs"])

if args["df"] is None and args["knots"] is None:

raise ValueError("must specify either df or knots")

order = args["degree"] + 1

if args["df"] is not None:

n_inner_knots = args["df"] - order

if not args["include_intercept"]:

n_inner_knots += 1

if n_inner_knots < 0:

raise ValueError(

"df=%r is too small for degree=%r and "

"include_intercept=%r; must be >= %s"

% (

args["df"],

args["degree"],

args["include_intercept"],

# We know that n_inner_knots is negative;

# if df were that much larger, it would

# have been zero, and things would work.

args["df"] - n_inner_knots,

)

if args["knots"] is not None:

if len(args["knots"]) != n_inner_knots:

raise ValueError(

"df=%s with degree=%r implies %s knots, "

"but %s knots were provided"

% (

args["df"],

args["degree"],

n_inner_knots,

len(args["knots"]),

)

else:

# Need to compute inner knots

knot_quantiles = np.linspace(0, 1, n_inner_knots + 2)[1:-1]

inner_knots = _R_compat_quantile(x, knot_quantiles)

if args["knots"] is not None:

inner_knots = args["knots"]

if args["lower_bound"] is not None:

lower_bound = args["lower_bound"]

else:

lower_bound = np.min(x)

if args["upper_bound"] is not None:

upper_bound = args["upper_bound"]

else:

upper_bound = np.max(x)

if lower_bound > upper_bound:

raise ValueError(

"lower_bound > upper_bound (%r > %r)" % (lower_bound, upper_bound)

)

inner_knots = np.asarray(inner_knots)

if inner_knots.ndim > 1:

raise ValueError("knots must be 1 dimensional")

if np.any(inner_knots < lower_bound):

raise ValueError(

"some knot values (%s) fall below lower bound "

"(%r)" % (inner_knots[inner_knots < lower_bound], lower_bound)

)

if np.any(inner_knots > upper_bound):

raise ValueError(

"some knot values (%s) fall above upper bound "

"(%r)" % (inner_knots[inner_knots > upper_bound], upper_bound)

)

all_knots = np.concatenate(([lower_bound, upper_bound] * order, inner_knots))

all_knots.sort()

self._degree = args["degree"]

self._all_knots = all_knots

def transform(

self,

df=None,

knots=None,

degree=3,

include_intercept=False,

lower_bound=None,

upper_bound=None,

basis = _eval_bspline_basis(x, self._all_knots, self._degree)

if not include_intercept:

basis = basis[:, 1:]

if have_pandas:

if isinstance(x, (pandas.Series, pandas.DataFrame)):

basis = pandas.DataFrame(basis)

basis.index = x.index

return basis

__getstate__ = no_pickling

bs = stateful_transform(BS)

def test_bs_compat():

from patsy.test_state import check_stateful

from patsy.test_splines_bs_data import R_bs_test_x, R_bs_test_data, R_bs_num_tests

lines = R_bs_test_data.split("\n")

tests_ran = 0

start_idx = lines.index("--BEGIN TEST CASE--")

while True:

if not lines[start_idx] == "--BEGIN TEST CASE--":

break

start_idx += 1

stop_idx = lines.index("--END TEST CASE--", start_idx)

block = lines[start_idx:stop_idx]

test_data = {}

for line in block:

key, value = line.split("=", 1)

test_data[key] = value

# Translate the R output into Python calling conventions

kwargs = {

"degree": int(test_data["degree"]),

# integer, or None

"df": eval(test_data["df"]),

# np.array() call, or None

"knots": eval(test_data["knots"]),

}

if test_data["Boundary.knots"] != "None":

lower, upper = eval(test_data["Boundary.knots"])

kwargs["lower_bound"] = lower

kwargs["upper_bound"] = upper

kwargs["include_intercept"] = test_data["intercept"] == "TRUE"

# Special case: in R, setting intercept=TRUE increases the effective

# dof by 1. Adjust our arguments to match.

# if kwargs["df"] is not None and kwargs["include_intercept"]:

# kwargs["df"] += 1

output = np.asarray(eval(test_data["output"]))

if kwargs["df"] is not None:

assert output.shape[1] == kwargs["df"]

# Do the actual test

check_stateful(BS, False, R_bs_test_x, output, **kwargs)

tests_ran += 1

# Set up for the next one

start_idx = stop_idx + 1

assert tests_ran == R_bs_num_tests

test_bs_compat.slow = 1

# This isn't checked by the above, because R doesn't have zero degree

# b-splines.

def test_bs_0degree():

x = np.logspace(-1, 1, 10)

result = bs(x, knots=[1, 4], degree=0, include_intercept=True)

assert result.shape[1] == 3

expected_0 = np.zeros(10)

expected_0[x < 1] = 1

assert np.array_equal(result[:, 0], expected_0)

expected_1 = np.zeros(10)

expected_1[(x >= 1) & (x < 4)] = 1

assert np.array_equal(result[:, 1], expected_1)

expected_2 = np.zeros(10)

expected_2[x >= 4] = 1

assert np.array_equal(result[:, 2], expected_2)

# Check handling of points that exactly fall on knots. They arbitrarily

# get included into the larger region, not the smaller. This is consistent

# with Python's half-open interval convention -- each basis function is

# constant on [knot[i], knot[i + 1]).

assert np.array_equal(

bs([0, 1, 2], degree=0, knots=[1], include_intercept=True),

[[1, 0], [0, 1], [0, 1]],

)

result_int = bs(x, knots=[1, 4], degree=0, include_intercept=True)

result_no_int = bs(x, knots=[1, 4], degree=0, include_intercept=False)

assert np.array_equal(result_int[:, 1:], result_no_int)

def test_bs_errors():

import pytest

x = np.linspace(-10, 10, 20)

# error checks:

# out of bounds

pytest.raises(NotImplementedError, bs, x, 3, lower_bound=0)

pytest.raises(NotImplementedError, bs, x, 3, upper_bound=0)

# must specify df or knots

pytest.raises(ValueError, bs, x)

# df/knots match/mismatch (with and without intercept)

# match:

bs(x, df=10, include_intercept=False, knots=[0] * 7)

bs(x, df=10, include_intercept=True, knots=[0] * 6)

bs(x, df=10, include_intercept=False, knots=[0] * 9, degree=1)

bs(x, df=10, include_intercept=True, knots=[0] * 8, degree=1)

# too many knots:

pytest.raises(ValueError, bs, x, df=10, include_intercept=False, knots=[0] * 8)

pytest.raises(ValueError, bs, x, df=10, include_intercept=True, knots=[0] * 7)

pytest.raises(

ValueError, bs, x, df=10, include_intercept=False, knots=[0] * 10, degree=1

)

pytest.raises(

ValueError, bs, x, df=10, include_intercept=True, knots=[0] * 9, degree=1

)

# too few knots:

pytest.raises(ValueError, bs, x, df=10, include_intercept=False, knots=[0] * 6)

pytest.raises(ValueError, bs, x, df=10, include_intercept=True, knots=[0] * 5)

pytest.raises(

ValueError, bs, x, df=10, include_intercept=False, knots=[0] * 8, degree=1

)

pytest.raises(

ValueError, bs, x, df=10, include_intercept=True, knots=[0] * 7, degree=1

)

# df too small

pytest.raises(ValueError, bs, x, df=1, degree=3)

pytest.raises(ValueError, bs, x, df=3, degree=5)

# bad degree

pytest.raises(ValueError, bs, x, df=10, degree=-1)

pytest.raises(ValueError, bs, x, df=10, degree=1.5)

# upper_bound < lower_bound

pytest.raises(ValueError, bs, x, 3, lower_bound=1, upper_bound=-1)

# multidimensional input

pytest.raises(ValueError, bs, np.column_stack((x, x)), 3)

# unsorted knots are okay, and get sorted

assert np.array_equal(bs(x, knots=[1, 4]), bs(x, knots=[4, 1]))

# 2d knots

pytest.raises(ValueError, bs, x, knots=[[0], [20]])

# knots > upper_bound

pytest.raises(ValueError, bs, x, knots=[0, 20])

pytest.raises(ValueError, bs, x, knots=[0, 4], upper_bound=3)

# knots < lower_bound

pytest.raises(ValueError, bs, x, knots=[-20, 0])

pytest.raises(ValueError, bs, x, knots=[-4, 0], lower_bound=-3)

# differences between bs and ns (since the R code is a pile of copy-paste):

# - degree is always 3

# - different number of interior knots given df (b/c fewer dof used at edges I

# guess)

# - boundary knots always repeated exactly 4 times (same as bs with degree=3)

# - complications at the end to handle boundary conditions

# the 'rcs' function uses slightly different conventions -- in particular it

# picks boundary knots that are not quite at the edges of the data, which

# makes sense for a natural spline.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

splines.py

Latest commit

History

splines.py

File metadata and controls