machinelearning/src/Microsoft.ML.FastTree/GamTrainer.cs at master · devhttps/machinelearning

History

1486 lines (1293 loc) · 70.2 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

// Licensed to the .NET Foundation under one or more agreements.

// The .NET Foundation licenses this file to you under the MIT license.

// See the LICENSE file in the project root for more information.

using System;

using System.Collections.Generic;

using System.IO;

using System.Linq;

using System.Threading;

using Microsoft.ML;

using Microsoft.ML.Command;

using Microsoft.ML.CommandLine;

using Microsoft.ML.Core.Data;

using Microsoft.ML.Data;

using Microsoft.ML.EntryPoints;

using Microsoft.ML.Internal.Calibration;

using Microsoft.ML.Internal.CpuMath;

using Microsoft.ML.Internal.Internallearn;

using Microsoft.ML.Internal.Utilities;

using Microsoft.ML.Model;

using Microsoft.ML.Trainers.FastTree;

using Microsoft.ML.Trainers.FastTree.Internal;

using Microsoft.ML.Training;

using Timer = Microsoft.ML.Trainers.FastTree.Internal.Timer;

[assembly: LoadableClass(typeof(GamModelParametersBase.VisualizationCommand), typeof(GamModelParametersBase.VisualizationCommand.Arguments), typeof(SignatureCommand),

"GAM Vizualization Command", GamModelParametersBase.VisualizationCommand.LoadName, "gamviz", DocName = "command/GamViz.md")]

[assembly: LoadableClass(typeof(void), typeof(Gam), null, typeof(SignatureEntryPointModule), "GAM")]

namespace Microsoft.ML.Trainers.FastTree

{

using AutoResetEvent = System.Threading.AutoResetEvent;

using SplitInfo = LeastSquaresRegressionTreeLearner.SplitInfo;

/// <summary>

/// Generalized Additive Model Trainer.

/// </summary>

/// <remarks>

/// <para>

/// Generalized Additive Models, or GAMs, model the data as a set of linearly independent features

/// similar to a linear model. For each feature, the GAM trainer learns a non-linear function,

/// called a "shape function", that computes the response as a function of the feature's value.

/// (In contrast, a linear model fits a linear response (e.g. a line) to each feature.)

/// To score an example, the outputs of all the shape functions are summed and the score is the total value.

/// </para>

/// <para>

/// This GAM trainer is implemented using shallow gradient boosted trees (e.g. tree stumps) to learn nonparametric

/// shape functions, and is based on the method described in Lou, Caruana, and Gehrke.

/// <a href='http://www.cs.cornell.edu/~yinlou/papers/lou-kdd12.pdf'>"Intelligible Models for Classification and Regression."</a> KDD'12, Beijing, China. 2012.

/// After training, an intercept is added to represent the average prediction over the training set,

/// and the shape functions are normalized to represent the deviation from the average prediction. This results

/// in models that are easily interpreted simply by inspecting the intercept and the shape functions.

/// See the sample below for an example of how to train a GAM model and inspect and interpret the results.

/// </para>

/// </remarks>

/// <example>

/// <format type="text/markdown">

/// <![CDATA[

/// [!code-csharp[GAM](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/GeneralizedAdditiveModels.cs)]

/// ]]>

/// </format>

/// </example>

public abstract partial class GamTrainerBase<TArgs, TTransformer, TPredictor> : TrainerEstimatorBase<TTransformer, TPredictor>

where TTransformer: ISingleFeaturePredictionTransformer<TPredictor>

where TArgs : GamTrainerBase<TArgs, TTransformer, TPredictor>.ArgumentsBase, new()

where TPredictor : IPredictorProducing<float>

{

public abstract class ArgumentsBase : LearnerInputBaseWithWeight

{

[Argument(ArgumentType.LastOccurenceWins, HelpText = "The entropy (regularization) coefficient between 0 and 1", ShortName = "e")]

public double EntropyCoefficient;

/// Only consider a gain if its likelihood versus a random choice gain is above a certain value.

/// So 0.95 would mean restricting to gains that have less than a 0.05 change of being generated randomly through choice of a random split.

[Argument(ArgumentType.LastOccurenceWins, HelpText = "Tree fitting gain confidence requirement (should be in the range [0,1) ).", ShortName = "gainconf")]

public int GainConfidenceLevel;

[Argument(ArgumentType.LastOccurenceWins, HelpText = "Total number of iterations over all features", ShortName = "iter", SortOrder = 1)]

[TGUI(SuggestedSweeps = "200,1500,9500")]

[TlcModule.SweepableDiscreteParamAttribute("NumIterations", new object[] { 200, 1500, 9500 })]

public int NumIterations = GamDefaults.NumIterations;

[Argument(ArgumentType.LastOccurenceWins, HelpText = "The number of threads to use", ShortName = "t", NullName = "<Auto>")]

public int? NumThreads = null;

[Argument(ArgumentType.LastOccurenceWins, HelpText = "The learning rate", ShortName = "lr", SortOrder = 4)]

[TGUI(SuggestedSweeps = "0.001,0.1;log")]

[TlcModule.SweepableFloatParamAttribute("LearningRates", 0.001f, 0.1f, isLogScale: true)]

public double LearningRates = GamDefaults.LearningRates;

[Argument(ArgumentType.LastOccurenceWins, HelpText = "Whether to utilize the disk or the data's native transposition facilities (where applicable) when performing the transpose", ShortName = "dt")]

public bool? DiskTranspose;

[Argument(ArgumentType.LastOccurenceWins, HelpText = "Maximum number of distinct values (bins) per feature", ShortName = "mb")]

public int MaxBins = GamDefaults.MaxBins;

[Argument(ArgumentType.AtMostOnce, HelpText = "Upper bound on absolute value of single output", ShortName = "mo")]

public double MaxOutput = Double.PositiveInfinity;

[Argument(ArgumentType.AtMostOnce, HelpText = "Sample each query 1 in k times in the GetDerivatives function", ShortName = "sr")]

public int GetDerivativesSampleRate = 1;

[Argument(ArgumentType.LastOccurenceWins, HelpText = "The seed of the random number generator", ShortName = "r1")]

public int RngSeed = 123;

[Argument(ArgumentType.LastOccurenceWins, HelpText = "Minimum number of training instances required to form a partition", ShortName = "mi", SortOrder = 3)]

[TGUI(SuggestedSweeps = "1,10,50")]

[TlcModule.SweepableDiscreteParamAttribute("MinDocuments", new object[] { 1, 10, 50 })]

public int MinDocuments = 10;

[Argument(ArgumentType.LastOccurenceWins, HelpText = "Whether to collectivize features during dataset preparation to speed up training", ShortName = "flocks", Hide = true)]

public bool FeatureFlocks = true;

[Argument(ArgumentType.AtMostOnce, HelpText = "Enable post-training pruning to avoid overfitting. (a validation set is required)", ShortName = "pruning")]

public bool EnablePruning = true;

}

internal const string Summary = "Trains a gradient boosted stump per feature, on all features simultaneously, " +

"to fit target values using least-squares. It mantains " +

"no interactions between features.";

private const string RegisterName = "GamTraining";

//Parameters of training

protected readonly TArgs Args;

private readonly double _gainConfidenceInSquaredStandardDeviations;

private readonly double _entropyCoefficient;

//Dataset information

protected Dataset TrainSet;

protected Dataset ValidSet;

/// <summary>

/// Whether a validation set was passed in

/// </summary>

protected bool HasValidSet => ValidSet != null;

protected ScoreTracker TrainSetScore;

protected ScoreTracker ValidSetScore;

protected TestHistory PruningTest;

protected int PruningLossIndex;

protected int InputLength;

private LeastSquaresRegressionTreeLearner.LeafSplitCandidates _leafSplitCandidates;

private SufficientStatsBase[] _histogram;

private ILeafSplitStatisticsCalculator _leafSplitHelper;

private ObjectiveFunctionBase _objectiveFunction;

private bool HasWeights => TrainSet?.SampleWeights != null;

// Training datastructures

private SubGraph _subGraph;

//Results of training

protected double MeanEffect;

protected double[][] BinEffects;

protected int[] FeatureMap;

public override TrainerInfo Info { get; }

private protected virtual bool NeedCalibration => false;

protected IParallelTraining ParallelTraining;

private protected GamTrainerBase(IHostEnvironment env,

string name,

SchemaShape.Column label,

string featureColumn,

string weightColumn,

int numIterations,

double learningRate,

int maxBins,

Action<TArgs> advancedSettings)

: base(Contracts.CheckRef(env, nameof(env)).Register(name), TrainerUtils.MakeR4VecFeature(featureColumn), label, TrainerUtils.MakeR4ScalarWeightColumn(weightColumn))

{

Args = new TArgs();

Args.NumIterations = numIterations;

Args.LearningRates = learningRate;

Args.MaxBins = maxBins;

//apply the advanced args, if the user supplied any

advancedSettings?.Invoke(Args);

Args.LabelColumn = label.Name;

Args.FeatureColumn = featureColumn;

if (weightColumn != null)

Args.WeightColumn = weightColumn;

Info = new TrainerInfo(normalization: false, calibration: NeedCalibration, caching: false, supportValid: true);

_gainConfidenceInSquaredStandardDeviations = Math.Pow(ProbabilityFunctions.Probit(1 - (1 - Args.GainConfidenceLevel) * 0.5), 2);

_entropyCoefficient = Args.EntropyCoefficient * 1e-6;

InitializeThreads();

}

private protected GamTrainerBase(IHostEnvironment env, TArgs args, string name, SchemaShape.Column label)

: base(Contracts.CheckRef(env, nameof(env)).Register(name), TrainerUtils.MakeR4VecFeature(args.FeatureColumn),

label, TrainerUtils.MakeR4ScalarWeightColumn(args.WeightColumn, args.WeightColumn.IsExplicit))

{

Contracts.CheckValue(env, nameof(env));

Host.CheckValue(args, nameof(args));

Host.CheckParam(args.LearningRates > 0, nameof(args.LearningRates), "Must be positive.");

Host.CheckParam(args.NumThreads == null || args.NumThreads > 0, nameof(args.NumThreads), "Must be positive.");

Host.CheckParam(0 <= args.EntropyCoefficient && args.EntropyCoefficient <= 1, nameof(args.EntropyCoefficient), "Must be in [0, 1].");

Host.CheckParam(0 <= args.GainConfidenceLevel && args.GainConfidenceLevel < 1, nameof(args.GainConfidenceLevel), "Must be in [0, 1).");

Host.CheckParam(0 < args.MaxBins, nameof(args.MaxBins), "Must be posittive.");

Host.CheckParam(0 < args.NumIterations, nameof(args.NumIterations), "Must be positive.");

Host.CheckParam(0 < args.MinDocuments, nameof(args.MinDocuments), "Must be positive.");

Args = args;

Info = new TrainerInfo(normalization: false, calibration: NeedCalibration, caching: false, supportValid: true);

_gainConfidenceInSquaredStandardDeviations = Math.Pow(ProbabilityFunctions.Probit(1 - (1 - Args.GainConfidenceLevel) * 0.5), 2);

_entropyCoefficient = Args.EntropyCoefficient * 1e-6;

InitializeThreads();

}

private protected void TrainBase(TrainContext context)

{

using (var ch = Host.Start("Training"))

{

ch.CheckValue(context, nameof(context));

// Create the datasets

ConvertData(context.TrainingSet, context.ValidationSet);

// Define scoring and testing

DefineScoreTrackers();

if (HasValidSet)

DefinePruningTest();

InputLength = context.TrainingSet.Schema.Feature.Value.Type.ValueCount;

TrainCore(ch);

}

private void DefineScoreTrackers()

{

TrainSetScore = new ScoreTracker("train", TrainSet, null);

if (HasValidSet)

ValidSetScore = new ScoreTracker("valid", ValidSet, null);

}

protected abstract void DefinePruningTest();

private protected abstract void CheckLabel(RoleMappedData data);

private void ConvertData(RoleMappedData trainData, RoleMappedData validationData)

{

trainData.CheckFeatureFloatVector();

trainData.CheckOptFloatWeight();

CheckLabel(trainData);

var useTranspose = UseTranspose(Args.DiskTranspose, trainData);

var instanceConverter = new ExamplesToFastTreeBins(Host, Args.MaxBins, useTranspose, !Args.FeatureFlocks, Args.MinDocuments, float.PositiveInfinity);

ParallelTraining.InitEnvironment();

TrainSet = instanceConverter.FindBinsAndReturnDataset(trainData, PredictionKind, ParallelTraining, null, false);

FeatureMap = instanceConverter.FeatureMap;

if (validationData != null)

ValidSet = instanceConverter.GetCompatibleDataset(validationData, PredictionKind, null, false);

Host.Assert(FeatureMap == null || FeatureMap.Length == TrainSet.NumFeatures);

}

private bool UseTranspose(bool? useTranspose, RoleMappedData data)

{

Host.AssertValue(data);

Host.Assert(data.Schema.Feature.HasValue);

if (useTranspose.HasValue)

return useTranspose.Value;

return data.Data is ITransposeDataView td && td.TransposeSchema.GetSlotType(data.Schema.Feature.Value.Index) != null;

}

private void TrainCore(IChannel ch)

{

Contracts.CheckValue(ch, nameof(ch));

// REVIEW:Get rid of this lock then we completly remove all static classes from Gam such as BlockingThreadPool.

lock (FastTreeShared.TrainLock)

{

using (Timer.Time(TimerEvent.TotalInitialization))

Initialize(ch);

using (Timer.Time(TimerEvent.TotalTrain))

TrainMainEffectsModel(ch);

}

/// <summary>

/// Training algorithm for the single-feature functions f(x)

/// </summary>

/// <param name="ch">The channel to write to</param>

private void TrainMainEffectsModel(IChannel ch)

{

Contracts.AssertValue(ch);

int iterations = Args.NumIterations;

ch.Info("Starting to train ...");

using (var pch = Host.StartProgressChannel("GAM training"))

{

_objectiveFunction = CreateObjectiveFunction();

var sumWeights = HasWeights ? TrainSet.SampleWeights.Sum() : 0;

int iteration = 0;

pch.SetHeader(new ProgressHeader("iterations"), e => e.SetProgress(0, iteration, iterations));

for (int i = iteration; iteration < iterations; iteration++)

{

using (Timer.Time(TimerEvent.Iteration))

{

var gradient = _objectiveFunction.GetGradient(ch, TrainSetScore.Scores);

var sumTargets = gradient.Sum();

SumUpsAcrossFlocks(gradient, sumTargets, sumWeights);

TrainOnEachFeature(gradient, TrainSetScore.Scores, sumTargets, sumWeights, iteration);

UpdateScores(iteration);

}

CombineGraphs(ch);

}

private void SumUpsAcrossFlocks(double[] gradient, double sumTargets, double sumWeights)

{

var sumupTask = ThreadTaskManager.MakeTask(

(flockIndex) =>

{

_histogram[flockIndex].Sumup(

TrainSet.FlockToFirstFeature(flockIndex),

null,

TrainSet.NumDocs,

sumTargets,

sumWeights,

gradient,

TrainSet.SampleWeights,

null);

}, TrainSet.NumFlocks);

sumupTask.RunTask();

}

private void TrainOnEachFeature(double[] gradient, double[] scores, double sumTargets, double sumWeights, int iteration)

{

var trainTask = ThreadTaskManager.MakeTask(

(feature) =>

{

TrainingIteration(feature, gradient, scores, sumTargets, sumWeights, iteration);

}, TrainSet.NumFeatures);

trainTask.RunTask();

}

private void TrainingIteration(int globalFeatureIndex, double[] gradient, double[] scores,

double sumTargets, double sumWeights, int iteration)

{

int flockIndex;

int subFeatureIndex;

TrainSet.MapFeatureToFlockAndSubFeature(globalFeatureIndex, out flockIndex, out subFeatureIndex);

// Compute the split for the feature

_histogram[flockIndex].FindBestSplitForFeature(_leafSplitHelper, _leafSplitCandidates,

_leafSplitCandidates.Targets.Length, sumTargets, sumWeights,

globalFeatureIndex, flockIndex, subFeatureIndex, Args.MinDocuments, HasWeights,

_gainConfidenceInSquaredStandardDeviations, _entropyCoefficient,

TrainSet.Flocks[flockIndex].Trust(subFeatureIndex), 0);

// Adjust the model

if (_leafSplitCandidates.FeatureSplitInfo[globalFeatureIndex].Gain > 0)

ConvertTreeToGraph(globalFeatureIndex, iteration);

}

/// <summary>

/// Update scores for all tracked datasets

/// </summary>

private void UpdateScores(int iteration)

{

// Pass scores by reference to be updated and manually trigger the update callbacks

UpdateScoresForSet(TrainSet, TrainSetScore.Scores, iteration);

TrainSetScore.SendScoresUpdatedMessage();

if (HasValidSet)

{

UpdateScoresForSet(ValidSet, ValidSetScore.Scores, iteration);

ValidSetScore.SendScoresUpdatedMessage();

}

/// <summary>

/// Updates the scores for a dataset.

/// </summary>

/// <param name="dataset">The dataset to use.</param>

/// <param name="scores">The current scores for this dataset</param>

/// <param name="iteration">The iteration of the algorithm.

/// Used to look up the sub-graph to use to update the score.</param>

/// <returns></returns>

private void UpdateScoresForSet(Dataset dataset, double[] scores, int iteration)

{

DefineDocumentThreadBlocks(dataset.NumDocs, BlockingThreadPool.NumThreads, out int[] threadBlocks);

var updateTask = ThreadTaskManager.MakeTask(

(threadIndex) =>

{

int startIndexInclusive = threadBlocks[threadIndex];

int endIndexExclusive = threadBlocks[threadIndex + 1];

for (int featureIndex = 0; featureIndex < _subGraph.Splits.Length; featureIndex++)

{

var featureIndexer = dataset.GetIndexer(featureIndex);

for (int doc = startIndexInclusive; doc < endIndexExclusive; doc++)

{

if (featureIndexer[doc] <= _subGraph.Splits[featureIndex][iteration].SplitPoint)

scores[doc] += _subGraph.Splits[featureIndex][iteration].LteValue;

else

scores[doc] += _subGraph.Splits[featureIndex][iteration].GtValue;

}

}, BlockingThreadPool.NumThreads);

updateTask.RunTask();

}

/// <summary>

/// Combine the single-feature single-tree graphs to a single-feature model

/// </summary>

private void CombineGraphs(IChannel ch)

{

// Prune backwards to the best iteration

int bestIteration = Args.NumIterations;

if (Args.EnablePruning && PruningTest != null)

{

ch.Info("Pruning");

var finalResult = PruningTest.ComputeTests().ToArray()[PruningLossIndex];

string lossFunctionName = finalResult.LossFunctionName;

double bestLoss = finalResult.FinalValue;

if (PruningTest != null)

{

bestIteration = PruningTest.BestIteration;

bestLoss = PruningTest.BestResult.FinalValue;

}

if (bestIteration != Args.NumIterations)

ch.Info($"Best Iteration ({lossFunctionName}): {bestIteration} @ {bestLoss:G6} (vs {Args.NumIterations} @ {finalResult.FinalValue:G6}).");

else

ch.Info("No pruning necessary. More iterations may be necessary.");

}

// Combine the graphs to compute the per-feature (binned) Effects

BinEffects = new double[TrainSet.NumFeatures][];

for (int featureIndex = 0; featureIndex < TrainSet.NumFeatures; featureIndex++)

{

TrainSet.MapFeatureToFlockAndSubFeature(featureIndex, out int flockIndex, out int subFeatureIndex);

int numOfBins = TrainSet.Flocks[flockIndex].BinCount(subFeatureIndex);

BinEffects[featureIndex] = new double[numOfBins];

for (int iteration = 0; iteration < bestIteration; iteration++)

{

var splitPoint = _subGraph.Splits[featureIndex][iteration].SplitPoint;

for (int bin = 0; bin <= splitPoint; bin++)

BinEffects[featureIndex][bin] += _subGraph.Splits[featureIndex][iteration].LteValue;

for (int bin = (int)splitPoint + 1; bin < numOfBins; bin++)

BinEffects[featureIndex][bin] += _subGraph.Splits[featureIndex][iteration].GtValue;

}

// Center the graph around zero

CenterGraph();

}

/// <summary>

/// Distribute the documents into blocks to be computed on each thread

/// </summary>

/// <param name="numDocs">The number of documents in the dataset</param>

/// <param name="blocks">An array containing the starting point for each thread;

/// the next position is the exclusive ending point for the thread.</param>

/// <param name="numThreads">The number of threads used.</param>

private void DefineDocumentThreadBlocks(int numDocs, int numThreads, out int[] blocks)

{

int extras = numDocs % numThreads;

int documentsPerThread = numDocs / numThreads;

blocks = new int[numThreads + 1];

blocks[0] = 0;

for (int t = 0; t < extras; t++)

blocks[t + 1] = blocks[t] + documentsPerThread + 1;

for (int t = extras; t < numThreads; t++)

blocks[t + 1] = blocks[t] + documentsPerThread;

}

/// <summary>

/// Center the graph using the mean response per feature on the training set.

/// </summary>

private void CenterGraph()

{

// Define this once

DefineDocumentThreadBlocks(TrainSet.NumDocs, BlockingThreadPool.NumThreads, out int[] trainThreadBlocks);

// Compute the mean of each Effect

var meanEffects = new double[BinEffects.Length];

var updateTask = ThreadTaskManager.MakeTask(

(threadIndex) =>

{

int startIndexInclusive = trainThreadBlocks[threadIndex];

int endIndexExclusive = trainThreadBlocks[threadIndex + 1];

for (int featureIndex = 0; featureIndex < BinEffects.Length; featureIndex++)

{

var featureIndexer = TrainSet.GetIndexer(featureIndex);

for (int doc = startIndexInclusive; doc < endIndexExclusive; doc++)

{

var bin = featureIndexer[doc];

double totalEffect;

double newTotalEffect;

{

totalEffect = meanEffects[featureIndex];

newTotalEffect = totalEffect + BinEffects[featureIndex][bin];

} while (totalEffect !=

Interlocked.CompareExchange(ref meanEffects[featureIndex], newTotalEffect, totalEffect));

// Update the shared effect, being careful of threading

}

}, BlockingThreadPool.NumThreads);

updateTask.RunTask();

// Compute the intercept and center each graph

MeanEffect = 0.0;

for (int featureIndex = 0; featureIndex < BinEffects.Length; featureIndex++)

{

// Compute the mean effect

meanEffects[featureIndex] /= TrainSet.NumDocs;

// Shift the mean from the bins into the intercept

MeanEffect += meanEffects[featureIndex];

for (int bin=0; bin < BinEffects[featureIndex].Length; ++bin)

BinEffects[featureIndex][bin] -= meanEffects[featureIndex];

}

private void ConvertTreeToGraph(int globalFeatureIndex, int iteration)

{

SplitInfo splitinfo = _leafSplitCandidates.FeatureSplitInfo[globalFeatureIndex];

_subGraph.Splits[globalFeatureIndex][iteration].SplitPoint = splitinfo.Threshold;

_subGraph.Splits[globalFeatureIndex][iteration].LteValue = Args.LearningRates * splitinfo.LteOutput;

_subGraph.Splits[globalFeatureIndex][iteration].GtValue = Args.LearningRates * splitinfo.GTOutput;

}

private void InitializeGamHistograms()

{

_histogram = new SufficientStatsBase[TrainSet.Flocks.Length];

for (int i = 0; i < TrainSet.Flocks.Length; i++)

_histogram[i] = TrainSet.Flocks[i].CreateSufficientStats(HasWeights);

}

private void Initialize(IChannel ch)

{

using (Timer.Time(TimerEvent.InitializeTraining))

{

InitializeGamHistograms();

_subGraph = new SubGraph(TrainSet.NumFeatures, Args.NumIterations);

_leafSplitCandidates = new LeastSquaresRegressionTreeLearner.LeafSplitCandidates(TrainSet);

_leafSplitHelper = new LeafSplitHelper(HasWeights);

}

private void InitializeThreads()

{

ParallelTraining = new SingleTrainer();

int numThreads = Args.NumThreads ?? Environment.ProcessorCount;

if (Host.ConcurrencyFactor > 0 && numThreads > Host.ConcurrencyFactor)

using (var ch = Host.Start("GamTrainer"))

{

numThreads = Host.ConcurrencyFactor;

ch.Warning("The number of threads specified in trainer arguments is larger than the concurrency factor "

+ "setting of the environment. Using {0} training threads instead.", numThreads);

}

ThreadTaskManager.Initialize(numThreads);

}

protected abstract ObjectiveFunctionBase CreateObjectiveFunction();

private class LeafSplitHelper : ILeafSplitStatisticsCalculator

{

private bool _hasWeights;

public LeafSplitHelper(bool hasWeights)

{

_hasWeights = hasWeights;

}

/// <summary>

/// Returns the split gain for a particular leaf. Used on two leaves to calculate

/// the squared error gain for a particular leaf.

/// </summary>

/// <param name="count">Number of documents in this leaf</param>

/// <param name="sumTargets">Sum of the target values for this leaf</param>

/// <param name="sumWeights">Sum of the weights for this leaf, not meaningful if

/// <see cref="HasWeights"/> is <c>false</c></param>

/// <returns>The gain in least squared error</returns>

public double GetLeafSplitGain(int count, double sumTargets, double sumWeights)

{

if (!_hasWeights)

return (sumTargets * sumTargets) / count;

return -4.0 * (Math.Abs(sumTargets) + sumWeights);

}

/// <summary>

/// Calculates the output value for a leaf after splitting.

/// </summary>

/// <param name="count">Number of documents in this leaf</param>

/// <param name="sumTargets">Sum of the target values for this leaf</param>

/// <param name="sumWeights">Sum of the weights for this leaf, not meaningful if

/// <see cref="HasWeights"/> is <c>false</c></param>

/// <returns>The output value for a leaf</returns>

public double CalculateSplittedLeafOutput(int count, double sumTargets, double sumWeights)

{

if (!_hasWeights)

return sumTargets / count;

Contracts.Assert(sumWeights != 0);

return sumTargets / sumWeights;

}

private struct SubGraph

{

public Stump[][] Splits;

public SubGraph(int numFeatures, int numIterations)

{

Splits = new Stump[numFeatures][];

for (int i =0; i < numFeatures; ++i)

{

Splits[i] = new Stump[numIterations];

for (int j = 0; j < numIterations; j++)

Splits[i][j] = new Stump(0, 0, 0);

}

public struct Stump

{

public uint SplitPoint;

public double LteValue;

public double GtValue;

public Stump(uint splitPoint, double lteValue, double gtValue)

{

SplitPoint = splitPoint;

LteValue = lteValue;

GtValue = gtValue;

}

public abstract class GamModelParametersBase : ModelParametersBase<float>, IValueMapper, ICalculateFeatureContribution,

IFeatureContributionMapper, ICanSaveInTextFormat, ICanSaveSummary

{

private readonly double[][] _binUpperBounds;

private readonly double[][] _binEffects;

public readonly double Intercept;

private readonly int _numFeatures;

private readonly ColumnType _inputType;

private readonly ColumnType _outputType;

// These would be the bins for a totally sparse input.

private readonly int[] _binsAtAllZero;

// The output value for all zeros

private readonly double _valueAtAllZero;

private readonly int[] _featureMap;

private readonly int _inputLength;

private readonly Dictionary<int, int> _inputFeatureToDatasetFeatureMap;

ColumnType IValueMapper.InputType => _inputType;

ColumnType IValueMapper.OutputType => _outputType;

public FeatureContributionCalculator FeatureContributionClaculator => new FeatureContributionCalculator(this);

private protected GamModelParametersBase(IHostEnvironment env, string name,

int inputLength, Dataset trainSet, double meanEffect, double[][] binEffects, int[] featureMap)

: base(env, name)

{

Host.CheckValue(trainSet, nameof(trainSet));

Host.CheckParam(trainSet.NumFeatures <= inputLength, nameof(inputLength), "Must be at least as large as dataset number of features");

Host.CheckParam(featureMap == null || featureMap.Length == trainSet.NumFeatures, nameof(featureMap), "Not of right size");

Host.CheckValue(binEffects, nameof(binEffects));

Host.CheckParam(binEffects.Length == trainSet.NumFeatures, nameof(binEffects), "Not of right size");

_inputLength = inputLength;

_numFeatures = binEffects.Length;

_inputType = new VectorType(NumberType.Float, _inputLength);

_outputType = NumberType.Float;

_featureMap = featureMap;

Intercept = meanEffect;

//No features were filtered.

if (_featureMap == null)

_featureMap = Utils.GetIdentityPermutation(trainSet.NumFeatures);

_inputFeatureToDatasetFeatureMap = new Dictionary<int, int>(_featureMap.Length);

for (int i = 0; i < _featureMap.Length; i++)

{

Host.CheckParam(0 <= _featureMap[i] && _featureMap[i] < inputLength, nameof(_featureMap), "Contains out of range feature vaule");

Host.CheckParam(!_inputFeatureToDatasetFeatureMap.ContainsValue(_featureMap[i]), nameof(_featureMap), "Contains duplicate mappings");

_inputFeatureToDatasetFeatureMap[_featureMap[i]] = i;

}

//keep only bin effect and upperbounds where the effect changes.

int flockIndex;

int subFeatureIndex;

_binUpperBounds = new double[_numFeatures][];

_binEffects = new double[_numFeatures][];

var newBinEffects = new List<double>();

var newBinBoundaries = new List<double>();

_binsAtAllZero = new int[_numFeatures];

for (int i = 0; i < _numFeatures; i++)

{

trainSet.MapFeatureToFlockAndSubFeature(i, out flockIndex, out subFeatureIndex);

double[] binUpperBound = trainSet.Flocks[flockIndex].BinUpperBounds(subFeatureIndex);

double[] binEffect = binEffects[i];

Host.CheckValue(binEffect, nameof(binEffects), "Array contained null entries");

Host.CheckParam(binUpperBound.Length == binEffect.Length, nameof(binEffects), "Array contained wrong number of effects");

double value = binEffect[0];

for (int j = 0; j < binEffect.Length; j++)

{

double element = binEffect[j];

if (element != value)

{

newBinEffects.Add(value);

newBinBoundaries.Add(binUpperBound[j - 1]);

value = element;

}

newBinBoundaries.Add(binUpperBound[binEffect.Length - 1]);

newBinEffects.Add(binEffect[binEffect.Length - 1]);

_binUpperBounds[i] = newBinBoundaries.ToArray();

// Center the effect around 0, and move the mean into the intercept

_binEffects[i] = newBinEffects.ToArray();

_valueAtAllZero += _binEffects[i][0];

newBinEffects.Clear();

newBinBoundaries.Clear();

}

protected GamModelParametersBase(IHostEnvironment env, string name, ModelLoadContext ctx)

: base(env, name)

{

Host.CheckValue(ctx, nameof(ctx));

BinaryReader reader = ctx.Reader;

_numFeatures = reader.ReadInt32();

Host.CheckDecode(_numFeatures >= 0);

_inputLength = reader.ReadInt32();

Host.CheckDecode(_inputLength >= 0);

Intercept = reader.ReadDouble();

_binEffects = new double[_numFeatures][];

_binUpperBounds = new double[_numFeatures][];

_binsAtAllZero = new int[_numFeatures];

for (int i = 0; i < _numFeatures; i++)

{

_binEffects[i] = reader.ReadDoubleArray();

Host.CheckDecode(Utils.Size(_binEffects[i]) >= 1);

}

for (int i = 0; i < _numFeatures; i++)

{

_binUpperBounds[i] = reader.ReadDoubleArray(_binEffects[i].Length);

// Ideally should verify that the sum of these matches _baseOutput,

// but due to differences in JIT over time and other considerations,

// it's possible that the sum may change even in the absence of

// model corruption.

_valueAtAllZero += GetBinEffect(i, 0, out _binsAtAllZero[i]);

}

int len = reader.ReadInt32();

Host.CheckDecode(len >= 0);

_inputFeatureToDatasetFeatureMap = new Dictionary<int, int>(len);

_featureMap = Utils.CreateArray(_numFeatures, -1);

for (int i = 0; i < len; i++)

{

int key = reader.ReadInt32();

Host.CheckDecode(0 <= key && key < _inputLength);

int val = reader.ReadInt32();

Host.CheckDecode(0 <= val && val < _numFeatures);

Host.CheckDecode(!_inputFeatureToDatasetFeatureMap.ContainsKey(key));

Host.CheckDecode(_featureMap[val] == -1);

_inputFeatureToDatasetFeatureMap[key] = val;

_featureMap[val] = key;

}

_inputType = new VectorType(NumberType.Float, _inputLength);

_outputType = NumberType.Float;

}

private protected override void SaveCore(ModelSaveContext ctx)

{

Host.CheckValue(ctx, nameof(ctx));

ctx.Writer.Write(_numFeatures);

Host.Assert(_numFeatures >= 0);

ctx.Writer.Write(_inputLength);

Host.Assert(_inputLength >= 0);

ctx.Writer.Write(Intercept);

for (int i = 0; i < _numFeatures; i++)

ctx.Writer.WriteDoubleArray(_binEffects[i]);

int diff = _binEffects.Sum(e => e.Take(e.Length - 1).Select((ef, i) => ef != e[i + 1] ? 1 : 0).Sum());

int bound = _binEffects.Sum(e => e.Length - 1);

for (int i = 0; i < _numFeatures; i++)

{

ctx.Writer.WriteDoublesNoCount(_binUpperBounds[i]);

Host.Assert(_binUpperBounds[i].Length == _binEffects[i].Length);

}

ctx.Writer.Write(_inputFeatureToDatasetFeatureMap.Count);

foreach (KeyValuePair<int, int> kvp in _inputFeatureToDatasetFeatureMap)

{

ctx.Writer.Write(kvp.Key);

ctx.Writer.Write(kvp.Value);

}

ValueMapper<TIn, TOut> IValueMapper.GetMapper<TIn, TOut>()

{

Host.Check(typeof(TIn) == typeof(VBuffer<float>));

Host.Check(typeof(TOut) == typeof(float));

ValueMapper<VBuffer<float>, float> del = Map;

return (ValueMapper<TIn, TOut>)(Delegate)del;

}

private void Map(in VBuffer<float> features, ref float response)

{

Host.CheckParam(features.Length == _inputLength, nameof(features), "Bad length of input");

double value = Intercept;

var featuresValues = features.GetValues();

if (features.IsDense)

{

for (int i = 0; i < featuresValues.Length; ++i)

{

if (_inputFeatureToDatasetFeatureMap.TryGetValue(i, out int j))

value += GetBinEffect(j, featuresValues[i]);

}

else

{

var featuresIndices = features.GetIndices();

// Add in the precomputed results for all features

value += _valueAtAllZero;

for (int i = 0; i < featuresValues.Length; ++i)

{

if (_inputFeatureToDatasetFeatureMap.TryGetValue(featuresIndices[i], out int j))

// Add the value and subtract the value at zero that was previously accounted for

value += GetBinEffect(j, featuresValues[i]) - GetBinEffect(j, 0);

}

response = (float)value;

}

internal double GetFeatureBinsAndScore(in VBuffer<float> features, int[] bins)

{

Host.CheckParam(features.Length == _inputLength, nameof(features));

Host.CheckParam(Utils.Size(bins) == _numFeatures, nameof(bins));

double value = Intercept;

var featuresValues = features.GetValues();

if (features.IsDense)

{

for (int i = 0; i < featuresValues.Length; ++i)

{

if (_inputFeatureToDatasetFeatureMap.TryGetValue(i, out int j))

value += GetBinEffect(j, featuresValues[i], out bins[j]);

}

else

{

var featuresIndices = features.GetIndices();

// Add in the precomputed results for all features

value += _valueAtAllZero;

Array.Copy(_binsAtAllZero, bins, _numFeatures);

// Update the results for features we have

for (int i = 0; i < featuresValues.Length; ++i)

{

if (_inputFeatureToDatasetFeatureMap.TryGetValue(featuresIndices[i], out int j))

// Add the value and subtract the value at zero that was previously accounted for

value += GetBinEffect(j, featuresValues[i], out bins[j]) - GetBinEffect(j, 0);

}

return value;

}

private double GetBinEffect(int featureIndex, double featureValue)

{

Contracts.Assert(0 <= featureIndex && featureIndex < _numFeatures);

int index = Algorithms.FindFirstGE(_binUpperBounds[featureIndex], featureValue);

return _binEffects[featureIndex][index];

}

private double GetBinEffect(int featureIndex, double featureValue, out int binIndex)

{

Contracts.Assert(0 <= featureIndex && featureIndex < _numFeatures);

binIndex = Algorithms.FindFirstGE(_binUpperBounds[featureIndex], featureValue);

return _binEffects[featureIndex][binIndex];

}

/// <summary>

/// Get the bin upper bounds for each feature.

/// </summary>

/// <param name="featureIndex">The index of the feature (in the training vector) to get.</param>

/// <returns>The bin upper bounds. May be null if this feature has no bins.</returns>

public double[] GetFeatureBinUpperBounds(int featureIndex)

{

Contracts.Assert(0 <= featureIndex && featureIndex < _numFeatures);

double[] featureBins;

if (_inputFeatureToDatasetFeatureMap.TryGetValue(featureIndex, out int j))

{

featureBins = new double[_binUpperBounds[j].Length];

_binUpperBounds[j].CopyTo(featureBins, 0);

}

else

{

featureBins = new double[0];

}

return featureBins;

}

/// <summary>

/// Get the binned weights for each feature.

/// </summary>

/// <param name="featureIndex">The index of the feature (in the training vector) to get.</param>

/// <returns>The binned weights for each feature.</returns>

public double[] GetFeatureWeights(int featureIndex)

{

Contracts.Assert(0 <= featureIndex && featureIndex < _numFeatures);

double[] featureWeights;

if (_inputFeatureToDatasetFeatureMap.TryGetValue(featureIndex, out int j))

{

featureWeights = new double[_binUpperBounds[j].Length];

_binEffects[j].CopyTo(featureWeights, 0);

}

else

{

featureWeights = new double[0];

}

return featureWeights;

}

void ICanSaveInTextFormat.SaveAsText(TextWriter writer, RoleMappedSchema schema)

{

Host.CheckValue(writer, nameof(writer));

Host.CheckValueOrNull(schema);

writer.WriteLine("\xfeffFeature index table"); // add BOM to tell excel this is UTF-8

writer.WriteLine($"Number of features:\t{_numFeatures+1:D}");

writer.WriteLine("Feature Index\tFeature Name");

// REVIEW: We really need some unit tests around text exporting (for this, and other learners).

// A useful test in this case would be a model trained with:

// maml.exe train data=Samples\breast-cancer-withheader.txt loader=text{header+ col=Label:0 col=F1:1-4 col=F2:4 col=F3:5-*}

// xf =expr{col=F2 expr=x:0.0} xf=concat{col=Features:F1,F2,F3} tr=gam out=bubba2.zip

// Write out the intercept

writer.WriteLine("-1\tIntercept");

var names = default(VBuffer<ReadOnlyMemory<char>>);

MetadataUtils.GetSlotNames(schema, RoleMappedSchema.ColumnRole.Feature, _inputLength, ref names);

for (int internalIndex = 0; internalIndex < _numFeatures; internalIndex++)

{

int featureIndex = _featureMap[internalIndex];

var name = names.GetItemOrDefault(featureIndex);

writer.WriteLine(!name.IsEmpty ? "{0}\t{1}" : "{0}\tFeature {0}", featureIndex, name);

}

writer.WriteLine();

writer.WriteLine("Per feature binned effects:");

writer.WriteLine("Feature Index\tFeature Value Bin Upper Bound\tOutput (effect on label)");

writer.WriteLine($"{-1:D}\t{float.MaxValue:R}\t{Intercept:R}");

for (int internalIndex = 0; internalIndex < _numFeatures; internalIndex++)

{

int featureIndex = _featureMap[internalIndex];

double[] effects = _binEffects[internalIndex];

double[] boundaries = _binUpperBounds[internalIndex];

for (int i = 0; i < effects.Length; ++i)

writer.WriteLine($"{featureIndex:D}\t{boundaries[i]:R}\t{effects[i]:R}");

}

void ICanSaveSummary.SaveSummary(TextWriter writer, RoleMappedSchema schema)

{

((ICanSaveInTextFormat)this).SaveAsText(writer, schema);

}

ValueMapper<TSrc, VBuffer<float>> IFeatureContributionMapper.GetFeatureContributionMapper<TSrc, TDstContributions>

(int top, int bottom, bool normalize)

{

Contracts.Check(typeof(TSrc) == typeof(VBuffer<float>));

View remainder of file in raw view

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

GamTrainer.cs

Latest commit

History

GamTrainer.cs

File metadata and controls