AliSQL/src/execution/join_hashtable.cpp at master · BitaminKim/AliSQL

History

1744 lines (1507 loc) · 68 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

#include "duckdb/execution/join_hashtable.hpp"

#include "duckdb/common/exception.hpp"

#include "duckdb/common/radix_partitioning.hpp"

#include "duckdb/common/vector_operations/vector_operations.hpp"

#include "duckdb/execution/ht_entry.hpp"

#include "duckdb/main/client_context.hpp"

#include "duckdb/storage/buffer_manager.hpp"

namespace duckdb {

using ValidityBytes = JoinHashTable::ValidityBytes;

using ScanStructure = JoinHashTable::ScanStructure;

using ProbeSpill = JoinHashTable::ProbeSpill;

using ProbeSpillLocalState = JoinHashTable::ProbeSpillLocalAppendState;

JoinHashTable::SharedState::SharedState()

: salt_v(LogicalType::UBIGINT), keys_to_compare_sel(STANDARD_VECTOR_SIZE), keys_no_match_sel(STANDARD_VECTOR_SIZE) {

}

JoinHashTable::ProbeState::ProbeState()

: SharedState(), ht_offsets_v(LogicalType::UBIGINT), hashes_dense_v(LogicalType::HASH),

non_empty_sel(STANDARD_VECTOR_SIZE) {

}

JoinHashTable::InsertState::InsertState(const JoinHashTable &ht)

: SharedState(), remaining_sel(STANDARD_VECTOR_SIZE), key_match_sel(STANDARD_VECTOR_SIZE),

rhs_row_locations(LogicalType::POINTER) {

ht.data_collection->InitializeChunk(lhs_data, ht.equality_predicate_columns);

ht.data_collection->InitializeChunkState(chunk_state, ht.equality_predicate_columns);

}

JoinHashTable::JoinHashTable(ClientContext &context_p, const PhysicalOperator &op_p,

const vector<JoinCondition> &conditions_p, vector<LogicalType> btypes, JoinType type_p,

const vector<idx_t> &output_columns_p)

: context(context_p), op(op_p), buffer_manager(BufferManager::GetBufferManager(context)), conditions(conditions_p),

build_types(std::move(btypes)), output_columns(output_columns_p), entry_size(0), tuple_size(0),

vfound(Value::BOOLEAN(false)), join_type(type_p), finalized(false), has_null(false),

radix_bits(INITIAL_RADIX_BITS) {

for (idx_t i = 0; i < conditions.size(); ++i) {

auto &condition = conditions[i];

D_ASSERT(condition.left->return_type == condition.right->return_type);

auto type = condition.left->return_type;

if (condition.comparison == ExpressionType::COMPARE_EQUAL ||

condition.comparison == ExpressionType::COMPARE_NOT_DISTINCT_FROM) {

// ensure that all equality conditions are at the front,

// and that all other conditions are at the back

D_ASSERT(equality_types.size() == condition_types.size());

equality_types.push_back(type);

equality_predicates.push_back(condition.comparison);

equality_predicate_columns.push_back(i);

} else {

// all non-equality conditions are at the back

non_equality_predicates.push_back(condition.comparison);

non_equality_predicate_columns.push_back(i);

}

null_values_are_equal.push_back(condition.comparison == ExpressionType::COMPARE_DISTINCT_FROM ||

condition.comparison == ExpressionType::COMPARE_NOT_DISTINCT_FROM);

condition_types.push_back(type);

}

// at least one equality is necessary

D_ASSERT(!equality_types.empty());

// Types for the layout

auto layout = make_shared_ptr<TupleDataLayout>();

vector<LogicalType> layout_types(condition_types);

layout_types.insert(layout_types.end(), build_types.begin(), build_types.end());

if (PropagatesBuildSide(join_type)) {

// full/right outer joins need an extra bool to keep track of whether or not a tuple has found a matching entry

// we place the bool before the NEXT pointer

layout_types.emplace_back(LogicalType::BOOLEAN);

}

layout_types.emplace_back(LogicalType::HASH);

layout->Initialize(layout_types, false);

layout_ptr = std::move(layout);

// Initialize the row matcher that are used for filtering during the probing only if there are non-equality

if (!non_equality_predicates.empty()) {

row_matcher_probe = unique_ptr<RowMatcher>(new RowMatcher());

row_matcher_probe_no_match_sel = unique_ptr<RowMatcher>(new RowMatcher());

row_matcher_probe->Initialize(false, *layout_ptr, non_equality_predicates, non_equality_predicate_columns);

row_matcher_probe_no_match_sel->Initialize(true, *layout_ptr, non_equality_predicates,

non_equality_predicate_columns);

needs_chain_matcher = true;

} else {

needs_chain_matcher = false;

}

chains_longer_than_one = false;

row_matcher_build.Initialize(true, *layout_ptr, equality_predicates);

const auto &offsets = layout_ptr->GetOffsets();

tuple_size = offsets[condition_types.size() + build_types.size()];

pointer_offset = offsets.back();

entry_size = layout_ptr->GetRowWidth();

data_collection = make_uniq<TupleDataCollection>(buffer_manager, layout_ptr);

sink_collection =

make_uniq<RadixPartitionedTupleData>(buffer_manager, layout_ptr, radix_bits, layout_ptr->ColumnCount() - 1);

dead_end = make_unsafe_uniq_array_uninitialized<data_t>(layout_ptr->GetRowWidth());

memset(dead_end.get(), 0, layout_ptr->GetRowWidth());

if (join_type == JoinType::SINGLE) {

auto &config = ClientConfig::GetConfig(context);

single_join_error_on_multiple_rows = config.scalar_subquery_error_on_multiple_rows;

}

InitializePartitionMasks();

}

JoinHashTable::~JoinHashTable() {

}

void JoinHashTable::Merge(JoinHashTable &other) {

{

lock_guard<mutex> guard(data_lock);

data_collection->Combine(*other.data_collection);

}

if (join_type == JoinType::MARK) {

auto &info = correlated_mark_join_info;

lock_guard<mutex> mj_lock(info.mj_lock);

has_null = has_null || other.has_null;

if (!info.correlated_types.empty()) {

auto &other_info = other.correlated_mark_join_info;

info.correlated_counts->Combine(*other_info.correlated_counts);

}

sink_collection->Combine(*other.sink_collection);

}

static void ApplyBitmaskAndGetSaltBuild(Vector &hashes_v, Vector &salt_v, const idx_t &count, const idx_t &bitmask) {

if (hashes_v.GetVectorType() == VectorType::CONSTANT_VECTOR) {

auto &hash = *ConstantVector::GetData<hash_t>(hashes_v);

salt_v.SetVectorType(VectorType::CONSTANT_VECTOR);

*ConstantVector::GetData<hash_t>(salt_v) = ht_entry_t::ExtractSalt(hash);

salt_v.Flatten(count);

hash = hash & bitmask;

hashes_v.Flatten(count);

} else {

hashes_v.Flatten(count);

auto salts = FlatVector::GetData<hash_t>(salt_v);

auto hashes = FlatVector::GetData<hash_t>(hashes_v);

for (idx_t i = 0; i < count; i++) {

salts[i] = ht_entry_t::ExtractSalt(hashes[i]);

hashes[i] &= bitmask;

}

template <bool HAS_SEL>

idx_t GetOptionalIndex(const SelectionVector *sel, const idx_t idx) {

return HAS_SEL ? sel->get_index(idx) : idx;

}

static void AddPointerToCompare(JoinHashTable::ProbeState &state, const ht_entry_t &entry, Vector &pointers_result_v,

idx_t row_ht_offset, idx_t &keys_to_compare_count, const idx_t &row_index) {

const auto row_ptr_insert_to = FlatVector::GetData<data_ptr_t>(pointers_result_v);

const auto ht_offsets = FlatVector::GetData<idx_t>(state.ht_offsets_v);

state.keys_to_compare_sel.set_index(keys_to_compare_count, row_index);

row_ptr_insert_to[row_index] = entry.GetPointer();

ht_offsets[row_index] = row_ht_offset;

keys_to_compare_count += 1;

}

template <bool USE_SALTS, bool HAS_SEL>

static idx_t ProbeForPointersInternal(JoinHashTable::ProbeState &state, JoinHashTable &ht, ht_entry_t *entries,

Vector &hashes_v, Vector &pointers_result_v, const SelectionVector *row_sel,

idx_t &count) {

auto hashes_dense = FlatVector::GetData<hash_t>(state.hashes_dense_v);

idx_t keys_to_compare_count = 0;

for (idx_t i = 0; i < count; i++) {

auto row_hash = hashes_dense[i]; // hashes has been flattened before -> always access dense

auto row_ht_offset = row_hash & ht.bitmask;

if (USE_SALTS) {

// increment the ht_offset of the entry as long as next entry is occupied and salt does not match

while (true) {

const ht_entry_t entry = entries[row_ht_offset];

const bool occupied = entry.IsOccupied();

// the entry is empty -> no match possible

if (!occupied) {

break;

}

const hash_t row_salt = ht_entry_t::ExtractSalt(row_hash);

const bool salt_match = entry.GetSalt() == row_salt;

if (salt_match) {

// we know that the enty is occupied and the salt matches -> compare the keys

auto row_index = GetOptionalIndex<HAS_SEL>(row_sel, i);

AddPointerToCompare(state, entry, pointers_result_v, row_ht_offset, keys_to_compare_count,

row_index);

break;

}

// full and salt does not match -> continue probing

IncrementAndWrap(row_ht_offset, ht.bitmask);

}

} else {

const ht_entry_t entry = entries[row_ht_offset];

const bool occupied = entry.IsOccupied();

if (occupied) {

// the entry is occupied -> compare the keys

auto row_index = GetOptionalIndex<HAS_SEL>(row_sel, i);

AddPointerToCompare(state, entry, pointers_result_v, row_ht_offset, keys_to_compare_count, row_index);

}

return keys_to_compare_count;

}

/// for each entry, do linear probing until

/// a) an empty entry is found

/// -> no match

/// b) an entry is found where (and the salt matches if USE_SALTS is true)

/// -> match, add to compare sel and increase found count

template <bool USE_SALTS>

static idx_t ProbeForPointers(JoinHashTable::ProbeState &state, JoinHashTable &ht, ht_entry_t *entries,

Vector &hashes_v, Vector &pointers_result_v, const SelectionVector *row_sel, idx_t count,

const bool has_row_sel) {

if (has_row_sel) {

return ProbeForPointersInternal<USE_SALTS, true>(state, ht, entries, hashes_v, pointers_result_v, row_sel,

count);

} else {

return ProbeForPointersInternal<USE_SALTS, false>(state, ht, entries, hashes_v, pointers_result_v, row_sel,

count);

}

//! Gets a pointer to the entry in the HT for each of the hashes_v using linear probing. Will update the key_match_sel

//! vector and the count argument to the number and position of the matches

template <bool USE_SALTS>

static void GetRowPointersInternal(DataChunk &keys, TupleDataChunkState &key_state, JoinHashTable::ProbeState &state,

Vector &hashes_v, const SelectionVector *row_sel, idx_t &count, JoinHashTable &ht,

ht_entry_t *entries, Vector &pointers_result_v, SelectionVector &match_sel,

bool has_row_sel) {

// in case of a hash collision, we need this information to correctly retrieve the salt of this hash

bool uses_unified = false;

UnifiedVectorFormat hashes_unified_v;

// densify hashes: If there is no sel, flatten the hashes, else densify via UnifiedVectorFormat

if (has_row_sel) {

hashes_v.ToUnifiedFormat(count, hashes_unified_v);

uses_unified = true;

auto hashes_unified = UnifiedVectorFormat::GetData<hash_t>(hashes_unified_v);

auto hashes_dense = FlatVector::GetData<idx_t>(state.hashes_dense_v);

for (idx_t i = 0; i < count; i++) {

const auto row_index = row_sel->get_index(i);

const auto uvf_index = hashes_unified_v.sel->get_index(row_index);

hashes_dense[i] = hashes_unified[uvf_index];

}

} else {

VectorOperations::Copy(hashes_v, state.hashes_dense_v, count, 0, 0);

}

// the number of keys that match for all iterations of the following loop

idx_t match_count = 0;

idx_t keys_no_match_count;

idx_t elements_to_probe_count = count;

do {

const idx_t keys_to_compare_count = ProbeForPointers<USE_SALTS>(state, ht, entries, hashes_v, pointers_result_v,

row_sel, elements_to_probe_count, has_row_sel);

// if there are no keys to compare, we are done

if (keys_to_compare_count == 0) {

break;

}

// Perform row comparisons, after Match function call salt_match_sel will point to the keys that match

keys_no_match_count = 0;

const idx_t keys_match_count = ht.row_matcher_build.Match(

keys, key_state.vector_data, state.keys_to_compare_sel, keys_to_compare_count, *ht.layout_ptr,

pointers_result_v, &state.keys_no_match_sel, keys_no_match_count);

D_ASSERT(keys_match_count + keys_no_match_count == keys_to_compare_count);

// add the indices to the match_sel

for (idx_t i = 0; i < keys_match_count; i++) {

const auto row_index = state.keys_to_compare_sel.get_index(i);

match_sel.set_index(match_count, row_index);

match_count++;

}

// Linear probing for collisions: Move to the next entry in the HT

auto hashes_unified = UnifiedVectorFormat::GetData<hash_t>(hashes_unified_v);

auto hashes_dense = FlatVector::GetData<hash_t>(state.hashes_dense_v);

auto ht_offsets = FlatVector::GetData<idx_t>(state.ht_offsets_v);

for (idx_t i = 0; i < keys_no_match_count; i++) {

const auto row_index = state.keys_no_match_sel.get_index(i);

// The ProbeForPointers function calculates the ht_offset from the hash; therefore, we have to write the

// new offset into the hashes_v; otherwise the next iteration will start at the old position. This might

// seem as an overhead but assures that the first call of ProbeForPointers is optimized as conceding

// calls are unlikely (Max 1-(65535/65536)^VectorSize = 3.1%)

auto ht_offset = ht_offsets[row_index];

IncrementAndWrap(ht_offset, ht.bitmask);

// Get original hash from unified vector format to extract the salt if hashes_dense was populated that way

hash_t hash;

if (uses_unified) {

const auto uvf_index = hashes_unified_v.sel->get_index(row_index);

hash = hashes_unified[uvf_index];

} else {

hash = hashes_dense[row_index];

}

const auto offset_and_salt = ht_offset | (hash & ht_entry_t::SALT_MASK);

hashes_dense[i] = offset_and_salt; // populate dense again

}

// in the next interation, we have a selection vector with the keys that do not match

row_sel = &state.keys_no_match_sel;

has_row_sel = true;

elements_to_probe_count = keys_no_match_count;

} while (DUCKDB_UNLIKELY(keys_no_match_count > 0));

// set the count to the number of matches

count = match_count;

}

inline bool JoinHashTable::UseSalt() const {

// only use salt for large hash tables

return this->capacity > USE_SALT_THRESHOLD;

}

void JoinHashTable::GetRowPointers(DataChunk &keys, TupleDataChunkState &key_state, ProbeState &state, Vector &hashes_v,

const SelectionVector *sel, idx_t &count, Vector &pointers_result_v,

SelectionVector &match_sel, const bool has_sel) {

if (UseSalt()) {

GetRowPointersInternal<true>(keys, key_state, state, hashes_v, sel, count, *this, entries, pointers_result_v,

match_sel, has_sel);

} else {

GetRowPointersInternal<false>(keys, key_state, state, hashes_v, sel, count, *this, entries, pointers_result_v,

match_sel, has_sel);

}

void JoinHashTable::Hash(DataChunk &keys, const SelectionVector &sel, idx_t count, Vector &hashes) {

if (count == keys.size()) {

// no null values are filtered: use regular hash functions

VectorOperations::Hash(keys.data[0], hashes, keys.size());

for (idx_t i = 1; i < equality_types.size(); i++) {

VectorOperations::CombineHash(hashes, keys.data[i], keys.size());

}

} else {

// null values were filtered: use selection vector

VectorOperations::Hash(keys.data[0], hashes, sel, count);

for (idx_t i = 1; i < equality_types.size(); i++) {

VectorOperations::CombineHash(hashes, keys.data[i], sel, count);

}

static idx_t FilterNullValues(UnifiedVectorFormat &vdata, const SelectionVector &sel, idx_t count,

SelectionVector &result) {

idx_t result_count = 0;

for (idx_t i = 0; i < count; i++) {

auto idx = sel.get_index(i);

auto key_idx = vdata.sel->get_index(idx);

if (vdata.validity.RowIsValid(key_idx)) {

result.set_index(result_count++, idx);

}

return result_count;

}

void JoinHashTable::Build(PartitionedTupleDataAppendState &append_state, DataChunk &keys, DataChunk &payload) {

D_ASSERT(!finalized);

D_ASSERT(keys.size() == payload.size());

if (keys.size() == 0) {

return;

}

// special case: correlated mark join

if (join_type == JoinType::MARK && !correlated_mark_join_info.correlated_types.empty()) {

auto &info = correlated_mark_join_info;

lock_guard<mutex> mj_lock(info.mj_lock);

// Correlated MARK join

// for the correlated mark join we need to keep track of COUNT(*) and COUNT(COLUMN) for each of the correlated

// columns push into the aggregate hash table

D_ASSERT(info.correlated_counts);

info.group_chunk.SetCardinality(keys);

for (idx_t i = 0; i < info.correlated_types.size(); i++) {

info.group_chunk.data[i].Reference(keys.data[i]);

}

if (info.correlated_payload.data.empty()) {

vector<LogicalType> types;

types.push_back(keys.data[info.correlated_types.size()].GetType());

info.correlated_payload.InitializeEmpty(types);

}

info.correlated_payload.SetCardinality(keys);

info.correlated_payload.data[0].Reference(keys.data[info.correlated_types.size()]);

info.correlated_counts->AddChunk(info.group_chunk, info.correlated_payload, AggregateType::NON_DISTINCT);

}

// build a chunk to append to the data collection [keys, payload, (optional "found" boolean), hash]

DataChunk source_chunk;

source_chunk.InitializeEmpty(layout_ptr->GetTypes());

for (idx_t i = 0; i < keys.ColumnCount(); i++) {

source_chunk.data[i].Reference(keys.data[i]);

}

idx_t col_offset = keys.ColumnCount();

D_ASSERT(build_types.size() == payload.ColumnCount());

for (idx_t i = 0; i < payload.ColumnCount(); i++) {

source_chunk.data[col_offset + i].Reference(payload.data[i]);

}

col_offset += payload.ColumnCount();

if (PropagatesBuildSide(join_type)) {

// for FULL/RIGHT OUTER joins initialize the "found" boolean to false

source_chunk.data[col_offset].Reference(vfound);

col_offset++;

}

Vector hash_values(LogicalType::HASH);

source_chunk.data[col_offset].Reference(hash_values);

source_chunk.SetCardinality(keys);

// ToUnifiedFormat the source chunk

TupleDataCollection::ToUnifiedFormat(append_state.chunk_state, source_chunk);

// prepare the keys for processing

const SelectionVector *current_sel;

SelectionVector sel(STANDARD_VECTOR_SIZE);

idx_t added_count = PrepareKeys(keys, append_state.chunk_state.vector_data, current_sel, sel, true);

if (added_count < keys.size()) {

has_null = true;

}

if (added_count == 0) {

return;

}

// hash the keys and obtain an entry in the list

// note that we only hash the keys used in the equality comparison

Hash(keys, *current_sel, added_count, hash_values);

// Re-reference and ToUnifiedFormat the hash column after computing it

source_chunk.data[col_offset].Reference(hash_values);

hash_values.ToUnifiedFormat(source_chunk.size(), append_state.chunk_state.vector_data.back().unified);

// We already called TupleDataCollection::ToUnifiedFormat, so we can AppendUnified here

sink_collection->AppendUnified(append_state, source_chunk, *current_sel, added_count);

}

idx_t JoinHashTable::PrepareKeys(DataChunk &keys, vector<TupleDataVectorFormat> &vector_data,

const SelectionVector *&current_sel, SelectionVector &sel, bool build_side) {

// figure out which keys are NULL, and create a selection vector out of them

current_sel = FlatVector::IncrementalSelectionVector();

idx_t added_count = keys.size();

if (build_side && PropagatesBuildSide(join_type)) {

// in case of a right or full outer join, we cannot remove NULL keys from the build side

return added_count;

}

for (idx_t col_idx = 0; col_idx < keys.ColumnCount(); col_idx++) {

// see internal issue 3717.

if (join_type == JoinType::MARK && !correlated_mark_join_info.correlated_types.empty()) {

continue;

}

if (null_values_are_equal[col_idx]) {

continue;

}

auto &col_key_data = vector_data[col_idx].unified;

if (col_key_data.validity.AllValid()) {

continue;

}

added_count = FilterNullValues(col_key_data, *current_sel, added_count, sel);

// null values are NOT equal for this column, filter them out

current_sel = &sel;

}

return added_count;

}

static void StorePointer(const const_data_ptr_t &pointer, const data_ptr_t &target) {

Store<uint64_t>(cast_pointer_to_uint64(pointer), target);

}

static data_ptr_t LoadPointer(const const_data_ptr_t &source) {

return cast_uint64_to_pointer(Load<uint64_t>(source));

}

//! If we consider to insert into an entry we expct to be empty, if it was filled in the meantime the insert will not

//! happen and we need to return the pointer to the to row with which the new entry would have collided. In any other

//! case we return a nullptr

template <bool PARALLEL, bool EXPECT_EMPTY>

static inline data_ptr_t InsertRowToEntry(atomic<ht_entry_t> &entry, const data_ptr_t &row_ptr_to_insert,

const hash_t &salt, const idx_t &pointer_offset) {

const ht_entry_t desired_entry(salt, row_ptr_to_insert);

if (PARALLEL) {

if (EXPECT_EMPTY) {

// Add nullptr to the end of the list to mark the end

StorePointer(nullptr, row_ptr_to_insert + pointer_offset);

ht_entry_t expected_entry;

entry.compare_exchange_strong(expected_entry, desired_entry, std::memory_order_acquire,

std::memory_order_relaxed);

// The expected entry is updated with the encountered entry by the compare exchange

// So, this returns a nullptr if it was empty, and a non-null if it was not (which cancels the insert)

return expected_entry.GetPointerOrNull();

} else {

// At this point we know that the keys match, so we can try to insert until we succeed

ht_entry_t expected_entry = entry.load(std::memory_order_relaxed);

D_ASSERT(expected_entry.IsOccupied());

do {

data_ptr_t current_row_pointer = expected_entry.GetPointer();

StorePointer(current_row_pointer, row_ptr_to_insert + pointer_offset);

} while (!entry.compare_exchange_weak(expected_entry, desired_entry, std::memory_order_release,

std::memory_order_relaxed));

return nullptr;

}

} else {

// If we are not in parallel mode, we can just do the operation without any checks

data_ptr_t current_row_pointer = entry.load(std::memory_order_relaxed).GetPointerOrNull();

StorePointer(current_row_pointer, row_ptr_to_insert + pointer_offset);

entry = desired_entry;

return nullptr;

}

static inline void PerformKeyComparison(JoinHashTable::InsertState &state, JoinHashTable &ht,

const TupleDataCollection &data_collection, Vector &row_locations,

const idx_t count, idx_t &key_match_count, idx_t &key_no_match_count) {

// Get the data for the rows that need to be compared

state.lhs_data.Reset();

state.lhs_data.SetCardinality(count); // the right size

// The target selection vector says where to write the results into the lhs_data, we just want to write

// sequentially as otherwise we trigger a bug in the Gather function

data_collection.ResetCachedCastVectors(state.chunk_state, ht.equality_predicate_columns);

data_collection.Gather(row_locations, state.keys_to_compare_sel, count, ht.equality_predicate_columns,

state.lhs_data, *FlatVector::IncrementalSelectionVector(),

state.chunk_state.cached_cast_vectors);

TupleDataCollection::ToUnifiedFormat(state.chunk_state, state.lhs_data);

for (idx_t i = 0; i < count; i++) {

state.key_match_sel.set_index(i, i);

}

// Perform row comparisons

key_match_count = ht.row_matcher_build.Match(state.lhs_data, state.chunk_state.vector_data, state.key_match_sel,

count, *ht.layout_ptr, state.rhs_row_locations,

&state.keys_no_match_sel, key_no_match_count);

D_ASSERT(key_match_count + key_no_match_count == count);

}

template <bool PARALLEL>

static inline void InsertMatchesAndIncrementMisses(atomic<ht_entry_t> entries[], JoinHashTable::InsertState &state,

JoinHashTable &ht, const data_ptr_t lhs_row_locations[],

idx_t ht_offsets[], const hash_t hash_salts[],

const idx_t capacity_mask, const idx_t key_match_count,

const idx_t key_no_match_count) {

if (key_match_count != 0) {

ht.chains_longer_than_one = true;

}

// Insert the rows that match

for (idx_t i = 0; i < key_match_count; i++) {

const auto need_compare_idx = state.key_match_sel.get_index(i);

const auto entry_index = state.keys_to_compare_sel.get_index(need_compare_idx);

const auto &ht_offset = ht_offsets[entry_index];

auto &entry = entries[ht_offset];

const auto row_ptr_to_insert = lhs_row_locations[entry_index];

const auto salt = hash_salts[entry_index];

InsertRowToEntry<PARALLEL, false>(entry, row_ptr_to_insert, salt, ht.pointer_offset);

}

// Linear probing: each of the entries that do not match move to the next entry in the HT

for (idx_t i = 0; i < key_no_match_count; i++) {

const auto need_compare_idx = state.keys_no_match_sel.get_index(i);

const auto entry_index = state.keys_to_compare_sel.get_index(need_compare_idx);

auto &ht_offset = ht_offsets[entry_index];

IncrementAndWrap(ht_offset, capacity_mask);

state.remaining_sel.set_index(i, entry_index);

}

template <bool PARALLEL>

static void InsertHashesLoop(atomic<ht_entry_t> entries[], Vector &row_locations, Vector &hashes_v, const idx_t &count,

JoinHashTable::InsertState &state, const TupleDataCollection &data_collection,

JoinHashTable &ht) {

D_ASSERT(hashes_v.GetType().id() == LogicalType::HASH);

ApplyBitmaskAndGetSaltBuild(hashes_v, state.salt_v, count, ht.bitmask);

// the salts offset for each row to insert

const auto ht_offsets = FlatVector::GetData<idx_t>(hashes_v);

const auto hash_salts = FlatVector::GetData<hash_t>(state.salt_v);

// the row locations of the rows that are already in the hash table

const auto rhs_row_locations = FlatVector::GetData<data_ptr_t>(state.rhs_row_locations);

// the row locations of the rows that are to be inserted

const auto lhs_row_locations = FlatVector::GetData<data_ptr_t>(row_locations);

// we start off with the entire chunk

idx_t remaining_count = count;

const auto *remaining_sel = FlatVector::IncrementalSelectionVector();

if (PropagatesBuildSide(ht.join_type)) {

// if we propagate the build side, we may have added rows with NULL keys to the HT

// these may need to be filtered out depending on the comparison type (exactly like PrepareKeys does)

for (idx_t col_idx = 0; col_idx < ht.conditions.size(); col_idx++) {

// if null values are NOT equal for this column we filter them out

if (ht.NullValuesAreEqual(col_idx)) {

continue;

}

idx_t entry_idx;

idx_t idx_in_entry;

ValidityBytes::GetEntryIndex(col_idx, entry_idx, idx_in_entry);

idx_t new_remaining_count = 0;

for (idx_t i = 0; i < remaining_count; i++) {

const auto idx = remaining_sel->get_index(i);

if (ValidityBytes(lhs_row_locations[idx], count).RowIsValidUnsafe(col_idx)) {

state.remaining_sel.set_index(new_remaining_count++, idx);

}

remaining_count = new_remaining_count;

remaining_sel = &state.remaining_sel;

}

// use the ht bitmask to make the modulo operation faster but keep the salt bits intact

idx_t capacity_mask = ht.bitmask | ht_entry_t::SALT_MASK;

while (remaining_count > 0) {

idx_t salt_match_count = 0;

// iterate over each entry to find out whether it belongs to an existing list or will start a new list

for (idx_t i = 0; i < remaining_count; i++) {

const idx_t row_index = remaining_sel->get_index(i);

auto &ht_offset = ht_offsets[row_index];

auto &salt = hash_salts[row_index];

// increment the ht_offset of the entry as long as next entry is occupied and salt does not match

ht_entry_t entry;

bool occupied;

while (true) {

atomic<ht_entry_t> &atomic_entry = entries[ht_offset];

entry = atomic_entry.load(std::memory_order_relaxed);

occupied = entry.IsOccupied();

// condition for incrementing the ht_offset: occupied and row_salt does not match -> move to next entry

if (!occupied) {

break;

}

if (entry.GetSalt() == salt) {

break;

}

IncrementAndWrap(ht_offset, capacity_mask);

}

if (!occupied) { // insert into free

auto &atomic_entry = entries[ht_offset];

const auto row_ptr_to_insert = lhs_row_locations[row_index];

const auto potential_collided_ptr =

InsertRowToEntry<PARALLEL, true>(atomic_entry, row_ptr_to_insert, salt, ht.pointer_offset);

if (PARALLEL) {

// if the insertion was not successful, the entry was occupied in the meantime, so we have to

// compare the keys and insert the row to the next entry

if (DUCKDB_UNLIKELY(potential_collided_ptr != nullptr)) {

// if the entry was occupied, we need to compare the keys and insert the row to the next entry

// we need to compare the keys and insert the row to the next entry

state.keys_to_compare_sel.set_index(salt_match_count, row_index);

rhs_row_locations[salt_match_count] = potential_collided_ptr;

salt_match_count += 1;

}

} else { // compare with full entry

state.keys_to_compare_sel.set_index(salt_match_count, row_index);

rhs_row_locations[salt_match_count] = entry.GetPointer();

salt_match_count += 1;

}

// at this step, for all the rows to insert we stepped either until we found an empty entry or an entry with

// a matching salt, we now need to compare the keys for the ones that have a matching salt

idx_t key_no_match_count = 0;

if (salt_match_count != 0) {

idx_t key_match_count = 0;

PerformKeyComparison(state, ht, data_collection, row_locations, salt_match_count, key_match_count,

key_no_match_count);

InsertMatchesAndIncrementMisses<PARALLEL>(entries, state, ht, lhs_row_locations, ht_offsets, hash_salts,

capacity_mask, key_match_count, key_no_match_count);

}

// update the overall selection vector to only point the entries that still need to be inserted

// as there was no match found for them yet

remaining_sel = &state.remaining_sel;

remaining_count = key_no_match_count;

}

void JoinHashTable::InsertHashes(Vector &hashes_v, const idx_t count, TupleDataChunkState &chunk_state,

InsertState &insert_state, bool parallel) {

auto atomic_entries = reinterpret_cast<atomic<ht_entry_t> *>(this->entries);

auto row_locations = chunk_state.row_locations;

if (parallel) {

InsertHashesLoop<true>(atomic_entries, row_locations, hashes_v, count, insert_state, *data_collection, *this);

} else {

InsertHashesLoop<false>(atomic_entries, row_locations, hashes_v, count, insert_state, *data_collection, *this);

}

void JoinHashTable::AllocatePointerTable() {

capacity = PointerTableCapacity(Count());

D_ASSERT(IsPowerOfTwo(capacity));

if (hash_map.get()) {

// There is already a hash map

auto current_capacity = hash_map.GetSize() / sizeof(ht_entry_t);

if (capacity > current_capacity) {

// Need more space

hash_map = buffer_manager.GetBufferAllocator().Allocate(capacity * sizeof(ht_entry_t));

entries = reinterpret_cast<ht_entry_t *>(hash_map.get());

} else {

// Just use the current hash map

capacity = current_capacity;

}

} else {

// Allocate a hash map

hash_map = buffer_manager.GetBufferAllocator().Allocate(capacity * sizeof(ht_entry_t));

entries = reinterpret_cast<ht_entry_t *>(hash_map.get());

}

D_ASSERT(hash_map.GetSize() == capacity * sizeof(ht_entry_t));

bitmask = capacity - 1;

DUCKDB_LOG(context, PhysicalOperatorLogType, op, "JoinHashTable", "Build",

{{"rows", to_string(data_collection->Count())},

{"size", to_string(data_collection->SizeInBytes() + hash_map.GetSize())}});

}

void JoinHashTable::InitializePointerTable(idx_t entry_idx_from, idx_t entry_idx_to) {

// initialize HT with all-zero entries

std::fill_n(entries + entry_idx_from, entry_idx_to - entry_idx_from, ht_entry_t());

}

void JoinHashTable::Finalize(idx_t chunk_idx_from, idx_t chunk_idx_to, bool parallel) {

// Pointer table should be allocated

D_ASSERT(hash_map.get());

Vector hashes(LogicalType::HASH);

auto hash_data = FlatVector::GetData<hash_t>(hashes);

TupleDataChunkIterator iterator(*data_collection, TupleDataPinProperties::KEEP_EVERYTHING_PINNED, chunk_idx_from,

chunk_idx_to, false);

const auto row_locations = iterator.GetRowLocations();

InsertState insert_state(*this);

do {

const auto count = iterator.GetCurrentChunkCount();

for (idx_t i = 0; i < count; i++) {

hash_data[i] = Load<hash_t>(row_locations[i] + pointer_offset);

}

TupleDataChunkState &chunk_state = iterator.GetChunkState();

InsertHashes(hashes, count, chunk_state, insert_state, parallel);

} while (iterator.Next());

}

void JoinHashTable::InitializeScanStructure(ScanStructure &scan_structure, DataChunk &keys,

TupleDataChunkState &key_state, const SelectionVector *&current_sel) {

D_ASSERT(Count() > 0); // should be handled before

D_ASSERT(finalized);

// set up the scan structure

scan_structure.is_null = false;

scan_structure.finished = false;

if (join_type != JoinType::INNER) {

memset(scan_structure.found_match.get(), 0, sizeof(bool) * STANDARD_VECTOR_SIZE);

}

// first prepare the keys for probing

TupleDataCollection::ToUnifiedFormat(key_state, keys);

scan_structure.count = PrepareKeys(keys, key_state.vector_data, current_sel, scan_structure.sel_vector, false);

if (scan_structure.count < keys.size()) {

scan_structure.has_null_value_filter = true;

} else {

scan_structure.has_null_value_filter = false;

}

void JoinHashTable::Probe(ScanStructure &scan_structure, DataChunk &keys, TupleDataChunkState &key_state,

ProbeState &probe_state, optional_ptr<Vector> precomputed_hashes) {

const SelectionVector *current_sel;

InitializeScanStructure(scan_structure, keys, key_state, current_sel);

if (scan_structure.count == 0) {

return;

}

if (precomputed_hashes) {

GetRowPointers(keys, key_state, probe_state, *precomputed_hashes, current_sel, scan_structure.count,

scan_structure.pointers, scan_structure.sel_vector, scan_structure.has_null_value_filter);

} else {

Vector hashes(LogicalType::HASH);

// hash all the keys

Hash(keys, *current_sel, scan_structure.count, hashes);

// now initialize the pointers of the scan structure based on the hashes

GetRowPointers(keys, key_state, probe_state, hashes, current_sel, scan_structure.count, scan_structure.pointers,

scan_structure.sel_vector, scan_structure.has_null_value_filter);

}

ScanStructure::ScanStructure(JoinHashTable &ht_p, TupleDataChunkState &key_state_p)

: key_state(key_state_p), pointers(LogicalType::POINTER), count(0), sel_vector(STANDARD_VECTOR_SIZE),

chain_match_sel_vector(STANDARD_VECTOR_SIZE), chain_no_match_sel_vector(STANDARD_VECTOR_SIZE),

found_match(make_unsafe_uniq_array_uninitialized<bool>(STANDARD_VECTOR_SIZE)), ht(ht_p), finished(false),

is_null(true), rhs_pointers(LogicalType::POINTER), lhs_sel_vector(STANDARD_VECTOR_SIZE), last_match_count(0),

last_sel_vector(STANDARD_VECTOR_SIZE) {

}

void ScanStructure::Next(DataChunk &keys, DataChunk &left, DataChunk &result) {

D_ASSERT(keys.size() == left.size());

if (finished) {

return;

}

switch (ht.join_type) {

case JoinType::INNER:

case JoinType::RIGHT:

NextInnerJoin(keys, left, result);

break;

case JoinType::SEMI:

NextSemiJoin(keys, left, result);

break;

case JoinType::MARK:

NextMarkJoin(keys, left, result);

break;

case JoinType::ANTI:

NextAntiJoin(keys, left, result);

break;

case JoinType::RIGHT_ANTI:

case JoinType::RIGHT_SEMI:

NextRightSemiOrAntiJoin(keys);

break;

case JoinType::OUTER:

case JoinType::LEFT:

NextLeftJoin(keys, left, result);

break;

case JoinType::SINGLE:

NextSingleJoin(keys, left, result);

break;

default:

throw InternalException("Unhandled join type in JoinHashTable");

}

bool ScanStructure::PointersExhausted() const {

// AdvancePointers creates a "new_count" for every pointer advanced during the

// previous advance pointers call. If no pointers are advanced, new_count = 0.

// count is then set ot new_count.

return count == 0;

}

idx_t ScanStructure::ResolvePredicates(DataChunk &keys, SelectionVector &match_sel, SelectionVector *no_match_sel) {

// Initialize the found_match array to the current sel_vector

for (idx_t i = 0; i < this->count; ++i) {

match_sel.set_index(i, this->sel_vector.get_index(i));

}

// If there is a matcher for the probing side because of non-equality predicates, use it

if (ht.needs_chain_matcher) {

idx_t no_match_count = 0;

auto &matcher = no_match_sel ? ht.row_matcher_probe_no_match_sel : ht.row_matcher_probe;

D_ASSERT(matcher);

// we need to only use the vectors with the indices of the columns that are used in the probe phase, namely

// the non-equality columns

return matcher->Match(keys, key_state.vector_data, match_sel, this->count, *ht.layout_ptr, pointers,

no_match_sel, no_match_count, ht.non_equality_predicate_columns);

} else {

// no match sel is the opposite of match sel

return this->count;

}

idx_t ScanStructure::ScanInnerJoin(DataChunk &keys, SelectionVector &result_vector) {

while (true) {

// resolve the equality_predicates for this set of keys

idx_t result_count = ResolvePredicates(keys, result_vector, nullptr);

// after doing all the comparisons set the found_match vector

if (found_match) {

for (idx_t i = 0; i < result_count; i++) {

auto idx = result_vector.get_index(i);

found_match[idx] = true;

}

if (result_count > 0) {

return result_count;

}

// no matches found: check the next set of pointers

AdvancePointers();

if (this->count == 0) {

return 0;

}

void ScanStructure::AdvancePointers(const SelectionVector &sel, const idx_t sel_count) {

if (!ht.chains_longer_than_one) {

this->count = 0;

return;

}

// now for all the pointers, we move on to the next set of pointers

idx_t new_count = 0;

auto ptrs = FlatVector::GetData<data_ptr_t>(this->pointers);

for (idx_t i = 0; i < sel_count; i++) {

auto idx = sel.get_index(i);

ptrs[idx] = LoadPointer(ptrs[idx] + ht.pointer_offset);

if (ptrs[idx]) {

this->sel_vector.set_index(new_count++, idx);

}

this->count = new_count;

}

void ScanStructure::AdvancePointers() {

AdvancePointers(this->sel_vector, this->count);

}

void ScanStructure::GatherResult(Vector &result, const SelectionVector &result_vector,

const SelectionVector &sel_vector, const idx_t count, const idx_t col_no) {

ht.data_collection->Gather(pointers, sel_vector, count, col_no, result, result_vector, nullptr);

}

void ScanStructure::GatherResult(Vector &result, const SelectionVector &sel_vector, const idx_t count,

const idx_t col_idx) {

GatherResult(result, *FlatVector::IncrementalSelectionVector(), sel_vector, count, col_idx);

}

void ScanStructure::GatherResult(Vector &result, const idx_t count, const idx_t col_idx) {

ht.data_collection->Gather(rhs_pointers, *FlatVector::IncrementalSelectionVector(), count, col_idx, result,

*FlatVector::IncrementalSelectionVector(), nullptr);

}

void ScanStructure::UpdateCompactionBuffer(idx_t base_count, SelectionVector &result_vector, idx_t result_count) {

// matches were found

// record the result

// on the LHS, we store result vector

for (idx_t i = 0; i < result_count; i++) {

lhs_sel_vector.set_index(base_count + i, result_vector.get_index(i));

}

// on the RHS, we collect their pointers

VectorOperations::Copy(pointers, rhs_pointers, result_vector, result_count, 0, base_count);

}

void ScanStructure::NextInnerJoin(DataChunk &keys, DataChunk &left, DataChunk &result) {

if (ht.join_type != JoinType::RIGHT_SEMI && ht.join_type != JoinType::RIGHT_ANTI) {

D_ASSERT(result.ColumnCount() == left.ColumnCount() + ht.output_columns.size());

}

idx_t base_count = 0;

idx_t result_count;

while (this->count > 0) {

// if we have saved the match result, we need not call ScanInnerJoin again

if (last_match_count == 0) {

result_count = ScanInnerJoin(keys, chain_match_sel_vector);

} else {

chain_match_sel_vector.Initialize(last_sel_vector);

result_count = last_match_count;

last_match_count = 0;

}

if (result_count > 0) {

View remainder of file in raw view

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

join_hashtable.cpp

Latest commit

History

join_hashtable.cpp

File metadata and controls