warpcore
0.0.1
Hashing at the Speed of Light on modern CUDA-accelerators
multi_bucket_hash_table.cuh
Go to the documentation of this file.
1
#
ifndef
WARPCORE_MULTI_BUCKET_HASH_TABLE_CUH
2
#
define
WARPCORE_MULTI_BUCKET_HASH_TABLE_CUH
3
4
#
include
"hash_set.cuh"
5
6
namespace
warpcore
7
{
8
9
template
<
10
class
Value,
11
std::uint32_t BucketSize = 1>
12
struct
ArrayBucket
{
13
using
value_type = Value;
14
using
index_type = std::uint32_t;
15
16
static_assert
(
17
BucketSize > 0,
18
"invalid bucket size of 0"
);
19
20
HOSTDEVICEQUALIFIER INLINEQUALIFIER
21
explicit
ArrayBucket
(
value_type
value
)
noexcept
22
{
23
#
ifdef
__CUDA_ARCH__
24
#
pragma
unroll
25
#
endif
26
for
(
index_type
i
= 0;
i
<
bucket_size
(); ++
i
)
27
values_
[
i
] =
value
;
28
}
29
30
HOSTDEVICEQUALIFIER
INLINEQUALIFIER
31
ArrayBucket
(
const
ArrayBucket
&
other
)
noexcept
32
{
33
#
ifdef
__CUDA_ARCH__
34
#
pragma
unroll
35
#
endif
36
for
(
index_type
i
= 0;
i
<
bucket_size
(); ++
i
)
37
values_
[
i
] =
other
.
values_
[
i
];
38
}
39
40
HOSTDEVICEQUALIFIER
INLINEQUALIFIER
41
ArrayBucket
&
operator
=(
const
ArrayBucket
&
other
)
noexcept
42
{
43
#
ifdef
__CUDA_ARCH__
44
#
pragma
unroll
45
#
endif
46
for
(
index_type
i
= 0;
i
<
bucket_size
(); ++
i
)
47
values_
[
i
] =
other
.
values_
[
i
];
48
return
*
this
;
49
}
50
51
/*! \brief get bucket size
52
* \return bucket size
53
*/
54
HOSTDEVICEQUALIFIER
INLINEQUALIFIER
55
static
constexpr
index_type
bucket_size
()
noexcept
56
{
57
return
BucketSize
;
58
}
59
60
/*! \brief accessor
61
* \param[in] i index to access
62
* \return value at position \c i
63
*/
64
DEVICEQUALIFIER
INLINEQUALIFIER
65
constexpr
value_type
&
operator
[](
const
index_type
i
)
noexcept
66
{
67
return
values_
[
i
];
68
}
69
70
/*! \brief const accessor
71
* \param[in] i index to access
72
* \return value at position \c i
73
*/
74
DEVICEQUALIFIER
INLINEQUALIFIER
75
constexpr
const
value_type
&
operator
[](
const
index_type
i
)
const
noexcept
76
{
77
return
values_
[
i
];
78
}
79
80
value_type
values_
[
BucketSize
];
81
};
82
83
84
/*! \brief multi-value hash table
85
* \tparam Key key type ( \c std::uint32_t or \c std::uint64_t )
86
* \tparam Value value type
87
* \tparam EmptyKey key which represents an empty slot
88
* \tparam TombstoneKey key which represents an erased slot
89
* \tparam ProbingScheme probing scheme from \c warpcore::probing_schemes
90
* \tparam TableStorage memory layout from \c warpcore::storage::key_value
91
* \tparam TempMemoryBytes size of temporary storage (typically a few kB)
92
*/
93
template
<
94
class
Key
,
95
class
Value
,
96
Key
EmptyKey
=
defaults
::
empty_key
<
Key
>(),
97
Key
TombstoneKey
=
defaults
::
tombstone_key
<
Key
>(),
98
Value
EmptyValue
=
defaults
::
empty_key
<
Value
>(),
99
class
ProbingScheme
=
defaults
::
probing_scheme_t
<
Key
, 8>,
100
class
TableStorage
=
defaults
::
table_storage_t
<
Key
,
ArrayBucket
<
Value
,2>>,
101
index_t
TempMemoryBytes
=
defaults
::
temp_memory_bytes
()>
102
class
MultiBucketHashTable
103
{
104
static_assert
(
105
checks
::
is_valid_key_type
<
Key
>(),
106
"invalid key type"
);
107
108
static_assert
(
109
checks
::
is_valid_slot_type
<
Value
>(),
110
"invalid value type"
);
111
112
static_assert
(
113
EmptyKey
!=
TombstoneKey
,
114
"empty key and tombstone key must not be identical"
);
115
116
static_assert
(
117
checks
::
is_cycle_free_probing_scheme
<
ProbingScheme
>(),
118
"not a valid probing scheme type"
);
119
120
static_assert
(
121
std
::
is_same
<
typename
ProbingScheme
::
key_type
,
Key
>::
value
,
122
"probing key type differs from table's key type"
);
123
124
static_assert
(
125
checks
::
is_key_value_storage
<
TableStorage
>(),
126
"not a valid storage type"
);
127
128
static_assert
(
129
std
::
is_same
<
typename
TableStorage
::
key_type
,
Key
>::
value
,
130
"storage's key type differs from table's key type"
);
131
132
static_assert
(
133
std
::
is_same
<
typename
TableStorage
::
value_type
::
value_type
,
Value
>::
value
,
134
"storage's value type differs from table's value type"
);
135
136
static_assert
(
137
TempMemoryBytes
>=
sizeof
(
index_t
),
138
"temporary storage must at least be of size index_type"
);
139
140
using
temp_type
=
storage
::
CyclicStore
<
index_t
>;
141
142
public
:
143
using
key_type
=
Key
;
144
using
value_type
=
Value
;
145
using
bucket_type
=
typename
TableStorage
::
value_type
;
146
using
index_type
=
index_t
;
147
using
status_type
=
Status
;
148
using
probing_scheme_type
=
ProbingScheme
;
149
150
/*! \brief get empty key
151
* \return empty key
152
*/
153
HOSTDEVICEQUALIFIER
INLINEQUALIFIER
154
static
constexpr
key_type
empty_key
()
noexcept
155
{
156
return
EmptyKey
;
157
}
158
159
/*! \brief get tombstone key
160
* \return tombstone key
161
*/
162
HOSTDEVICEQUALIFIER
INLINEQUALIFIER
163
static
constexpr
key_type
tombstone_key
()
noexcept
164
{
165
return
TombstoneKey
;
166
}
167
168
/*! \brief get empty value
169
* \return empty value
170
*/
171
HOSTDEVICEQUALIFIER
INLINEQUALIFIER
172
static
constexpr
value_type
empty_value
()
noexcept
173
{
174
return
EmptyValue
;
175
}
176
177
178
/*! \brief get cooperative group size
179
* \return cooperative group size
180
*/
181
HOSTDEVICEQUALIFIER
INLINEQUALIFIER
182
static
constexpr
index_type
cg_size
()
noexcept
183
{
184
return
ProbingScheme
::
cg_size
();
185
}
186
187
/*! \brief get bucket size
188
* \return bucket size
189
*/
190
HOSTDEVICEQUALIFIER
INLINEQUALIFIER
191
static
constexpr
index_type
bucket_size
()
noexcept
192
{
193
return
TableStorage
::
value_type
::
bucket_size
();
194
}
195
196
/*! \brief constructor
197
* \param[in] min_capacity minimum number of slots in the hash table
198
* \param[in] seed random seed
199
* \param[in] max_values_per_key maximum number of values to store per key
200
* \param[in] no_init whether to initialize the table at construction or not
201
*/
202
HOSTQUALIFIER
INLINEQUALIFIER
203
explicit
MultiBucketHashTable
(
204
const
index_type
min_capacity
,
205
const
key_type
seed
=
defaults
::
seed
<
key_type
>(),
206
const
index_type
max_values_per_key
=
207
std
::
numeric_limits
<
index_type
>::
max
(),
208
const
bool
no_init
=
false
)
noexcept
:
209
status_
(
nullptr
),
210
table_
(
detail
::
get_valid_capacity
(
min_capacity
,
cg_size
())),
211
temp_
(
TempMemoryBytes
/
sizeof
(
index_type
)),
212
seed_
(
seed
),
213
max_values_per_key_
(
max_values_per_key
),
214
num_keys_
(
nullptr
),
215
num_occupied_
(
nullptr
),
216
is_copy_
(
false
),
217
is_initialized_
(
false
)
218
{
219
cudaMalloc
(&
status_
,
sizeof
(
status_type
));
220
cudaMalloc
(&
num_keys_
,
sizeof
(
index_type
));
221
cudaMalloc
(&
num_occupied_
,
sizeof
(
index_type
));
222
223
assign_status
(
table_
.
status
() +
temp_
.
status
());
224
225
if
(!
no_init
)
init
();
226
}
227
228
/*! \brief copy-constructor (shallow)
229
* \param[in] object to be copied
230
*/
231
HOSTDEVICEQUALIFIER
INLINEQUALIFIER
232
MultiBucketHashTable
(
const
MultiBucketHashTable
&
o
)
noexcept
:
233
status_
(
o
.
status_
),
234
table_
(
o
.
table_
),
235
temp_
(
o
.
temp_
),
236
seed_
(
o
.
seed_
),
237
max_values_per_key_
(
o
.
max_values_per_key_
),
238
num_keys_
(
o
.
num_keys_
),
239
num_occupied_
(
o
.
num_occupied_
),
240
is_copy_
(
true
),
241
is_initialized_
(
o
.
is_initialized_
)
242
{}
243
244
/*! \brief move-constructor
245
* \param[in] object to be moved
246
*/
247
HOSTQUALIFIER
INLINEQUALIFIER
248
MultiBucketHashTable
(
MultiBucketHashTable
&&
o
)
noexcept
:
249
status_
(
std
::
move
(
o
.
status_
)),
250
table_
(
std
::
move
(
o
.
table_
)),
251
temp_
(
std
::
move
(
o
.
temp_
)),
252
seed_
(
std
::
move
(
o
.
seed_
)),
253
max_values_per_key_
(
std
::
move
(
o
.
max_values_per_key_
)),
254
num_keys_
(
std
::
move
(
o
.
num_keys_
)),
255
num_occupied_
(
std
::
move
(
o
.
num_occupied_
)),
256
is_copy_
(
std
::
move
(
o
.
is_copy_
)),
257
is_initialized_
(
std
::
move
(
o
.
is_initialized_
))
258
{
259
o
.
is_copy_
=
true
;
260
}
261
262
#
ifndef
__CUDA_ARCH__
263
/*! \brief destructor
264
*/
265
HOSTQUALIFIER
INLINEQUALIFIER
266
~
MultiBucketHashTable
()
noexcept
267
{
268
if
(!
is_copy_
)
269
{
270
if
(
status_
!=
nullptr
)
cudaFree
(
status_
);
271
if
(
num_keys_
!=
nullptr
)
cudaFree
(
num_keys_
);
272
if
(
num_occupied_
!=
nullptr
)
cudaFree
(
num_occupied_
);
273
}
274
}
275
#
endif
276
277
/*! \brief (re)initialize the hash table
278
* \param[in] stream CUDA stream in which this operation is executed in
279
*/
280
HOSTQUALIFIER
INLINEQUALIFIER
281
void
init
(
const
cudaStream_t
stream
= 0)
noexcept
282
{
283
is_initialized_
=
false
;
284
285
if
(!
table_
.
status
().
has_not_initialized
() &&
286
!
temp_
.
status
().
has_not_initialized
())
287
{
288
table_
.
init_keys
(
empty_key
(),
stream
);
289
table_
.
init_values
(
bucket_type
(
empty_value
()),
stream
);
290
291
assign_status
(
table_
.
status
() +
temp_
.
status
(),
stream
);
292
293
cudaMemsetAsync
(
num_keys_
, 0,
sizeof
(
index_type
),
stream
);
294
cudaMemsetAsync
(
num_occupied_
, 0,
sizeof
(
index_type
),
stream
);
295
296
is_initialized_
=
true
;
297
}
298
}
299
300
private
:
301
DEVICEQUALIFIER
INLINEQUALIFIER
302
bool
insert_into_bucket
(
303
const
index_type
last_key_pos
,
304
const
value_type
value_in
,
305
const
cg
::
thread_block_tile
<
cg_size
()>&
group
,
306
index_type
num_values
,
307
status_type
&
status
)
noexcept
308
{
309
#
pragma
unroll
310
for
(
index_type
i
= 0;
311
i
<
SDIV
(
bucket_size
(),
cg_size
())*
cg_size
();
312
i
+=
cg_size
())
313
{
314
// first bucket value always written after key insert
315
const
value_type
table_value
=
316
((0 <
group
.
thread_rank
()) && (
i
+
group
.
thread_rank
() <
bucket_size
())) ?
317
table_
[
last_key_pos
].
value
[
group
.
thread_rank
()] :
318
~
empty_value
();
319
320
auto
empty_value_mask
=
group
.
ballot
(
is_empty_value
(
table_value
));
321
322
num_values
+=
min
(
bucket_size
(),
cg_size
()) -
__popc
(
empty_value_mask
);
323
324
if
(
num_values
>=
max_values_per_key_
)
325
{
326
status
=
status_type
::
duplicate_key
() +
327
status_type
::
max_values_for_key_reached
();
328
device_join_status
(
status
);
329
return
true
;
330
}
331
332
bool
success
=
false
;
333
334
while
(
empty_value_mask
)
335
{
336
const
auto
leader
=
ffs
(
empty_value_mask
) - 1;
337
338
if
(
group
.
thread_rank
() ==
leader
)
339
{
340
const
auto
old
=
341
atomicCAS
(&(
table_
[
last_key_pos
].
value
[
i
+
group
.
thread_rank
()]),
table_value
,
value_in
);
342
343
success
= (
old
==
table_value
);
344
}
345
346
if
(
group
.
any
(
success
))
347
{
348
status
= (
num_values
> 0) ?
349
status_type
::
duplicate_key
() :
status_type
::
none
();
350
return
true
;
351
}
352
353
++
num_values
;
354
if
(
num_values
>=
max_values_per_key_
)
355
{
356
status
=
status_type
::
duplicate_key
() +
357
status_type
::
max_values_for_key_reached
();
358
device_join_status
(
status
);
359
return
true
;
360
}
361
362
empty_value_mask
^= 1UL <<
leader
;
363
}
364
}
365
366
return
false
;
367
}
368
369
public
:
370
/*! \brief inserts a key into the hash table
371
* \param[in] key_in key to insert into the hash table
372
* \param[in] value_in value that corresponds to \c key_in
373
* \param[in] group cooperative group
374
* \param[in] probing_length maximum number of probing attempts
375
* \return status (per thread)
376
*/
377
DEVICEQUALIFIER
INLINEQUALIFIER
378
status_type
insert
(
379
const
key_type
key_in
,
380
const
value_type
value_in
,
381
const
cg
::
thread_block_tile
<
cg_size
()>&
group
,
382
const
index_type
probing_length
=
defaults
::
probing_length
())
noexcept
383
{
384
if
(!
is_initialized_
)
385
{
386
return
status_type
::
not_initialized
();
387
}
388
389
if
(!
is_valid_key
(
key_in
))
390
{
391
device_join_status
(
status_type
::
invalid_key
());
392
return
status_type
::
invalid_key
();
393
}
394
395
if
(!
is_valid_value
(
value_in
))
396
{
397
device_join_status
(
status_type
::
invalid_value
());
398
return
status_type
::
invalid_value
();
399
}
400
401
ProbingScheme
iter
(
capacity
(),
probing_length
,
group
);
402
index_type
num_values_plus_bucket_size
= 0;
// count one bucket less
403
404
index_type
last_key_pos
=
std
::
numeric_limits
<
index_type
>::
max
();
405
for
(
index_type
i
=
iter
.
begin
(
key_in
,
seed_
);
i
!=
iter
.
end
();
i
=
iter
.
next
())
406
{
407
const
key_type
table_key
=
cub
::
ThreadLoad
<
cub
::
LOAD_VOLATILE
>(&
table_
[
i
].
key
);
408
409
auto
empty_key_mask
=
group
.
ballot
(
is_empty_key
(
table_key
));
410
411
const
auto
key_found_mask
=
group
.
ballot
(
table_key
==
key_in
);
412
413
const
auto
new_last_key_pos
=
group
.
shfl
(
i
, 31 -
__clz
(
key_found_mask
));
414
415
last_key_pos
=
key_found_mask
?
new_last_key_pos
:
last_key_pos
;
416
417
num_values_plus_bucket_size
+=
bucket_size
() *
__popc
(
key_found_mask
);
418
419
// early exit
420
if
(
num_values_plus_bucket_size
>=
max_values_per_key_
)
421
{
422
if
(
bucket_size
() == 1)
423
{
424
// num values = num buckets, so no space left
425
status_type
status
=
status_type
::
duplicate_key
() +
426
status_type
::
max_values_for_key_reached
();
427
device_join_status
(
status
);
428
return
status
;
429
}
430
else
431
{
432
status_type
status
=
status_type
::
unknown_error
();
433
// check if space left in last bucket
434
insert_into_bucket
(
last_key_pos
,
value_in
,
group
,
435
num_values_plus_bucket_size
-
bucket_size
(),
status
);
436
return
status
;
437
}
438
}
439
440
while
(
empty_key_mask
)
441
{
442
status_type
status
;
443
if
((
bucket_size
() > 1) &&
444
(
last_key_pos
<
std
::
numeric_limits
<
index_type
>::
max
()) &&
445
insert_into_bucket
(
last_key_pos
,
value_in
,
group
,
446
num_values_plus_bucket_size
-
bucket_size
(),
status
))
447
return
status
;
448
449
// insert key
450
bool
success
=
false
;
451
bool
key_collision
=
false
;
452
453
const
auto
leader
=
ffs
(
empty_key_mask
) - 1;
454
455
if
(
group
.
thread_rank
() ==
leader
)
456
{
457
const
auto
old
=
458
atomicCAS
(&(
table_
[
i
].
key
),
table_key
,
key_in
);
459
460
success
= (
old
==
table_key
);
461
key_collision
= (
old
==
key_in
);
462
463
if
(
success
)
464
{
465
// relaxed write to first slot in value array
466
table_
[
i
].
value
[0] =
value_in
;
467
468
helpers
::
atomicAggInc
(
num_occupied_
);
469
470
if
(
num_values_plus_bucket_size
== 0)
471
{
472
helpers
::
atomicAggInc
(
num_keys_
);
473
}
474
}
475
}
476
477
if
(
group
.
any
(
success
))
478
{
479
return
(
num_values_plus_bucket_size
> 0) ?
480
status_type
::
duplicate_key
() :
status_type
::
none
();
481
}
482
483
key_collision
=
group
.
any
(
key_collision
);
484
num_values_plus_bucket_size
+=
key_collision
*
bucket_size
();
485
486
if
(
bucket_size
() == 1)
487
{
488
if
(
num_values_plus_bucket_size
>=
max_values_per_key_
)
489
{
490
status_type
status
=
status_type
::
duplicate_key
() +
491
status_type
::
max_values_for_key_reached
();
492
device_join_status
(
status
);
493
return
status
;
494
}
495
}
496
else
497
{
498
// check position in next iteration
499
const
auto
new_last_key_pos
=
group
.
shfl
(
i
,
leader
);
500
last_key_pos
=
key_collision
?
new_last_key_pos
:
last_key_pos
;
501
}
502
503
empty_key_mask
^= 1UL <<
leader
;
504
}
505
}
506
507
status_type
status
;
508
if
((
bucket_size
() > 1) &&
509
(
last_key_pos
<
std
::
numeric_limits
<
index_type
>::
max
()) &&
510
insert_into_bucket
(
last_key_pos
,
value_in
,
group
,
511
num_values_plus_bucket_size
-
bucket_size
(),
status
))
512
return
status
;
513
514
status
= (
num_values_plus_bucket_size
> 0) ?
515
status_type
::
probing_length_exceeded
() +
status_type
::
duplicate_key
() :
516
status_type
::
probing_length_exceeded
();
517
device_join_status
(
status
);
518
return
status
;
519
}
520
521
/*! \brief insert a set of keys into the hash table
522
* \tparam StatusHandler handles returned status per key (see \c status_handlers)
523
* \param[in] keys_in pointer to keys to insert into the hash table
524
* \param[in] values_in corresponds values to \c keys_in
525
* \param[in] num_in number of keys to insert
526
* \param[in] stream CUDA stream in which this operation is executed in
527
* \param[in] probing_length maximum number of probing attempts
528
* \param[out] status_out status information per key
529
*/
530
template
<
class
StatusHandler
=
defaults
::
status_handler_t
>
531
HOSTQUALIFIER
INLINEQUALIFIER
532
void
insert
(
533
const
key_type
*
const
keys_in
,
534
const
value_type
*
const
values_in
,
535
const
index_type
num_in
,
536
const
cudaStream_t
stream
= 0,
537
const
index_type
probing_length
=
defaults
::
probing_length
(),
538
typename
StatusHandler
::
base_type
*
const
status_out
=
nullptr
)
noexcept
539
{
540
static_assert
(
541
checks
::
is_status_handler
<
StatusHandler
>(),
542
"not a valid status handler type"
);
543
544
if
(!
is_initialized_
)
return
;
545
546
kernels
::
insert
<
MultiBucketHashTable
,
StatusHandler
>
547
<<<
SDIV
(
num_in
*
cg_size
(),
MAXBLOCKSIZE
),
MAXBLOCKSIZE
, 0,
stream
>>>
548
(
keys_in
,
values_in
,
num_in
, *
this
,
probing_length
,
status_out
);
549
}
550
551
/*! \brief retrieves all values to a corresponding key
552
* \param[in] key_in key to retrieve from the hash table
553
* \param[out] values_out values for \c key_in
554
* \param[out] num_out number of retrieved values
555
* \param[in] group cooperative group
556
* \param[in] probing_length maximum number of probing attempts
557
* \return status (per thread)
558
*/
559
DEVICEQUALIFIER
INLINEQUALIFIER
560
status_type
retrieve
(
561
const
key_type
key_in
,
562
value_type
*
const
values_out
,
563
index_type
&
num_out
,
564
const
cg
::
thread_block_tile
<
cg_size
()>&
group
,
565
const
index_type
probing_length
=
defaults
::
probing_length
())
const
noexcept
566
{
567
if
(
values_out
==
nullptr
)
568
{
569
const
auto
status
=
num_values
(
key_in
,
num_out
,
group
,
probing_length
);
570
device_join_status
(
status_type
::
dry_run
());
571
return
status_type
::
dry_run
() +
status
;
572
}
573
else
574
{
575
return
for_each
([=, *
this
]
DEVICEQUALIFIER
576
(
const
key_type
/* key */
,
const
value_type
&
value
,
const
index_type
index
)
577
{
578
values_out
[
index
] =
value
;
579
},
580
key_in
,
581
num_out
,
582
group
,
583
probing_length
);
584
}
585
}
586
587
/*! \brief retrieve a set of keys from the hash table
588
* \note this method has a dry-run mode where it only calculates the needed array sizes in case no memory (aka \c nullptr ) is provided
589
* \note \c end_offsets_out can be \c begin_offsets_out+1
590
* \tparam StatusHandler handles returned status per key (see \c status_handlers)
591
* \param[in] keys_in pointer to keys to retrieve from the hash table
592
* \param[in] num_in number of keys to retrieve
593
* \param[out] begin_offsets_out begin of value range for a corresponding key in \c values_out
594
* \param[out] end_offsets_out end of value range for a corresponding key in \c values_out
595
* \param[out] num_out total number of values retrieved by this operation
596
* \param[in] stream CUDA stream in which this operation is executed in
597
* \param[in] probing_length maximum number of probing attempts
598
* \param[out] status_out status information (per key)
599
*/
600
template
<
class
StatusHandler
=
defaults
::
status_handler_t
>
601
HOSTQUALIFIER
INLINEQUALIFIER
602
void
retrieve
(
603
const
key_type
*
const
keys_in
,
604
const
index_type
num_in
,
605
index_type
*
const
begin_offsets_out
,
606
index_type
*
const
end_offsets_out
,
607
value_type
*
const
values_out
,
608
index_type
&
num_out
,
609
const
cudaStream_t
stream
= 0,
610
const
index_type
probing_length
=
defaults
::
probing_length
(),
611
typename
StatusHandler
::
base_type
*
const
status_out
=
nullptr
)
const
noexcept
612
{
613
static_assert
(
614
checks
::
is_status_handler
<
StatusHandler
>(),
615
"not a valid status handler type"
);
616
617
if
(!
is_initialized_
)
return
;
618
619
// cub::DeviceScan::InclusiveSum takes input sizes of type int
620
if
(
num_in
>
std
::
numeric_limits
<
int
>::
max
())
621
{
622
join_status
(
status_type
::
index_overflow
(),
stream
);
623
624
return
;
625
}
626
627
num_values
(
628
keys_in
,
629
num_in
,
630
num_out
,
631
end_offsets_out
,
632
stream
,
633
probing_length
);
634
635
if
(
values_out
!=
nullptr
)
636
{
637
index_type
temp_bytes
=
num_out
*
sizeof
(
value_type
);
638
639
cub
::
DeviceScan
::
InclusiveSum
(
640
values_out
,
641
temp_bytes
,
642
end_offsets_out
,
643
end_offsets_out
,
644
num_in
,
645
stream
);
646
647
cudaMemsetAsync
(
begin_offsets_out
, 0,
sizeof
(
index_type
),
stream
);
648
649
if
(
end_offsets_out
!=
begin_offsets_out
+ 1)
650
{
651
cudaMemcpyAsync
(
652
begin_offsets_out
+ 1,
653
end_offsets_out
,
654
sizeof
(
index_type
) * (
num_in
- 1),
655
D2D
,
656
stream
);
657
}
658
659
kernels
::
retrieve
<
MultiBucketHashTable
,
StatusHandler
>
660
<<<
SDIV
(
num_in
*
cg_size
(),
MAXBLOCKSIZE
),
MAXBLOCKSIZE
, 0,
stream
>>>
661
(
662
keys_in
,
663
num_in
,
664
begin_offsets_out
,
665
end_offsets_out
,
666
values_out
,
667
*
this
,
668
probing_length
,
669
status_out
);
670
}
671
else
672
{
673
if
(
status_out
!=
nullptr
)
674
{
675
helpers
::
lambda_kernel
676
<<<
SDIV
(
num_in
,
MAXBLOCKSIZE
),
MAXBLOCKSIZE
, 0,
stream
>>>
677
([=, *
this
]
DEVICEQUALIFIER
678
{
679
const
index_type
tid
=
helpers
::
global_thread_id
();
680
681
if
(
tid
<
num_in
)
682
{
683
StatusHandler
::
handle
(
Status
::
dry_run
(),
status_out
,
tid
);
684
}
685
});
686
}
687
688
join_status
(
status_type
::
dry_run
(),
stream
);
689
}
690
691
if
(
stream
== 0)
692
{
693
cudaStreamSynchronize
(
stream
);
694
}
695
}
696
697
/*! \brief retrieves all elements from the hash table
698
* \note this method has a dry-run mode where it only calculates the needed array sizes in case no memory (aka \c nullptr ) is provided
699
* \note this method implements a multi-stage dry-run mode
700
* \param[out] keys_out pointer to the set of unique keys
701
* \param[out] num_keys_out number of unique keys
702
* \param[out] begin_offsets_out begin of value range for a corresponding key in \c values_out
703
* \param[out] end_offsets_out end of value range for a corresponding key in \c values_out
704
* \param[out] values_out array which holds all retrieved values
705
* \param[out] num_values_out total number of values retrieved by this operation
706
* \param[in] stream CUDA stream in which this operation is executed in
707
*/
708
HOSTQUALIFIER
INLINEQUALIFIER
709
void
retrieve_all
(
710
key_type
*
const
keys_out
,
711
index_type
&
num_keys_out
,
712
index_type
*
const
begin_offsets_out
,
713
index_type
*
const
end_offsets_out
,
714
value_type
*
const
values_out
,
715
value_type
&
num_values_out
,
716
const
cudaStream_t
stream
= 0)
const
noexcept
717
{
718
if
(!
is_initialized_
)
return
;
719
720
retrieve_all_keys
(
keys_out
,
num_keys_out
,
stream
);
721
722
if
(
keys_out
!=
nullptr
)
723
{
724
retrieve
(
725
keys_out
,
726
num_keys_out
,
727
begin_offsets_out
,
728
end_offsets_out
,
729
values_out
,
730
num_values_out
,
731
stream
);
732
}
733
734
if
(
stream
== 0)
735
{
736
cudaStreamSynchronize
(
stream
);
737
}
738
}
739
740
/*! \brief retrieve all unqiue keys
741
* \info this method has a dry-run mode where it only calculates the needed array sizes in case no memory (aka \c nullptr ) is provided
742
* \param[out] keys_out retrieved unqiue keys
743
* \param[out] num_out numof unique keys
744
* \param[in] stream CUDA stream in which this operation is executed in
745
*/
746
HOSTQUALIFIER
INLINEQUALIFIER
747
void
retrieve_all_keys
(
748
key_type
*
const
keys_out
,
749
index_type
&
num_out
,
750
const
cudaStream_t
stream
= 0)
const
noexcept
751
{
752
if
(!
is_initialized_
)
return
;
753
754
if
(
keys_out
!=
nullptr
)
755
{
756
index_type
*
const
tmp
=
temp_
.
get
();
757
cudaMemsetAsync
(
tmp
, 0,
sizeof
(
index_type
),
stream
);
758
759
kernels
::
for_each_unique_key
760
<<<
SDIV
(
capacity
()*
cg_size
(),
MAXBLOCKSIZE
),
MAXBLOCKSIZE
, 0,
stream
>>>
761
([=]
DEVICEQUALIFIER
(
const
key_type
&
key
)
762
{
763
index_type
out
=
helpers
::
atomicAggInc
(
tmp
);
764
keys_out
[
out
] =
key
;
765
}, *
this
);
766
767
cudaMemcpyAsync
(&
num_out
,
tmp
,
sizeof
(
index_type
),
D2H
,
stream
);
768
769
if
(
stream
== 0)
770
{
771
cudaStreamSynchronize
(
stream
);
772
}
773
}
774
else
775
{
776
num_out
=
num_keys
(
stream
);
777
join_status
(
status_type
::
dry_run
(),
stream
);
778
}
779
}
780
781
/*! \brief applies a funtion over all values of a specified key
782
* \tparam Func type of map i.e. CUDA device lambda
783
* \param[in] f map to apply
784
* \param[in] key_in key to consider
785
* \param[out] num_values_out number of values associated to \c key_in
786
* \param[in] group cooperative group
787
* \param[in] probing_length maximum number of probing attempts
788
* \return status (per thread)
789
*/
790
template
<
class
Func
>
791
DEVICEQUALIFIER
INLINEQUALIFIER
792
status_type
for_each
(
793
Func
f
,
794
const
key_type
key_in
,
795
index_type
&
num_values_out
,
796
const
cg
::
thread_block_tile
<
cg_size
()>&
group
,
797
const
index_type
probing_length
=
defaults
::
probing_length
())
const
noexcept
798
{
799
if
(!
is_initialized_
)
return
status_type
::
not_initialized
();
800
801
if
(!
is_valid_key
(
key_in
))
802
{
803
num_values_out
= 0;
804
device_join_status
(
status_type
::
invalid_key
());
805
return
status_type
::
invalid_key
();
806
}
807
808
ProbingScheme
iter
(
capacity
(),
min
(
probing_length
,
capacity
()),
group
);
809
810
index_type
num
= 0;
811
for
(
index_type
i
=
iter
.
begin
(
key_in
,
seed_
);
i
!=
iter
.
end
();
i
=
iter
.
next
())
812
{
813
const
auto
table_key
=
table_
[
i
].
key
;
814
const
auto
hit
= (
table_key
==
key_in
);
815
const
auto
hit_mask
=
group
.
ballot
(
hit
);
816
817
index_type
num_empty
= 0;
818
if
(
hit
)
819
{
820
const
auto
j
=
821
num
+
bucket_size
() *
__popc
(
hit_mask
& ((1U <<
group
.
thread_rank
()) - 1));
822
823
const
auto
bucket
=
table_
[
i
].
value
;
824
#
pragma
unroll
825
for
(
index_type
b
= 0;
b
<
bucket_size
(); ++
b
) {
826
const
auto
&
value
=
bucket
[
b
];
827
// if(value != empty_value() && j+b < max_values_per_key_)
828
if
(
value
!=
empty_value
())
829
f
(
key_in
,
value
,
j
+
b
);
830
else
831
++
num_empty
;
832
}
833
}
834
835
// get num_empty from last bucket in group
836
// if not hit this return 0 from last thread
837
num_empty
=
group
.
shfl
(
num_empty
, 31 -
__clz
(
hit_mask
));
838
839
num
+=
bucket_size
() *
__popc
(
hit_mask
) -
num_empty
;
840
841
if
(
group
.
any
(
is_empty_key
(
table_key
) ||
num
>=
max_values_per_key_
))
842
{
843
num_values_out
=
num
;
844
845
if
(
num
== 0)
846
{
847
device_join_status
(
status_type
::
key_not_found
());
848
return
status_type
::
key_not_found
();
849
}
850
else
851
{
852
return
status_type
::
none
();
853
}
854
}
855
}
856
857
num_values_out
=
num
;
858
device_join_status
(
status_type
::
probing_length_exceeded
());
859
return
status_type
::
probing_length_exceeded
();
860
}
861
862
/*! \brief applies a funtion over all key bucket pairs inside the table
863
* \tparam Func type of map i.e. CUDA device lambda
864
* \param[in] f map to apply
865
* \param[in] stream CUDA stream in which this operation is executed in
866
* \param[in] size of dynamic shared memory to reserve for this execution
867
*/
868
template
<
class
Func
>
869
HOSTQUALIFIER
INLINEQUALIFIER
870
void
for_each_bucket
(
871
Func
f
,
// TODO const?
872
const
cudaStream_t
stream
= 0,
873
const
index_type
smem_bytes
= 0)
const
noexcept
874
{
875
if
(!
is_initialized_
)
return
;
876
877
kernels
::
for_each
878
<<<
SDIV
(
capacity
(),
MAXBLOCKSIZE
),
MAXBLOCKSIZE
,
smem_bytes
,
stream
>>>
879
(
f
, *
this
);
880
}
881
882
/*! \brief applies a funtion over all key value pairs inside the table
883
* \tparam Func type of map i.e. CUDA device lambda
884
* \param[in] f map to apply
885
* \param[in] stream CUDA stream in which this operation is executed in
886
* \param[in] size of dynamic shared memory to reserve for this execution
887
*/
888
template
<
class
Func
>
889
HOSTQUALIFIER
INLINEQUALIFIER
890
void
for_each_value
(
891
Func
f
,
// TODO const?
892
const
cudaStream_t
stream
= 0,
893
const
index_type
smem_bytes
= 0)
const
noexcept
894
{
895
if
(!
is_initialized_
)
return
;
896
897
auto
bucket_f
= [=,
f
=
std
::
move
(
f
)]
DEVICEQUALIFIER
898
(
const
key_type
key
,
const
bucket_type
bucket
)
mutable
899
{
900
#
pragma
unroll
901
for
(
index_type
b
= 0;
b
<
bucket_size
(); ++
b
) {
902
const
auto
&
value
=
bucket
[
b
];
903
if
(
value
!=
empty_value
())
904
f
(
key
,
value
);
905
}
906
};
907
908
kernels
::
for_each
909
<<<
SDIV
(
capacity
(),
MAXBLOCKSIZE
),
MAXBLOCKSIZE
,
smem_bytes
,
stream
>>>
910
(
bucket_f
, *
this
);
911
}
912
913
/*! \brief applies a funtion over all key value pairs
914
* \tparam Func type of map i.e. CUDA device lambda
915
* \tparam StatusHandler handles returned status per key (see \c status_handlers)
916
* \param[in] f map to apply
917
* \param[in] keys_in keys to consider
918
* \param[in] num_in number of keys
919
* \param[in] stream CUDA stream in which this operation is executed in
920
* \param[in] probing_length maximum number of probing attempts
921
* \param[out] status_out status information (per key)
922
* \param[in] size of dynamic shared memory to reserve for this execution
923
*/
924
template
<
class
Func
,
class
StatusHandler
=
defaults
::
status_handler_t
>
925
HOSTQUALIFIER
INLINEQUALIFIER
926
void
for_each
(
927
Func
f
,
// TODO const?
928
const
key_type
*
const
keys_in
,
929
const
index_type
num_in
,
930
const
cudaStream_t
stream
= 0,
931
const
index_type
probing_length
=
defaults
::
probing_length
(),
932
typename
StatusHandler
::
base_type
*
const
status_out
=
nullptr
,
933
const
index_type
smem_bytes
= 0)
const
noexcept
934
{
935
static_assert
(
936
checks
::
is_status_handler
<
StatusHandler
>(),
937
"not a valid status handler type"
);
938
939
if
(!
is_initialized_
)
return
;
940
941
kernels
::
for_each
<
Func
,
MultiBucketHashTable
>
942
<<<
SDIV
(
capacity
(),
MAXBLOCKSIZE
),
MAXBLOCKSIZE
,
smem_bytes
,
stream
>>>
943
(
f
,
keys_in
,
num_in
, *
this
,
status_out
);
944
}
945
946
/*! \brief number of unique keys inside the table
947
* \param[in] stream CUDA stream in which this operation is executed in
948
* \return number of unique keys
949
*/
950
HOSTQUALIFIER
INLINEQUALIFIER
951
index_type
num_keys
(
const
cudaStream_t
stream
= 0)
const
noexcept
952
{
953
index_type
num
= 0;
954
955
cudaMemcpyAsync
(&
num
,
num_keys_
,
sizeof
(
index_type
),
D2H
,
stream
);
956
957
cudaStreamSynchronize
(
stream
);
958
959
return
num
;
960
}
961
962
/*! \brief number of occupied slots in the hash table
963
* \param[in] stream CUDA stream in which this operation is executed in
964
* \return the number of occupied slots
965
*/
966
HOSTQUALIFIER
INLINEQUALIFIER
967
index_type
num_occupied
(
const
cudaStream_t
stream
= 0)
const
noexcept
968
{
969
index_type
num
= 0;
970
971
cudaMemcpyAsync
(&
num
,
num_occupied_
,
sizeof
(
index_type
),
D2H
,
stream
);
972
973
cudaStreamSynchronize
(
stream
);
974
975
return
num
;
976
}
977
978
/*! \brief total number of values inside the table
979
* \param[in] key_in key to be probed
980
* \param[out] num_out number of values associated to \c key_in*
981
* \param[in] group cooperative group
982
* \param[in] probing_length maximum number of probing attempts
983
* \return status (per thread)
984
*/
985
DEVICEQUALIFIER
INLINEQUALIFIER
986
status_type
num_values
(
987
const
key_type
key_in
,
988
index_type
&
num_out
,
989
const
cg
::
thread_block_tile
<
cg_size
()>&
group
,
990
const
index_type
probing_length
=
defaults
::
probing_length
())
const
noexcept
991
{
992
return
for_each
([=]
DEVICEQUALIFIER
(
993
const
key_type
/* key */
,
994
const
value_type
&
/* value */
,
995
const
index_type
/* index */
) {},
996
key_in
,
997
num_out
,
998
group
,
999
probing_length
);
1000
}
1001
1002
/*! \brief number of values associated to a set of keys
1003
* \info this function returns only \c num_out if \c num_per_key_out==nullptr
1004
* \tparam StatusHandler handles returned status per key (see \c status_handlers)
1005
* \param[in] keys_in keys to consider
1006
* \param[in] num_in number of keys
1007
* \param[out] num_out total number of values
1008
* \param[out] num_per_key_out number of values per key
1009
* \param[in] stream CUDA stream in which this operation is executed in
1010
* \param[in] probing_length maximum number of probing attempts
1011
* \param[out] status_out status information (per key)
1012
*/
1013
template
<
class
StatusHandler
=
defaults
::
status_handler_t
>
1014
HOSTQUALIFIER
INLINEQUALIFIER
1015
void
num_values
(
1016
const
key_type
*
const
keys_in
,
1017
const
index_type
num_in
,
1018
index_type
&
num_out
,
1019
index_type
*
const
num_per_key_out
=
nullptr
,
1020
const
cudaStream_t
stream
= 0,
1021
const
index_type
probing_length
=
defaults
::
probing_length
(),
1022
typename
StatusHandler
::
base_type
*
const
status_out
=
nullptr
)
const
noexcept
1023
{
1024
if
(!
is_initialized_
)
return
;
1025
1026
// TODO check if shared memory is benefitial
1027
1028
index_type
*
const
tmp
=
temp_
.
get
();
1029
cudaMemsetAsync
(
tmp
, 0,
sizeof
(
index_type
),
stream
);
1030
1031
kernels
::
num_values
<
MultiBucketHashTable
,
StatusHandler
>
1032
<<<
SDIV
(
num_in
*
cg_size
(),
MAXBLOCKSIZE
),
MAXBLOCKSIZE
, 0,
stream
>>>
1033
(
keys_in
,
num_in
,
tmp
,
num_per_key_out
, *
this
,
probing_length
,
status_out
);
1034
1035
cudaMemcpyAsync
(&
num_out
,
tmp
,
sizeof
(
index_type
),
D2H
,
stream
);
1036
1037
if
(
stream
== 0)
1038
{
1039
cudaStreamSynchronize
(
stream
);
1040
}
1041
}
1042
1043
/*! \brief number of values stored inside the hash table
1044
* \info alias for \c size()
1045
* \param[in] stream CUDA stream in which this operation is executed in
1046
* \return the number of values
1047
*/
1048
HOSTQUALIFIER
INLINEQUALIFIER
1049
index_type
num_values
(
const
cudaStream_t
stream
= 0)
const
noexcept
1050
{
1051
return
size
(
stream
);
1052
}
1053
1054
/*! \brief number of values stored inside the hash table
1055
* \param[in] stream CUDA stream in which this operation is executed in
1056
* \return the number of values
1057
*/
1058
HOSTQUALIFIER
INLINEQUALIFIER
1059
index_type
size
(
const
cudaStream_t
stream
= 0)
const
noexcept
1060
{
1061
if
(!
is_initialized_
)
return
0;
1062
1063
index_type
out
;
1064
index_type
*
tmp
=
temp_
.
get
();
1065
1066
cudaMemsetAsync
(
tmp
, 0,
sizeof
(
index_t
),
stream
);
1067
1068
kernels
::
num_values
1069
<<<
SDIV
(
capacity
(),
MAXBLOCKSIZE
),
MAXBLOCKSIZE
, 0,
stream
>>>
1070
(
tmp
, *
this
);
1071
1072
cudaMemcpyAsync
(
1073
&
out
,
1074
tmp
,
1075
sizeof
(
index_type
),
1076
D2H
,
1077
stream
);
1078
1079
cudaStreamSynchronize
(
stream
);
1080
1081
return
out
;
1082
}
1083
1084
/*! \brief current load factor of the hash table
1085
* \param[in] stream CUDA stream in which this operation is executed in
1086
* \return load factor
1087
*/
1088
HOSTQUALIFIER
INLINEQUALIFIER
1089
float
key_load_factor
(
const
cudaStream_t
stream
= 0)
const
noexcept
1090
{
1091
return
float
(
num_occupied
(
stream
)) /
float
(
capacity
());
1092
}
1093
1094
/*! \brief current load factor of the hash table
1095
* \param[in] stream CUDA stream in which this operation is executed in
1096
* \return load factor
1097
*/
1098
HOSTQUALIFIER
INLINEQUALIFIER
1099
float
value_load_factor
(
const
cudaStream_t
stream
= 0)
const
noexcept
1100
{
1101
return
float
(
num_values
(
stream
)) /
float
(
capacity
()*
bucket_size
());
1102
}
1103
1104
/*! \brief current storage density of the hash table
1105
* \param[in] stream CUDA stream in which this operation is executed in
1106
* \return storage density
1107
*/
1108
HOSTQUALIFIER
INLINEQUALIFIER
1109
float
storage_density
(
const
cudaStream_t
stream
= 0)
const
noexcept
1110
{
1111
const
index_type
key_bytes
=
num_keys
(
stream
) *
sizeof
(
key_type
);
1112
const
index_type
value_bytes
=
num_values
(
stream
) *
sizeof
(
value_type
);
1113
const
index_type
table_bytes
=
bytes_total
();
1114
1115
return
float
(
key_bytes
+
value_bytes
) /
float
(
table_bytes
);
1116
}
1117
1118
/*! \brief current relative storage density of the hash table
1119
* \param stream CUDA stream in which this operation is executed in
1120
* \return storage density
1121
*/
1122
HOSTQUALIFIER
INLINEQUALIFIER
1123
float
relative_storage_density
(
const
cudaStream_t
stream
= 0)
const
noexcept
1124
{
1125
const
index_type
key_bytes
=
num_keys
(
stream
) *
sizeof
(
key_type
);
1126
const
index_type
value_bytes
=
num_values
(
stream
) *
sizeof
(
value_type
);
1127
const
index_type
occupied_bytes
=
1128
num_occupied
(
stream
) *
sizeof
(
key_type
) +
value_bytes
;
1129
1130
return
float
(
key_bytes
+
value_bytes
) / (
occupied_bytes
);
1131
}
1132
1133
/*! \brief get the key capacity of the hash table
1134
* \return number of key slots in the hash table
1135
*/
1136
HOSTDEVICEQUALIFIER
INLINEQUALIFIER
1137
index_type
capacity
()
const
noexcept
1138
{
1139
return
table_
.
capacity
();
1140
}
1141
1142
/*! \brief get the maximum value capacity of the hash table
1143
* \return maximum value capacity
1144
*/
1145
HOSTDEVICEQUALIFIER
INLINEQUALIFIER
1146
index_type
value_capacity
()
const
noexcept
1147
{
1148
return
table_
.
capacity
() *
bucket_size
();
1149
}
1150
1151
/*! \brief get the total number of bytes occupied by this data structure
1152
* \return bytes
1153
*/
1154
HOSTQUALIFIER
INLINEQUALIFIER
1155
index_type
bytes_total
()
const
noexcept
1156
{
1157
return
table_
.
bytes_total
() +
sizeof
(
index_type
);
1158
}
1159
1160
/*! \brief indicates if the hash table is properly initialized
1161
* \return \c true iff the hash table is properly initialized
1162
*/
1163
HOSTDEVICEQUALIFIER
INLINEQUALIFIER
1164
bool
is_initialized
()
const
noexcept
1165
{
1166
return
is_initialized_
;
1167
}
1168
1169
/*! \brief get the status of the hash table
1170
* \param[in] stream CUDA stream in which this operation is executed in
1171
* \return the status
1172
*/
1173
HOSTQUALIFIER
INLINEQUALIFIER
1174
status_type
peek_status
(
const
cudaStream_t
stream
= 0)
const
noexcept
1175
{
1176
status_type
status
=
status_type
::
not_initialized
();
1177
1178
if
(
status_
!=
nullptr
)
1179
{
1180
cudaMemcpyAsync
(
1181
&
status
,
1182
status_
,
1183
sizeof
(
status_type
),
1184
D2H
,
1185
stream
);
1186
1187
cudaStreamSynchronize
(
stream
);
1188
}
1189
1190
return
status
;
1191
}
1192
1193
/*! \brief get and reset the status of the hash table
1194
* \param[in] stream CUDA stream in which this operation is executed in
1195
* \return the status
1196
*/
1197
HOSTQUALIFIER
INLINEQUALIFIER
1198
status_type
pop_status
(
const
cudaStream_t
stream
= 0)
noexcept
1199
{
1200
status_type
status
=
status_type
::
not_initialized
();
1201
1202
if
(
status_
!=
nullptr
)
1203
{
1204
cudaMemcpyAsync
(
1205
&
status
,
1206
status_
,
1207
sizeof
(
status_type
),
1208
D2H
,
1209
stream
);
1210
1211
assign_status
(
table_
.
status
(),
stream
);
1212
}
1213
1214
return
status
;
1215
}
1216
1217
/*! \brief checks if \c key is equal to \c EmptyKey
1218
* \return \c bool
1219
*/
1220
HOSTDEVICEQUALIFIER
INLINEQUALIFIER
1221
static
constexpr
bool
is_empty_key
(
const
key_type
key
)
noexcept
1222
{
1223
return
(
key
==
empty_key
());
1224
}
1225
1226
/*! \brief checks if \c key is equal to \c TombstoneKey
1227
* \return \c bool
1228
*/
1229
HOSTDEVICEQUALIFIER
INLINEQUALIFIER
1230
static
constexpr
bool
is_tombstone_key
(
const
key_type
key
)
noexcept
1231
{
1232
return
(
key
==
tombstone_key
());
1233
}
1234
1235
/*! \brief checks if \c key is not equal to \c (EmptyKey||TombstoneKey)
1236
* \return \c bool
1237
*/
1238
HOSTDEVICEQUALIFIER
INLINEQUALIFIER
1239
static
constexpr
bool
is_valid_key
(
const
key_type
key
)
noexcept
1240
{
1241
return
(
key
!=
empty_key
() &&
key
!=
tombstone_key
());
1242
}
1243
1244
/*! \brief checks if \c value is equal to \c EmptyValue
1245
* \return \c bool
1246
*/
1247
HOSTDEVICEQUALIFIER
INLINEQUALIFIER
1248
static
constexpr
bool
is_empty_value
(
const
value_type
value
)
noexcept
1249
{
1250
return
(
value
==
empty_value
());
1251
}
1252
1253
/*! \brief checks if \c value is equal not to \c EmptyValue
1254
* \return \c bool
1255
*/
1256
HOSTDEVICEQUALIFIER
INLINEQUALIFIER
1257
static
constexpr
bool
is_valid_value
(
const
value_type
value
)
noexcept
1258
{
1259
return
(
value
!=
empty_value
());
1260
}
1261
1262
/*! \brief indicates if this object is a shallow copy
1263
* \return \c bool
1264
*/
1265
HOSTDEVICEQUALIFIER
INLINEQUALIFIER
1266
bool
is_copy
()
const
noexcept
1267
{
1268
return
is_copy_
;
1269
}
1270
1271
private
:
1272
/*! \brief assigns the hash table's status
1273
* \info \c const on purpose
1274
* \param[in] status new status
1275
* \param[in] stream CUDA stream in which this operation is executed in
1276
*/
1277
HOSTQUALIFIER
INLINEQUALIFIER
1278
void
assign_status
(
1279
const
status_type
status
,
1280
const
cudaStream_t
stream
= 0)
const
noexcept
1281
{
1282
if
(
status_
!=
nullptr
)
1283
{
1284
cudaMemcpyAsync
(
1285
status_
,
1286
&
status
,
1287
sizeof
(
status_type
),
1288
H2D
,
1289
stream
);
1290
1291
cudaStreamSynchronize
(
stream
);
1292
}
1293
}
1294
1295
/*! \brief joins additional flags to the hash table's status
1296
* \info \c const on purpose
1297
* \param[in] status new status
1298
* \param[in] stream CUDA stream in which this operation is executed in
1299
*/
1300
HOSTQUALIFIER
INLINEQUALIFIER
1301
void
join_status
(
1302
const
status_type
status
,
1303
const
cudaStream_t
stream
= 0)
const
noexcept
1304
{
1305
if
(
status_
!=
nullptr
)
1306
{
1307
status_type
peeked
=
peek_status
(
stream
);
1308
const
status_type
joined
=
peeked
+
status
;
1309
1310
if
(
joined
!=
peeked
)
1311
{
1312
assign_status
(
joined
,
stream
);
1313
}
1314
}
1315
}
1316
1317
/*! \brief joins additional flags to the hash table's status
1318
* \info \c const on purpose
1319
* \param[in] status new status
1320
*/
1321
DEVICEQUALIFIER
INLINEQUALIFIER
1322
void
device_join_status
(
const
status_type
status
)
const
noexcept
1323
{
1324
if
(
status_
!=
nullptr
)
1325
{
1326
status_
->
atomic_join
(
status
);
1327
}
1328
}
1329
1330
status_type
*
status_
;
//< pointer to status
1331
TableStorage
table_
;
//< actual key/value storage
1332
temp_type
temp_
;
//< temporary memory
1333
key_type
seed_
;
//< random seed
1334
index_type
max_values_per_key_
;
//< maximum number of values to store per key
1335
index_type
*
num_keys_
;
//< pointer to the count of unique keys
1336
index_type
*
num_occupied_
;
//< pointer to the count of occupied key slots
1337
bool
is_copy_
;
//< indicates if table is a shallow copy
1338
bool
is_initialized_
;
//< indicates if table is properly initialized
1339
1340
template
<
class
Core
>
1341
GLOBALQUALIFIER
1342
friend
void
kernels
::
size
(
index_type
*
const
,
const
Core
);
1343
1344
template
<
class
Core
>
1345
GLOBALQUALIFIER
1346
friend
void
kernels
::
num_values
(
index_type
*
const
,
const
Core
);
1347
1348
template
<
class
Func
,
class
Core
>
1349
GLOBALQUALIFIER
1350
friend
void
kernels
::
for_each
(
Func
,
const
Core
);
1351
1352
template
<
class
Func
,
class
Core
>
1353
GLOBALQUALIFIER
1354
friend
void
kernels
::
for_each_unique_key
(
Func
,
const
Core
);
1355
1356
template
<
class
Core
,
class
StatusHandler
>
1357
GLOBALQUALIFIER
1358
friend
void
kernels
::
retrieve
(
1359
const
typename
Core
::
key_type
*
const
,
1360
const
index_type
,
1361
const
index_type
*
const
,
1362
const
index_type
*
const
,
1363
typename
Core
::
value_type
*
const
,
1364
const
Core
,
1365
const
index_type
,
1366
typename
StatusHandler
::
base_type
*
const
);
1367
1368
};
// class MultiBucketHashTable
1369
1370
}
// namespace warpcore
1371
1372
#
endif
/* WARPCORE_MULTI_BUCKET_HASH_TABLE_CUH */
include
warpcore
multi_bucket_hash_table.cuh
Generated by
1.9.1