warpcore 0.0.1
Hashing at the Speed of Light on modern CUDA-accelerators
random_distributions.cuh
Go to the documentation of this file.
1 #ifndef WARPCORE_RANDOM_DISTRIBUTIONS_CUH
2 #define WARPCORE_RANDOM_DISTRIBUTIONS_CUH
3 
4 #include <limits>
5 #include <assert.h>
6 #include <iostream>
7 #include <kiss/kiss.cuh>
8 #include <warpcore/bloom_filter.cuh>
9 
10 namespace warpcore
11 {
12 
13 /*! \brief transforms interval (a, b) into interval (c, d)
14 * \tparam T data type
15 * \param[in] x value to be transformed into new interval
16 * \param[in] a lower bound of input interval
17 * \param[in] b upper bound of input interval
18 * \param[in] c lower bound of output interval
19 * \param[in] d upper bound of output interval
20 * \return value in new interval
21 */
22 template<class T>
23 HOSTDEVICEQUALIFIER INLINEQUALIFIER
24 constexpr T transform_range(
25  const T x,
26  const T a,
27  const T b,
28  const T c,
29  const T d) noexcept
30 {
31  // check is intervals are valid
32  assert(a < b);
33  assert(c < d);
34  // check if x is inside input interval
35  assert(x >= a && x <= b);
36 
37  // see https://math.stackexchange.com/a/914843
38  // TODO check for correctness
39  //double tmp = ((d - c) / (b - a)) * (x - a) * 1.0;
40  return c + x % (d - c);
41 }
42 
43 /*! \brief generates \c n uniform random elements on a CUDA device
44 * \tparam T data type to be generated
45 * \tparam Rng type of random number generator
46 * \param[out] out pointer to the output array
47 * \param[in] n number of elements to generate
48 * \param[in] seed initial random seed for the RNG
49 */
50 template<class T, class Rng>
51 HOSTQUALIFIER INLINEQUALIFIER
53  T * out,
54  std::uint64_t n,
55  std::uint32_t seed) noexcept
56 {
57  namespace wc = warpcore;
58 
59  // execute kernel
60  helpers::lambda_kernel<<<4096, 32>>>
61  ([=] DEVICEQUALIFIER
62  {
63  const std::uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x;
64 
65  // generate initial local seed per thread
66  const std::uint32_t local_seed =
67  wc::hashers::MurmurHash<std::uint32_t>::hash(seed+tid);
68  Rng rng{local_seed};
69  T x;
70 
71  // grid-stride loop
72  const auto grid_stride = blockDim.x * gridDim.x;
73  for(std::uint64_t i = tid; i < n; i += grid_stride)
74  {
75  // generate random element
76  x = rng.template next<T>();
77 
78  // write output
79  out[i] = x;
80  }
81  })
82  ; CUERR
83 }
84 
85 template<class Rng = kiss::Kiss<std::uint32_t>>
86 HOSTQUALIFIER INLINEQUALIFIER
88  std::uint32_t * out,
89  std::uint64_t n,
90  std::uint32_t seed) noexcept
91 {
92  uniform_distribution<std::uint32_t, Rng>(
93  out,
94  n,
95  seed);
96 }
97 
98 template<class Rng = kiss::Kiss<std::uint64_t>>
99 HOSTQUALIFIER INLINEQUALIFIER
101  std::uint64_t * out,
102  std::uint64_t n,
103  std::uint32_t seed) noexcept
104 {
105  uniform_distribution<std::uint64_t, Rng>(
106  out,
107  n,
108  seed);
109 }
110 
111 template<class Rng = kiss::Kiss<std::uint32_t>>
112 HOSTQUALIFIER INLINEQUALIFIER
114  float * out,
115  std::uint64_t n,
116  std::uint32_t seed) noexcept
117 {
118  uniform_distribution<std::uint32_t, Rng>(
119  reinterpret_cast<std::uint32_t*>(out),
120  n,
121  seed);
122 }
123 
124 template<class Rng = kiss::Kiss<std::uint32_t>>
125 HOSTQUALIFIER INLINEQUALIFIER
127  double * out,
128  std::uint64_t n,
129  std::uint32_t seed) noexcept
130 {
131  uniform_distribution<std::uint32_t, Rng>(
132  reinterpret_cast<std::uint32_t*>(out),
133  n,
134  seed);
135 }
136 
137 /*! \brief generates \c n unique random elements on a CUDA device
138 * \tparam T data type to be generated
139 * \tparam Rng type of random number generator
140 * \param[out] out pointer to the output array
141 * \param[in] n number of elements to generate
142 * \param[in] seed initial random seed for the RNG
143 */
144 template<class T, class Rng>
145 HOSTQUALIFIER INLINEQUALIFIER
147  T * out,
148  std::uint64_t n,
149  std::uint32_t seed) noexcept
150 {
151  namespace wc = warpcore;
152  namespace cg = cooperative_groups;
153  using hasher_t = wc::hashers::MurmurHash<std::uint32_t>;
154  using filter_t = wc::BloomFilter<T>;
155 
156  filter_t bf{n/4096, 8, T(seed)}; // TODO parameter search
157 
158  // execute kernel
159  helpers::lambda_kernel
160  <<<4096, 32>>>
161  ([=] DEVICEQUALIFIER () mutable
162  {
163  const std::uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x;
164  // generate initial local seed per thread
165  const std::uint32_t local_seed =
166  hasher_t::hash(hasher_t::hash(tid) + seed);
167  Rng rng{local_seed};
168  // cooperative group to use for bloom filter
169  const auto group =
170  cg::tiled_partition<filter_t::cg_size()>(cg::this_thread_block());
171  T x;
172  bool is_unique;
173 
174  // grid-stride loop
175  const auto grid_stride = blockDim.x * gridDim.x;
176  for(std::uint64_t i = tid; i < n; i += grid_stride)
177  {
178  // do while randomly drawn element is not unique
179  do
180  {
181  // generate random element
182  x = rng.template next<T>();
183 
184  // insert into bloom filter and check if unique
185  is_unique = bf.insert_and_query(x, group);
186  }
187  while(!is_unique);
188 
189  // write output
190  out[i] = x;
191  }
192  }
193  ); CUERR
194 }
195 
196 template<class Rng = kiss::Kiss<std::uint32_t>>
197 HOSTQUALIFIER INLINEQUALIFIER
199  std::uint32_t * out,
200  std::uint64_t n,
201  std::uint32_t seed) noexcept
202 {
203  unique_distribution<std::uint32_t, Rng>(
204  out,
205  n,
206  seed);
207 }
208 
209 template<class Rng = kiss::Kiss<std::uint64_t>>
210 HOSTQUALIFIER INLINEQUALIFIER
212  std::uint64_t * out,
213  std::uint64_t n,
214  std::uint32_t seed) noexcept
215 {
216  unique_distribution<std::uint64_t, Rng>(
217  out,
218  n,
219  seed);
220 }
221 
222 template<class Rng = kiss::Kiss<std::uint32_t>>
223 HOSTQUALIFIER INLINEQUALIFIER
225  float * out,
226  std::uint64_t n,
227  std::uint32_t seed) noexcept
228 {
229  unique_distribution<std::uint32_t, Rng>(
230  reinterpret_cast<std::uint32_t*>(out),
231  n,
232  seed);
233 }
234 
235 template<class Rng = kiss::Kiss<std::uint32_t>>
236 HOSTQUALIFIER INLINEQUALIFIER
238  double * out,
239  std::uint64_t n,
240  std::uint32_t seed) noexcept
241 {
242  unique_distribution<std::uint32_t, Rng>(
243  reinterpret_cast<std::uint32_t*>(out),
244  n,
245  seed);
246 }
247 
248 /*! \brief generates \c n zipf distributed random elements on a CUDA device
249 * \tparam T data type to be generated
250 * \tparam Rng type of random number generator
251 * \tparam P data type used for probabilities
252 * \param[in] in set of unique elements to be considered
253 * \param[in] n_in cardinality of input set
254 * \param[out] out pointer to the output array
255 * \param[in] n_out number of elements to generate
256 * \param[in] s skew of the distribution
257 * \param[in] seed initial random seed for the RNG
258 */
259 template<class T, class Rng = kiss::Kiss<std::uint32_t>, class P = double>
260 HOSTQUALIFIER INLINEQUALIFIER
262  T * in,
263  std::uint64_t n_in,
264  T * out,
265  std::uint64_t n_out,
266  P s,
267  std::uint32_t seed) noexcept
268 {
269  namespace wc = warpcore;
270 
271  // zipf is undefined for s = 1.0
272  assert(s != 1.0);
273 
274  helpers::lambda_kernel
275  <<<4096, 32>>>
276  ([=] DEVICEQUALIFIER
277  {
278  const std::uint32_t tid = blockDim.x * blockIdx.x + threadIdx.x;
279  // generate initial local seed per thread
280  const std::uint32_t local_seed =
281  wc::hashers::MurmurHash<std::uint32_t>::hash(seed+tid);
282  Rng rng{local_seed};
283 
284  // see https://medium.com/@jasoncrease/rejection-sampling-the-zipf-distribution-6b359792cffa
285  const P t = (pow(double(n_in), 1.0 - s) - s) / (1.0 - s);
286  std::uint64_t k;
287  P y;
288  P rat;
289 
290  // grid-stride loop
291  const auto grid_stride = blockDim.x * gridDim.x;
292  for(std::uint64_t i = tid; i < n_out; i += grid_stride)
293  {
294  do
295  {
296  // generate random element
297  P p = rng.template next<P>();
298  // calculate bounding function of inverse CDF
299  P inv_b =
300  ((p * t) <= 1.0) ? p * t : pow((p * t) * (1.0 - s) + s, 1.0 / (1.0 - s));
301  // draw a target rank
302  k = (T)(inv_b + 1.0);
303  // generate random element
304  y = rng.template next<P>();
305  // added corner case if k <= 1.0
306  P b = (k <= 1.0) ? 1.0 / t : pow(inv_b, -s) / t;
307  // calculate acceptance boundary
308  rat = pow(double(k), -s) / (b * t);
309  }
310  while(y >= rat);
311 
312  // iff T is of index type and pointer in is a nullptr return rank k
313  if((std::is_same<T, std::uint32_t>::value ||
314  std::is_same<T, std::uint64_t>::value) &&
315  in == nullptr)
316  {
317  out[i] = k + 1;
318  }
319  else
320  {
321  out[i] = in[k];
322  }
323  }
324  }
325  ); CUERR
326 }
327 
328 } // namespace warpcore
329 
330 #endif /* WARPCORE_RANDOM_DISTRIBUTIONS_CUH */