Austin Schuh | 36244a1 | 2019-09-21 17:52:38 -0700 | [diff] [blame^] | 1 | // Copyright 2017 The Abseil Authors. |
| 2 | // |
| 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | // you may not use this file except in compliance with the License. |
| 5 | // You may obtain a copy of the License at |
| 6 | // |
| 7 | // https://www.apache.org/licenses/LICENSE-2.0 |
| 8 | // |
| 9 | // Unless required by applicable law or agreed to in writing, software |
| 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | // See the License for the specific language governing permissions and |
| 13 | // limitations under the License. |
| 14 | |
| 15 | #include "absl/random/zipf_distribution.h" |
| 16 | |
| 17 | #include <algorithm> |
| 18 | #include <cstddef> |
| 19 | #include <cstdint> |
| 20 | #include <iterator> |
| 21 | #include <random> |
| 22 | #include <string> |
| 23 | #include <utility> |
| 24 | #include <vector> |
| 25 | |
| 26 | #include "gmock/gmock.h" |
| 27 | #include "gtest/gtest.h" |
| 28 | #include "absl/base/internal/raw_logging.h" |
| 29 | #include "absl/random/internal/chi_square.h" |
| 30 | #include "absl/random/internal/sequence_urbg.h" |
| 31 | #include "absl/random/random.h" |
| 32 | #include "absl/strings/str_cat.h" |
| 33 | #include "absl/strings/str_replace.h" |
| 34 | #include "absl/strings/strip.h" |
| 35 | |
| 36 | namespace { |
| 37 | |
| 38 | using ::absl::random_internal::kChiSquared; |
| 39 | using ::testing::ElementsAre; |
| 40 | |
| 41 | template <typename IntType> |
| 42 | class ZipfDistributionTypedTest : public ::testing::Test {}; |
| 43 | |
| 44 | using IntTypes = ::testing::Types<int, int8_t, int16_t, int32_t, int64_t, |
| 45 | uint8_t, uint16_t, uint32_t, uint64_t>; |
| 46 | TYPED_TEST_CASE(ZipfDistributionTypedTest, IntTypes); |
| 47 | |
| 48 | TYPED_TEST(ZipfDistributionTypedTest, SerializeTest) { |
| 49 | using param_type = typename absl::zipf_distribution<TypeParam>::param_type; |
| 50 | |
| 51 | constexpr int kCount = 1000; |
| 52 | absl::InsecureBitGen gen; |
| 53 | for (const auto& param : { |
| 54 | param_type(), |
| 55 | param_type(32), |
| 56 | param_type(100, 3, 2), |
| 57 | param_type(std::numeric_limits<TypeParam>::max(), 4, 3), |
| 58 | param_type(std::numeric_limits<TypeParam>::max() / 2), |
| 59 | }) { |
| 60 | // Validate parameters. |
| 61 | const auto k = param.k(); |
| 62 | const auto q = param.q(); |
| 63 | const auto v = param.v(); |
| 64 | |
| 65 | absl::zipf_distribution<TypeParam> before(k, q, v); |
| 66 | EXPECT_EQ(before.k(), param.k()); |
| 67 | EXPECT_EQ(before.q(), param.q()); |
| 68 | EXPECT_EQ(before.v(), param.v()); |
| 69 | |
| 70 | { |
| 71 | absl::zipf_distribution<TypeParam> via_param(param); |
| 72 | EXPECT_EQ(via_param, before); |
| 73 | } |
| 74 | |
| 75 | // Validate stream serialization. |
| 76 | std::stringstream ss; |
| 77 | ss << before; |
| 78 | absl::zipf_distribution<TypeParam> after(4, 5.5, 4.4); |
| 79 | |
| 80 | EXPECT_NE(before.k(), after.k()); |
| 81 | EXPECT_NE(before.q(), after.q()); |
| 82 | EXPECT_NE(before.v(), after.v()); |
| 83 | EXPECT_NE(before.param(), after.param()); |
| 84 | EXPECT_NE(before, after); |
| 85 | |
| 86 | ss >> after; |
| 87 | |
| 88 | EXPECT_EQ(before.k(), after.k()); |
| 89 | EXPECT_EQ(before.q(), after.q()); |
| 90 | EXPECT_EQ(before.v(), after.v()); |
| 91 | EXPECT_EQ(before.param(), after.param()); |
| 92 | EXPECT_EQ(before, after); |
| 93 | |
| 94 | // Smoke test. |
| 95 | auto sample_min = after.max(); |
| 96 | auto sample_max = after.min(); |
| 97 | for (int i = 0; i < kCount; i++) { |
| 98 | auto sample = after(gen); |
| 99 | EXPECT_GE(sample, after.min()); |
| 100 | EXPECT_LE(sample, after.max()); |
| 101 | if (sample > sample_max) sample_max = sample; |
| 102 | if (sample < sample_min) sample_min = sample; |
| 103 | } |
| 104 | ABSL_INTERNAL_LOG(INFO, |
| 105 | absl::StrCat("Range: ", +sample_min, ", ", +sample_max)); |
| 106 | } |
| 107 | } |
| 108 | |
| 109 | class ZipfModel { |
| 110 | public: |
| 111 | ZipfModel(size_t k, double q, double v) : k_(k), q_(q), v_(v) {} |
| 112 | |
| 113 | double mean() const { return mean_; } |
| 114 | |
| 115 | // For the other moments of the Zipf distribution, see, for example, |
| 116 | // http://mathworld.wolfram.com/ZipfDistribution.html |
| 117 | |
| 118 | // PMF(k) = (1 / k^s) / H(N,s) |
| 119 | // Returns the probability that any single invocation returns k. |
| 120 | double PMF(size_t i) { return i >= hnq_.size() ? 0.0 : hnq_[i] / sum_hnq_; } |
| 121 | |
| 122 | // CDF = H(k, s) / H(N,s) |
| 123 | double CDF(size_t i) { |
| 124 | if (i >= hnq_.size()) { |
| 125 | return 1.0; |
| 126 | } |
| 127 | auto it = std::begin(hnq_); |
| 128 | double h = 0.0; |
| 129 | for (const auto end = it; it != end; it++) { |
| 130 | h += *it; |
| 131 | } |
| 132 | return h / sum_hnq_; |
| 133 | } |
| 134 | |
| 135 | // The InverseCDF returns the k values which bound p on the upper and lower |
| 136 | // bound. Since there is no closed-form solution, this is implemented as a |
| 137 | // bisction of the cdf. |
| 138 | std::pair<size_t, size_t> InverseCDF(double p) { |
| 139 | size_t min = 0; |
| 140 | size_t max = hnq_.size(); |
| 141 | while (max > min + 1) { |
| 142 | size_t target = (max + min) >> 1; |
| 143 | double x = CDF(target); |
| 144 | if (x > p) { |
| 145 | max = target; |
| 146 | } else { |
| 147 | min = target; |
| 148 | } |
| 149 | } |
| 150 | return {min, max}; |
| 151 | } |
| 152 | |
| 153 | // Compute the probability totals, which are based on the generalized harmonic |
| 154 | // number, H(N,s). |
| 155 | // H(N,s) == SUM(k=1..N, 1 / k^s) |
| 156 | // |
| 157 | // In the limit, H(N,s) == zetac(s) + 1. |
| 158 | // |
| 159 | // NOTE: The mean of a zipf distribution could be computed here as well. |
| 160 | // Mean := H(N, s-1) / H(N,s). |
| 161 | // Given the parameter v = 1, this gives the following function: |
| 162 | // (Hn(100, 1) - Hn(1,1)) / (Hn(100,2) - Hn(1,2)) = 6.5944 |
| 163 | // |
| 164 | void Init() { |
| 165 | if (!hnq_.empty()) { |
| 166 | return; |
| 167 | } |
| 168 | hnq_.clear(); |
| 169 | hnq_.reserve(std::min(k_, size_t{1000})); |
| 170 | |
| 171 | sum_hnq_ = 0; |
| 172 | double qm1 = q_ - 1.0; |
| 173 | double sum_hnq_m1 = 0; |
| 174 | for (size_t i = 0; i < k_; i++) { |
| 175 | // Partial n-th generalized harmonic number |
| 176 | const double x = v_ + i; |
| 177 | |
| 178 | // H(n, q-1) |
| 179 | const double hnqm1 = |
| 180 | (q_ == 2.0) ? (1.0 / x) |
| 181 | : (q_ == 3.0) ? (1.0 / (x * x)) : std::pow(x, -qm1); |
| 182 | sum_hnq_m1 += hnqm1; |
| 183 | |
| 184 | // H(n, q) |
| 185 | const double hnq = |
| 186 | (q_ == 2.0) ? (1.0 / (x * x)) |
| 187 | : (q_ == 3.0) ? (1.0 / (x * x * x)) : std::pow(x, -q_); |
| 188 | sum_hnq_ += hnq; |
| 189 | hnq_.push_back(hnq); |
| 190 | if (i > 1000 && hnq <= 1e-10) { |
| 191 | // The harmonic number is too small. |
| 192 | break; |
| 193 | } |
| 194 | } |
| 195 | assert(sum_hnq_ > 0); |
| 196 | mean_ = sum_hnq_m1 / sum_hnq_; |
| 197 | } |
| 198 | |
| 199 | private: |
| 200 | const size_t k_; |
| 201 | const double q_; |
| 202 | const double v_; |
| 203 | |
| 204 | double mean_; |
| 205 | std::vector<double> hnq_; |
| 206 | double sum_hnq_; |
| 207 | }; |
| 208 | |
| 209 | using zipf_u64 = absl::zipf_distribution<uint64_t>; |
| 210 | |
| 211 | class ZipfTest : public testing::TestWithParam<zipf_u64::param_type>, |
| 212 | public ZipfModel { |
| 213 | public: |
| 214 | ZipfTest() : ZipfModel(GetParam().k(), GetParam().q(), GetParam().v()) {} |
| 215 | |
| 216 | absl::InsecureBitGen rng_; |
| 217 | }; |
| 218 | |
| 219 | TEST_P(ZipfTest, ChiSquaredTest) { |
| 220 | const auto& param = GetParam(); |
| 221 | Init(); |
| 222 | |
| 223 | size_t trials = 10000; |
| 224 | |
| 225 | // Find the split-points for the buckets. |
| 226 | std::vector<size_t> points; |
| 227 | std::vector<double> expected; |
| 228 | { |
| 229 | double last_cdf = 0.0; |
| 230 | double min_p = 1.0; |
| 231 | for (double p = 0.01; p < 1.0; p += 0.01) { |
| 232 | auto x = InverseCDF(p); |
| 233 | if (points.empty() || points.back() < x.second) { |
| 234 | const double p = CDF(x.second); |
| 235 | points.push_back(x.second); |
| 236 | double q = p - last_cdf; |
| 237 | expected.push_back(q); |
| 238 | last_cdf = p; |
| 239 | if (q < min_p) { |
| 240 | min_p = q; |
| 241 | } |
| 242 | } |
| 243 | } |
| 244 | if (last_cdf < 0.999) { |
| 245 | points.push_back(std::numeric_limits<size_t>::max()); |
| 246 | double q = 1.0 - last_cdf; |
| 247 | expected.push_back(q); |
| 248 | if (q < min_p) { |
| 249 | min_p = q; |
| 250 | } |
| 251 | } else { |
| 252 | points.back() = std::numeric_limits<size_t>::max(); |
| 253 | expected.back() += (1.0 - last_cdf); |
| 254 | } |
| 255 | // The Chi-Squared score is not completely scale-invariant; it works best |
| 256 | // when the small values are in the small digits. |
| 257 | trials = static_cast<size_t>(8.0 / min_p); |
| 258 | } |
| 259 | ASSERT_GT(points.size(), 0); |
| 260 | |
| 261 | // Generate n variates and fill the counts vector with the count of their |
| 262 | // occurrences. |
| 263 | std::vector<int64_t> buckets(points.size(), 0); |
| 264 | double avg = 0; |
| 265 | { |
| 266 | zipf_u64 dis(param); |
| 267 | for (size_t i = 0; i < trials; i++) { |
| 268 | uint64_t x = dis(rng_); |
| 269 | ASSERT_LE(x, dis.max()); |
| 270 | ASSERT_GE(x, dis.min()); |
| 271 | avg += static_cast<double>(x); |
| 272 | auto it = std::upper_bound(std::begin(points), std::end(points), |
| 273 | static_cast<size_t>(x)); |
| 274 | buckets[std::distance(std::begin(points), it)]++; |
| 275 | } |
| 276 | avg = avg / static_cast<double>(trials); |
| 277 | } |
| 278 | |
| 279 | // Validate the output using the Chi-Squared test. |
| 280 | for (auto& e : expected) { |
| 281 | e *= trials; |
| 282 | } |
| 283 | |
| 284 | // The null-hypothesis is that the distribution is a poisson distribution with |
| 285 | // the provided mean (not estimated from the data). |
| 286 | const int dof = static_cast<int>(expected.size()) - 1; |
| 287 | |
| 288 | // NOTE: This test runs about 15x per invocation, so a value of 0.9995 is |
| 289 | // approximately correct for a test suite failure rate of 1 in 100. In |
| 290 | // practice we see failures slightly higher than that. |
| 291 | const double threshold = absl::random_internal::ChiSquareValue(dof, 0.9999); |
| 292 | |
| 293 | const double chi_square = absl::random_internal::ChiSquare( |
| 294 | std::begin(buckets), std::end(buckets), std::begin(expected), |
| 295 | std::end(expected)); |
| 296 | |
| 297 | const double p_actual = |
| 298 | absl::random_internal::ChiSquarePValue(chi_square, dof); |
| 299 | |
| 300 | // Log if the chi_squared value is above the threshold. |
| 301 | if (chi_square > threshold) { |
| 302 | ABSL_INTERNAL_LOG(INFO, "values"); |
| 303 | for (size_t i = 0; i < expected.size(); i++) { |
| 304 | ABSL_INTERNAL_LOG(INFO, absl::StrCat(points[i], ": ", buckets[i], |
| 305 | " vs. E=", expected[i])); |
| 306 | } |
| 307 | ABSL_INTERNAL_LOG(INFO, absl::StrCat("trials ", trials)); |
| 308 | ABSL_INTERNAL_LOG(INFO, |
| 309 | absl::StrCat("mean ", avg, " vs. expected ", mean())); |
| 310 | ABSL_INTERNAL_LOG(INFO, absl::StrCat(kChiSquared, "(data, ", dof, ") = ", |
| 311 | chi_square, " (", p_actual, ")")); |
| 312 | ABSL_INTERNAL_LOG(INFO, |
| 313 | absl::StrCat(kChiSquared, " @ 0.9995 = ", threshold)); |
| 314 | FAIL() << kChiSquared << " value of " << chi_square |
| 315 | << " is above the threshold."; |
| 316 | } |
| 317 | } |
| 318 | |
| 319 | std::vector<zipf_u64::param_type> GenParams() { |
| 320 | using param = zipf_u64::param_type; |
| 321 | const auto k = param().k(); |
| 322 | const auto q = param().q(); |
| 323 | const auto v = param().v(); |
| 324 | const uint64_t k2 = 1 << 10; |
| 325 | return std::vector<zipf_u64::param_type>{ |
| 326 | // Default |
| 327 | param(k, q, v), |
| 328 | // vary K |
| 329 | param(4, q, v), param(1 << 4, q, v), param(k2, q, v), |
| 330 | // vary V |
| 331 | param(k2, q, 0.5), param(k2, q, 1.5), param(k2, q, 2.5), param(k2, q, 10), |
| 332 | // vary Q |
| 333 | param(k2, 1.5, v), param(k2, 3, v), param(k2, 5, v), param(k2, 10, v), |
| 334 | // Vary V & Q |
| 335 | param(k2, 1.5, 0.5), param(k2, 3, 1.5), param(k, 10, 10)}; |
| 336 | } |
| 337 | |
| 338 | std::string ParamName( |
| 339 | const ::testing::TestParamInfo<zipf_u64::param_type>& info) { |
| 340 | const auto& p = info.param; |
| 341 | std::string name = absl::StrCat("k_", p.k(), "__q_", absl::SixDigits(p.q()), |
| 342 | "__v_", absl::SixDigits(p.v())); |
| 343 | return absl::StrReplaceAll(name, {{"+", "_"}, {"-", "_"}, {".", "_"}}); |
| 344 | } |
| 345 | |
| 346 | INSTANTIATE_TEST_SUITE_P(All, ZipfTest, ::testing::ValuesIn(GenParams()), |
| 347 | ParamName); |
| 348 | |
| 349 | // NOTE: absl::zipf_distribution is not guaranteed to be stable. |
| 350 | TEST(ZipfDistributionTest, StabilityTest) { |
| 351 | // absl::zipf_distribution stability relies on |
| 352 | // absl::uniform_real_distribution, std::log, std::exp, std::log1p |
| 353 | absl::random_internal::sequence_urbg urbg( |
| 354 | {0x0003eb76f6f7f755ull, 0xFFCEA50FDB2F953Bull, 0xC332DDEFBE6C5AA5ull, |
| 355 | 0x6558218568AB9702ull, 0x2AEF7DAD5B6E2F84ull, 0x1521B62829076170ull, |
| 356 | 0xECDD4775619F1510ull, 0x13CCA830EB61BD96ull, 0x0334FE1EAA0363CFull, |
| 357 | 0xB5735C904C70A239ull, 0xD59E9E0BCBAADE14ull, 0xEECC86BC60622CA7ull}); |
| 358 | |
| 359 | std::vector<int> output(10); |
| 360 | |
| 361 | { |
| 362 | absl::zipf_distribution<int32_t> dist; |
| 363 | std::generate(std::begin(output), std::end(output), |
| 364 | [&] { return dist(urbg); }); |
| 365 | EXPECT_THAT(output, ElementsAre(10031, 0, 0, 3, 6, 0, 7, 47, 0, 0)); |
| 366 | } |
| 367 | urbg.reset(); |
| 368 | { |
| 369 | absl::zipf_distribution<int32_t> dist(std::numeric_limits<int32_t>::max(), |
| 370 | 3.3); |
| 371 | std::generate(std::begin(output), std::end(output), |
| 372 | [&] { return dist(urbg); }); |
| 373 | EXPECT_THAT(output, ElementsAre(44, 0, 0, 0, 0, 1, 0, 1, 3, 0)); |
| 374 | } |
| 375 | } |
| 376 | |
| 377 | TEST(ZipfDistributionTest, AlgorithmBounds) { |
| 378 | absl::zipf_distribution<int32_t> dist; |
| 379 | |
| 380 | // Small values from absl::uniform_real_distribution map to larger Zipf |
| 381 | // distribution values. |
| 382 | const std::pair<uint64_t, int32_t> kInputs[] = { |
| 383 | {0xffffffffffffffff, 0x0}, {0x7fffffffffffffff, 0x0}, |
| 384 | {0x3ffffffffffffffb, 0x1}, {0x1ffffffffffffffd, 0x4}, |
| 385 | {0xffffffffffffffe, 0x9}, {0x7ffffffffffffff, 0x12}, |
| 386 | {0x3ffffffffffffff, 0x25}, {0x1ffffffffffffff, 0x4c}, |
| 387 | {0xffffffffffffff, 0x99}, {0x7fffffffffffff, 0x132}, |
| 388 | {0x3fffffffffffff, 0x265}, {0x1fffffffffffff, 0x4cc}, |
| 389 | {0xfffffffffffff, 0x999}, {0x7ffffffffffff, 0x1332}, |
| 390 | {0x3ffffffffffff, 0x2665}, {0x1ffffffffffff, 0x4ccc}, |
| 391 | {0xffffffffffff, 0x9998}, {0x7fffffffffff, 0x1332f}, |
| 392 | {0x3fffffffffff, 0x2665a}, {0x1fffffffffff, 0x4cc9e}, |
| 393 | {0xfffffffffff, 0x998e0}, {0x7ffffffffff, 0x133051}, |
| 394 | {0x3ffffffffff, 0x265ae4}, {0x1ffffffffff, 0x4c9ed3}, |
| 395 | {0xffffffffff, 0x98e223}, {0x7fffffffff, 0x13058c4}, |
| 396 | {0x3fffffffff, 0x25b178e}, {0x1fffffffff, 0x4a062b2}, |
| 397 | {0xfffffffff, 0x8ee23b8}, {0x7ffffffff, 0x10b21642}, |
| 398 | {0x3ffffffff, 0x1d89d89d}, {0x1ffffffff, 0x2fffffff}, |
| 399 | {0xffffffff, 0x45d1745d}, {0x7fffffff, 0x5a5a5a5a}, |
| 400 | {0x3fffffff, 0x69ee5846}, {0x1fffffff, 0x73ecade3}, |
| 401 | {0xfffffff, 0x79a9d260}, {0x7ffffff, 0x7cc0532b}, |
| 402 | {0x3ffffff, 0x7e5ad146}, {0x1ffffff, 0x7f2c0bec}, |
| 403 | {0xffffff, 0x7f95adef}, {0x7fffff, 0x7fcac0da}, |
| 404 | {0x3fffff, 0x7fe55ae2}, {0x1fffff, 0x7ff2ac0e}, |
| 405 | {0xfffff, 0x7ff955ae}, {0x7ffff, 0x7ffcaac1}, |
| 406 | {0x3ffff, 0x7ffe555b}, {0x1ffff, 0x7fff2aac}, |
| 407 | {0xffff, 0x7fff9556}, {0x7fff, 0x7fffcaab}, |
| 408 | {0x3fff, 0x7fffe555}, {0x1fff, 0x7ffff2ab}, |
| 409 | {0xfff, 0x7ffff955}, {0x7ff, 0x7ffffcab}, |
| 410 | {0x3ff, 0x7ffffe55}, {0x1ff, 0x7fffff2b}, |
| 411 | {0xff, 0x7fffff95}, {0x7f, 0x7fffffcb}, |
| 412 | {0x3f, 0x7fffffe5}, {0x1f, 0x7ffffff3}, |
| 413 | {0xf, 0x7ffffff9}, {0x7, 0x7ffffffd}, |
| 414 | {0x3, 0x7ffffffe}, {0x1, 0x7fffffff}, |
| 415 | }; |
| 416 | |
| 417 | for (const auto& instance : kInputs) { |
| 418 | absl::random_internal::sequence_urbg urbg({instance.first}); |
| 419 | EXPECT_EQ(instance.second, dist(urbg)); |
| 420 | } |
| 421 | } |
| 422 | |
| 423 | } // namespace |