Austin Schuh | 36244a1 | 2019-09-21 17:52:38 -0700 | [diff] [blame^] | 1 | // Copyright 2017 The Abseil Authors. |
| 2 | // |
| 3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | // you may not use this file except in compliance with the License. |
| 5 | // You may obtain a copy of the License at |
| 6 | // |
| 7 | // https://www.apache.org/licenses/LICENSE-2.0 |
| 8 | // |
| 9 | // Unless required by applicable law or agreed to in writing, software |
| 10 | // distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | // See the License for the specific language governing permissions and |
| 13 | // limitations under the License. |
| 14 | |
| 15 | #include "absl/random/discrete_distribution.h" |
| 16 | |
| 17 | #include <cmath> |
| 18 | #include <cstddef> |
| 19 | #include <cstdint> |
| 20 | #include <iterator> |
| 21 | #include <numeric> |
| 22 | #include <random> |
| 23 | #include <sstream> |
| 24 | #include <string> |
| 25 | #include <vector> |
| 26 | |
| 27 | #include "gmock/gmock.h" |
| 28 | #include "gtest/gtest.h" |
| 29 | #include "absl/base/internal/raw_logging.h" |
| 30 | #include "absl/random/internal/chi_square.h" |
| 31 | #include "absl/random/internal/distribution_test_util.h" |
| 32 | #include "absl/random/internal/sequence_urbg.h" |
| 33 | #include "absl/random/random.h" |
| 34 | #include "absl/strings/str_cat.h" |
| 35 | #include "absl/strings/strip.h" |
| 36 | |
| 37 | namespace { |
| 38 | |
| 39 | template <typename IntType> |
| 40 | class DiscreteDistributionTypeTest : public ::testing::Test {}; |
| 41 | |
| 42 | using IntTypes = ::testing::Types<int8_t, uint8_t, int16_t, uint16_t, int32_t, |
| 43 | uint32_t, int64_t, uint64_t>; |
| 44 | TYPED_TEST_SUITE(DiscreteDistributionTypeTest, IntTypes); |
| 45 | |
| 46 | TYPED_TEST(DiscreteDistributionTypeTest, ParamSerializeTest) { |
| 47 | using param_type = |
| 48 | typename absl::discrete_distribution<TypeParam>::param_type; |
| 49 | |
| 50 | absl::discrete_distribution<TypeParam> empty; |
| 51 | EXPECT_THAT(empty.probabilities(), testing::ElementsAre(1.0)); |
| 52 | |
| 53 | absl::discrete_distribution<TypeParam> before({1.0, 2.0, 1.0}); |
| 54 | |
| 55 | // Validate that the probabilities sum to 1.0. We picked values which |
| 56 | // can be represented exactly to avoid floating-point roundoff error. |
| 57 | double s = 0; |
| 58 | for (const auto& x : before.probabilities()) { |
| 59 | s += x; |
| 60 | } |
| 61 | EXPECT_EQ(s, 1.0); |
| 62 | EXPECT_THAT(before.probabilities(), testing::ElementsAre(0.25, 0.5, 0.25)); |
| 63 | |
| 64 | // Validate the same data via an initializer list. |
| 65 | { |
| 66 | std::vector<double> data({1.0, 2.0, 1.0}); |
| 67 | |
| 68 | absl::discrete_distribution<TypeParam> via_param{ |
| 69 | param_type(std::begin(data), std::end(data))}; |
| 70 | |
| 71 | EXPECT_EQ(via_param, before); |
| 72 | } |
| 73 | |
| 74 | std::stringstream ss; |
| 75 | ss << before; |
| 76 | absl::discrete_distribution<TypeParam> after; |
| 77 | |
| 78 | EXPECT_NE(before, after); |
| 79 | |
| 80 | ss >> after; |
| 81 | |
| 82 | EXPECT_EQ(before, after); |
| 83 | } |
| 84 | |
| 85 | TYPED_TEST(DiscreteDistributionTypeTest, Constructor) { |
| 86 | auto fn = [](double x) { return x; }; |
| 87 | { |
| 88 | absl::discrete_distribution<int> unary(0, 1.0, 9.0, fn); |
| 89 | EXPECT_THAT(unary.probabilities(), testing::ElementsAre(1.0)); |
| 90 | } |
| 91 | |
| 92 | { |
| 93 | absl::discrete_distribution<int> unary(2, 1.0, 9.0, fn); |
| 94 | // => fn(1.0 + 0 * 4 + 2) => 3 |
| 95 | // => fn(1.0 + 1 * 4 + 2) => 7 |
| 96 | EXPECT_THAT(unary.probabilities(), testing::ElementsAre(0.3, 0.7)); |
| 97 | } |
| 98 | } |
| 99 | |
| 100 | TEST(DiscreteDistributionTest, InitDiscreteDistribution) { |
| 101 | using testing::Pair; |
| 102 | |
| 103 | { |
| 104 | std::vector<double> p({1.0, 2.0, 3.0}); |
| 105 | std::vector<std::pair<double, size_t>> q = |
| 106 | absl::random_internal::InitDiscreteDistribution(&p); |
| 107 | |
| 108 | EXPECT_THAT(p, testing::ElementsAre(1 / 6.0, 2 / 6.0, 3 / 6.0)); |
| 109 | |
| 110 | // Each bucket is p=1/3, so bucket 0 will send half it's traffic |
| 111 | // to bucket 2, while the rest will retain all of their traffic. |
| 112 | EXPECT_THAT(q, testing::ElementsAre(Pair(0.5, 2), // |
| 113 | Pair(1.0, 1), // |
| 114 | Pair(1.0, 2))); |
| 115 | } |
| 116 | |
| 117 | { |
| 118 | std::vector<double> p({1.0, 2.0, 3.0, 5.0, 2.0}); |
| 119 | |
| 120 | std::vector<std::pair<double, size_t>> q = |
| 121 | absl::random_internal::InitDiscreteDistribution(&p); |
| 122 | |
| 123 | EXPECT_THAT(p, testing::ElementsAre(1 / 13.0, 2 / 13.0, 3 / 13.0, 5 / 13.0, |
| 124 | 2 / 13.0)); |
| 125 | |
| 126 | // A more complex bucketing solution: Each bucket has p=0.2 |
| 127 | // So buckets 0, 1, 4 will send their alternate traffic elsewhere, which |
| 128 | // happens to be bucket 3. |
| 129 | // However, summing up that alternate traffic gives bucket 3 too much |
| 130 | // traffic, so it will send some traffic to bucket 2. |
| 131 | constexpr double b0 = 1.0 / 13.0 / 0.2; |
| 132 | constexpr double b1 = 2.0 / 13.0 / 0.2; |
| 133 | constexpr double b3 = (5.0 / 13.0 / 0.2) - ((1 - b0) + (1 - b1) + (1 - b1)); |
| 134 | |
| 135 | EXPECT_THAT(q, testing::ElementsAre(Pair(b0, 3), // |
| 136 | Pair(b1, 3), // |
| 137 | Pair(1.0, 2), // |
| 138 | Pair(b3, 2), // |
| 139 | Pair(b1, 3))); |
| 140 | } |
| 141 | } |
| 142 | |
| 143 | TEST(DiscreteDistributionTest, ChiSquaredTest50) { |
| 144 | using absl::random_internal::kChiSquared; |
| 145 | |
| 146 | constexpr size_t kTrials = 10000; |
| 147 | constexpr int kBuckets = 50; // inclusive, so actally +1 |
| 148 | |
| 149 | // 1-in-100000 threshold, but remember, there are about 8 tests |
| 150 | // in this file. And the test could fail for other reasons. |
| 151 | // Empirically validated with --runs_per_test=10000. |
| 152 | const int kThreshold = |
| 153 | absl::random_internal::ChiSquareValue(kBuckets, 0.99999); |
| 154 | |
| 155 | std::vector<double> weights(kBuckets, 0); |
| 156 | std::iota(std::begin(weights), std::end(weights), 1); |
| 157 | absl::discrete_distribution<int> dist(std::begin(weights), std::end(weights)); |
| 158 | |
| 159 | absl::InsecureBitGen rng; |
| 160 | |
| 161 | std::vector<int32_t> counts(kBuckets, 0); |
| 162 | for (size_t i = 0; i < kTrials; i++) { |
| 163 | auto x = dist(rng); |
| 164 | counts[x]++; |
| 165 | } |
| 166 | |
| 167 | // Scale weights. |
| 168 | double sum = 0; |
| 169 | for (double x : weights) { |
| 170 | sum += x; |
| 171 | } |
| 172 | for (double& x : weights) { |
| 173 | x = kTrials * (x / sum); |
| 174 | } |
| 175 | |
| 176 | double chi_square = |
| 177 | absl::random_internal::ChiSquare(std::begin(counts), std::end(counts), |
| 178 | std::begin(weights), std::end(weights)); |
| 179 | |
| 180 | if (chi_square > kThreshold) { |
| 181 | double p_value = |
| 182 | absl::random_internal::ChiSquarePValue(chi_square, kBuckets); |
| 183 | |
| 184 | // Chi-squared test failed. Output does not appear to be uniform. |
| 185 | std::string msg; |
| 186 | for (size_t i = 0; i < counts.size(); i++) { |
| 187 | absl::StrAppend(&msg, i, ": ", counts[i], " vs ", weights[i], "\n"); |
| 188 | } |
| 189 | absl::StrAppend(&msg, kChiSquared, " p-value ", p_value, "\n"); |
| 190 | absl::StrAppend(&msg, "High ", kChiSquared, " value: ", chi_square, " > ", |
| 191 | kThreshold); |
| 192 | ABSL_RAW_LOG(INFO, "%s", msg.c_str()); |
| 193 | FAIL() << msg; |
| 194 | } |
| 195 | } |
| 196 | |
| 197 | TEST(DiscreteDistributionTest, StabilityTest) { |
| 198 | // absl::discrete_distribution stabilitiy relies on |
| 199 | // absl::uniform_int_distribution and absl::bernoulli_distribution. |
| 200 | absl::random_internal::sequence_urbg urbg( |
| 201 | {0x0003eb76f6f7f755ull, 0xFFCEA50FDB2F953Bull, 0xC332DDEFBE6C5AA5ull, |
| 202 | 0x6558218568AB9702ull, 0x2AEF7DAD5B6E2F84ull, 0x1521B62829076170ull, |
| 203 | 0xECDD4775619F1510ull, 0x13CCA830EB61BD96ull, 0x0334FE1EAA0363CFull, |
| 204 | 0xB5735C904C70A239ull, 0xD59E9E0BCBAADE14ull, 0xEECC86BC60622CA7ull}); |
| 205 | |
| 206 | std::vector<int> output(6); |
| 207 | |
| 208 | { |
| 209 | absl::discrete_distribution<int32_t> dist({1.0, 2.0, 3.0, 5.0, 2.0}); |
| 210 | EXPECT_EQ(0, dist.min()); |
| 211 | EXPECT_EQ(4, dist.max()); |
| 212 | for (auto& v : output) { |
| 213 | v = dist(urbg); |
| 214 | } |
| 215 | EXPECT_EQ(12, urbg.invocations()); |
| 216 | } |
| 217 | |
| 218 | // With 12 calls to urbg, each call into discrete_distribution consumes |
| 219 | // precisely 2 values: one for the uniform call, and a second for the |
| 220 | // bernoulli. |
| 221 | // |
| 222 | // Given the alt mapping: 0=>3, 1=>3, 2=>2, 3=>2, 4=>3, we can |
| 223 | // |
| 224 | // uniform: 443210143131 |
| 225 | // bernoulli: b0 000011100101 |
| 226 | // bernoulli: b1 001111101101 |
| 227 | // bernoulli: b2 111111111111 |
| 228 | // bernoulli: b3 001111101111 |
| 229 | // bernoulli: b4 001111101101 |
| 230 | // ... |
| 231 | EXPECT_THAT(output, testing::ElementsAre(3, 3, 1, 3, 3, 3)); |
| 232 | |
| 233 | { |
| 234 | urbg.reset(); |
| 235 | absl::discrete_distribution<int64_t> dist({1.0, 2.0, 3.0, 5.0, 2.0}); |
| 236 | EXPECT_EQ(0, dist.min()); |
| 237 | EXPECT_EQ(4, dist.max()); |
| 238 | for (auto& v : output) { |
| 239 | v = dist(urbg); |
| 240 | } |
| 241 | EXPECT_EQ(12, urbg.invocations()); |
| 242 | } |
| 243 | EXPECT_THAT(output, testing::ElementsAre(3, 3, 0, 3, 0, 4)); |
| 244 | } |
| 245 | |
| 246 | } // namespace |