add --target-duration option
[benchmarks.git] / src / harness.cpp
1 #include "harness.h"
2 #include "json.h"
3 #include <atomic>
4 #include <chrono>
5 #include <cmath>
6 #include <condition_variable>
7 #include <functional>
8 #include <iostream>
9 #include <memory>
10 #include <mutex>
11 #include <ostream>
12 #include <shared_mutex>
13 #include <thread>
14 #include <variant>
15 #include <vector>
16
17 #ifdef NDEBUG // assert needs to work even in release mode
18 #undef NDEBUG
19 #endif
20 #include <cassert>
21
22 using std::chrono::steady_clock;
23
24 class BenchHarnessBase::ThreadCache final
25 {
26 private:
27 std::vector<std::thread> threads;
28 std::shared_mutex state_lock;
29 std::unique_lock<std::shared_mutex> locked_state;
30 std::condition_variable_any cond_var;
31 struct UnlockGuard final
32 {
33 std::shared_mutex &state_lock;
34 UnlockGuard(std::shared_mutex &state_lock) : state_lock(state_lock)
35 {
36 state_lock.unlock();
37 }
38 ~UnlockGuard()
39 {
40 state_lock.lock();
41 }
42 };
43 struct Task final
44 {
45 std::function<void()> fn;
46 };
47 struct ThreadState final
48 {
49 std::unique_ptr<Task> task;
50 std::mutex mutex;
51 };
52 std::vector<std::shared_ptr<ThreadState>> states;
53 bool shutting_down = false;
54 std::atomic_size_t tasks_left_to_drain = 0;
55 void add_thread()
56 {
57 auto thread_state = std::make_shared<ThreadState>();
58 states.push_back(thread_state);
59 threads.push_back(std::thread([this, thread_state]() {
60 auto shared_lock = std::shared_lock(state_lock);
61 while (true)
62 {
63 auto lock = std::unique_lock(thread_state->mutex);
64 auto task = std::move(thread_state->task);
65 lock.unlock();
66 if (task)
67 {
68 task->fn();
69 task.reset();
70 tasks_left_to_drain--;
71 cond_var.notify_all();
72 continue;
73 }
74
75 if (this->shutting_down)
76 return;
77
78 cond_var.wait(shared_lock);
79 }
80 }));
81 }
82
83 public:
84 ThreadCache()
85 {
86 locked_state = std::unique_lock(state_lock);
87 }
88 ThreadCache(const ThreadCache &) = delete;
89 ThreadCache &operator=(const ThreadCache &) = delete;
90 ~ThreadCache()
91 {
92 shutting_down = true;
93 cond_var.notify_all();
94 locked_state.unlock();
95 for (auto &thread : threads)
96 {
97 thread.join();
98 }
99 }
100 static std::shared_ptr<ThreadCache> get()
101 {
102 // weak so it's destroyed before returning from main()
103 static std::weak_ptr<ThreadCache> static_thread_cache;
104
105 std::shared_ptr<ThreadCache> thread_cache = static_thread_cache.lock();
106 if (!thread_cache)
107 {
108 thread_cache = std::make_shared<ThreadCache>();
109 static_thread_cache = thread_cache;
110 }
111 return thread_cache;
112 }
113 static std::shared_ptr<ThreadCache> get(BenchHarnessBase &bhb,
114 std::uint32_t thread_count)
115 {
116 std::shared_ptr<ThreadCache> thread_cache = get();
117 bhb.thread_cache = thread_cache;
118 while (thread_cache->threads.size() < thread_count)
119 thread_cache->add_thread();
120 return thread_cache;
121 }
122 void drain()
123 {
124 while (tasks_left_to_drain > 0)
125 {
126 // unlocks state_lock, allowing all threads to proceed
127 // simultaneously
128 cond_var.wait(locked_state);
129 }
130 }
131 template <typename Fn> void schedule_on(std::uint32_t thread_num, Fn fn)
132 {
133 auto lock = std::unique_lock(states[thread_num]->mutex);
134 assert(!states[thread_num]->task);
135 tasks_left_to_drain++;
136 states[thread_num]->task = std::make_unique<Task>(Task{.fn = fn});
137 cond_var.notify_all();
138 }
139 };
140
141 struct WriteDuration final
142 {
143 std::chrono::duration<double> dur;
144 friend std::ostream &operator<<(std::ostream &os,
145 const WriteDuration &wdur)
146 {
147 double dur = wdur.dur.count();
148 if (!std::isfinite(dur) || std::fabs(dur) > 0.1)
149 {
150 os << dur << " sec";
151 }
152 else if (std::fabs(dur) > 0.1e-3)
153 {
154 os << dur * 1e3 << " ms";
155 }
156 else if (std::fabs(dur) > 0.1e-6)
157 {
158 os << dur * 1e6 << " us";
159 }
160 else if (std::fabs(dur) > 0.1e-9)
161 {
162 os << dur * 1e9 << " ns";
163 }
164 else if (std::fabs(dur) > 0.1e-12)
165 {
166 os << dur * 1e12 << " ps";
167 }
168 else
169 {
170 os << dur << " sec";
171 }
172 return os;
173 }
174 };
175
176 struct BenchmarkResultInner final
177 {
178 struct Result final
179 {
180 std::chrono::duration<double> total_dur;
181 std::chrono::duration<double> iter_dur;
182 operator JsonValue() const
183 {
184 return JsonValue::Object{
185 {"total_dur", total_dur.count()},
186 {"iter_dur", iter_dur.count()},
187 };
188 }
189 };
190 std::string name;
191 JsonValue config_json;
192 std::uint64_t iteration_count;
193 Result average;
194 std::vector<Result> threads;
195 };
196
197 struct BenchmarkResultImpl final : public BenchmarkResult
198 {
199 BenchmarkResultInner inner;
200 BenchmarkResultImpl(BenchmarkResultInner inner) : inner(std::move(inner))
201 {
202 }
203 virtual void print() const override
204 {
205 std::cout << inner.name << ":\n";
206 if (inner.threads.size() > 1)
207 {
208 for (std::size_t i = 0; i < inner.threads.size(); i++)
209 {
210 std::cout << "Thread #" << i << " took "
211 << WriteDuration{inner.threads[i].total_dur}
212 << " for " << inner.iteration_count
213 << " iterations -- "
214 << WriteDuration{inner.threads[i].iter_dur}
215 << "/iter.\n";
216 }
217 }
218 std::cout << "Average elapsed time: "
219 << WriteDuration{inner.average.total_dur} << " for "
220 << inner.iteration_count << " iterations -- "
221 << WriteDuration{inner.average.iter_dur} << "/iter.\n"
222 << std::endl;
223 }
224 virtual operator JsonValue() const override
225 {
226 return JsonValue::Object{
227 {"name", inner.name},
228 {"config", inner.config_json},
229 {"iteration_count", inner.iteration_count},
230 {"average", inner.average},
231 {"threads", inner.threads},
232 };
233 }
234 };
235
236 std::shared_ptr<BenchmarkResult> BenchHarnessBase::base_run(
237 const Config &config, const std::string &name,
238 void (*fn)(BenchHarnessBase *bench_harness_base,
239 std::uint64_t iteration_count, std::uint32_t thread_num))
240 {
241
242 std::uint32_t thread_count =
243 config.thread_count.value_or(std::thread::hardware_concurrency());
244 bool no_threads =
245 thread_count == 0 || (thread_count == 1 && !config.thread_count);
246 if (no_threads)
247 {
248 thread_count = 1;
249 }
250
251 std::vector<steady_clock::duration> elapsed(thread_count);
252 auto run_base = [&](std::uint64_t iteration_count,
253 std::uint32_t thread_num) {
254 auto start_time = steady_clock::now();
255 fn(this, iteration_count, thread_num);
256 auto end_time = steady_clock::now();
257 elapsed[thread_num] = end_time - start_time;
258 };
259 auto run = [&](std::uint64_t iteration_count) {
260 if (no_threads)
261 {
262 return run_base(iteration_count, 0);
263 }
264 auto thread_cache = ThreadCache::get(*this, thread_count);
265 for (std::uint32_t thread_num = 0; thread_num < thread_count;
266 thread_num++)
267 {
268 thread_cache->schedule_on(
269 thread_num, [&run_base, iteration_count, thread_num]() {
270 run_base(iteration_count, thread_num);
271 });
272 }
273 thread_cache->drain();
274 };
275 std::uint64_t iteration_count = 1;
276 if (config.iteration_count)
277 {
278 iteration_count = *config.iteration_count;
279 assert(iteration_count > 0);
280 run(iteration_count);
281 }
282 else
283 {
284 while (true)
285 {
286 run(iteration_count);
287 steady_clock::duration total_elapsed{};
288 for (auto i : elapsed)
289 {
290 total_elapsed += i;
291 }
292 std::chrono::duration<double> target_average_elapsed =
293 std::chrono::milliseconds(500);
294 if (config.target_duration)
295 {
296 target_average_elapsed =
297 std::chrono::duration<double>(*config.target_duration);
298 }
299 if (total_elapsed > thread_count * target_average_elapsed ||
300 iteration_count >= (1ULL << 63))
301 {
302 break;
303 }
304 iteration_count <<= 1;
305 }
306 }
307 steady_clock::duration total_elapsed{};
308 BenchmarkResultInner retval = {
309 .name = name,
310 .config_json = config,
311 .iteration_count = iteration_count,
312 .average = {},
313 .threads = {},
314 };
315 for (std::uint32_t thread_num = 0; thread_num < thread_count; thread_num++)
316 {
317 total_elapsed += elapsed[thread_num];
318 auto dur = std::chrono::duration<double>(elapsed[thread_num]);
319 retval.threads.push_back({
320 .total_dur = dur,
321 .iter_dur = dur / iteration_count,
322 });
323 }
324 auto total = std::chrono::duration<double>(total_elapsed);
325 retval.average = {
326 .total_dur = total / thread_count,
327 .iter_dur = total / thread_count / iteration_count,
328 };
329 return std::make_shared<BenchmarkResultImpl>(retval);
330 }
331
332 std::shared_ptr<void> BenchHarnessBase::get_thread_cache()
333 {
334 return ThreadCache::get();
335 }