benchmarking c++11 atomics works
[benchmarks.git] / src / harness.cpp
1 #include "harness.h"
2 #include <atomic>
3 #include <chrono>
4 #include <cmath>
5 #include <condition_variable>
6 #include <functional>
7 #include <iostream>
8 #include <memory>
9 #include <mutex>
10 #include <ostream>
11 #include <shared_mutex>
12 #include <thread>
13 #include <variant>
14
15 #ifdef NDEBUG // assert needs to work even in release mode
16 #undef NDEBUG
17 #endif
18 #include <cassert>
19
20 using std::chrono::steady_clock;
21
22 class BenchHarnessBase::ThreadCache final
23 {
24 private:
25 std::vector<std::thread> threads;
26 std::shared_mutex state_lock;
27 std::unique_lock<std::shared_mutex> locked_state;
28 std::condition_variable_any cond_var;
29 struct UnlockGuard final
30 {
31 std::shared_mutex &state_lock;
32 UnlockGuard(std::shared_mutex &state_lock) : state_lock(state_lock)
33 {
34 state_lock.unlock();
35 }
36 ~UnlockGuard()
37 {
38 state_lock.lock();
39 }
40 };
41 struct Task final
42 {
43 std::function<void()> fn;
44 };
45 struct ThreadState final
46 {
47 std::unique_ptr<Task> task;
48 std::mutex mutex;
49 };
50 std::vector<std::shared_ptr<ThreadState>> states;
51 bool shutting_down = false;
52 std::atomic_size_t tasks_left_to_drain = 0;
53 void add_thread()
54 {
55 auto thread_state = std::make_shared<ThreadState>();
56 states.push_back(thread_state);
57 threads.push_back(std::thread([this, thread_state]() {
58 auto shared_lock = std::shared_lock(state_lock);
59 while (true)
60 {
61 auto lock = std::unique_lock(thread_state->mutex);
62 auto task = std::move(thread_state->task);
63 lock.unlock();
64 if (task)
65 {
66 task->fn();
67 task.reset();
68 tasks_left_to_drain--;
69 cond_var.notify_all();
70 continue;
71 }
72
73 if (this->shutting_down)
74 return;
75
76 cond_var.wait(shared_lock);
77 }
78 }));
79 }
80
81 public:
82 ThreadCache()
83 {
84 locked_state = std::unique_lock(state_lock);
85 }
86 ThreadCache(const ThreadCache &) = delete;
87 ThreadCache &operator=(const ThreadCache &) = delete;
88 ~ThreadCache()
89 {
90 shutting_down = true;
91 cond_var.notify_all();
92 locked_state.unlock();
93 for (auto &thread : threads)
94 {
95 thread.join();
96 }
97 }
98 static std::shared_ptr<ThreadCache> get()
99 {
100 // weak so it's destroyed before returning from main()
101 static std::weak_ptr<ThreadCache> static_thread_cache;
102
103 std::shared_ptr<ThreadCache> thread_cache = static_thread_cache.lock();
104 if (!thread_cache)
105 {
106 thread_cache = std::make_shared<ThreadCache>();
107 static_thread_cache = thread_cache;
108 }
109 return thread_cache;
110 }
111 static std::shared_ptr<ThreadCache> get(BenchHarnessBase &bhb,
112 std::uint32_t thread_count)
113 {
114 std::shared_ptr<ThreadCache> thread_cache = get();
115 bhb.thread_cache = thread_cache;
116 while (thread_cache->threads.size() < thread_count)
117 thread_cache->add_thread();
118 return thread_cache;
119 }
120 void drain()
121 {
122 while (tasks_left_to_drain > 0)
123 {
124 // unlocks state_lock, allowing all threads to proceed
125 // simultaneously
126 cond_var.wait(locked_state);
127 }
128 }
129 template <typename Fn> void schedule_on(std::uint32_t thread_num, Fn fn)
130 {
131 auto lock = std::unique_lock(states[thread_num]->mutex);
132 assert(!states[thread_num]->task);
133 tasks_left_to_drain++;
134 states[thread_num]->task = std::make_unique<Task>(Task{.fn = fn});
135 cond_var.notify_all();
136 }
137 };
138
139 struct WriteDuration final
140 {
141 std::chrono::duration<double> dur;
142 friend std::ostream &operator<<(std::ostream &os,
143 const WriteDuration &wdur)
144 {
145 double dur = wdur.dur.count();
146 if (!std::isfinite(dur) || std::fabs(dur) > 0.1)
147 {
148 os << dur << " sec";
149 }
150 else if (std::fabs(dur) > 0.1e-3)
151 {
152 os << dur * 1e3 << " ms";
153 }
154 else if (std::fabs(dur) > 0.1e-6)
155 {
156 os << dur * 1e6 << " us";
157 }
158 else if (std::fabs(dur) > 0.1e-9)
159 {
160 os << dur * 1e9 << " ns";
161 }
162 else if (std::fabs(dur) > 0.1e-12)
163 {
164 os << dur * 1e12 << " ps";
165 }
166 else
167 {
168 os << dur << " sec";
169 }
170 return os;
171 }
172 };
173
174 void BenchHarnessBase::base_run(
175 Config config,
176 void (*fn)(BenchHarnessBase *bench_harness_base,
177 std::uint64_t iteration_count, std::uint32_t thread_num))
178 {
179
180 std::uint32_t thread_count =
181 config.thread_count.value_or(std::thread::hardware_concurrency());
182 bool no_threads =
183 thread_count == 0 || (thread_count == 1 && !config.thread_count);
184 if (no_threads)
185 {
186 thread_count = 1;
187 }
188
189 std::vector<steady_clock::duration> elapsed(thread_count);
190 auto run_base = [&](std::uint64_t iteration_count,
191 std::uint32_t thread_num) {
192 auto start_time = steady_clock::now();
193 fn(this, iteration_count, thread_num);
194 auto end_time = steady_clock::now();
195 elapsed[thread_num] = end_time - start_time;
196 };
197 auto run = [&](std::uint64_t iteration_count) {
198 if (no_threads)
199 {
200 return run_base(iteration_count, 0);
201 }
202 auto thread_cache = ThreadCache::get(*this, thread_count);
203 for (std::uint32_t thread_num = 0; thread_num < thread_count;
204 thread_num++)
205 {
206 thread_cache->schedule_on(
207 thread_num, [&run_base, iteration_count, thread_num]() {
208 run_base(iteration_count, thread_num);
209 });
210 }
211 thread_cache->drain();
212 };
213 std::uint64_t iteration_count = 1;
214 if (config.iteration_count)
215 {
216 iteration_count = *config.iteration_count;
217 run(iteration_count);
218 }
219 else
220 {
221 while (true)
222 {
223 run(iteration_count);
224 steady_clock::duration total_elapsed{};
225 for (auto i : elapsed)
226 {
227 total_elapsed += i;
228 }
229 auto target_average_elapsed = std::chrono::milliseconds(500);
230 if (total_elapsed > thread_count * target_average_elapsed)
231 {
232 break;
233 }
234 iteration_count <<= 1;
235 }
236 }
237 steady_clock::duration total_elapsed{};
238 for (std::uint32_t thread_num = 0; thread_num < thread_count; thread_num++)
239 {
240 total_elapsed += elapsed[thread_num];
241 if (thread_count > 1)
242 {
243 auto dur = std::chrono::duration<double>(elapsed[thread_num]);
244 std::cout << "Thread #" << thread_num << " took "
245 << WriteDuration{dur} << " for " << iteration_count
246 << " iterations -- "
247 << WriteDuration{dur / iteration_count} << "/iter.\n";
248 }
249 }
250 auto total = std::chrono::duration<double>(total_elapsed);
251 std::cout << "Average elapsed time: "
252 << WriteDuration{total / thread_count} << " for "
253 << iteration_count << " iterations -- "
254 << WriteDuration{total / thread_count / iteration_count}
255 << "/iter.\n"
256 << std::endl;
257 }
258
259 std::shared_ptr<void> BenchHarnessBase::get_thread_cache()
260 {
261 return ThreadCache::get();
262 }