6 #include <condition_variable>
12 #include <shared_mutex>
17 #ifdef NDEBUG // assert needs to work even in release mode
22 using std::chrono::steady_clock
;
24 class BenchHarnessBase::ThreadCache final
27 std::vector
<std::thread
> threads
;
28 std::shared_mutex state_lock
;
29 std::unique_lock
<std::shared_mutex
> locked_state
;
30 std::condition_variable_any cond_var
;
31 struct UnlockGuard final
33 std::shared_mutex
&state_lock
;
34 UnlockGuard(std::shared_mutex
&state_lock
) : state_lock(state_lock
)
45 std::function
<void()> fn
;
47 struct ThreadState final
49 std::unique_ptr
<Task
> task
;
52 std::vector
<std::shared_ptr
<ThreadState
>> states
;
53 bool shutting_down
= false;
54 std::atomic_size_t tasks_left_to_drain
= 0;
57 auto thread_state
= std::make_shared
<ThreadState
>();
58 states
.push_back(thread_state
);
59 threads
.push_back(std::thread([this, thread_state
]() {
60 auto shared_lock
= std::shared_lock(state_lock
);
63 auto lock
= std::unique_lock(thread_state
->mutex
);
64 auto task
= std::move(thread_state
->task
);
70 tasks_left_to_drain
--;
71 cond_var
.notify_all();
75 if (this->shutting_down
)
78 cond_var
.wait(shared_lock
);
86 locked_state
= std::unique_lock(state_lock
);
88 ThreadCache(const ThreadCache
&) = delete;
89 ThreadCache
&operator=(const ThreadCache
&) = delete;
93 cond_var
.notify_all();
94 locked_state
.unlock();
95 for (auto &thread
: threads
)
100 static std::shared_ptr
<ThreadCache
> get()
102 // weak so it's destroyed before returning from main()
103 static std::weak_ptr
<ThreadCache
> static_thread_cache
;
105 std::shared_ptr
<ThreadCache
> thread_cache
= static_thread_cache
.lock();
108 thread_cache
= std::make_shared
<ThreadCache
>();
109 static_thread_cache
= thread_cache
;
113 static std::shared_ptr
<ThreadCache
> get(BenchHarnessBase
&bhb
,
114 std::uint32_t thread_count
)
116 std::shared_ptr
<ThreadCache
> thread_cache
= get();
117 bhb
.thread_cache
= thread_cache
;
118 while (thread_cache
->threads
.size() < thread_count
)
119 thread_cache
->add_thread();
124 while (tasks_left_to_drain
> 0)
126 // unlocks state_lock, allowing all threads to proceed
128 cond_var
.wait(locked_state
);
131 template <typename Fn
> void schedule_on(std::uint32_t thread_num
, Fn fn
)
133 auto lock
= std::unique_lock(states
[thread_num
]->mutex
);
134 assert(!states
[thread_num
]->task
);
135 tasks_left_to_drain
++;
136 states
[thread_num
]->task
= std::make_unique
<Task
>(Task
{.fn
= fn
});
137 cond_var
.notify_all();
141 struct WriteDuration final
143 std::chrono::duration
<double> dur
;
144 friend std::ostream
&operator<<(std::ostream
&os
,
145 const WriteDuration
&wdur
)
147 double dur
= wdur
.dur
.count();
148 if (!std::isfinite(dur
) || std::fabs(dur
) > 0.1)
152 else if (std::fabs(dur
) > 0.1e-3)
154 os
<< dur
* 1e3
<< " ms";
156 else if (std::fabs(dur
) > 0.1e-6)
158 os
<< dur
* 1e6
<< " us";
160 else if (std::fabs(dur
) > 0.1e-9)
162 os
<< dur
* 1e9
<< " ns";
164 else if (std::fabs(dur
) > 0.1e-12)
166 os
<< dur
* 1e12
<< " ps";
176 struct BenchmarkResultInner final
180 std::chrono::duration
<double> total_dur
;
181 std::chrono::duration
<double> iter_dur
;
182 operator JsonValue() const
184 return JsonValue::Object
{
185 {"total_dur", total_dur
.count()},
186 {"iter_dur", iter_dur
.count()},
191 JsonValue config_json
;
192 std::uint64_t iteration_count
;
194 std::vector
<Result
> threads
;
197 struct BenchmarkResultImpl final
: public BenchmarkResult
199 BenchmarkResultInner inner
;
200 BenchmarkResultImpl(BenchmarkResultInner inner
) : inner(std::move(inner
))
203 virtual void print() const override
205 std::cout
<< inner
.name
<< ":\n";
206 if (inner
.threads
.size() > 1)
208 for (std::size_t i
= 0; i
< inner
.threads
.size(); i
++)
210 std::cout
<< "Thread #" << i
<< " took "
211 << WriteDuration
{inner
.threads
[i
].total_dur
}
212 << " for " << inner
.iteration_count
214 << WriteDuration
{inner
.threads
[i
].iter_dur
}
218 std::cout
<< "Average elapsed time: "
219 << WriteDuration
{inner
.average
.total_dur
} << " for "
220 << inner
.iteration_count
<< " iterations -- "
221 << WriteDuration
{inner
.average
.iter_dur
} << "/iter.\n"
224 virtual operator JsonValue() const override
226 return JsonValue::Object
{
227 {"name", inner
.name
},
228 {"config", inner
.config_json
},
229 {"iteration_count", inner
.iteration_count
},
230 {"average", inner
.average
},
231 {"threads", inner
.threads
},
236 std::shared_ptr
<BenchmarkResult
> BenchHarnessBase::base_run(
237 const Config
&config
, const std::string
&name
,
238 void (*fn
)(BenchHarnessBase
*bench_harness_base
,
239 std::uint64_t iteration_count
, std::uint32_t thread_num
))
242 std::uint32_t thread_count
=
243 config
.thread_count
.value_or(std::thread::hardware_concurrency());
245 thread_count
== 0 || (thread_count
== 1 && !config
.thread_count
);
251 std::vector
<steady_clock::duration
> elapsed(thread_count
);
252 auto run_base
= [&](std::uint64_t iteration_count
,
253 std::uint32_t thread_num
) {
254 auto start_time
= steady_clock::now();
255 fn(this, iteration_count
, thread_num
);
256 auto end_time
= steady_clock::now();
257 elapsed
[thread_num
] = end_time
- start_time
;
259 auto run
= [&](std::uint64_t iteration_count
) {
262 return run_base(iteration_count
, 0);
264 auto thread_cache
= ThreadCache::get(*this, thread_count
);
265 for (std::uint32_t thread_num
= 0; thread_num
< thread_count
;
268 thread_cache
->schedule_on(
269 thread_num
, [&run_base
, iteration_count
, thread_num
]() {
270 run_base(iteration_count
, thread_num
);
273 thread_cache
->drain();
275 std::uint64_t iteration_count
= 1;
276 if (config
.iteration_count
)
278 iteration_count
= *config
.iteration_count
;
279 assert(iteration_count
> 0);
280 run(iteration_count
);
286 run(iteration_count
);
287 steady_clock::duration total_elapsed
{};
288 for (auto i
: elapsed
)
292 auto target_average_elapsed
= std::chrono::milliseconds(500);
293 if (total_elapsed
> thread_count
* target_average_elapsed
||
294 iteration_count
>= (1ULL << 63))
298 iteration_count
<<= 1;
301 steady_clock::duration total_elapsed
{};
302 BenchmarkResultInner retval
= {
304 .config_json
= config
,
305 .iteration_count
= iteration_count
,
309 for (std::uint32_t thread_num
= 0; thread_num
< thread_count
; thread_num
++)
311 total_elapsed
+= elapsed
[thread_num
];
312 auto dur
= std::chrono::duration
<double>(elapsed
[thread_num
]);
313 retval
.threads
.push_back({
315 .iter_dur
= dur
/ iteration_count
,
318 auto total
= std::chrono::duration
<double>(total_elapsed
);
320 .total_dur
= total
/ thread_count
,
321 .iter_dur
= total
/ thread_count
/ iteration_count
,
323 return std::make_shared
<BenchmarkResultImpl
>(retval
);
326 std::shared_ptr
<void> BenchHarnessBase::get_thread_cache()
328 return ThreadCache::get();