| 1 | /* -*- mode: c++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ |
| 2 | |
| 3 | /* |
| 4 | Copyright (C) 2006, 2008, 2010, 2018, 2023 Klaus Spanderen |
| 5 | |
| 6 | This file is part of QuantLib, a free-software/open-source library |
| 7 | for financial quantitative analysts and developers - http://quantlib.org/ |
| 8 | |
| 9 | QuantLib is free software: you can redistribute it and/or modify it |
| 10 | under the terms of the QuantLib license. You should have received a |
| 11 | copy of the license along with this program; if not, please email |
| 12 | <quantlib-dev@lists.sf.net>. The license is also available online at |
| 13 | <http://quantlib.org/license.shtml>. |
| 14 | |
| 15 | This program is distributed in the hope that it will be useful, but WITHOUT |
| 16 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| 17 | FOR A PARTICULAR PURPOSE. See the license for more details. |
| 18 | */ |
| 19 | |
| 20 | |
| 21 | /* |
| 22 | QuantLib Benchmark Suite |
| 23 | |
| 24 | Measures the performance of a preselected set of numerically intensive |
| 25 | test cases. The overall QuantLib Benchmark Index is given by the average |
| 26 | performance in mflops. This benchmarks supports multiprocessing, e.g. |
| 27 | |
| 28 | Single process benchmark: |
| 29 | ./quantlib-benchmark |
| 30 | |
| 31 | Benchmark with 16 processes: |
| 32 | ./quantlib-benchmark --mp=16 |
| 33 | |
| 34 | Benchmark with one process per core |
| 35 | ./quantlib-benchmark --mp |
| 36 | |
| 37 | The number of floating point operations of a given test case was measured |
| 38 | using the perfex library, http://user.it.uu.se/~mikpe/linux/perfctr |
| 39 | and PAPI, http://icl.cs.utk.edu/papi |
| 40 | |
| 41 | Example results: 1. i7 7820X@3.6GHz :24192.2 mflops |
| 42 | 2. i7 4702HQ@2.2GHz : 6524.9 mflops |
| 43 | 3. i7 870@2.93GHz : 4759.2 mflops |
| 44 | 4. Core2 Q9300@2.5Ghz : 2272.6 mflops |
| 45 | 5. Core2 Q6600@2.4Ghz : 1984.0 mflops |
| 46 | 6. i3 540@3.1Ghz : 1755.3 mflops |
| 47 | 7. Raspberry Pi4@1.5GHz : 1704.2 mflops |
| 48 | 8. Core2 Dual@2.0Ghz : 835.9 mflops |
| 49 | 9. Athlon 64 X2 4400+ : 824.2 mflops |
| 50 | 10. Cortex-A57@2.0GHz : 821.7 mflops |
| 51 | 11. Core2 Dual@2.0Ghz : 754.1 mflops |
| 52 | 12. Pentium4 Dual@2.8Ghz : 423.8 mflops |
| 53 | 13. Raspberry Pi3@1.2GHz : 309.2 mflops |
| 54 | 14. Pentium4@3.0Ghz : 266.3 mflops |
| 55 | 15. PentiumIII@1.1Ghz : 146.2 mflops |
| 56 | 16. Alpha 2xEV68@833Mhz : 184.6 mflops |
| 57 | 17. Wii PowerPC 750@729MHz : 46.1 mflops |
| 58 | 18. Raspberry Pi ARM@700Mhz: 28.3 mflops |
| 59 | 19. MIPS R5000@150MHz : 12.6 mflops |
| 60 | 20. RISC-V on FPGA@25Mhz : 2.4 mflops |
| 61 | 21. Strong ARM@206Mhz : 1.4 mflops |
| 62 | 22. SPARC v7@25MHz : 0.78mflops |
| 63 | |
| 64 | Remarks: OS: Linux, static libs |
| 65 | 1. g++-6.3.0 -O3 -ffast-math -march=core-avx2 |
| 66 | Remark: 16 processes |
| 67 | 2. g++-4.8.1 -O3 -ffast-math -march=core-avx2 |
| 68 | Remark: eight processes |
| 69 | 3. gcc-4.6.3, -O3 -ffast-math -mfpmath=sse,387 -march=corei7 |
| 70 | Remark: eight processes |
| 71 | 4. icc-11.0, -gcc-version=420 -fast -fp-model fast=2 -ipo-jobs2 |
| 72 | Remark: four processes |
| 73 | 5. icc-11.0, -gcc-version=420 -fast -fp-model fast=2 -ipo-jobs2 |
| 74 | Remark: four processes |
| 75 | 6. gcc-4.4.5, -O3 -ffast-math -mfpmath=sse,387 -msse4.2 -march=core2 |
| 76 | Remark: four processes |
| 77 | 7. gcc-8.3.0, -O3 -ffast-math -mcpu=cortx-a8 -mfpu=neon-fp-armv8 |
| 78 | Remark: four processes |
| 79 | 8. icc-11.0, -gcc-version=420 -fast -fp-model fast=2 -ipo-jobs2 |
| 80 | Remark: two processes |
| 81 | 9. icc-11.0, -gcc-version=420 -xSSSE3 -O3 -ipo -no-prec-div -static |
| 82 | -fp-model fast=2 -ipo-jobs2, Remark: two processes |
| 83 | 10. clang++-6.0.1 -O2, Remark: four processes |
| 84 | 11. gcc-4.2.1, -O3 -ffast-math -mfpmath=sse,387 -msse3 -funroll-all-loops |
| 85 | Remark: two processes |
| 86 | 12. gcc-4.0.1, -O3 -march=pentium4 -ffast-math |
| 87 | -mfpmath=sse,387 -msse2 -funroll-all-loops, Remark: two processes |
| 88 | 13. gcc-4.9.2 -O2, Remark: four processes |
| 89 | 14. gcc-4.0.1, -O3 -march=pentium4 -ffast-math |
| 90 | -mfpmath=sse,387 -msse2 -funroll-all-loops |
| 91 | 15. gcc-4.1.1, -O3 -march=pentium3 -ffast-math |
| 92 | -mfpmath=sse,387 -msse -funroll-all-loops |
| 93 | 16. gcc-3.3.5, -O3 -mcpu=e67 -funroll-all-loops, Remark: two processes |
| 94 | 17. gcc-4.9.2, -O2 -g on a Nintendo Wii |
| 95 | 18. gcc-4.6.3, -O3 |
| 96 | 19. gcc-4-7-4, -O2 on a SGI Indy |
| 97 | 20. gcc-9.2, -O2 on RISC-V softcore on an Artix7 100T FPGA |
| 98 | 21. gcc-3.4.3, -O2 -g on a Zaurus PDA |
| 99 | 22. gcc-7.5.0, -O2 on a Sun SPARCstation IPC, FPU: Weitek 3170 |
| 100 | |
| 101 | This benchmark is derived from quantlibtestsuite.cpp. Please see the |
| 102 | copyrights therein. |
| 103 | */ |
| 104 | |
| 105 | #include <ql/types.hpp> |
| 106 | #include <ql/version.hpp> |
| 107 | |
| 108 | #ifdef QL_ENABLE_PARALLEL_UNIT_TEST_RUNNER |
| 109 | #include <boost/process.hpp> |
| 110 | #include <boost/interprocess/ipc/message_queue.hpp> |
| 111 | #endif |
| 112 | |
| 113 | #define BOOST_TEST_NO_MAIN 1 |
| 114 | #include <boost/test/included/unit_test.hpp> |
| 115 | |
| 116 | #include <boost/algorithm/string.hpp> |
| 117 | #include <boost/numeric/conversion/cast.hpp> |
| 118 | |
| 119 | #include <iomanip> |
| 120 | #include <iostream> |
| 121 | #include <vector> |
| 122 | #include <string> |
| 123 | #include <utility> |
| 124 | #include <chrono> |
| 125 | #include <thread> |
| 126 | |
| 127 | /* PAPI code |
| 128 | #include <stdio.h |
| 129 | #include <papi.h> |
| 130 | */ |
| 131 | |
| 132 | /* Use BOOST_MSVC instead of _MSC_VER since some other vendors (Metrowerks, |
| 133 | for example) also #define _MSC_VER |
| 134 | */ |
| 135 | #if !defined(BOOST_ALL_NO_LIB) && defined(BOOST_MSVC) |
| 136 | # include <ql/auto_link.hpp> |
| 137 | #endif |
| 138 | |
| 139 | #include "utilities.hpp" |
| 140 | #include "americanoption.hpp" |
| 141 | #include "asianoptions.hpp" |
| 142 | #include "barrieroption.hpp" |
| 143 | #include "basketoption.hpp" |
| 144 | #include "batesmodel.hpp" |
| 145 | #include "convertiblebonds.hpp" |
| 146 | #include "digitaloption.hpp" |
| 147 | #include "dividendoption.hpp" |
| 148 | #include "europeanoption.hpp" |
| 149 | #include "fdheston.hpp" |
| 150 | #include "hestonmodel.hpp" |
| 151 | #include "interpolations.hpp" |
| 152 | #include "jumpdiffusion.hpp" |
| 153 | #include "marketmodel_smm.hpp" |
| 154 | #include "marketmodel_cms.hpp" |
| 155 | #include "lowdiscrepancysequences.hpp" |
| 156 | #include "quantooption.hpp" |
| 157 | #include "riskstats.hpp" |
| 158 | #include "shortratemodels.hpp" |
| 159 | |
| 160 | |
| 161 | namespace { |
| 162 | |
| 163 | class Benchmark { |
| 164 | public: |
| 165 | typedef void (*fct_ptr)(); |
| 166 | Benchmark(std::string name, fct_ptr f, double mflop) |
| 167 | : f_(f), name_(std::move(name)), mflop_(mflop) {} |
| 168 | |
| 169 | fct_ptr getTestCase() const { |
| 170 | return f_; |
| 171 | } |
| 172 | double getMflop() const { |
| 173 | return mflop_; |
| 174 | } |
| 175 | std::string getName() const { |
| 176 | return name_; |
| 177 | } |
| 178 | void swap(Benchmark& other) { |
| 179 | std::swap(a&: f_, b&: other.f_); |
| 180 | std::swap(lhs&: name_, rhs&: other.name_); |
| 181 | std::swap(a&: mflop_, b&: other.mflop_); |
| 182 | } |
| 183 | private: |
| 184 | fct_ptr f_; |
| 185 | std::string name_; |
| 186 | double mflop_; // total number of mega floating |
| 187 | // point operations (not per sec!) |
| 188 | }; |
| 189 | |
| 190 | std::vector<Benchmark> bm = { |
| 191 | Benchmark("AmericanOption::FdAmericanGreeks" , &AmericanOptionTest::testFdAmericanGreeks, 518.31), |
| 192 | Benchmark("AsianOption::MCArithmeticAveragePrice" , &AsianOptionTest::testMCDiscreteArithmeticAveragePrice, 5186.13), |
| 193 | Benchmark("BarrierOption::BabsiriValues" , &BarrierOptionTest::testBabsiriValues, 880.8), |
| 194 | Benchmark("BasketOption::EuroTwoValues" , &BasketOptionTest::testEuroTwoValues, 340.04), |
| 195 | Benchmark("BasketOption::TavellaValues" , &BasketOptionTest::testTavellaValues, 933.80), |
| 196 | Benchmark("BasketOption::OddSamples" , &BasketOptionTest::testOddSamples, 642.46), |
| 197 | Benchmark("BatesModel::DAXCalibration" , &BatesModelTest::testDAXCalibration, 1993.35), |
| 198 | Benchmark("ConvertibleBondTest::testBond" , &ConvertibleBondTest::testBond, 159.85), |
| 199 | Benchmark("DigitalOption::MCCashAtHit" , &DigitalOptionTest::testMCCashAtHit, 995.87), |
| 200 | Benchmark("DividendOption::FdEuropeanGreeks" , &DividendOptionTest::testFdEuropeanGreeks, 949.52), |
| 201 | Benchmark("DividendOption::FdAmericanGreeks" , &DividendOptionTest::testFdAmericanGreeks, 1113.74), |
| 202 | Benchmark("EuropeanOption::FdMcEngines" , &EuropeanOptionTest::testMcEngines, 1988.63), |
| 203 | Benchmark("EuropeanOption::ImpliedVol" , &EuropeanOptionTest::testImpliedVol, 131.51), |
| 204 | Benchmark("EuropeanOption::FdEngines" , &EuropeanOptionTest::testFdEngines, 148.43), |
| 205 | Benchmark("FdHestonTest::testFdmHestonAmerican" , &FdHestonTest::testFdmHestonAmerican, 234.21), |
| 206 | Benchmark("HestonModel::DAXCalibration" , &HestonModelTest::testDAXCalibration, 555.19), |
| 207 | Benchmark("InterpolationTest::testSabrInterpolation" , &InterpolationTest::testSabrInterpolation, 2266.06), |
| 208 | Benchmark("JumpDiffusion::Greeks" , &JumpDiffusionTest::testGreeks, 433.77), |
| 209 | Benchmark("MarketModelCmsTest::testCmSwapsSwaptions" , &MarketModelCmsTest::testMultiStepCmSwapsAndSwaptions, 11497.73), |
| 210 | Benchmark("MarketModelSmmTest::testMultiSmmSwaptions" , &MarketModelSmmTest::testMultiStepCoterminalSwapsAndSwaptions, 11244.95), |
| 211 | Benchmark("QuantoOption::ForwardGreeks" , &QuantoOptionTest::testForwardGreeks, 90.98), |
| 212 | Benchmark("RandomNumber::MersenneTwisterDescrepancy" , &LowDiscrepancyTest::testMersenneTwisterDiscrepancy, 951.98), |
| 213 | Benchmark("RiskStatistics::Results" , &RiskStatisticsTest::testResults, 300.28), |
| 214 | Benchmark("ShortRateModel::Swaps" , &ShortRateModelTest::testSwaps, 454.73) |
| 215 | }; |
| 216 | |
| 217 | /* PAPI code |
| 218 | float real_time, proc_time, mflops; |
| 219 | long_long lflop, flop=0; |
| 220 | */ |
| 221 | |
| 222 | class TimedBenchmark { |
| 223 | public: |
| 224 | typedef void (*fct_ptr)(); |
| 225 | explicit TimedBenchmark(fct_ptr f) : f_(f) {} |
| 226 | |
| 227 | void startMeasurement() const { |
| 228 | /* PAPI code |
| 229 | lflop = flop; |
| 230 | PAPI_flops(&real_time, &proc_time, &flop, &mflops); |
| 231 | */ |
| 232 | } |
| 233 | |
| 234 | void stopMeasurement() const { |
| 235 | /* PAPI code |
| 236 | PAPI_flops(&real_time, &proc_time, &flop, &mflops); |
| 237 | printf("Real_time: %f Proc_time: %f Total mflop: %f\n", |
| 238 | real_time, proc_time, (flop-lflop)/1e6); |
| 239 | */ |
| 240 | } |
| 241 | |
| 242 | double operator()() const { |
| 243 | startMeasurement(); |
| 244 | auto startTime = std::chrono::steady_clock::now(); |
| 245 | BOOST_CHECK(true); // to prevent no-assertion warning |
| 246 | f_(); |
| 247 | auto stopTime = std::chrono::steady_clock::now(); |
| 248 | stopMeasurement(); |
| 249 | return std::chrono::duration_cast<std::chrono::microseconds>( |
| 250 | d: stopTime - startTime).count() * 1e-6; |
| 251 | } |
| 252 | private: |
| 253 | fct_ptr f_; |
| 254 | }; |
| 255 | |
| 256 | void printResults( |
| 257 | unsigned nProc, |
| 258 | std::vector<std::pair<Benchmark, double> >& runTimes) { |
| 259 | |
| 260 | const std::string = "Benchmark Suite QuantLib " QL_VERSION; |
| 261 | |
| 262 | std::cout << std::endl << std::string(58,'-') << std::endl; |
| 263 | std::cout << header << std::endl; |
| 264 | std::cout << std::string(58,'-') << std::endl << std::endl; |
| 265 | |
| 266 | std::sort(first: runTimes.begin(), last: runTimes.end(), |
| 267 | comp: [](const auto& a, const auto& b) { |
| 268 | return a.first.getName() < b.first.getName(); |
| 269 | } |
| 270 | ); |
| 271 | |
| 272 | std::vector<std::tuple<Benchmark, int, double> > aggTimes; |
| 273 | for (const auto& iter: runTimes) { |
| 274 | if (aggTimes.empty() |
| 275 | || std::get<0>(t&: aggTimes.back()).getName() |
| 276 | != iter.first.getName()) { |
| 277 | aggTimes.emplace_back(args: iter.first, args: 1, args: iter.second); |
| 278 | } |
| 279 | else { |
| 280 | ++std::get<1>(t&: aggTimes.back()); |
| 281 | std::get<2>(t&: aggTimes.back()) += iter.second; |
| 282 | } |
| 283 | } |
| 284 | |
| 285 | double sum=0; |
| 286 | for (const auto& iterT: aggTimes) { |
| 287 | const double mflopsPerSec |
| 288 | = std::get<0>(t: iterT).getMflop() / std::get<2>(t: iterT) |
| 289 | * nProc * std::get<1>(t: iterT); |
| 290 | |
| 291 | std::cout << std::get<0>(t: iterT).getName() |
| 292 | << std::string(42-std::get<0>(t: iterT).getName().length(),' ') |
| 293 | << ":" << std::fixed << std::setw(8) << std::setprecision(1) |
| 294 | << mflopsPerSec |
| 295 | << " mflops" << std::endl; |
| 296 | |
| 297 | sum+=mflopsPerSec; |
| 298 | } |
| 299 | std::cout << std::string(58,'-') << std::endl |
| 300 | << "QuantLib Benchmark Index :" |
| 301 | << std::fixed << std::setw(8) << std::setprecision(1) |
| 302 | << sum/aggTimes.size() |
| 303 | << " mflops" << std::endl; |
| 304 | } |
| 305 | #ifdef QL_ENABLE_PARALLEL_UNIT_TEST_RUNNER |
| 306 | int worker(const char* exe, const std::vector<std::string>& args) { |
| 307 | return boost::process::system(exe, boost::process::args=args); |
| 308 | } |
| 309 | #endif |
| 310 | } |
| 311 | |
| 312 | int main(int argc, char* argv[] ) { |
| 313 | const std::string clientModeStr = "--client_mode=true" ; |
| 314 | bool clientMode = false; |
| 315 | |
| 316 | unsigned nProc = 1; |
| 317 | std::vector<std::pair<Benchmark, double> > runTimes; |
| 318 | |
| 319 | for (int i=1; i<argc; ++i) { |
| 320 | std::string arg = argv[i]; |
| 321 | std::vector<std::string> tok; |
| 322 | boost::split(Result&: tok, Input&: arg, Pred: boost::is_any_of(Set: "=" )); |
| 323 | |
| 324 | if (tok[0] == "--mp" ) { |
| 325 | nProc = (tok.size() == 2) |
| 326 | ? boost::numeric_cast<unsigned>(arg: std::stoul(str: tok[1])) |
| 327 | : std::thread::hardware_concurrency(); |
| 328 | } |
| 329 | else if (arg == "--help" || arg == "-?" ) { |
| 330 | std::cout |
| 331 | << "'quantlib-benchmark' is QuantLib " QL_VERSION " CPU performance benchmark" |
| 332 | << std::endl << std::endl |
| 333 | << "Usage: ./quantlib-benchmark [OPTION]..." |
| 334 | << std::endl << std::endl |
| 335 | << "with the following options:" |
| 336 | << std::endl |
| 337 | #ifdef QL_ENABLE_PARALLEL_UNIT_TEST_RUNNER |
| 338 | << "--mp[=PROCESSES] \t parallel execution with PROCESSES processes" |
| 339 | << std::endl |
| 340 | #endif |
| 341 | << "-?, --help \t\t display this help and exit" |
| 342 | << std::endl; |
| 343 | return 0; |
| 344 | } |
| 345 | else if (arg == clientModeStr) { |
| 346 | clientMode = true; |
| 347 | } |
| 348 | else { |
| 349 | std::cout << "quantlib-benchmark: unrecognized option '" << arg << "'." |
| 350 | << std::endl |
| 351 | << "Try 'quantlib-benchmark --help' for more information." |
| 352 | << std::endl; |
| 353 | return 0; |
| 354 | } |
| 355 | } |
| 356 | |
| 357 | if (nProc == 1 && !clientMode) { |
| 358 | std::for_each(first: bm.begin(), last: bm.end(), |
| 359 | f: [&runTimes](const Benchmark& iter) { |
| 360 | runTimes.emplace_back( |
| 361 | args: iter, args: TimedBenchmark(iter.getTestCase())()); |
| 362 | }); |
| 363 | printResults(nProc, runTimes); |
| 364 | } |
| 365 | else { |
| 366 | #ifdef QL_ENABLE_PARALLEL_UNIT_TEST_RUNNER |
| 367 | using namespace boost::interprocess; |
| 368 | |
| 369 | typedef std::pair<unsigned, double> result_type; |
| 370 | |
| 371 | message_queue::size_type recvd_size; |
| 372 | unsigned priority, terminateId=-1; |
| 373 | |
| 374 | const char* const testUnitIdQueueName = "test_unit_queue" ; |
| 375 | const char* const testResultQueueName = "test_result_queue" ; |
| 376 | |
| 377 | if (!clientMode) { |
| 378 | message_queue::remove(testUnitIdQueueName); |
| 379 | message_queue::remove(testResultQueueName); |
| 380 | struct queue_remove { |
| 381 | explicit queue_remove(const char* name) : name_(name) { } |
| 382 | ~queue_remove() { message_queue::remove(name_); } |
| 383 | |
| 384 | private: |
| 385 | const char* const name_; |
| 386 | } remover1(testUnitIdQueueName),remover2(testResultQueueName); |
| 387 | |
| 388 | message_queue mq( |
| 389 | open_or_create, testUnitIdQueueName, |
| 390 | nProc*bm.size(), sizeof(unsigned) |
| 391 | ); |
| 392 | message_queue rq( |
| 393 | open_or_create, testResultQueueName, 16, sizeof(result_type)); |
| 394 | |
| 395 | const std::vector<std::string> workerArgs(1, clientModeStr); |
| 396 | std::vector<std::thread> threadGroup; |
| 397 | for (unsigned i = 0; i < nProc; ++i) { |
| 398 | threadGroup.emplace_back([&]() { worker(argv[0], workerArgs); }); |
| 399 | } |
| 400 | |
| 401 | for (unsigned i=0; i < nProc; ++i) |
| 402 | for (unsigned j=0; j < bm.size(); ++j) |
| 403 | mq.send(&j, sizeof(unsigned), 0); |
| 404 | |
| 405 | result_type r; |
| 406 | for (unsigned i = 0; i < nProc*bm.size(); ++i) { |
| 407 | rq.receive(&r, sizeof(result_type), recvd_size, priority); |
| 408 | runTimes.push_back(std::make_pair(bm[r.first], r.second)); |
| 409 | } |
| 410 | for (unsigned i=0; i < nProc; ++i) { |
| 411 | mq.send(&terminateId, sizeof(unsigned), 0); |
| 412 | } |
| 413 | for (auto& thread: threadGroup) { |
| 414 | thread.join(); |
| 415 | } |
| 416 | printResults(nProc, runTimes); |
| 417 | } |
| 418 | else { |
| 419 | message_queue mq(open_only, testUnitIdQueueName); |
| 420 | message_queue rq(open_only, testResultQueueName); |
| 421 | |
| 422 | unsigned id=0; |
| 423 | mq.receive(&id, sizeof(unsigned), recvd_size, priority); |
| 424 | |
| 425 | while (id != terminateId) { |
| 426 | result_type a(id, TimedBenchmark(bm[id].getTestCase())()); |
| 427 | rq.send(&a, sizeof(result_type), 0); |
| 428 | |
| 429 | mq.receive(&id, sizeof(unsigned), recvd_size, priority); |
| 430 | } |
| 431 | } |
| 432 | #else |
| 433 | std::cout << "Please compile QuantLib with option 'QL_ENABLE_PARALLEL_UNIT_TEST_RUNNER'" |
| 434 | " to run the benchmarks in parallel" << std::endl; |
| 435 | #endif |
| 436 | } |
| 437 | |
| 438 | return 0; |
| 439 | } |
| 440 | |