<!--
                                                                                   .                                                                                                                    
                             @%#####@#                                        @@########@                                                                                                               
                          &###########%@                                    @%%@###########(                                                                                                            
                       @##############%%%@,                               @%%%%@#############,                                                                                                          
                    @########&########%%%%%%@                            %%%%%%%##############@                                                                                                         
                  @##################@%%%%%%%%@                        %%%%%%%%%@###############/                                                                                                       
                @###################@%%%%%%%%%%%@                     %%%%%%%%%%%################@                                                                                                      
              @####################@%%%%%%%%%%%%%%*                 @%%%%%%%%%%%%@################@                                                                                                     
            (%###################@ &%%%%%%%%%%%%%%%@               @%%%%%%%%%%%%%%@################/                                                                                                    
           @###################@# .  @%%%%%%%%%%%%%%%@            @%%%%%%%%%%%%%%%  ################                                                                                                    
         @###################&@        @%%%%%%%%%%%%%%%/         @%%%%%%%%%%%%%&     @##############@                                @*                                                                 
        @##################@@             @%%%%%%%%%%%%%@       @%%%%%%%%%%%%@        @#############@                              @//////@,                                                            
      @##################@                   @%%%%%%%%%%%@     @%%%%%%%%%@              &###########@                           *@    @     #////#                                  @@@@                
    *%###############%@                           @@%%%%@@/  %@%%%%%%@*                  @##########@                         ///////////*     ////                              @////////*             
   @#############&@                             @@.  *     ....,**.   @,                   @########@                      .(////////////((..@@*                                @/////////@             
  @##########@@                             @.    @&    ,..   ,   ......   (                ########@                 @////////////////((@                                     &      ////@             
.%#######@&                              @      .      &             .. ..    @               @#####@                @////////////////(@                                      *       *///@             
#####@#                               /%      .                      #.   .     @.             @####@                @//////////.,///(                                       @ *      @/#(@             
##@                                 @       /   @               #     .          @               @##@                 @            *(/                                      @         /((((             
                                   /           @                ,,                  @                                  @            @                                                 /(@((@            
                                 @           *,@.           * @@(@,@    @           .@                                  @       *//(@                                      @          (@(((@            
                                %           @,,,           .  .,,  ,@   %         .   @                                  */////////(@                                     @///////   ((((..%            
                                      .   .@,,,,, .    .   .  %,     @..,@.      ..  &                                   @/////////(@                                    @/////////(((.....,(           
                               @     .   .(,,,   ,.@   .  .. %,       #./,.........@ . @                                 .          @                                   @      ////....../(((@          
                                    ......,,,  ./  @@  .. .& ,  .///.  .%,#......... .@                                   @         %                                  @         /@...,(((#             
                              @ .  ......@(@,//////*,@....(,( ,,&/////@@@@@...........@ @                                 @    /////(.                               #           &/((((@                
                              @ * .......@@, @/////@ . @../     #//////,@@@..../.../@.@.@                                  /////////(@                              @//////////                         
                              @% .......@,@  @  ...@     .,     @  .. & ,@&........@*.@./                                  @/////////(*                           */////////////,                       
                              @  .....@.#,     &#@               #/.#@    .....   @@*....                                  @/        .#                          @      ///////@                        
                              @  @..../...            &                  @..@....****@...@                                            .@                                     ///                        
                             .    @    / @                          @@  , **/,..@**/@(/@@@&                                     .//////(                      @               @                         
                                  @ @   .@#                           @@   @  @    #/@       @                              @//////////(@                    @/////////      .                          
                                     %,     @@                    ,     ,  @    @ @/@         .,                            @///////////(,                 @///////////////  *                          
                                  @             ,#       .     ,             @   *#/          .@                            %            &                @        /////////@                           
                                  @   ,*         .,@         @                  @#,          ..@                                         .@             .              ////@                            
                                 @       @*        ..@&,&@@,,/              @..@@,          ...@                                      .//((            @                  @                             
                                 #       @         ..##%,,,,,,#         @##@@..@@@,         ..*&////&@@ @#///////#//////////@@///////////((@          /,//////           @,                             
                                       (          ,#####@,(,,,           @(#@...@          .,.,@///////////%@&///////////////*///@@@@/////*@          #//////////////   #                               
                                 (    @          @%*######,,,,,           #..(@.,         ...,,.@@////////////////#////%(////&((@(/@     (/////. #@( @*///////////////(*,                               
                                @    @          @,,###.#####*,,,            ...%,        ....,/.#.@#..   ####@////////@/////(////&//       &//////     /@.       ////((,                                
                              @/////@          @@,*..%.@###/&##@#           ..*&/////////(/%,,@..(* ...#   &#@    //////&(/////////(/        @//////     ///@      ./#                                  
                             @ (///           .@ @@.......@%%%...@           @@//////////////##@*** #....@   #@     (/////@@@(//////(%        (//////      ///    ..@                                   
                            @//@@@           ,.@  @...,@..@..%&%@@@           @/(////(//(/  (##@*** @%.@...@  @@      %////&///(##(///#        @/////      ////*...&                                    
                            @@  @           /..@   ,..*.,.&..@%%%%@..          @/(////(//////#@@*@/.....%  %,  @%     .%//////////@#(/@      ..@/////,     //(((..,                                     
                            @ %            @..*@    ..........%@@%%.@           /..........@###...... ..@@...@  @    ..(////@///////#@@........//////  ....(((((.,                                      
                             @            @@...@     ........@%@%%%/.@            ......   ..@,.........%,../(   %.....,////@/(/////@#,,,,,,,,@((((((....((((((.@                                       
                           @               #...@         @.@.@%&%%%&..@             @..    ......&*#@@@.@.*./. ..&..,,,@####////@////%,,,,,,,@(((((...(((((@%                                           
                          @              @ /..,%            @%%%%%%%...@            .@.   ..@*...#**..@##.& (*@,,@,,,,(####/////(####&,,,,,@(((((.#@.                                                   
                         @              @  %...%            @%%%&%%%....,            .@@ ..@*&%%*/,.@.@@*.@ .@,,,%,,,###%##//(#######@@@@@                                                              
                         (            .@   @...@            &%%@@%%%@...&            ...(....@ /**&.&*,.&..@,*/,*@##@#######@@@@@#                                                                      
                        /            .@    @.,.%           @%%%*@%%%% .//@           ......*.@@(*****/@.*&@&**@@@.                                                                                      
                     ...@         .*      #@.#        %@@#%%%%@ %%%%%@    *         ..@..@*@@@@@@@ .*@ /.*@*@,**/@                                                                                      
                       &      ...@......                         &%######   &,  ........@    ( /@..#@*%.@***@**%                                                                                        
                                                                               .@.......*         @***&*****@                                                                                           
                                                                                                                                                                                                        
-->
<!doctype html><html lang=en><head><meta charset=utf-8><meta name=viewport content="width=device-width,initial-scale=1,shrink-to-fit=no"><meta name=generator content="Hugo 0.134.2"><meta name=description content="belijzajac's personal blog, where he shows off some of his current, past, and future projects as well as shares personal experience of developing projects involving modern C++ and Linux!"><link rel=icon href=https://belijzajac.dev/shimakaze.gif><link rel=stylesheet href=https://belijzajac.dev/css/bootstrap.min.css><link rel=stylesheet href=https://belijzajac.dev/css/font-awesome.min.css><title>Outperforming Rayon with OpenMP | belijzajac.dev</title><style>.author{margin-top:1rem;margin-bottom:1rem;text-align:center}.author-name{margin-top:1rem;font-size:1.7em;margin-bottom:2px}.author-bio{margin:0 auto;opacity:.9;max-width:393px;line-height:1.688}.post-item{display:flex;padding-top:3px;padding-bottom:0;justify-content:space-between}.post-item-title{text-align:left;font-size:1.25rem}@media(max-width:600px){.post-item-title{font-size:1.2rem}.post-item-meta{display:none}}.post-item-vertical-space{display:block;margin-top:8px}.follow-me{position:sticky;margin-right:-380px;width:320px;font-size:.9em;float:right;padding-top:15px;padding-left:15px;font-family:-apple-system,BlinkMacSystemFont,segoe ui,Roboto,helvetica neue,Arial,sans-serif,apple color emoji,segoe ui emoji,segoe ui symbol;background:#e0dedd;font-size:1rem;box-shadow:1px 1px 2px rgba(0,0,0,.125);word-wrap:break-word;box-sizing:border-box}.table-of-contents{position:sticky;top:2em;margin-right:-380px;width:320px;font-size:1rem;float:right;padding-top:15px;padding-left:15px;font-family:-apple-system,BlinkMacSystemFont,segoe ui,Roboto,helvetica neue,Arial,sans-serif,apple color emoji,segoe ui emoji,segoe ui symbol;background:#e0dedd;box-shadow:1px 1px 2px rgba(0,0,0,.125);word-wrap:break-word;box-sizing:border-box}.table-of-contents ul{list-style:none;line-height:2;padding-left:15px}.table-of-contents ul ul{margin-bottom:0}.table-of-contents ul li ul{line-height:1.66}.table-of-contents a{color:var(--text-light)}.table-of-contents a:hover{color:var(--text-darkish)}.table-of-contents li.active>a{color:#454545;font-weight:850}.table-of-contents a{transition:all 100ms ease-in-out}@media screen and (max-width:1650px){.table-of-contents{display:none}.follow-me{display:none}}.article-description{font-weight:400;font-size:1.2rem;background:#eaeaea;border-left:3px solid #ccc;padding:.5em 10px}.grid-container{display:grid;grid-template-columns:auto auto auto auto;justify-content:start}.grid-item{font-size:20px;text-align:center}highlight{padding:.2em;background-color:#ffe28e}.navbar{margin-bottom:1em}article{text-align:justify;padding-bottom:1em}img{max-width:100%}body{background:#ece9e6;background:-webkit-linear-gradient(to top,#FFFFFF,#ECE9E6);background:linear-gradient(to right,#FFFFFF,#ECE9E6)}body{color:#212529}a{color:#00254f}a:hover,a:focus{color:#000914}.container{max-width:910px}pre{display:block;padding:9.5px;word-wrap:break-word;background-color:#ead6d4}pre code{padding:0;font-size:inherit;color:inherit;background-color:transparent;border:none;border-radius:0}code{padding:2px 4px;color:inherit;background-color:#ead6d4;border:1px solid #333;border-radius:4px;font-size:.9em}blockquote,.blockquote{padding:10px 20px;margin:0 0 20px;font-size:1em;border-left:5px solid #6c757d}.footer{text-align:center;color:#7d7d7d}.footer a{color:inherit}.separator{border:0;border-top:3px dashed #000}.separator-dotted{border:0;border-top:3px dotted #b1b1b1}h1{font-size:2.2rem}h1,h2,h3,h4,h5{text-align:left}</style></head><body><nav class="navbar navbar-expand-md navbar-dark" style=background-color:#222121><div class=container-fluid><button class="navbar-toggler navbar-toggler-right border-0 p-0" type=button data-toggle=collapse data-target=#navbar20>
<span class=navbar-toggler-icon></span><p class="navbar-brand text-white mb-0">&nbsp;belijzajac.dev</p></button><div class="collapse navbar-collapse" id=navbar20><ul class="navbar-nav mr-auto"><li class=nav-item><a class=nav-link href=/>Home</a></li></ul><p class="d-none d-md-block lead mb-0 text-white"><b>&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&nbsp;belijzajac.dev</b></p><ul class="navbar-nav ml-auto"><li class="nav-item mx-1"><a class=nav-link href=https://github.com/belijzajac target=_blank rel=noopener><i class="fa fa-github fa-fw fa-lg"></i></a></li><li class="nav-item mx-1"><a class=nav-link href=https://twitter.com/belijzajac target=_blank rel=noopener><i class="fa fa-twitter fa-fw fa-lg"></i></a></li><li class="nav-item mx-1"><a class=nav-link href=mailto:blog@belijzajac.dev><i class="fa fa-envelope-o fa-fw fa-lg"></i></a></li><li class="nav-item mx-1"><a class=nav-link href=/index.xml target=_blank rel=noopener><i class="fa fa-rss fa-fw fa-lg"></i></a></li></ul></div></div></nav><div class=container><article><h1>Outperforming Rayon with OpenMP</h1><h3 class=article-description>Replacing Rayon with OpenMP for additional gains</h3><p><small class=text-secondary>Nov 16, 2021
</small><small><code>rust</code></small>
<small><code>c</code></small>
<small><code>perf</code></small>
<small><code>openmp</code></small>
<small><code>kzg-proofs</code></small>
<small><code>bls12-381</code></small></p><div class=table-of-contents><h5><b>CONTENTS</b></h5><nav id=TableOfContents><ul><li><a href=#introduction>Introduction</a></li><li><a href=#choosing-the-right-tool-for-the-job>Choosing the right tool for the job</a></li><li><a href=#searching-for-bottlenecks>Searching for bottlenecks</a></li><li><a href=#parallelizing-fft_g1>Parallelizing fft_g1</a></li><li><a href=#local-c-kzg-benchmark>Local c-kzg benchmark</a></li><li><a href=#github-actions-ci-benchmarks>GitHub Actions CI benchmarks</a><ul><li><a href=#benchmarking-blst-from-scratch>Benchmarking blst-from-scratch</a></li><li><a href=#benchmarking-ckzg>Benchmarking ckzg</a></li></ul></li><li><a href=#summary>Summary</a></li></ul></nav></div><div class=follow-me><h5><b>FOLLOW ME</b></h5><div class=grid-container><div class=grid-item><a class=nav-link href=https://github.com/belijzajac target=_blank rel=noopener><i class="fa fa-github fa-fw fa-lg"></i></a></div><div class=grid-item><a class=nav-link href=https://twitter.com/belijzajac target=_blank rel=noopener><i class="fa fa-twitter fa-fw fa-lg"></i></a></div><div class=grid-item><a class=nav-link href=mailto:blog@belijzajac.dev><i class="fa fa-envelope-o fa-fw fa-lg"></i></a></div><div class=grid-item><a class=nav-link href=/index.xml target=_blank rel=noopener><i class="fa fa-rss fa-fw fa-lg"></i></a></div></div></div><script>const tableOfContents=document.querySelector(".table-of-contents"),followMe=document.querySelector(".follow-me");function updateFollowMePosition(){const e=tableOfContents.getBoundingClientRect();followMe.style.top=e.bottom+40+"px"}updateFollowMePosition(),window.addEventListener("scroll",updateFollowMePosition),window.addEventListener("resize",updateFollowMePosition)</script><p><img src=/post-images/rip-craberino.jpg alt=rip-craberino></p><h2 id=introduction>Introduction</h2><p>For the Blockchain Technologies course, students were paired into groups and assigned to produce the fastest Rust library implementing the KZG10 cryptographic scheme. Two teams used the <highlight><a href=https://github.com/supranational/blst>blst</a></highlight> backend, which is implemented in assembly and has direct bindings for Rust and C. The first team, <highlight><a href=https://github.com/grandinetech/rust-kzg/tree/main/blst>blst-from-scratch</a></highlight>, used the Rust bindings provided by the blst library to produce an interface closer to <highlight><a href=https://github.com/benjaminion/c-kzg>c-kzg</a></highlight>. The second team, which I was part of, worked on the <highlight><a href=https://github.com/grandinetech/rust-kzg/tree/main/ckzg>ckzg</a></highlight> library in C. We were responsible for producing an implementation that could integrate into Rust via the C bindings provided by my team.</p><h2 id=choosing-the-right-tool-for-the-job>Choosing the right tool for the job</h2><p>It&rsquo;s a no-brainer for Rust programmers to choose <code>Rayon</code> when it comes to writing parallel code, as there aren&rsquo;t many other viable and easy-to-use options available. While Rust does offer alternatives like <code>std::thread</code>, which provides access to native OS threads, the manual creation and management of threads can be cumbersome.</p><p>When I was working on my C code, I had to decide on the best approach to parallelize it. My options included:</p><ul><li><code>pthread</code>: A POSIX standard for thread creation and management.</li><li>A popular third-party threadpool library from GitHub.</li><li><code>OpenMP</code>: Parallel programming library for C and C++ without manual thread management.</li></ul><p>I chose OpenMP because, during experimentation, I discovered it yielded the best results and was relatively straightforward to use. However, I encountered a challenge in integrating it with Rust to ensure compatibility across multiple platforms, starting with Linux and possibly macOS. Eventually, I came up with the following Bash script to automate the entire process of building and packaging shared libraries. Fortunately, OpenMP was integrated into Rust by either:</p><ul><li>exporting the <code>RUSTFLAGS</code> environment variable pointing to the correct <code>libomp</code> LLVM runtime</li></ul><div class=highlight><pre tabindex=0 style=color:#ebdbb2;background-color:#282828;-moz-tab-size:4;-o-tab-size:4;tab-size:4><code class=language-bash data-lang=bash><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 1</span><span><span style=color:#928374;font-style:italic># Linux</span>
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 2</span><span>apt install libomp-dev
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 3</span><span><span style=color:#fabd2f>export</span> LIBOMP_PATH<span style=color:#fe8019>=</span><span style=color:#fe8019>$(</span>find /usr/lib/llvm* -name libiomp5.so | head -n 1<span style=color:#fe8019>)</span>
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 4</span><span>
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 5</span><span><span style=color:#928374;font-style:italic># MacOS</span>
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 6</span><span>brew install libomp
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 7</span><span>ln -s /usr/local/opt/libomp/lib/libomp.dylib /usr/local/lib
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 8</span><span>ln -s /usr/local/opt/libomp/include/omp.h /usr/local/include
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 9</span><span><span style=color:#fabd2f>export</span> LIBOMP_PATH<span style=color:#fe8019>=</span>/usr/local/lib/libomp.dylib
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">10</span><span>
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">11</span><span><span style=color:#928374;font-style:italic># And finally</span>
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">12</span><span><span style=color:#fabd2f>export</span> RUSTFLAGS<span style=color:#fe8019>=</span><span style=color:#b8bb26>&#34;-C link-arg=</span>$LIBOMP_PATH<span style=color:#b8bb26>&#34;</span>
</span></span></code></pre></div><ul><li>or creating a <code>.cargo/config.toml</code> file inside the project directory and mentioning it there</li></ul><div class=highlight><pre tabindex=0 style=color:#ebdbb2;background-color:#282828;-moz-tab-size:4;-o-tab-size:4;tab-size:4><code class=language-text data-lang=text><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">1</span><span>[build]
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">2</span><span>rustflags = [
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">3</span><span>  &#34;-C&#34;, &#34;link-arg=LIBOMP_PATH&#34;
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">4</span><span>]
</span></span></code></pre></div><p>Well, that was simple.</p><h2 id=searching-for-bottlenecks>Searching for bottlenecks</h2><p>In order to optimize a program&rsquo;s performance, CPU profiling tools like <code>Perf</code> play a crucial role by providing detailed insights into where computational resources are being used. One powerful visualization tool generated by these profilers is the flamegraph, which offers a clear representation of a program&rsquo;s CPU usage over time.</p><p><img src=/post-images/flame-graphu.svg alt=flamegraph-of-fft-g1></p><p>The flamegraph displayed above illustrates the CPU time distribution of the c-kzg library&rsquo;s <code>fft_g1</code> benchmark. Upon analysis, it became evident that a significant portion of the execution time was spent in assembly code, highlighting potential areas for optimization. Further investigation on <highlight><a href=https://github.com/protolambda/go-kzg>go-kzg</a></highlight> revealed that the <code>fft_g1</code> benchmark was indeed a performance bottleneck and stood out as a prime candidate for parallelization. By parallelizing this specific operation, we can improving the overall performance of the library.</p><h2 id=parallelizing-fft_g1>Parallelizing fft_g1</h2><p>The <code>fft_g1</code> function calls the <code>fft_g1_fast</code> function, which applies the <em>divide-and-conquer</em> principle to divide a large problem into smaller subproblems, recursively solving each of them. The general procedure here is to distribute work (<code>fft_f1_fast</code>s) among worker threads.</p><p>The blst-from-scratch team implemented it as follows:</p><div class=highlight><pre tabindex=0 style=color:#ebdbb2;background-color:#282828;-moz-tab-size:4;-o-tab-size:4;tab-size:4;display:grid><code class=language-rust data-lang=rust><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 1</span><span><span style=color:#fe8019>let</span> (lo, hi) <span style=color:#fe8019>=</span> ret.split_at_mut(half);
</span></span><span style=display:flex;background-color:#3d3d3d><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 2</span><span>rayon::join(
</span></span><span style=display:flex;background-color:#3d3d3d><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 3</span><span>  <span style=color:#fe8019>||</span> fft_g1_fast(lo, data, stride <span style=color:#fe8019>*</span> <span style=color:#d3869b>2</span>, roots, roots_stride <span style=color:#fe8019>*</span> <span style=color:#d3869b>2</span>),
</span></span><span style=display:flex;background-color:#3d3d3d><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 4</span><span>  <span style=color:#fe8019>||</span> fft_g1_fast(hi, <span style=color:#fe8019>&amp;</span>data[stride<span style=color:#fe8019>..</span>], stride <span style=color:#fe8019>*</span> <span style=color:#d3869b>2</span>, roots, roots_stride <span style=color:#fe8019>*</span> <span style=color:#d3869b>2</span>)
</span></span><span style=display:flex;background-color:#3d3d3d><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 5</span><span>);
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 6</span><span>
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 7</span><span><span style=color:#fe8019>for</span> i <span style=color:#fe8019>in</span> <span style=color:#d3869b>0</span><span style=color:#fe8019>..</span>half {
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 8</span><span>  <span style=color:#fe8019>let</span> y_times_root <span style=color:#fe8019>=</span> ret[i <span style=color:#fe8019>+</span> half].mul(<span style=color:#fe8019>&amp;</span>roots[i <span style=color:#fe8019>*</span> roots_stride]);
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 9</span><span>  ret[i <span style=color:#fe8019>+</span> half] <span style=color:#fe8019>=</span> ret[i].sub(<span style=color:#fe8019>&amp;</span>y_times_root);
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">10</span><span>  ret[i] <span style=color:#fe8019>=</span> ret[i].add_or_dbl(<span style=color:#fe8019>&amp;</span>y_times_root);
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">11</span><span>}
</span></span></code></pre></div><p>As a side note, <code>rayon::join</code> spawns two threads, one executing each of the two closures.</p><p>The C equivalent, on the other hand, was as follows:</p><div class=highlight><pre tabindex=0 style=color:#ebdbb2;background-color:#282828;-moz-tab-size:4;-o-tab-size:4;tab-size:4;display:grid><code class=language-c data-lang=c><span style=display:flex;background-color:#3d3d3d><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 1</span><span><span style=color:#8ec07c>#pragma omp parallel sections
</span></span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 2</span><span><span style=color:#8ec07c></span>{
</span></span><span style=display:flex;background-color:#3d3d3d><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 3</span><span>  <span style=color:#8ec07c>#pragma omp section
</span></span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 4</span><span><span style=color:#8ec07c></span>  {
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 5</span><span>    <span style=color:#fabd2f>fft_g1_fast</span>(out, in, stride <span style=color:#fe8019>*</span> <span style=color:#d3869b>2</span>, roots, roots_stride <span style=color:#fe8019>*</span> <span style=color:#d3869b>2</span>, half);
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 6</span><span>  }
</span></span><span style=display:flex;background-color:#3d3d3d><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 7</span><span>  <span style=color:#8ec07c>#pragma omp section
</span></span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 8</span><span><span style=color:#8ec07c></span>  {
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 9</span><span>    <span style=color:#fabd2f>fft_g1_fast</span>(out <span style=color:#fe8019>+</span> half, in <span style=color:#fe8019>+</span> stride, stride <span style=color:#fe8019>*</span> <span style=color:#d3869b>2</span>, roots, roots_stride <span style=color:#fe8019>*</span> <span style=color:#d3869b>2</span>, half);
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">10</span><span>  }
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">11</span><span>}
</span></span><span style=display:flex;background-color:#3d3d3d><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">12</span><span><span style=color:#8ec07c>#pragma omp parallel
</span></span></span><span style=display:flex;background-color:#3d3d3d><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">13</span><span><span style=color:#8ec07c>#pragma omp for
</span></span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">14</span><span><span style=color:#8ec07c></span><span style=color:#fe8019>for</span> (<span style=color:#fabd2f>uint64_t</span> i <span style=color:#fe8019>=</span> <span style=color:#d3869b>0</span>; i <span style=color:#fe8019>&lt;</span> half; i<span style=color:#fe8019>++</span>) {
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">15</span><span>  <span style=color:#fabd2f>g1_t</span> y_times_root;
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">16</span><span>  <span style=color:#fabd2f>g1_mul</span>(<span style=color:#fe8019>&amp;</span>y_times_root, <span style=color:#fe8019>&amp;</span>out[i <span style=color:#fe8019>+</span> half], <span style=color:#fe8019>&amp;</span>roots[i <span style=color:#fe8019>*</span> roots_stride]);
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">17</span><span>  <span style=color:#fabd2f>g1_sub</span>(<span style=color:#fe8019>&amp;</span>out[i <span style=color:#fe8019>+</span> half], <span style=color:#fe8019>&amp;</span>out[i], <span style=color:#fe8019>&amp;</span>y_times_root);
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">18</span><span>  <span style=color:#fabd2f>g1_add_or_dbl</span>(<span style=color:#fe8019>&amp;</span>out[i], <span style=color:#fe8019>&amp;</span>out[i], <span style=color:#fe8019>&amp;</span>y_times_root);
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">19</span><span>}
</span></span></code></pre></div><p>In addition to parallel sections, I also used OpenMP&rsquo;s parallel for-loop, because I noticed it yielded a <strong>5% greater performance</strong> on my personal machine. Considering the <code>ubuntu-latest</code> runner in GitHub Actions CI had only two available cores, the halves of the problem were shared among two threads where each ran the for-loop to do arithmetic operations on polynomial <code>G1</code> points.</p><p>In the above code snippets, <code>fft_g1</code> calls <code>fft_g1_fast</code>, which up to scale 16 should at most <code>1 &lt;&lt; 15</code> times call itself recursively, where each such call will be distributed among the 2 threads. Since we&rsquo;re computing <code>fft_g1</code> up to scale 8, there should be <code>(1 &lt;&lt; 7) + 1</code> tasks (not to be confused by OpenMP&rsquo;s <code>task</code> pragma directive!) for <code>fft_g1_fast</code> or <code>129</code> such tasks that will be run in parallel!</p><h2 id=local-c-kzg-benchmark>Local c-kzg benchmark</h2><p>Running on my personal computer with i5-7300HQ (4 threads overclocked at 3.50GHz), all mitigations turned off, and a custom Liquorix kernel, I was able to achieve the following results:</p><table><tr><th>Original c-kzg library</th><th>Parallelized c-kzg library</th></tr><tr><td><div class=highlight><pre tabindex=0 style=color:#ebdbb2;background-color:#282828;-moz-tab-size:4;-o-tab-size:4;tab-size:4;display:grid><code class=language-bash data-lang=bash><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 1</span><span>$ ./fft_g1_bench
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 2</span><span>*** Benchmarking FFT_g1, <span style=color:#d3869b>1</span> second per test.       
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 3</span><span>fft_g1/scale_4 <span style=color:#d3869b>1729769</span> ns/op
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 4</span><span>fft_g1/scale_5 <span style=color:#d3869b>4935085</span> ns/op
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 5</span><span>fft_g1/scale_6 <span style=color:#d3869b>12897731</span> ns/op
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 6</span><span>fft_g1/scale_7 <span style=color:#d3869b>32022026</span> ns/op
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 7</span><span>fft_g1/scale_8 <span style=color:#d3869b>76552852</span> ns/op
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 8</span><span>fft_g1/scale_9 <span style=color:#d3869b>184970057</span> ns/op
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 9</span><span>fft_g1/scale_10 <span style=color:#d3869b>418273808</span> ns/op
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">10</span><span>fft_g1/scale_11 <span style=color:#d3869b>919499032</span> ns/op
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">11</span><span>fft_g1/scale_12 <span style=color:#d3869b>2025633037</span> ns/op
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">12</span><span>fft_g1/scale_13 <span style=color:#d3869b>4479830518</span> ns/op
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">13</span><span>fft_g1/scale_14 <span style=color:#d3869b>9754557496</span> ns/op
</span></span><span style=display:flex;background-color:#3d3d3d><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">14</span><span>fft_g1/scale_15 <span style=color:#d3869b>21125613058</span> ns/op
</span></span></code></pre></div></td><td><div class=highlight><pre tabindex=0 style=color:#ebdbb2;background-color:#282828;-moz-tab-size:4;-o-tab-size:4;tab-size:4;display:grid><code class=language-bash data-lang=bash><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 1</span><span>$ OMP_NUM_THREADS<span style=color:#fe8019>=</span><span style=color:#d3869b>4</span> ./fft_g1_bench
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 2</span><span>*** Benchmarking FFT_g1, <span style=color:#d3869b>1</span> second per test.       
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 3</span><span>fft_g1/scale_4 <span style=color:#d3869b>839454</span> ns/op
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 4</span><span>fft_g1/scale_5 <span style=color:#d3869b>2378457</span> ns/op
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 5</span><span>fft_g1/scale_6 <span style=color:#d3869b>6404191</span> ns/op
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 6</span><span>fft_g1/scale_7 <span style=color:#d3869b>16325966</span> ns/op
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 7</span><span>fft_g1/scale_8 <span style=color:#d3869b>38141754</span> ns/op
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 8</span><span>fft_g1/scale_9 <span style=color:#d3869b>90948810</span> ns/op
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59"> 9</span><span>fft_g1/scale_10 <span style=color:#d3869b>204757690</span> ns/op
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">10</span><span>fft_g1/scale_11 <span style=color:#d3869b>457509973</span> ns/op
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">11</span><span>fft_g1/scale_12 <span style=color:#d3869b>1006089135</span> ns/op
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">12</span><span>fft_g1/scale_13 <span style=color:#d3869b>2240095284</span> ns/op
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">13</span><span>fft_g1/scale_14 <span style=color:#d3869b>4879448286</span> ns/op
</span></span><span style=display:flex;background-color:#3d3d3d><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">14</span><span>fft_g1/scale_15 <span style=color:#d3869b>10650876381</span> ns/op
</span></span></code></pre></div></td></tr></table><p>That&rsquo;s <strong>twice as fast</strong> with as little effort as putting in a few pragmas!</p><h2 id=github-actions-ci-benchmarks>GitHub Actions CI benchmarks</h2><p>The <code>fft_g1</code> benchmark was limited to scale 7 because the overall run time for the job exceeds the 6 hour limit if I were to benchmark it up to scale 16, as Criterion runs each iteration a couple of hundred times to produce more accurate results, and that used to automatically cancel other running CI jobs as jobs submitted to GitHub Actions are limited to 360 minutes.</p><h3 id=benchmarking-blst-from-scratch>Benchmarking blst-from-scratch</h3><p><img src=/post-images/from-scratch-github-actions.png alt=from-scratch-github-actions></p><p>From the above screenshot we can see that the parallelized version of the library ran <code>1m 28s</code> shorter than its sequential version, and below are the results of sequential <code>fft_g1</code> algorithm:</p><div class=highlight><pre tabindex=0 style=color:#ebdbb2;background-color:#282828;-moz-tab-size:4;-o-tab-size:4;tab-size:4;display:grid><code class=language-text data-lang=text><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">1</span><span>Benchmarking bench_fft_g1 scale: &#39;7&#39;
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">2</span><span>Benchmarking bench_fft_g1 scale: &#39;7&#39;: Warming up for 3.0000 s
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">3</span><span>Benchmarking bench_fft_g1 scale: &#39;7&#39;: Collecting 100 samples in estimated 6.6364 s (200 iterations)
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">4</span><span>Benchmarking bench_fft_g1 scale: &#39;7&#39;: Analyzing
</span></span><span style=display:flex;background-color:#3d3d3d><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">5</span><span>bench_fft_g1 scale: &#39;7&#39; time:   [33.423 ms 33.785 ms 34.150 ms]
</span></span></code></pre></div><p>of which the average run time for scale 7 was cut down by <code>38.926%</code> by its parallel counterpart:</p><div class=highlight><pre tabindex=0 style=color:#ebdbb2;background-color:#282828;-moz-tab-size:4;-o-tab-size:4;tab-size:4;display:grid><code class=language-text data-lang=text><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">1</span><span>Benchmarking bench_fft_g1 scale: &#39;7&#39;
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">2</span><span>Benchmarking bench_fft_g1 scale: &#39;7&#39;: Warming up for 3.0000 s
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">3</span><span>Benchmarking bench_fft_g1 scale: &#39;7&#39;: Collecting 100 samples in estimated 6.3282 s (300 iterations)
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">4</span><span>Benchmarking bench_fft_g1 scale: &#39;7&#39;: Analyzing
</span></span><span style=display:flex;background-color:#3d3d3d><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">5</span><span>bench_fft_g1 scale: &#39;7&#39; time:   [20.432 ms 20.634 ms 20.843 ms]
</span></span><span style=display:flex;background-color:#3d3d3d><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">6</span><span>                        change: [-39.822% -38.926% -38.001%] (p = 0.00 &lt; 0.05)
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">7</span><span>                        Performance has improved.
</span></span></code></pre></div><h3 id=benchmarking-ckzg>Benchmarking ckzg</h3><p><img src=/post-images/ckzg-github-actions.png alt=ckzg-github-actions></p><p>The sequential version of the ckzg library ran <code>2m 7s</code> faster than the same version of blst-from-scratch because it had other benchmarks that performed faster, though the parallelized version ran <code>1m 2s</code> faster than its sequential version. Below are the results of the sequantial <code>fft_g1</code> algorithm:</p><div class=highlight><pre tabindex=0 style=color:#ebdbb2;background-color:#282828;-moz-tab-size:4;-o-tab-size:4;tab-size:4;display:grid><code class=language-text data-lang=text><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">1</span><span>Benchmarking bench_fft_g1 scale: &#39;7&#39;
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">2</span><span>Benchmarking bench_fft_g1 scale: &#39;7&#39;: Warming up for 3.0000 s
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">3</span><span>Benchmarking bench_fft_g1 scale: &#39;7&#39;: Collecting 100 samples in estimated 6.8313 s (200 iterations)
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">4</span><span>Benchmarking bench_fft_g1 scale: &#39;7&#39;: Analyzing
</span></span><span style=display:flex;background-color:#3d3d3d><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">5</span><span>bench_fft_g1 scale: &#39;7&#39; time:   [32.194 ms 32.471 ms 32.760 ms]
</span></span></code></pre></div><p>Yet the parallel version of the <code>fft_g1</code> algorithm performed much faster than it did for blst-from-scratch, even though both unparallelized versions for both teams performed evenly:</p><div class=highlight><pre tabindex=0 style=color:#ebdbb2;background-color:#282828;-moz-tab-size:4;-o-tab-size:4;tab-size:4;display:grid><code class=language-text data-lang=text><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">1</span><span>Benchmarking bench_fft_g1 scale: &#39;7&#39;
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">2</span><span>Benchmarking bench_fft_g1 scale: &#39;7&#39;: Warming up for 3.0000 s
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">3</span><span>Benchmarking bench_fft_g1 scale: &#39;7&#39;: Collecting 100 samples in estimated 5.0701 s (300 iterations)
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">4</span><span>Benchmarking bench_fft_g1 scale: &#39;7&#39;: Analyzing
</span></span><span style=display:flex;background-color:#3d3d3d><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">5</span><span>bench_fft_g1 scale: &#39;7&#39; time:   [16.854 ms 17.107 ms 17.439 ms]
</span></span><span style=display:flex;background-color:#3d3d3d><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">6</span><span>                        change: [-48.216% -47.318% -46.306%] (p = 0.00 &lt; 0.05)
</span></span><span style=display:flex><span style="white-space:pre;-webkit-user-select:none;user-select:none;margin-right:.4em;padding:0 .4em;color:#756d59">7</span><span>                        Performance has improved.
</span></span></code></pre></div><h2 id=summary>Summary</h2><ul><li>OpenMP lets you quickly prototype what is possible to parallelize with the help of CPU profiling tools like Perf</li><li>Criterion is actually a really nice benchmarking tool to measure performance, especially when integrated into CI</li></ul><div class=separator></div><br><div class=media><div class=media-left><img src=https://belijzajac.dev/author.jpg alt=selfie class="mr-3 mt-3 rounded-circle" style="border:2px solid #d8d8d8;border-radius:50%"></div><div class=media-body><h4 class=media-heading>The author behind this post</h4>Hi there! My online handle is belijzajac, which translates to "white hare" in Russian. I'm a software developer with a strong passion for C++, Rust, Linux, and compilers. With a solid foundation in these technologies, I'm always looking for new ways to challenge myself and grow as a developer. In my free time, you can find me tinkering with new technologies and keeping up to date with the latest industry trends. Thank you for visiting my blog!</div></div><br><div class=separator></div><br><script src=https://giscus.app/client.js data-repo=belijzajac/belijzajac.github.io data-repo-id=R_kgDOGjWVjg data-category=Announcements data-category-id=DIC_kwDOGjWVjs4Cco4S data-mapping=pathname data-strict=0 data-reactions-enabled=1 data-emit-metadata=0 data-input-position=top data-theme=light data-lang=en data-loading=lazy crossorigin=anonymous async></script><noscript>Please enable JavaScript to view the comments powered by giscus</noscript><script src=https://belijzajac.dev/js/toc.js></script></article><script type=application/javascript>countDownDate=new Date("2021-12-23");var x=setInterval(function(){var s=(new Date).getTime(),e=s-countDownDate.getTime(),t=Math.floor(e/(365*1e3*60*60*24)),o=Math.floor(e/(1e3*60*60*24)-(t>0?365*t:0)),i=Math.floor(e%(1e3*60*60*24)/(1e3*60*60)),a=Math.floor(e%(1e3*60*60)/(1e3*60)),n=Math.floor(e%(1e3*60)/1e3);document.getElementById("demo").innerHTML=(t>0?t+"y ":"")+o+"d "+i+"h "+a+"m "+(n>9?n:"0"+n)+"s "},1e3)</script><footer class=footer><span>Website is running for <tag id=demo></tag><br>Copyright © 2024
<a href=https://github.com/belijzajac target=_blank rel=noopener>belijzajac</a>
| <a href=https://creativecommons.org/licenses/by-nc-sa/4.0/ target=_blank rel=noopener>CC BY-NC-SA 4.0</a></span></footer><br></div><script src=https://belijzajac.dev/js/jquery-3.3.1.slim.min.js></script><script src=https://belijzajac.dev/js/bootstrap.bundle.min.js></script><script>window.ga=window.ga||function(){(ga.q=ga.q||[]).push(arguments)},ga.l=+new Date,ga("create","UA-217331412-1","auto"),ga("send","pageview")</script><script async src=https://www.google-analytics.com/analytics.js></script></body></html>