recent.bib

@unpublished{chen2025enablingmixedprecisionspectralelement,
  title = {Enabling mixed-precision in spectral element codes},
  author = {Yanxiang Chen and Pablo de Oliveira Castro and Paolo Bientinesi and Niclas Jansson and Roman Iakymchuk},
  note = {(accepted for publication in Journal of Future Generation Computer Systems)},
  year = {2025},
  abstract = {Mixed-precision computing has the potential to significantly reduce the cost of exascale computations, but determining when and how to implement it in programs can be challenging. In this article, we propose a methodology for enabling mixed-precision with the help of computer arithmetic tools, roofline model, and computer arithmetic techniques. As case studies, we consider Nekbone, a mini-application for the Computational Fluid Dynamics (CFD) solver Nek5000, and a modern Neko CFD application. With the help of the Verificarlo tool and computer arithmetic techniques, we introduce a strategy to address stagnation issues in the preconditioned Conjugate Gradient method in Nekbone and apply these insights to implement a mixed-precision version of Neko. We evaluate the derived mixed-precision versions of these codes by combining metrics in three dimensions: accuracy, time-to-solution, and energy-to-solution. Notably, mixed-precision in Nekbone reduces time-to-solution by roughly 1.62x and energy-to-solution by 2.43x on MareNostrum 5, while in the real-world Neko application, the gain is up to 1.3x in both time and energy, with the accuracy that matches double-precision results.},
  url = {https://arxiv.org/abs/2503.02134}
}

@inproceedings{delval2025Noise,
  title = {{Noise injection for performance bottleneck analysis}},
  author = {Delval, Aurélien and de Oliveira Castro, Pablo and Jalby, William and Renault, Etienne},
  note = {(to appear)},
  booktitle = {Euro-Par 2025 31st International Conference on Parallel and Distributed Computing},
  publisher = {Springer},
  year = {2025},
  month = aug,
  abstract = {Bottleneck evaluation is a crucial part of performance tuning of HPC applications, as it directly influences the search for optimizations and the selection of the best hardware for a given code. In this paper, we introduce a new model-agnostic, instruction-accurate framework for bottleneck analysis based on performance noise injection. This method provides a precise analysis that complements existing techniques, particularly in quantifying unused resource slack. Specifically, we classify programs based on whether they are limited by computation, data access bandwidth, or latency by injecting additional noise instructions that target specific bottleneck sources. Our approach is built on the LLVM compiler toolchain, ensuring easy portability across different architectures and microarchitectures, which constitutes an improvement over many state-of-the-art tools. We validate our framework on a range of hardware benchmarks and kernels, including a detailed study of a sparse-matrix–vector product (SPMXV) kernel, where we successfully detect distinct performance regimes. These insights further inform hardware selection, as demonstrated by our comparative evaluation between HBM and DDR memory systems.},
  url = {https://easychair.org/smart-program/EURO-PAR2025/2025-08-28.html#session:97092}
}

@unpublished{jam2025MLKAPS,
  title = {{MLKAPS: Machine Learning and Adaptive Sampling for HPC Kernel Auto-tuning}},
  author = {Jam, Mathys and Petit, Eric and de Oliveira Castro, Pablo and Defour, David and Henry, Greg and Jalby, William},
  note = {working paper or preprint},
  year = {2025},
  month = jan,
  pdf = {https://arxiv.org/pdf/2501.05811},
  abstract = {Many High-Performance Computing (HPC) libraries rely on decision trees to select the best kernel hyperparameters at runtime,depending on the input and environment. However, finding optimized configurations for each input and environment is challengingand requires significant manual effort and computational resources. This paper presents MLKAPS, a tool that automates this task usingmachine learning and adaptive sampling techniques. MLKAPS generates decision trees that tune HPC kernels' design parameters toachieve efficient performance for any user input. MLKAPS scales to large input and design spaces, outperforming similar state-of-the-artauto-tuning tools in tuning time and mean speedup. We demonstrate the benefits of MLKAPS on the highly optimized Intel MKLdgetrf LU kernel and show that MLKAPS finds blindspots in the manual tuning of HPC experts. It improves over 85\% of the inputswith a geomean speedup of x1.30. On the Intel MKL dgeqrf QR kernel, MLKAPS improves performance on 85\% of the inputs with ageomean speedup of x1.18.}
}

@unpublished{deoliveiracastro2024error,
  title = {{Error Analysis of sum-product algorithms under stochastic rounding}},
  author = {de Oliveira Castro, Pablo and El Arar, El-Mehdi and Petit, Eric and Sohier, Devan},
  note = {working paper or preprint},
  year = {2024},
  month = nov,
  keywords = {Stochastic rounding ; Martingales ; Rounding error analysis ; Floating-point arithmetic ; Computation DAG ; Karatsuba multiplication},
  pdf = {https://hal.science/hal-04787542v1/file/main.pdf},
  abstract = {The quality of numerical computations can be measured through their forward error, for which finding good error bounds is challenging in general. For several algorithms and using stochastic rounding (SR), probabilistic analysis has been shown to be an effective alternative for obtaining tight error bounds. This analysis considers the distribution of errors and evaluates the algorithm's performance on average. Using martingales and the Azuma-Hoeffding inequality, it provides error bounds that are valid with a certain probability and in O(n√u) instead of deterministic worst-case bounds in O(nu), where n is the number of operations and u is the unit roundoff. In this paper, we present a general method that automatically constructs a martingale for any computation scheme with multi-linear errors based on additions, subtractions, and multiplications. We apply this generalization to algorithms previously studied with SR, such as pairwise summation and the Horner algorithm, and prove equivalent results. We also analyze a previously unstudied algorithm, Karatsuba polynomial multiplication, which illustrates that the method can handle reused intermediate computations.}
}

@inproceedings{delval2024verificarloCI,
  author = {Delval, Aurélien and Coppens, François and Petit, Eric
                  and Iakymchuk, Roman and de Oliveira Castro, Pablo},
  title = {{V}erificarlo {CI}: {C}ontinuous {I}ntegration for
                  {N}umerical {O}ptimization and {D}ebugging},
  volume = {69},
  address = {Jülich},
  publisher = {Forschungszentrum Jülich GmbH Zentralbibliothek, Verlag},
  reportid = {FZJ-2025-02465},
  series = {Schriften des Forschungszentrums Jülich IAS Series},
  pages = {104 - 107},
  year = {2025},
  comment = {Proceedings of the 35th Parallel CFD International
                  Conference 2024},
  booktitle = {Proceedings of the 35th Parallel CFD
                   International Conference 2024},
  abstract = {Floating-point accuracy is an important concern when
                  developing numerical simulations or other compute-intensive
                  codes. Tracking the introduction of numerical regression is
                  often delayed until it provokes unexpected bug for the
                  end-user. In this paper, we introduce Verificarlo CI, a
                  continuous integration workflow for the numerical
                  optimization and debugging of a code over the course of its
                  development. We demonstrate applicability of Verificarlo CI
                  on two test-case applications.},
  month = {Sep},
  date = {2024-09-02},
  doi = {10.34734/FZJ-2025-02465},
  url = {https://juser.fz-juelich.de/record/1041833},
  pdf = {https://juser.fz-juelich.de/record/1041833/files/106.pdf}
}

@article{elarar2024bounds,
  title = {{Bounds on non-linear errors for variance computation with stochastic rounding}},
  author = {El Arar, El-Mehdi and Sohier, Devan and de Oliveira Castro, Pablo and Petit, Eric},
  journal = {SIAM Journal on Scientific Computing},
  volume = {46},
  number = {5},
  pages = {B579-B599},
  year = {2024},
  pdf = {https://hal.science/hal-04056057/file/main.pdf},
  doi = {10.1137/23M1563001},
  abstract = { Abstract. The main objective of this work is to investigate nonlinear errors and pairwise summation using stochastic rounding (SR) in variance computation algorithms. We estimate the forward error of computations under SR through two methods: the first is based on a bound of the variance and the Bienaymé–Chebyshev inequality, while the second is based on martingales and the Azuma–Hoeffding inequality. The study shows that for pairwise summation, using SR results in a probabilistic bound of the forward error proportional to \(\sqrt{\log (n)}u\) rather than the deterministic bound in \(O(\log (n)u)\) when using the default rounding mode. We examine two algorithms that compute the variance, one called “textbook” and the other “two-pass,” which both exhibit nonlinear errors. Using the two methods mentioned above, we show that the forward errors of these algorithms have probabilistic bounds under SR in \(O(\sqrt{n}u)\) instead of \(nu\) for the deterministic bounds. We show that this advantage holds using pairwise summation for both textbook and two-pass, with probabilistic bounds of the forward error proportional to \(\sqrt{\log (n)}u\). }
}