test/yosupo/convolution.test.cpp

View this file on GitHub
Last update: 2024-07-11 22:21:57+09:00
Problem: https://judge.yosupo.jp/problem/convolution_mod

Code

#define PROBLEM "https://judge.yosupo.jp/problem/convolution_mod"
#include "include/mtl/convolution.hpp"
#include <bits/stdc++.h>

using namespace std;
using ll = long long;

int main() {
  cin.tie(nullptr); ios::sync_with_stdio(false);

  int N,M; cin>>N>>M;
  vector<Modular998244353> A(N), B(M);
  for (auto& a : A) cin>>a;
  for (auto& b : B) cin>>b;
  auto C = convolution(A, B);
  C.resize(N+M-1);
  for (int i = 0; i < N+M-1; i++)
    cout << C[i] << " ";
  cout << endl;

  return 0;
}

#line 1 "test/yosupo/convolution.test.cpp"
#define PROBLEM "https://judge.yosupo.jp/problem/convolution_mod"
#line 2 "include/mtl/fft.hpp"
#include <vector>
#include <complex>

class Fft {
 public:
  using complex_t = std::complex<double>;
  using value_type = complex_t;
 private:
  size_t n_;

 public:
  explicit Fft(size_t size) {
    n_ = 1;
    while (n_ < size)
      n_ *= 2;
  };

  size_t n() const { return n_; }

  void fft_inline(std::vector<complex_t>& f) const {
    _fft(f);
  }

  std::vector<complex_t> fft(const std::vector<complex_t>& f) const {
    auto ff = f;
    ff.resize(n_, 0);
    _fft(ff);
    return ff;
  }

  void ifft_inline(std::vector<complex_t>& f) const {
    _ifft(f);
  }

  std::vector<complex_t> ifft(const std::vector<complex_t>& f) const {
    auto ff = f;
    ff.resize(n_, 0);
    _ifft(ff);
    return ff;
  }

 private:
  template <bool Forward>
  void _fft_impl(std::vector<complex_t>& f) const {
    // iterative bit reversal
    for (size_t i = 0, j = 1; j < n_-1; j++) {
      for (size_t k = n_ >> 1; k > (i ^= k); k >>= 1);
      if (i < j) std::swap(f[i], f[j]);
    }
    // Cooley-Tukey FFT
    for (size_t m = 1; m < n_; m *= 2) {
      double theta0 = Forward ? M_PI/m : -M_PI/m;
      for (size_t chunk = 0; chunk < n_; chunk += 2*m) {
        for (size_t i = 0; i < m; i++) {
          complex_t w = {cos(theta0*i), -sin(theta0*i)};
          auto p = chunk + i;
          auto a = f[p + 0];
          auto b = f[p + m] * w;
          f[p + 0] = a + b;
          f[p + m] = a - b;
        }
      }
    }
  }

  void _fft(std::vector<complex_t>& f) const {
    _fft_impl<true>(f);
  }
  void _ifft(std::vector<complex_t>& f) const {
    _fft_impl<false>(f);
    for (auto& x : f) {
      x /= n_;
    }
  }

};

#line 2 "include/mtl/bit_manip.hpp"
#include <cstdint>
#include <cassert>
#if __cplusplus >= 202002L
#ifndef MTL_CPP20
#define MTL_CPP20
#endif
#include <bit>
#endif

namespace bm {

/// Count 1s for each 8 bits
inline constexpr uint64_t popcnt_e8(uint64_t x) {
  x = (x & 0x5555555555555555) + ((x>>1) & 0x5555555555555555);
  x = (x & 0x3333333333333333) + ((x>>2) & 0x3333333333333333);
  x = (x & 0x0F0F0F0F0F0F0F0F) + ((x>>4) & 0x0F0F0F0F0F0F0F0F);
  return x;
}
/// Count 1s
inline constexpr unsigned popcnt(uint64_t x) {
#ifdef MTL_CPP20
  return std::popcount(x);
#else
  return (popcnt_e8(x) * 0x0101010101010101) >> 56;
#endif
}
/// Alias to mtl::popcnt(x)
constexpr unsigned popcount(uint64_t x) {
  return popcnt(x);
}
/// Count trailing 0s. s.t. *11011000 -> 3
inline constexpr unsigned ctz(uint64_t x) {
#ifdef MTL_CPP20
  return std::countr_zero(x);
#else
  return popcnt((x & (-x)) - 1);
#endif
}
/// Alias to mtl::ctz(x)
constexpr unsigned countr_zero(uint64_t x) {
  return ctz(x);
}
/// Count trailing 1s. s.t. *11011011 -> 2
inline constexpr unsigned cto(uint64_t x) {
#ifdef MTL_CPP20
  return std::countr_one(x);
#else
  return ctz(~x);
#endif
}
/// Alias to mtl::cto(x)
constexpr unsigned countr_one(uint64_t x) {
  return cto(x);
}
inline constexpr unsigned ctz8(uint8_t x) {
  return x == 0 ? 8 : popcnt_e8((x & (-x)) - 1);
}
/// [00..0](8bit) -> 0, [**..*](not only 0) -> 1
inline constexpr uint8_t summary(uint64_t x) {
  constexpr uint64_t hmask = 0x8080808080808080ull;
  constexpr uint64_t lmask = 0x7F7F7F7F7F7F7F7Full;
  auto a = x & hmask;
  auto b = x & lmask;
  b = hmask - b;
  b = ~b;
  auto c = (a | b) & hmask;
  c *= 0x0002040810204081ull;
  return uint8_t(c >> 56);
}
/// Extract target area of bits
inline constexpr uint64_t bextr(uint64_t x, unsigned start, unsigned len) {
  uint64_t mask = len < 64 ? (1ull<<len)-1 : 0xFFFFFFFFFFFFFFFFull;
  return (x >> start) & mask;
}
/// 00101101 -> 00111111 -count_1s-> 6
inline constexpr unsigned log2p1(uint8_t x) {
  if (x & 0x80)
    return 8;
  uint64_t p = uint64_t(x) * 0x0101010101010101ull;
  p -= 0x8040201008040201ull;
  p = ~p & 0x8080808080808080ull;
  p = (p >> 7) * 0x0101010101010101ull;
  p >>= 56;
  return p;
}
/// 00101100 -mask_mssb-> 00100000 -to_index-> 5
inline constexpr unsigned mssb8(uint8_t x) {
  assert(x != 0);
  return log2p1(x) - 1;
}
/// 00101100 -mask_lssb-> 00000100 -to_index-> 2
inline constexpr unsigned lssb8(uint8_t x) {
  assert(x != 0);
  return popcnt_e8((x & -x) - 1);
}
/// Count leading 0s. 00001011... -> 4
inline constexpr unsigned clz(uint64_t x) {
#ifdef MTL_CPP20
  return std::countl_zero(x);
#else
  if (x == 0)
    return 64;
  auto i = mssb8(summary(x));
  auto j = mssb8(bextr(x, 8 * i, 8));
  return 63 - (8 * i + j);
#endif
}
/// Alias to mtl::clz(x)
constexpr unsigned countl_zero(uint64_t x) {
  return clz(x);
}
/// Count leading 1s. 11110100... -> 4
inline constexpr unsigned clo(uint64_t x) {
#ifdef MTL_CPP20
  return std::countl_one(x);
#else
  return clz(~x);
#endif
}
/// Alias to mtl::clo(x)
constexpr unsigned countl_one(uint64_t x) {
  return clo(x);
}

inline constexpr unsigned clz8(uint8_t x) {
  return x == 0 ? 8 : 7 - mssb8(x);
}
inline constexpr uint64_t bit_reverse(uint64_t x) {
  x = ((x & 0x00000000FFFFFFFF) << 32) | ((x & 0xFFFFFFFF00000000) >> 32);
  x = ((x & 0x0000FFFF0000FFFF) << 16) | ((x & 0xFFFF0000FFFF0000) >> 16);
  x = ((x & 0x00FF00FF00FF00FF) << 8) | ((x & 0xFF00FF00FF00FF00) >> 8);
  x = ((x & 0x0F0F0F0F0F0F0F0F) << 4) | ((x & 0xF0F0F0F0F0F0F0F0) >> 4);
  x = ((x & 0x3333333333333333) << 2) | ((x & 0xCCCCCCCCCCCCCCCC) >> 2);
  x = ((x & 0x5555555555555555) << 1) | ((x & 0xAAAAAAAAAAAAAAAA) >> 1);
  return x;
}

/// Check if x is power of 2. 00100000 -> true, 00100001 -> false
constexpr bool has_single_bit(uint64_t x) noexcept {
#ifdef MTL_CPP20
  return std::has_single_bit(x);
#else
  return x != 0 && (x & (x - 1)) == 0;
#endif
}

/// Bit width needs to represent x. 00110110 -> 6
constexpr int bit_width(uint64_t x) noexcept {
#ifdef MTL_CPP20
  return std::bit_width(x);
#else
  return 64 - clz(x);
#endif
}

/// Ceil power of 2. 00110110 -> 01000000
constexpr uint64_t bit_ceil(uint64_t x) {
#ifdef MTL_CPP20
  return std::bit_ceil(x);
#else
  if (x == 0) return 1;
  return 1ull << bit_width(x - 1);
#endif
}

/// Floor power of 2. 00110110 -> 00100000
constexpr uint64_t bit_floor(uint64_t x) {
#ifdef MTL_CPP20
  return std::bit_floor(x);
#else
  if (x == 0) return 0;
  return 1ull << (bit_width(x) - 1);
#endif
}

} // namespace bm
#line 3 "include/mtl/modular.hpp"
#include <iostream>
#line 5 "include/mtl/modular.hpp"

template <int MOD>
class Modular {
 private:
  unsigned int val_;

 public:
  static constexpr unsigned int mod() { return MOD; }
  template<class T>
  static constexpr unsigned int safe_mod(T v) {
    auto x = (long long)(v%(long long)mod());
    if (x < 0) x += mod();
    return (unsigned int) x;
  }

  constexpr Modular() : val_(0) {}
  template<class T,
      std::enable_if_t<
          std::is_integral<T>::value && std::is_unsigned<T>::value
      > * = nullptr>
  constexpr Modular(T v) : val_(v%mod()) {}
  template<class T,
      std::enable_if_t<
          std::is_integral<T>::value && !std::is_unsigned<T>::value
      > * = nullptr>
  constexpr Modular(T v) : val_(safe_mod(v)) {}

  constexpr unsigned int val() const { return val_; }
  constexpr Modular& operator+=(Modular x) {
    val_ += x.val();
    if (val_ >= mod()) val_ -= mod();
    return *this;
  }
  constexpr Modular operator-() const { return {mod() - val_}; }
  constexpr Modular& operator-=(Modular x) {
    val_ += mod() - x.val();
    if (val_ >= mod()) val_ -= mod();
    return *this;
  }
  constexpr Modular& operator*=(Modular x) {
    auto v = (long long) val_ * x.val();
    if (v >= mod()) v %= mod();
    val_ = v;
    return *this;
  }
  constexpr Modular pow(long long p) const {
    assert(p >= 0);
    Modular t = 1;
    Modular u = *this;
    while (p) {
      if (p & 1)
        t *= u;
      u *= u;
      p >>= 1;
    }
    return t;
  }
  friend constexpr Modular pow(Modular x, long long p) {
    return x.pow(p);
  }
  constexpr Modular inv() const { return pow(mod()-2); }
  constexpr Modular& operator/=(Modular x) { return *this *= x.inv(); }
  constexpr Modular operator+(Modular x) const { return Modular(*this) += x; }
  constexpr Modular operator-(Modular x) const { return Modular(*this) -= x; }
  constexpr Modular operator*(Modular x) const { return Modular(*this) *= x; }
  constexpr Modular operator/(Modular x) const { return Modular(*this) /= x; }
  constexpr Modular& operator++() { return *this += 1; }
  constexpr Modular operator++(int) { Modular c = *this; ++(*this); return c; }
  constexpr Modular& operator--() { return *this -= 1; }
  constexpr Modular operator--(int) { Modular c = *this; --(*this); return c; }

  constexpr bool operator==(Modular x) const { return val() == x.val(); }
  constexpr bool operator!=(Modular x) const { return val() != x.val(); }

  constexpr bool is_square() const {
    return pow((mod()-1)/2) == 1;
  }
  /**
   * Return x s.t. x * x = a mod p
   * reference: https://zenn.dev/peria/articles/c6afc72b6b003c
  */
  constexpr Modular sqrt() const {
    if (!is_square()) 
      throw std::runtime_error("not square");
    auto mod_eight = mod() % 8;
    if (mod_eight == 3 || mod_eight == 7) {
      return pow((mod()+1)/4);
    } else if (mod_eight == 5) {
      auto x = pow((mod()+3)/8);
      if (x * x != *this)
        x *= Modular(2).pow((mod()-1)/4);
      return x;
    } else {
      Modular d = 2;
      while (d.is_square())
        d += 1;
      auto t = mod()-1;
      int s = bm::ctz(t);
      t >>= s;
      auto a = pow(t);
      auto D = d.pow(t);
      int m = 0;
      Modular dt = 1;
      Modular du = D;
      for (int i = 0; i < s; i++) {
        if ((a*dt).pow(1u<<(s-1-i)) == -1) {
          m |= 1u << i;
          dt *= du;
        }
        du *= du;
      }
      return pow((t+1)/2) * D.pow(m/2);
    }
  }

  friend std::ostream& operator<<(std::ostream& os, const Modular& x) {
    return os << x.val();
  }
  friend std::istream& operator>>(std::istream& is, Modular& x) {
    return is >> x.val_;
  }

};

using Modular998244353 = Modular<998244353>;
using Modular1000000007 = Modular<(int)1e9+7>;

template<int Id=0>
class DynamicModular {
 private:
  static unsigned int mod_;
  unsigned int val_;

 public:
  static unsigned int mod() { return mod_; }
  static void set_mod(unsigned int m) { mod_ = m; }
  template<class T>
  static constexpr unsigned int safe_mod(T v) {
    auto x = (long long)(v%(long long)mod());
    if (x < 0) x += mod();
    return (unsigned int) x;
  }

  constexpr DynamicModular() : val_(0) {}
  template<class T,
      std::enable_if_t<
          std::is_integral<T>::value && std::is_unsigned<T>::value
      > * = nullptr>
  constexpr DynamicModular(T v) : val_(v%mod()) {}
  template<class T,
      std::enable_if_t<
          std::is_integral<T>::value && !std::is_unsigned<T>::value
      > * = nullptr>
  constexpr DynamicModular(T v) : val_(safe_mod(v)) {}

  constexpr unsigned int val() const { return val_; }
  constexpr DynamicModular& operator+=(DynamicModular x) {
    val_ += x.val();
    if (val_ >= mod()) val_ -= mod();
    return *this;
  }
  constexpr DynamicModular operator-() const { return {mod() - val_}; }
  constexpr DynamicModular& operator-=(DynamicModular x) {
    val_ += mod() - x.val();
    if (val_ >= mod()) val_ -= mod();
    return *this;
  }
  constexpr DynamicModular& operator*=(DynamicModular x) {
    auto v = (long long) val_ * x.val();
    if (v >= mod()) v %= mod();
    val_ = v;
    return *this;
  }
  constexpr DynamicModular pow(long long p) const {
    assert(p >= 0);
    DynamicModular t = 1;
    DynamicModular u = *this;
    while (p) {
      if (p & 1)
        t *= u;
      u *= u;
      p >>= 1;
    }
    return t;
  }
  friend constexpr DynamicModular pow(DynamicModular x, long long p) {
    return x.pow(p);
  }
  // TODO: implement when mod is not prime
  constexpr DynamicModular inv() const { return pow(mod()-2); }
  constexpr DynamicModular& operator/=(DynamicModular x) { return *this *= x.inv(); }
  constexpr DynamicModular operator+(DynamicModular x) const { return DynamicModular(*this) += x; }
  constexpr DynamicModular operator-(DynamicModular x) const { return DynamicModular(*this) -= x; }
  constexpr DynamicModular operator*(DynamicModular x) const { return DynamicModular(*this) *= x; }
  constexpr DynamicModular operator/(DynamicModular x) const { return DynamicModular(*this) /= x; }
  constexpr DynamicModular& operator++() { return *this += 1; }
  constexpr DynamicModular operator++(int) { DynamicModular c = *this; ++(*this); return c; }
  constexpr DynamicModular& operator--() { return *this -= 1; }
  constexpr DynamicModular operator--(int) { DynamicModular c = *this; --(*this); return c; }

  constexpr bool operator==(DynamicModular x) const { return val() == x.val(); }
  constexpr bool operator!=(DynamicModular x) const { return val() != x.val(); }

  constexpr bool is_square() const {
    return val() == 0 or pow((mod()-1)/2) == 1;
  }
  /**
   * Return x s.t. x * x = a mod p
   * reference: https://zenn.dev/peria/articles/c6afc72b6b003c
  */
  constexpr DynamicModular sqrt() const {
    // assert mod is prime
    if (!is_square()) 
      throw std::runtime_error("not square");
    if (val() < 2)
      return val();
    auto mod_eight = mod() % 8;
    if (mod_eight == 3 || mod_eight == 7) {
      return pow((mod()+1)/4);
    } else if (mod_eight == 5) {
      auto x = pow((mod()+3)/8);
      if (x * x != *this)
        x *= DynamicModular(2).pow((mod()-1)/4);
      return x;
    } else {
      DynamicModular d = 2;
      while (d.is_square())
        ++d;
      auto t = mod()-1;
      int s = bm::ctz(t);
      t >>= s;
      auto a = pow(t);
      auto D = d.pow(t);
      int m = 0;
      DynamicModular dt = 1;
      DynamicModular du = D;
      for (int i = 0; i < s; i++) {
        if ((a*dt).pow(1u<<(s-1-i)) == -1) {
          m |= 1u << i;
          dt *= du;
        }
        du *= du;
      }
      return pow((t+1)/2) * D.pow(m/2);
    }
  }

  friend std::ostream& operator<<(std::ostream& os, const DynamicModular& x) {
    return os << x.val();
  }
  friend std::istream& operator>>(std::istream& is, DynamicModular& x) {
    return is >> x.val_;
  }

};
template<int Id>
unsigned int DynamicModular<Id>::mod_;

#line 264 "include/mtl/modular.hpp"

template<class ModInt>
struct ModularUtil {
  static constexpr int mod = ModInt::mod();
  static struct inv_table {
    std::vector<ModInt> tb{0,1};
    inv_table() : tb({0,1}) {}
  } inv_;
  void set_inv(int n) {
    int m = inv_.tb.size();
    if (m > n) return;
    inv_.tb.resize(n+1);
    for (int i = m; i < n+1; i++)
      inv_.tb[i] = -inv_.tb[mod % i] * (mod / i);
  }
  ModInt& inv(int i) {
    set_inv(i);
    return inv_.tb[i];
  }
};
template<class ModInt>
typename ModularUtil<ModInt>::inv_table ModularUtil<ModInt>::inv_;

#include <array>

namespace math {

constexpr int mod_pow_constexpr(int x, int p, int m) {
  long long t = 1;
  long long u = x;
  while (p) {
    if (p & 1) {
      t *= u;
      t %= m;
    }
    u *= u;
    u %= m;
    p >>= 1;
  }
  return (int) t;
}

constexpr int primitive_root_constexpr(int m) {
  if (m == 2) return 1;
  if (m == 167772161) return 3;
  if (m == 469762049) return 3;
  if (m == 754974721) return 11;
  if (m == 880803841) return 26;
  if (m == 998244353) return 3;

  std::array<int, 20> divs{};
  int cnt = 0;
  int x = m-1;
  if (x % 2 == 0) {
    divs[cnt++] = 2;
    x >>= bm::ctz(x);
  }
  for (int d = 3; d*d <= x; d += 2) {
    if (x % d == 0) {
      divs[cnt++] = d;
      while (x % d == 0)
        x /= d;
    }
  }
  if (x > 1) divs[cnt++] = x;
  for (int g = 2; g < m; g++) {
    bool ok = true;
    for (int i = 0; i < cnt; i++) {
      if (mod_pow_constexpr(g, (m-1) / divs[i], m) == 1) {
        ok = false;
        break;
      }
    }
    if (ok) return g;
  }
  return -1;
}

template<int m>
constexpr int primitive_root = primitive_root_constexpr(m);

}
#line 6 "include/mtl/ntt.hpp"

namespace math {

constexpr int ceil_pow2(unsigned long long x) {
  return x == 0 ? 0 : 64-bm::clz(x-1);
}

}

#ifndef MTL_ARRAY_SET_CONSTEXPR
#if __cplusplus >= 201703L
#define MTL_ARRAY_SET_CONSTEXPR constexpr
#else
#define MTL_ARRAY_SET_CONSTEXPR
#endif
#endif

namespace _ntt {

template<int mod>
struct ntt_info {
  using mint = Modular<mod>;
  static constexpr int primitive_root() {
    return math::primitive_root<mod>;
  }
  static constexpr int log_n() {
    return bm::ctz(mod-1);
  }
  static MTL_ARRAY_SET_CONSTEXPR
  std::array<Modular<mod>, log_n()> coeff(bool forward) {
    mint r = mint(primitive_root()).pow((mod-1) >> log_n());
    std::array<mint, log_n()> coeff{};
    mint iw = forward ? r.inv() : r;
    for (int i = log_n()-1; i >= 0; i--) {
      coeff[i] = iw;
      iw *= iw;
    }
    return coeff;
  }
};

template<class T>
void _iterative_bit_reverse(std::vector<T>& f) {
  int log_n = math::ceil_pow2(f.size());
  int n = 1 << log_n;
  for (int i = 0, j = 1; j < n-1; j++) {
    for (int k = n >> 1; k > (i ^= k); k >>= 1);
    if (i < j) std::swap(f[i], f[j]);
  }
}

template <bool Forward, bool BitReverse, int mod>
void _fft_impl(std::vector<Modular<mod>>& f) {
  using mint = Modular<mod>;
  int log_n = math::ceil_pow2(f.size());
  using info = ntt_info<mod>;
  assert(info::log_n() >= log_n);
  int n = 1 << log_n;
  f.resize(n, 0);
  if constexpr (!Forward and BitReverse)
    _iterative_bit_reverse(f);
  // Cooley-Tukey FFT
  static MTL_ARRAY_SET_CONSTEXPR auto coeff = info::coeff(Forward);
  if constexpr (Forward) {
    for (int log_m = log_n-1; log_m >= 0; log_m -= 2) {
      int m = 1<<log_m;
      mint w0 = coeff[log_m];
      if (log_m == 0) {
        for (int chunk = 0; chunk < n; chunk += 2*m) {
          mint w = 1;
          for (int i = 0; i < m; i++) {
            auto p = chunk + i;
            auto a = f[p + 0];
            auto b = f[p + m];
            f[p + 0] = (a + b);
            f[p + m] = (a - b) * w;
            w *= w0;
          }
        }
      } else { // 4-base
        auto w1 = coeff[log_m-1];
        auto wh = w0.pow(m/2).val();
        for (int chunk = 0; chunk < n; chunk += 2*m) {
          mint w = 1, x = 1;
          for (int i = 0; i < m/2; i++) {
            auto p = chunk + i;
            auto ia = p + 0*m/2;
            auto ib = p + 1*m/2;
            auto ic = p + 2*m/2;
            auto id = p + 3*m/2;
            long long a = f[ia].val(), b = f[ib].val(), c = f[ic].val(), d = f[id].val();
            auto s = a + c, t = b + d, u = (a - c + mod) * w.val() % mod, v = (b - d + mod) * w.val() % mod * wh % mod;
            f[ia] = (s + t);
            f[ib] = (s - t) * x.val();
            f[ic] = (u + v);
            f[id] = (u - v) * x.val();
            w *= w0;
            x *= w1;
          }
        }
      }
    }
  } else {
    int _log_m = 0;
    if (log_n % 2 == 1) {
      for (int chunk = 0; chunk < n; chunk += 2) {
        auto p = chunk;
        auto a = f[p + 0];
        auto b = f[p + 1];
        f[p + 0] = a + b;
        f[p + 1] = a - b;
      }
      _log_m = 1;
    }
    // 4-base
    for (int log_m = _log_m; log_m < log_n; log_m += 2) {
      int m = 1<<(log_m+1);
      mint w0 = coeff[log_m];
      auto w1 = coeff[log_m+1];
      auto wh = w1.pow(m/2).val();
      for (int chunk = 0; chunk < n; chunk += 2*m) {
        mint w = 1, x = 1;
        for (int i = 0; i < m/2; i++) {
          auto p = chunk + i;
          auto ia = p + 0*m/2;
          auto ib = p + 1*m/2;
          auto ic = p + 2*m/2;
          auto id = p + 3*m/2;
          long long a = f[ia].val(), b = (long long)f[ib].val() * w.val() % mod, 
                    c = f[ic].val(), d = (long long)f[id].val() * w.val() % mod;
          auto s = a + b, t = a - b + mod, u = (c + d) * x.val() % mod, v = (c - d + mod) * x.val() % mod * wh % mod;
          f[ia] = s + u;
          f[ib] = t + v;
          f[ic] = s - u; 
          f[id] = t - v;
          w *= w0;
          x *= w1;
        }
      }
    }
  }
  if constexpr (Forward and BitReverse)
    _iterative_bit_reverse(f);
  if constexpr (!Forward) {
    mint inv = mint(n).inv();
    for (int i = 0; i < n; i++) f[i] *= inv;
  }
}
template <int mod>
void _fft(std::vector<Modular<mod>>& f) { 
  _fft_impl<true, true>(f); 
}
template <int mod>
void _ifft(std::vector<Modular<mod>>& f) { 
  _fft_impl<false, true>(f); 
}
template <int mod>
void _convolution_fft(std::vector<Modular<mod>>& f) { 
  _fft_impl<true, false>(f); 
}
template <int mod>
void _convolution_ifft(std::vector<Modular<mod>>& f) { 
  _fft_impl<false, false>(f); 
}

} // namespace _ntt

template<class mint>
void ntt_inline(std::vector<mint>& f) {
  _ntt::_fft(f);
}
template<class mint>
std::vector<mint> ntt(std::vector<mint> f) {
  _ntt::_fft(f);
  return f;
}
template<class mint>
void intt_inline(std::vector<mint>& f) {
  _ntt::_ifft(f);
}
template<class mint>
std::vector<mint> intt(std::vector<mint> f) {
  _ntt::_ifft(f);
  return f;
}

template<class mint>
void ntt_convolution_inline(std::vector<mint>& f) {
  _ntt::_convolution_fft(f);
}
template<class mint>
std::vector<mint> ntt_convolution(std::vector<mint> f) {
  _ntt::_convolution_fft(f);
  return f;
}
template<class mint>
void intt_convolution_inline(std::vector<mint>& f) {
  _ntt::_convolution_ifft(f);
}
template<class mint>
std::vector<mint> intt_convolution(std::vector<mint> f) {
  _ntt::_convolution_ifft(f);
  return f;
}
#line 4 "include/mtl/convolution.hpp"

const int CONVOLUTION_NAIVE_THRESHOLD = 60;

template<class T>
std::vector<T> convolution_naive(const std::vector<T>& f, const std::vector<T>& g) {
  auto n = f.size(), m = g.size();
  if (n == 0 or m == 0)
    return {};
  std::vector<T> h(n+m-1);
  if (n < m) {
    for (size_t j = 0; j < m; j++)
      for (size_t i = 0; i < n; i++)
        h[i+j] += f[i] * g[j];
  } else {
    for (size_t i = 0; i < n; i++)
      for (size_t j = 0; j < m; j++)
        h[i+j] += f[i] * g[j];
  }
  return h;
}

template<class T>
std::vector<T> convolution_fft(std::vector<T> f, std::vector<T> g) {
  auto n = f.size(), m = g.size();
  size_t len = n + m - 1;
  size_t l = 1 << math::ceil_pow2(len);
  if (n+m-3 < l/2) {
    auto fb = f.back(), gb = g.back();
    f.pop_back(); g.pop_back();
    std::vector<T> h(n+m-1, 0);
    h[n+m-2] = fb * gb;
    for (size_t i = 0; i < n; i++)
      h[i + m-1] += f[i] * gb;
    for (size_t i = 0; i < m; i++)
      h[i + n-1] += g[i] * fb;
    auto fg = convolution(std::move(f), std::move(g));
    for (size_t i = 0; i < n+m-3; i++)
      h[i] += fg[i];
    return h;
  }
  Fft fft(len);
  f.resize(fft.n(), 0);
  g.resize(fft.n(), 0);
  auto same = f == g;
  fft.fft_inline(f);
  if (same) {
    for (size_t i = 0; i < fft.n(); i++)
      f[i] *= f[i];
  } else {
    fft.fft_inline(g);
    for (size_t i = 0; i < fft.n(); i++)
      f[i] *= g[i];
  }
  fft.ifft_inline(f);
  std::vector<T> res(len);
  for (size_t i = 0; i < len; i++)
    res[i] = f[i].real();
  return res;
}

template<int Mod>
std::vector<Modular<Mod>> convolution_ntt(std::vector<Modular<Mod>> f, std::vector<Modular<Mod>> g) {
  auto n = f.size(), m = g.size();
  size_t l = 1 << math::ceil_pow2(n + m - 1);
  if (n+m-3 < l/2) {
    auto fb = f.back(), gb = g.back();
    f.pop_back(); g.pop_back();
    std::vector<Modular<Mod>> h(n+m-1, 0);
    h[n+m-2] = fb * gb;
    for (size_t i = 0; i < n-1; i++)
      h[i + m-1] += f[i] * gb;
    for (size_t i = 0; i < m-1; i++)
      h[i + n-1] += g[i] * fb;
    auto fg = convolution(std::move(f), std::move(g));
    for (size_t i = 0; i < fg.size(); i++)
      h[i] += fg[i];
    return h;
  }
  f.resize(l, 0);
  g.resize(l, 0);
  bool same = f == g;
  ntt_convolution_inline(f);
  if (same) {
    for (size_t i = 0; i < l; i++) 
      f[i] *= f[i];
  } else {
    ntt_convolution_inline(g);
    for (size_t i = 0; i < l; i++) 
      f[i] *= g[i];
  }
  intt_convolution_inline(f);
  return f;
}

template<int Mod>
std::vector<Modular<Mod>> convolution_fft(std::vector<Modular<Mod>> f, std::vector<Modular<Mod>> g) {
  return convolution_ntt(std::move(f), std::move(g));
}

std::vector<int> convolution_fft(const std::vector<int>& a, const std::vector<int>& b) {
  auto n = a.size(), m = b.size();
  std::vector<Modular998244353> f(n), g(m);
  for (size_t i = 0; i < n; i++)
    f[i] = a[i];
  for (size_t i = 0; i < m; i++)
    g[i] = b[i];
  auto h = convolution_fft(std::move(f), std::move(g));
  std::vector<int> res(n+m-1);
  for (size_t i = 0; i < n+m-1; i++)
    res[i] = h[i].val();
  return res;
}

template<class T>
std::vector<T> convolution(const std::vector<T>& f, const std::vector<T>& g) {
  auto n = f.size(), m = g.size();
  if (n == 0 or m == 0)
    return {};
  if (std::min(n, m) <= CONVOLUTION_NAIVE_THRESHOLD)
    return convolution_naive(f, g);
  return convolution_fft(f, g);
}

template<class T>
std::vector<T> convolution(std::vector<T>&& f, std::vector<T>&& g) {
  auto n = f.size(), m = g.size();
  if (n == 0 or m == 0)
    return {};
  if (std::min(n, m) <= CONVOLUTION_NAIVE_THRESHOLD)
    return convolution_naive(f, g);
  return convolution_fft(std::move(f), std::move(g));
}

template<class mint>
std::vector<mint> convolution_garner(const std::vector<mint>& f, 
                                     const std::vector<mint>& g) {
  auto n = f.size();
  auto m = g.size();
  if (n == 0 or m == 0)
    return {};
  constexpr long long nttprimes[] = {754974721, 167772161, 469762049};
  using mint0 = Modular<754974721>;
  using mint1 = Modular<167772161>;
  using mint2 = Modular<469762049>;
  std::vector<mint0> a0(n), b0(m);
  std::vector<mint1> a1(n), b1(m);
  std::vector<mint2> a2(n), b2(m);
  for(size_t i = 0; i < n; i++) a0[i] = f[i].val(), a1[i] = f[i].val(), a2[i] = f[i].val();
  for(size_t i = 0; i < m; i++) b0[i] = g[i].val(), b1[i] = g[i].val(), b2[i] = g[i].val();
  auto c0 = convolution_ntt(a0, b0);
  auto c1 = convolution_ntt(a1, b1);
  auto c2 = convolution_ntt(a2, b2);
  constexpr long long m01 = (long long) nttprimes[0] * nttprimes[1];
  constexpr long long m0_inv_m1 = mint1(nttprimes[0]).inv().val();
  constexpr long long m01_inv_m2 = mint2(m01).inv().val();
  const int mod = mint::mod();
  auto garner = [&](mint0 x0, mint1 x1, mint2 x2) -> mint {
    int r0 = x0.val(), r1 = x1.val(), r2 = x2.val();
    int v1 = (m0_inv_m1 * (r1 + nttprimes[1] - r0)) % nttprimes[1];
    auto v2 = (mint2(r2) - r0 - mint2(nttprimes[0]) * v1) * mint2(m01_inv_m2);
    return mint(r0 + 1LL * nttprimes[0] * v1 + m01 % mod * v2.val());
  };
  std::vector<mint> c(c0.size());
  for(size_t i = 0; i < c.size(); i++) c[i] = garner(c0[i], c1[i], c2[i]);
  return c;
}

template<int Mod>
std::vector<Modular<Mod>> convolution_large(const std::vector<Modular<Mod>>& f, 
                                            const std::vector<Modular<Mod>>& g) {
  auto n = f.size();
  auto m = g.size();
  if (n == 0 or m == 0)
    return {};
  constexpr auto h = bm::ctz(Mod-1);
  if (n+m-1 < 1u<<h)
    return convolution(f ,g);
  size_t l = h-1;
  int cs = ((n-1)>>l)+1;
  int ds = ((m-1)>>l)+1;
  using result_type = std::vector<Modular<Mod>>;
  std::vector<result_type> c(cs, result_type(2<<l));
  std::vector<result_type> d(ds, result_type(2<<l));
  size_t mask = (1<<l)-1;
  for (size_t i = 0; i < n; i++)
    c[i>>l][i&mask] = f[i];
  for (size_t i = 0; i < m; i++)
    d[i>>l][i&mask] = g[i];
  for (auto& v: c) ntt_convolution_inline(v);
  for (auto& v: d) ntt_convolution_inline(v);
  result_type res(n+m-1);
  result_type e(2<<l);
  for (int i = 0; i < cs+ds-1; i++) {
    e.assign(2<<l, 0);
    for (int j = std::max(i-ds+1, 0); j < std::min(cs, i+1); j++) {
      int k = i-j;
      assert(k >= 0 and k < ds);
      for (int x = 0; x < 2<<l; x++)
        e[x] += c[j][x] * d[k][x];
    }
    intt_convolution_inline(e);
    auto len = std::min((size_t)2<<l, n+m-1-(i<<l));
    for (size_t x = 0; x < len; x++)
      res[(i<<l) + x] += e[x];
  }
  return res;
}

size_t complexity_of_convolution(size_t fsize, size_t gsize) {
  int lg = math::ceil_pow2(fsize + gsize - 1);
  return lg << lg;
}
#line 3 "test/yosupo/convolution.test.cpp"
#include <bits/stdc++.h>

using namespace std;
using ll = long long;

int main() {
  cin.tie(nullptr); ios::sync_with_stdio(false);

  int N,M; cin>>N>>M;
  vector<Modular998244353> A(N), B(M);
  for (auto& a : A) cin>>a;
  for (auto& b : B) cin>>b;
  auto C = convolution(A, B);
  C.resize(N+M-1);
  for (int i = 0; i < N+M-1; i++)
    cout << C[i] << " ";
  cout << endl;

  return 0;
}

Test cases

Env	Name	Status	Elapsed	Memory
g++	example_00	AC	6 ms	3 MB
g++	example_01	AC	5 ms	3 MB
g++	fft_killer_00	AC	227 ms	18 MB
g++	fft_killer_01	AC	232 ms	18 MB
g++	max_ans_zero_00	AC	233 ms	18 MB
g++	max_random_00	AC	227 ms	18 MB
g++	max_random_01	AC	216 ms	18 MB
g++	medium_00	AC	9 ms	4 MB
g++	medium_01	AC	7 ms	4 MB
g++	medium_02	AC	8 ms	4 MB
g++	medium_all_zero_00	AC	7 ms	4 MB
g++	medium_pre_suf_zero_00	AC	6 ms	4 MB
g++	medium_pre_suf_zero_01	AC	6 ms	4 MB
g++	medium_pre_suf_zero_02	AC	6 ms	4 MB
g++	medium_pre_suf_zero_03	AC	6 ms	4 MB
g++	medium_pre_suf_zero_04	AC	6 ms	4 MB
g++	random_00	AC	200 ms	16 MB
g++	random_01	AC	212 ms	16 MB
g++	random_02	AC	98 ms	11 MB
g++	signed_overflow_00	AC	6 ms	3 MB
g++	small_00	AC	5 ms	3 MB
g++	small_01	AC	5 ms	3 MB
g++	small_02	AC	5 ms	3 MB
g++	small_03	AC	5 ms	3 MB
g++	small_04	AC	6 ms	3 MB
g++	small_05	AC	6 ms	3 MB
g++	small_06	AC	5 ms	3 MB
g++	small_07	AC	5 ms	3 MB
g++	small_08	AC	5 ms	3 MB
g++	small_09	AC	5 ms	3 MB
g++	small_10	AC	6 ms	3 MB
g++	small_11	AC	5 ms	3 MB
g++	small_12	AC	5 ms	3 MB
g++	small_13	AC	6 ms	3 MB
g++	small_14	AC	5 ms	3 MB
g++	small_15	AC	5 ms	3 MB
g++	small_and_large_00	AC	158 ms	15 MB
g++	small_and_large_01	AC	160 ms	15 MB
g++	small_and_large_02	AC	159 ms	13 MB
g++	small_and_large_03	AC	160 ms	13 MB
g++	unsigned_overflow_00	AC	6 ms	3 MB
clang++	example_00	AC	6 ms	3 MB
clang++	example_01	AC	5 ms	3 MB
clang++	fft_killer_00	AC	219 ms	18 MB
clang++	fft_killer_01	AC	216 ms	18 MB
clang++	max_ans_zero_00	AC	225 ms	18 MB
clang++	max_random_00	AC	229 ms	18 MB
clang++	max_random_01	AC	220 ms	18 MB
clang++	medium_00	AC	9 ms	4 MB
clang++	medium_01	AC	7 ms	4 MB
clang++	medium_02	AC	8 ms	4 MB
clang++	medium_all_zero_00	AC	7 ms	4 MB
clang++	medium_pre_suf_zero_00	AC	6 ms	4 MB
clang++	medium_pre_suf_zero_01	AC	6 ms	4 MB
clang++	medium_pre_suf_zero_02	AC	6 ms	4 MB
clang++	medium_pre_suf_zero_03	AC	6 ms	4 MB
clang++	medium_pre_suf_zero_04	AC	6 ms	4 MB
clang++	random_00	AC	192 ms	16 MB
clang++	random_01	AC	209 ms	17 MB
clang++	random_02	AC	100 ms	11 MB
clang++	signed_overflow_00	AC	6 ms	3 MB
clang++	small_00	AC	5 ms	3 MB
clang++	small_01	AC	5 ms	3 MB
clang++	small_02	AC	5 ms	3 MB
clang++	small_03	AC	5 ms	3 MB
clang++	small_04	AC	5 ms	3 MB
clang++	small_05	AC	5 ms	3 MB
clang++	small_06	AC	5 ms	3 MB
clang++	small_07	AC	6 ms	3 MB
clang++	small_08	AC	5 ms	3 MB
clang++	small_09	AC	5 ms	3 MB
clang++	small_10	AC	5 ms	3 MB
clang++	small_11	AC	5 ms	3 MB
clang++	small_12	AC	5 ms	3 MB
clang++	small_13	AC	5 ms	3 MB
clang++	small_14	AC	5 ms	3 MB
clang++	small_15	AC	5 ms	3 MB
clang++	small_and_large_00	AC	169 ms	16 MB
clang++	small_and_large_01	AC	163 ms	16 MB
clang++	small_and_large_02	AC	162 ms	13 MB
clang++	small_and_large_03	AC	165 ms	13 MB
clang++	unsigned_overflow_00	AC	6 ms	3 MB