diff --git a/include/ml/math/cg_solve.hpp b/include/ml/math/cg_solve.hpp new file mode 100644 index 0000000..50006a1 --- /dev/null +++ b/include/ml/math/cg_solve.hpp @@ -0,0 +1,107 @@ +#ifndef INCLUDE_ML_MATH_CG_SOLVE_HPP +#define INCLUDE_ML_MATH_CG_SOLVE_HPP + +#include "ml/math/mat_mul.hpp" +#include "ml/math/mat_ops.hpp" +#include "ml/math/vec_ops.hpp" + +namespace ml +{ + +namespace detail +{ + +class ml_cg_update_x_r; +class ml_cg_update_p; + +template +void update_x_r(queue& q, vector_t& v1, vector_t& v2, T alpha) { + q.submit([&](handler& cgh) { + auto v1_acc = v1.template get_access_1d(cgh); + auto v2_acc = v2.template get_access_1d(cgh); + cgh.parallel_for>(v1.get_nd_range(), [=](nd_item<1> item) { + auto row = item.get_global(0); + v1_acc(row) += alpha * v2_acc(row); + }); + }); +} + +template +void update_p(queue& q, vector_t& p, vector_t& r, T factor) { + q.submit([&](handler& cgh) { + auto p_acc = p.template get_access_1d(cgh); + auto r_acc = r.template get_access_1d(cgh); + cgh.parallel_for>(p.get_nd_range(), [=](nd_item<1> item) { + auto row = item.get_global(0); + p_acc(row) = r_acc(row) + factor * p_acc(row); + }); + }); +} + +} // detail + +/** + * @brief Solve the system Ax = b where A is a symmetric SPD matrix of size nxn. + * + * Uses the Conjugate Gradient method. + * + * @tparam T + * @param q + * @param[in] a of size nxn + * @param[in] b of size n + * @param[in,out] x of size n, it is used as an initial guess, if you have none, it must be set to 0 + * @param r temporary buffer must be at least of size n + * @param p temporary buffer must be at least of size n + * @param Ap temporary buffer must be at least of size n + */ +template +void cg_solve(queue& q, matrix_t& a, vector_t& b, vector_t& x, + vector_t& r, vector_t& p, vector_t& Ap, T epsilon = 1E-4) { + auto n = a.data_range[0]; + a.assert_square(); + assert_less_or_eq(n, x.data_range[0]); + assert_less_or_eq(n, r.data_range[0]); + assert_less_or_eq(n, p.data_range[0]); + assert_less_or_eq(n, Ap.data_range[0]); + + sycl_copy(q, b, r); + sycl_copy(q, b, p); + + T rs_old = sycl_inner_product(q, r); + T rs_new = 0; + + for (SYCLIndexT i = 0; i < n; ++i) { + mat_mul(q, A, p, Ap); + alpha = rs_old / sycl_inner_product(q, p, Ap); + detail::update_x_r(q, x, p, alpha); + detail::update_x_r(q, r, Ap, -alpha); + rs_new = sycl_inner_product(q, r); + if (std::sqrt(rs_new) < epsilon) + break; + detail::update_p(q, p, r, rs_new / rs_old); + rs_old = rs_new; + } +} + +/** + * @brief Solve the system Ax = b and create any necessary temporary buffers. + * + * @see cg_solve(queue&, matrix_t&, vector_t&, vector_t&, vector_t&, vector_t&, vector_t&) + * @tparam T + * @param q + * @param[in] a + * @param[in] b + * @param[out] x + */ +template +void cg_solve(queue& q, matrix_t& a, vector_t& b, vector_t& x, T epsilon = 1E-4) { + auto n = a.data_range[0]; + vector_t r(b.data_range, b.kernel_range); + vector_t p(b.data_range, b.kernel_range); + vector_t Ap(b.data_range, b.kernel_range); + cg_solve(q, a, b, x, r, p, Ap, epsilon); +} + +} // ml + +#endif //INCLUDE_ML_MATH_CG_SOLVE_HPP diff --git a/include/ml/math/eig.hpp b/include/ml/math/eig.hpp new file mode 100644 index 0000000..200c93d --- /dev/null +++ b/include/ml/math/eig.hpp @@ -0,0 +1,153 @@ +#ifndef INCLUDE_ML_MATH_EIG_HPP +#define INCLUDE_ML_MATH_EIG_HPP + +#include +#include +#include +#include + +#include "ml/math/mat_inv.hpp" +#include "ml/math/mat_mul.hpp" +#include "ml/utils/copy.hpp" + +namespace ml +{ + +namespace detail { + +/** + * @brief Rayleigh quotient iteration + * + * v(i+1) = (A - Lambda(i)*I)^-1 * v(i) / ||(A - Lambda(i)*I)^-1 * v(i)|| + * Lambda(i) = v(i)' * A * v(i) / v(i)' * v(i) + * + * + * @tparam T + * @param q + * @param matrix + * @param act_eig_vec + * @param eig_val + * @param vec_buffer + * @param mat_buffer + * @param c_buffer + * @param block_buffer + * @param actual_vec_range + * @param vec_range_pow2 + */ +template +void rayleigh_vec_iteration(queue& q, matrix_t& matrix, vector_t& act_eig_vec, T eig_val, + vector_t& vec_buffer, matrix_t& mat_buffer, + matrix_t& c_buffer, matrix_t& block_buffer, + const nd_range<1>& actual_vec_range, const nd_range<1>& vec_range_pow2) { + auto data_dim = actual_vec_range.get_global_range(0); + mat_inv(q, matrix, mat_buffer, c_buffer, block_buffer, data_dim, -eig_val); //TODO inv_solve + mat_mul_vec(q, mat_buffer, act_eig_vec, vec_buffer, data_dim, data_dim); + T norm = sycl_norm(q, vec_buffer); + + // Copy normalized vec_buffer in act_eig_vec + return q.submit([&](handler& cgh) { + auto tmp_acc = vec_buffer.template get_access(cgh); + auto vec_acc = act_eig_vec.template get_access(cgh); + cgh.parallel_for>(vec_range_pow2, [=](nd_item<1> item) { + auto row = item.get_global(0); + vec_acc[row] = tmp_acc[row] / norm; + }); + }); +} + +// Assumes matrix is symmetric +// A_(i+1) = A_i - Lambda(i) * v(i) * v(i)' +// TODO: try to use the symmetry for optimization +template +handler_event deflate(queue& q, matrix_t& matrix, vector_t& act_eig_vec, T eig_val, + const nd_range<2>& matrix_range) { + TIME(eig_deflate); + return q.submit([&](handler& cgh) { + auto vec_acc = act_eig_vec.template get_access(cgh); + auto matrix_acc = matrix.template get_access(cgh); + cgh.parallel_for>(matrix_range, [=](nd_item<2> item) { + auto row = item.get_global(0); + auto col = item.get_global(1); + matrix_acc[row][col] -= eig_val * vec_acc[row] * vec_acc[col]; + }); + }); +} + +} // detail + +// Compute enough eigenpairs (eigenvalue with its eigenvector) to satisfies keep_percent. +// Assumes matrix is symmetric and its size is a power of 2. Note that data is overwritten during this call. +// +// data_dim is the number of row of the matrix if it were not rounded by the next power of 2. +// +// nb_eigenpairs is the number of eigenpairs to compute (which is also the number of rows of the output matrix). +// +// epsilon is used to stop the Rayleigh iteration whenever the difference between the previous and actual +// eigenvector is smaller than that. +// +// max_rayleigh_iter is used to stop the rayleigh iteration if it does not converge anymore (precision issue?) +template +matrix_t eig(queue& q, matrix_t& matrix, SYCLIndexT data_dim, SYCLIndexT nb_eigenpairs = 0, + double epsilon = 1e-3, unsigned max_rayleigh_iter = 15) { + matrix.assert_square(); + const auto matrix_global_range = matrix.get_range(); + const auto data_dim_pow2 = matrix_global_range[0]; + assert((data_dim_pow2 & (data_dim_pow2 - 1)) == 0 && "Matrix size must be a power of 2"); + assert(nb_eigenpairs <= data_dim && "Cannot ask for more eigenvectors than there are variables"); + + if (nb_eigenpairs == 0) + nb_eigenpairs = data_dim; + matrix_t eig_vecs(range<2>(nb_eigenpairs, data_dim_pow2)); + + const auto matrix_range = get_optimal_nd_range(matrix_global_range); + const auto index_order = ::detail::compute_index_order(q, matrix, data_dim); + + vector_t act_eig_vec{range<1>(data_dim_pow2)}; // Allocated to be a power of 2 + vector_t vec_buffer{range<1>(data_dim_pow2)}; // Used as a temporary vector buffer + matrix_t mat_buffer{matrix.get_range()}; // Used as a temporary matrix buffer + matrix_t c_buffer(range<2>(data_dim, 2 * data_dim)); // Temporary buffer for matrix inversion + matrix_t block_buffer(range<2>(data_dim, data_dim + 1)); // Temporary buffer for matrix inversion + const auto actual_vec_range = get_optimal_nd_range(range<1>(data_dim)); + const auto vec_range_pow2 = get_optimal_nd_range(act_eig_vec.get_range()); + sycl_memset(q, vec_buffer, vec_range_pow2); + sycl_memset(q, mat_buffer, matrix_range); + T eig_val_diff; + + // Iterate to compute enough eigenpairs with Hotteling's deflation. + // A_(i+1) = A_i - Lambda(i) * v(i) * v(i)' + // Assuming v is normalized. + // This deflation allow A_(i+1) to have the same eigenpairs than A_i excluding the pair (Lambda(i), v(i)). + T eig_val; + T eig_vals_sum = 0; + IndexT act_rayleigh_iter; + + std::cout.precision(8); + for (IndexT i = 0; i < nb_eigenpairs; ++i) { + ::detail::write_initial_guess(q, matrix, act_eig_vec, index_order[i], vec_range_pow2); + + // Rayleigh quotient iteration: + // Lambda(i) = v(i)' * A * v(i) + // v(i+1) = (A - Lambda(i)*I)^-1 * v(i) / ||(A - Lambda(i)*I)^-1 * v(i)|| + // Note: no need to divide lambda by the norm of v if the guess is already normalized + ::detail::rayleigh_vec_iteration(q, matrix, act_eig_vec, eig_val, + vec_buffer, mat_buffer, c_buffer, block_buffer, + actual_vec_range, vec_range_pow2); + + std::cout << "#" << i; + std::cout << "\t nb_rayleigh_iter=" << act_rayleigh_iter; + std::cout << "\t eigenvalue diff=" << std::fixed << eig_val_diff; + std::cout << "\t eigenvalue=" << eig_val << " / " << eig_vals_sum << std::endl; + + copy_vec_to_row(q, eig_vecs, act_eig_vec, vec_range_pow2, i); + eig_vals_sum += eig_val; + + if (i < nb_eigenpairs - 1) + ::detail::deflate(q, matrix, act_eig_vec, eig_val, matrix_range); + } + + return eig_vecs; +} + +} // ml + +#endif //INCLUDE_ML_MATH_EIG_HPP diff --git a/include/ml/math/svd.hpp b/include/ml/math/svd.hpp index fc3e76d..e9b253e 100644 --- a/include/ml/math/svd.hpp +++ b/include/ml/math/svd.hpp @@ -1,11 +1,6 @@ #ifndef INCLUDE_ML_MATH_SVD_HPP #define INCLUDE_ML_MATH_SVD_HPP -#include -#include -#include -#include - #include "ml/math/vec_ops.hpp" #include "ml/math/mat_mul.hpp"