//==============================================================
// Copyright Intel Corporation
//
// SPDX-License-Identifier: MIT
// =============================================================
#include <string>
#include <sycl/ext/intel/fpga_extensions.hpp>
#include <sycl/sycl.hpp>

#include "exception_handler.hpp"

using namespace sycl;
using namespace std;

// Forward declare the kernel names in the global scope.
// This FPGA best practice reduces name mangling in the optimization reports.
class UnOptKernel;
class OptKernel;

constexpr size_t kMaxN = 150;


event Unoptimized(queue &q, const vector<double> &vec_a,
                  const vector<double> &vec_b, double &result, size_t N) {
  buffer b_a(vec_a);
  buffer b_b(vec_b);
  buffer b_result(&result, range(1));

  auto e = q.submit([&](handler &h) {
    accessor a(b_a, h, read_only);
    accessor b(b_b, h, read_only);
    accessor result(b_result, h, write_only, no_init);

    h.single_task<UnOptKernel>([=]() {
      double sum = 0;
      double local_a[kMaxN * kMaxN], local_b[kMaxN];

      // Copy to local memory for speed
      for (size_t i = 0; i < N * N; i++) local_a[i] = a[i];
      for (size_t i = 0; i < N; i++) local_b[i] = b[i];

      for (size_t i = 0; i < N; i++) {
        for (size_t j = 0; j < N; j++) {
          sum += local_a[i * N + j];
        }
        sum += local_b[i];
      }
      result[0] = sum;
    });
  });
  return e;
}

event Optimized(queue &q, const vector<double> &vec_a,
                const vector<double> &vec_b, double &result, size_t N) {
  buffer b_a(vec_a);
  buffer b_b(vec_b);
  buffer b_result(&result, range(1));

  auto e = q.submit([&](handler &h) {
    accessor a(b_a, h, read_only);
    accessor b(b_b, h, read_only);
    accessor result(b_result, h, write_only, no_init);

    h.single_task<OptKernel>([=]() [[intel::kernel_args_restrict]] {
      double sum = 0;
      double local_a[kMaxN * kMaxN], local_b[kMaxN];

      // Copy to local memory
      for (size_t i = 0; i < N * N; i++) local_a[i] = a[i];
      for (size_t i = 0; i < N; i++) local_b[i] = b[i];

      for (size_t i = 0; i < N; i++) {
        // Step 1: Definition
        double sum_2 = 0;

        // Step 2: Accumulation of array A values for one outer loop iteration
        for (size_t j = 0; j < N; j++) {
          sum_2 += local_a[i * N + j];
        }

        // Step 3: Addition of array B value for an outer loop iteration
        sum += sum_2;
        sum += local_b[i];
      }

      result[0] = sum;
    });
  });
  return e;
}

void PrintTime(const event &e, queue &q, const char *kind) {
  double start_k = e.get_profiling_info<info::event_profiling::command_start>();
  double end_k = e.get_profiling_info<info::event_profiling::command_end>();

  cout << "Run: " << kind << ":\n";
#if defined(FPGA_SIMULATOR)
  double kernel_time = (double)(end_k - start_k) * 1e-9;
  cout << "kernel time : " << kernel_time << " s\n";
#else
  double kernel_time = (double)(end_k - start_k) * 1e-6;
  cout << "kernel time : " << kernel_time << " ms\n";
#endif
}

int main(int argc, char *argv[]) {
  size_t n = kMaxN;

  if (argc > 1) {
    string option(argv[1]);
    if (option == "-h" || option == "--help") {
      cout << "Usage: <executable> <data size>\n\nFAILED\n";
      return 1;
    } else {
      n = stoi(option);
    }
  }
  // Cap the value of n.
  n = std::max(std::min((size_t)n, (size_t)kMaxN), (size_t)100);
  cout << "Number of elements: " << n << '\n';

  vector<double> vec_a(n * n);
  vector<double> vec_b(n);

  double answer = 0;

  // initialize data and compute golden result
  for (size_t i = 0; i < n; i++) {
    for (size_t j = 0; j < n; j++) {
      vec_a[i * n + j] = i + j;
      answer += i + j;
    }
    vec_b[i] = i;
    answer += i;
  }

  // Initialize queue with device selector and enabling profiling
  // Create queue, get platform and device
#if FPGA_SIMULATOR
  auto selector = sycl::ext::intel::fpga_simulator_selector_v;
#elif FPGA_HARDWARE
  auto selector = sycl::ext::intel::fpga_selector_v;
#else  // #if FPGA_EMULATOR
  auto selector = sycl::ext::intel::fpga_emulator_selector_v;
#endif

#ifndef FPGA_HARDWARE
  cout << "\nEmulator and simulator outputs do not demonstrate true "
          "hardware performance. The design may need to run on actual "
          "hardware to observe the performance benefit of the optimization "
          "exemplified in this tutorial.\n\n";
#endif

  double unopt_sum = -1, opt_sum = -1;

  try {
    // Create a profiling queue
    queue q(selector, fpga_tools::exception_handler,
            property::queue::enable_profiling{});

    auto device = q.get_device();

    std::cout << "Running on device: "
              << device.get_info<sycl::info::device::name>().c_str()
              << std::endl;

    // compute result on device
    PrintTime(Unoptimized(q, vec_a, vec_b, unopt_sum, n), q, "Unoptimized");
    PrintTime(Optimized(q, vec_a, vec_b, opt_sum, n), q, "Optimized");

    // q's destructor invokes q's exception handler on any device exceptions.
  } catch (sycl::exception const &e) {
    // Catches exceptions in the host code
    std::cerr << "Caught a SYCL host exception:\n" << e.what() << "\n";

    // Most likely the runtime couldn't find FPGA hardware!
    if (e.code().value() == CL_DEVICE_NOT_FOUND) {
      std::cerr << "If you are targeting an FPGA, please ensure that your "
                   "system has a correctly configured FPGA board.\n";
      std::cerr << "Run sys_check in the oneAPI root directory to verify.\n";
      std::cerr << "If you are targeting the FPGA emulator, compile with "
                   "-DFPGA_EMULATOR.\n";
    }
    std::terminate();
  }

  // Check the results
  bool failed = false;
  if (unopt_sum != answer) {
    cout << "Unoptimized: expected: " << answer << ", result: " << unopt_sum
         << '\n';
    failed = true;
  }
  if (opt_sum != answer) {
    cout << "Optimized: expected: " << answer << ", result: " << opt_sum
         << '\n';
    failed = true;
  }

  if (failed) {
    cout << "FAILED\n";
    return 1;
  }
  cout << "PASSED\n";
  return 0;
}
