/*------------------------------------------------------------------------------------------------*
 * Copyright (C) by the DBCSR developers group - All rights reserved                              *
 * This file is part of the DBCSR library.                                                        *
 *                                                                                                *
 * For information on the license, see the LICENSE file.                                          *
 * For further information please visit https://dbcsr.cp2k.org                                    *
 * SPDX-License-Identifier: GPL-2.0+                                                              *
 *------------------------------------------------------------------------------------------------*/
#if defined(__OPENCL)
#include "opencl_libsmm.h"
/* Header opencl_kernels.h is generated by the build system using acc_opencl.sh */
#include "opencl_kernels.h"
#include "../../acc_bench.h"
#include <libxsmm_sync.h>
#include <assert.h>
#if defined(_OPENMP)
# include <omp.h>
#endif

#if LIBXSMM_VERSION3(1, 16, 1) <= LIBXSMM_VERSION3(LIBXSMM_VERSION_MAJOR, \
    LIBXSMM_VERSION_MINOR, LIBXSMM_VERSION_UPDATE) && 808 <= LIBXSMM_VERSION_PATCH
# define OPENCL_LIBSMM_REGISTER(KEY, KEY_SIZE, VALUE_SIZE, VALUE_INIT) \
    libxsmm_xregister(KEY, KEY_SIZE, VALUE_SIZE, VALUE_INIT, NULL/*key_hash*/)
# define OPENCL_LIBSMM_DISPATCH(KEY, KEY_SIZE) \
    libxsmm_xdispatch(KEY, KEY_SIZE, NULL/*key_hash*/)
#else
# define OPENCL_LIBSMM_REGISTER(KEY, KEY_SIZE, VALUE_SIZE, VALUE_INIT) \
    libxsmm_xregister(KEY, KEY_SIZE, VALUE_SIZE, VALUE_INIT)
# define OPENCL_LIBSMM_DISPATCH(KEY, KEY_SIZE) \
    libxsmm_xdispatch(KEY, KEY_SIZE)
#endif

#if defined(_OPENMP)
# define OPENCL_LIBSMM_USEOMP(FUNC) LIBXSMM_USEOMP(FUNC)
#else
# define OPENCL_LIBSMM_USEOMP(FUNC) (FUNC)
#endif

#if !defined(OPENCL_LIBSMM_DEBUG_TRANS) && defined(OPENCL_LIBSMM_DEBUG) \
  && (1 < OPENCL_LIBSMM_DEBUG || 0 > OPENCL_LIBSMM_DEBUG)
# define OPENCL_LIBSMM_DEBUG_TRANS
#endif
#if !defined(OPENCL_LIBSMM_DEBUG_SMM) && defined(OPENCL_LIBSMM_DEBUG)
# define OPENCL_LIBSMM_DEBUG_SMM
#endif
#if !defined(OPENCL_LIBSMM_DEBUG_EXIT) && defined(OPENCL_LIBSMM_DEBUG) && 1
# define OPENCL_LIBSMM_DEBUG_EXIT
#endif
#if !defined(OPENCL_LIBSMM_KERNELNAME_TRANS)
# define OPENCL_LIBSMM_KERNELNAME_TRANS "trans"
#endif
#if !defined(OPENCL_LIBSMM_KERNELNAME_SMM)
# define OPENCL_LIBSMM_KERNELNAME_SMM "smm"
#endif
#if !defined(OPENCL_LIBSMM_NLOCKS_TRANS)
# define OPENCL_LIBSMM_NLOCKS_TRANS 16
#endif
#if !defined(OPENCL_LIBSMM_NLOCKS_SMM)
# define OPENCL_LIBSMM_NLOCKS_SMM 16
#endif
#if !defined(OPENCL_LIBSMM_VLEN)
# define OPENCL_LIBSMM_VLEN 32
#endif
#if !defined(OPENCL_LIBSMM_VMIN)
# define OPENCL_LIBSMM_VMIN 8
#endif

/* approximate arithmetic intensity for SMMs like C += Ai * Bi (beta=1) */
#define OPENCL_LIBSMM_AI(M, N, K, TYPESIZE) ( \
  (2.0 * (M) * (N) * (K)) / ((TYPESIZE) * (K) * ((M) + (N))))

#define OPENCL_LIBSMM_TYPESIZE(TYPEID) ( \
  dbcsr_type_real_8 == (TYPEID) \
    ? 8 : (dbcsr_type_real_4 == (TYPEID) \
    ? 4 : 0/*unknown*/))

#define OPENCL_LIBSMM_ISORT(IARR, SIZE) { int opencl_libsmm_isort_i_ = 0; \
  for (; opencl_libsmm_isort_i_ < ((int)(SIZE) - 1); ++opencl_libsmm_isort_i_) { \
    int opencl_libsmm_isort_j_ = opencl_libsmm_isort_i_ + 2; \
    int opencl_libsmm_isort_k_ = opencl_libsmm_isort_i_ + 1; \
    for (; opencl_libsmm_isort_j_ < ((int)(SIZE)); ++opencl_libsmm_isort_j_) { \
      if ((IARR)[opencl_libsmm_isort_j_] < (IARR)[opencl_libsmm_isort_k_]) { \
        opencl_libsmm_isort_k_ = opencl_libsmm_isort_j_; \
      } \
    } \
    if ((IARR)[opencl_libsmm_isort_k_] < (IARR)[opencl_libsmm_isort_i_]) { \
      LIBXSMM_ISWAP((IARR)[opencl_libsmm_isort_i_], (IARR)[opencl_libsmm_isort_k_]); \
    } \
  } \
}


#if defined(__cplusplus)
extern "C" {
#endif

/* maintain GFLOPS/AI ratios for performance estimates and suitability */
double opencl_libsmm_shst, opencl_libsmm_dhst, opencl_libsmm_sacc, opencl_libsmm_dacc;
/* collect device name/id persistent/global buffer such that pointer remains valid */
char opencl_libsmm_devices[ACC_OPENCL_DEVICES_MAXCOUNT][ACC_OPENCL_BUFFERSIZE];
/* calling clSetKernelArg must be consistent across host-threads */
volatile int opencl_libsmm_lock_trans[OPENCL_LIBSMM_NLOCKS_TRANS];
volatile int opencl_libsmm_lock_smm[OPENCL_LIBSMM_NLOCKS_SMM];
/* track initialization status of LIBSMM */
int opencl_libsmm_initialized;


int opencl_libsmm_use_cmem(cl_device_id device)
{
#if defined(OPENCL_LIBSMM_CMEM)
  int result = EXIT_SUCCESS;
  cl_ulong size_maxalloc = 1, size_maxcmem = 0;
  ACC_OPENCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
    sizeof(cl_ulong), &size_maxalloc, NULL), "retrieve maximum size of memory allocation", result);
  ACC_OPENCL_CHECK(clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
    sizeof(cl_ulong), &size_maxcmem, NULL), "retrieve maximum size of constant buffer", result);
  return (EXIT_SUCCESS == result ? (size_maxalloc <= size_maxcmem ? EXIT_SUCCESS : EXIT_FAILURE) : result);
#else
  ACC_OPENCL_UNUSED(device);
  return EXIT_FAILURE;
#endif
}


#if defined(_DEBUG) && defined(OPENCL_LIBSMM_DEBUG) && (0 != OPENCL_LIBSMM_DEBUG)
void opencl_libsmm_print_matrix(FILE* ostream, const char* label,
  libsmm_acc_data_t type, const void* mat, int m, int n)
{
  int i, j;
  const char *const s = (NULL != label ? label : "");
  const int len = (int)strlen(s);
  for (i = 0; i < m; ++i) {
    if (0 < i) fprintf(ostream, "%*s", len, " "); else fprintf(ostream, "%s", s);
    for (j = 0; j < n; ++j) {
      switch (type) {
        case dbcsr_type_real_8: fprintf(ostream, "%.2f ", ((double*)mat)[i*n+j]); break;
        case dbcsr_type_real_4: fprintf(ostream, "%.2f ", ((float*) mat)[i*n+j]); break;
        default: fprintf(ostream, "? ");
      }
    }
    fprintf(ostream, "\n");
  }
}
#endif


int opencl_libsmm_read_params(char* parambuf,
  opencl_libsmm_smmkey_t* key, opencl_libsmm_smm_t* value,
  opencl_libsmm_perfest_t* perfest, char* const* device)
{
  char* s = strtok(parambuf, OPENCL_LIBSMM_PARAMS_DELIMS);
  const int max_consumed = 20 + (NULL == device ? 0 : 1);
  int result = EXIT_SUCCESS, consumed = 0, i;
  int t = (NULL == device ? 1 : 0);
  double gflops;
  assert(NULL != key && NULL != value);
  for (; NULL != s; s = strtok('\0' != *s ? (s + strlen(s) + 1) : s, OPENCL_LIBSMM_PARAMS_DELIMS), ++t) {
    switch (t) {
      case 0: if (1 == sscanf(s, "%[^" OPENCL_LIBSMM_PARAMS_DELIMS "]", *device)) {
        ++consumed;
      } break;
      case 1: if (1 == sscanf(s, "%i", &i)) {
        key->type = (libsmm_acc_data_t)i; ++consumed;
      } break;
      case 2: if (1 == sscanf(s, "%i", &i)) {
        key->m = i; ++consumed;
      } break;
      case 3: if (1 == sscanf(s, "%i", &i)) {
        key->n = i; ++consumed;
      } break;
      case 4: if (1 == sscanf(s, "%i", &i)) {
        key->k = i; ++consumed;
      } break;
      case 5: if (1 == sscanf(s, "%lf", &gflops)) {
        assert(0 <= gflops); ++consumed;
      } break;
      case 6: if (1 == sscanf(s, "%i", &i)) {
        value->bs = i; ++consumed;
      } break;
      case 7: if (1 == sscanf(s, "%i", &i)) {
        value->bm = i; ++consumed;
      } break;
      case 8: if (1 == sscanf(s, "%i", &i)) {
        value->bn = i; ++consumed;
      } break;
      case 9: if (1 == sscanf(s, "%i", &i)) {
        value->bk = i; ++consumed;
      } break;
      case 10: if (1 == sscanf(s, "%i", &i)) {
        value->ws = i; ++consumed;
      } break;
      case 11: if (1 == sscanf(s, "%i", &i)) {
        value->wg = i; ++consumed;
      } break;
      case 12: if (1 == sscanf(s, "%i", &i)) {
        value->lu = i; ++consumed;
      } break;
      case 13: if (1 == sscanf(s, "%i", &i)) {
        value->nz = i; ++consumed;
      } break;
      case 14: if (1 == sscanf(s, "%i", &i)) {
        value->al = i; ++consumed;
      } break;
      case 15: if (1 == sscanf(s, "%i", &i)) {
        value->tb = i; ++consumed;
      } break;
      case 16: if (1 == sscanf(s, "%i", &i)) {
        value->tc = i; ++consumed;
      } break;
      case 17: if (1 == sscanf(s, "%i", &i)) {
        value->ap = i; ++consumed;
      } break;
      case 18: if (1 == sscanf(s, "%i", &i)) {
        value->aa = i; ++consumed;
      } break;
      case 19: if (1 == sscanf(s, "%i", &i)) {
        value->ab = i; ++consumed;
      } break;
      case 20: if (1 == sscanf(s, "%i", &i)) {
        value->ac = i; ++consumed;
      } break;
    }
  }
  if (max_consumed == consumed) {
    if (NULL != perfest) {
      switch (key->type) {
        case dbcsr_type_real_8: {
          const double ratio = gflops / OPENCL_LIBSMM_AI(key->m, key->n, key->k, sizeof(double));
          libxsmm_kahan_sum(log(ratio), &perfest->gf_ai_dratio_sumlog, &perfest->gf_ai_dratio_kahan);
          if (perfest->gf_ai_dratio_max < ratio) perfest->gf_ai_dratio_max = ratio;
          ++perfest->dcount;
        } break;
        case dbcsr_type_real_4: {
          const double ratio = gflops / OPENCL_LIBSMM_AI(key->m, key->n, key->k, sizeof(float));
          libxsmm_kahan_sum(log(ratio), &perfest->gf_ai_sratio_sumlog, &perfest->gf_ai_sratio_kahan);
          if (perfest->gf_ai_sratio_max < ratio) perfest->gf_ai_sratio_max = ratio;
          ++perfest->scount;
        } break;
        default: result = EXIT_FAILURE;
      }
    }
  }
  else {
    result = EXIT_FAILURE;
  }
  return result;
}


int opencl_libsmm_device(void* stream, cl_device_id* device, const char** config)
{
  int result = c_dbcsr_acc_opencl_device(stream, device), empty = 0, i = 0;
  assert(NULL != config);
  if (EXIT_SUCCESS == result) {
    char buffer[ACC_OPENCL_BUFFERSIZE];
    result = clGetDeviceInfo(*device, CL_DEVICE_NAME,
      ACC_OPENCL_BUFFERSIZE, buffer, NULL);
    *config = NULL;
    if (CL_SUCCESS == result) {
      for (; i < ACC_OPENCL_DEVICES_MAXCOUNT; ++i) {
        const char *const name = opencl_libsmm_devices[i];
        if ('\0' == *name) {
          if (2 <= ++empty) break;
        }
        else if (0 == strncmp(buffer, name, ACC_OPENCL_BUFFERSIZE)) {
          *config = name; break;
        }
      }
      if (NULL == *config) { /* no matching device */
#if !defined(OPENCL_LIBSMM_DEVMATCH)
        if ('\0' != *opencl_libsmm_devices[0]) {
          *config = opencl_libsmm_devices[0];
        }
        else
#endif
        {
#if defined(OPENCL_LIBSMM_PARAMS_DEVICE)
          *config = OPENCL_LIBSMM_PARAMS_DEVICE;
#endif
        }
      }
    }
  }
  return result;
}


int libsmm_acc_init(void)
{
#if defined(_OPENMP)
  /* initialization/finalization is not meant to be thread-safe */
  int result = ((0 == omp_in_parallel()
# if /*WORKAROUND*/defined(__DBCSR_ACC)
    || 0/*master*/ == omp_get_thread_num()
# endif
    ) ? EXIT_SUCCESS : EXIT_FAILURE);
#else
  int result = EXIT_SUCCESS;
#endif
  /* multiple calls to libsmm_acc_init are not considered as an error */
  if (1 == LIBXSMM_ATOMIC_ADD_FETCH(&opencl_libsmm_initialized, 1, LIBXSMM_ATOMIC_RELAXED)) {
#if !defined(__DBCSR_ACC)
    /* DBCSR shall call c_dbcsr_acc_init as well as libsmm_acc_init (since both interfaces are used).
     * Also, libsmm_acc_init may privately call c_dbcsr_acc_init (as it depends on the ACC interface).
     * The implementation of c_dbcsr_acc_init should hence be safe against "over initialization".
     * However, DBCSR only calls c_dbcsr_acc_init (and expects an implicit libsmm_acc_init).
     */
    if (EXIT_SUCCESS == result) {
      result = c_dbcsr_acc_init();
    }
#endif
    libxsmm_init();
    if (EXIT_SUCCESS == result) {
#if defined(OPENCL_LIBSMM_SUITABLE)
      const char *const env_suitable = getenv("OPENCL_LIBSMM_SUITABLE");
#endif
      char *const env_params = getenv("OPENCL_LIBSMM_SMM_PARAMS");
      opencl_libsmm_perfest_t perfest;
      memset(&perfest, 0, sizeof(perfest));
      if (NULL == env_params || '0' != *env_params) {
        char buffer[ACC_OPENCL_BUFFERSIZE];
        opencl_libsmm_smm_t config;
        opencl_libsmm_smmkey_t key;
        /* zeroing config (tuned parameters are setup below) */
        memset(&config, 0, sizeof(config));
        /* potentially heterogeneous key-data */
        LIBXSMM_MEMZERO127(&key);
        assert(NULL == key.device);
        if (NULL != env_params && '\0' != *env_params) { /* filename */
          FILE *const file = fopen(env_params, "r");
          if (NULL != file) {
            /* consume first line, check for device entry, and skip CSV header line */
            if (NULL != fgets(buffer, ACC_OPENCL_BUFFERSIZE, file)) {
              char* device = (NULL != c_dbcsr_acc_opencl_stristr(buffer, "device")
                ? opencl_libsmm_devices[0] : NULL);
              int ndevices = 0, i;
              while (NULL != fgets(buffer, ACC_OPENCL_BUFFERSIZE, file)) {
                if (EXIT_SUCCESS == opencl_libsmm_read_params(buffer, &key, &config, &perfest,
                  NULL != device ? &device : NULL))
                {
                  if (NULL != device) {
                    for (i = 0; i < ndevices; ++i) {
                      if (0 == strncmp(device, opencl_libsmm_devices[i], ACC_OPENCL_BUFFERSIZE)) break;
                    }
                    key.device = ('\0' != *opencl_libsmm_devices[i] ? opencl_libsmm_devices[i] : NULL);
                    if (i == ndevices) device = opencl_libsmm_devices[++ndevices%ACC_OPENCL_DEVICES_MAXCOUNT];
                  }
                  if (NULL == OPENCL_LIBSMM_REGISTER(&key, sizeof(key), sizeof(config), &config)) {
                    result = EXIT_FAILURE; break;
                  }
                }
                else {
                  if (0 != c_dbcsr_acc_opencl_config.verbosity) {
                    fprintf(stderr, "WARNING LIBSMM: failed to load tuned parameters!\n");
                  }
                  break; /* invalid entry, or no device column */
                }
              }
            }
            else result = EXIT_FAILURE; /* invalid header */
            fclose(file);
          }
          else if ( /* try reading OPENCL_LIBSMM_SMM_PARAMS-value as kernel parameters */
            EXIT_SUCCESS == opencl_libsmm_read_params(env_params, &key, &config, &perfest, NULL/*device*/))
          {
            if (NULL == OPENCL_LIBSMM_REGISTER(&key, sizeof(key), sizeof(config), &config)) result = EXIT_FAILURE;
          }
          else if (0 != c_dbcsr_acc_opencl_config.verbosity) { /* soft-error */
            fprintf(stderr, "WARNING LIBSMM: failed to open parameter file!\n");
          }
        }
#if defined(OPENCL_LIBSMM_PARAMS_SMM)
        if (EXIT_SUCCESS == result && 0 == perfest.scount && 0 == perfest.dcount) {
          const char* line = OPENCL_LIBSMM_PARAMS_SMM, *next;
# if defined(OPENCL_LIBSMM_PARAMS_DEVICE)
          key.device = OPENCL_LIBSMM_PARAMS_DEVICE;
# endif
          do {
            next = strchr(line, '\n');
            if (NULL != next && next < (line + ACC_OPENCL_BUFFERSIZE)) {
              const int len = next - line;
              memcpy(buffer, line, len); buffer[len] = '\0';
              if (EXIT_SUCCESS == opencl_libsmm_read_params(buffer, &key, &config, &perfest, NULL)) {
                if (NULL == OPENCL_LIBSMM_REGISTER(&key, sizeof(key), sizeof(config), &config)) {
                  result = EXIT_FAILURE; break;
                }
              }
              else {
                if (0 != c_dbcsr_acc_opencl_config.verbosity) {
                  fprintf(stderr, "WARNING LIBSMM: failed to load tuned parameters!\n");
                }
                break;
              }
              line = ++next;
            }
          } while (NULL != next);
        }
#endif
      }
#if defined(OPENCL_LIBSMM_SUITABLE)
      if (EXIT_SUCCESS == result && (NULL == env_suitable || '0' != *env_suitable)) {
        const int stack_size = 30000, nrepeat = 100;
        const int nc = MAX(stack_size / 16, 1), na = 10 * nc, nb = 10 * nc;
        const int m = 8, n = 8, k = 8, mn = m * n, mk = m * k, kn = k * n;
        const size_t scratch_size = /*stack*/stack_size * 3 * sizeof(int)
          + (/*a*/na * mk + /*b*/nb * kn + /*c*/nc * mn) * /*max.typesize*/sizeof(double)
          + 3 * (LIBXSMM_ALIGNMENT - 1)/*alignments*/;
        void *const scratch = libxsmm_aligned_scratch(scratch_size, LIBXSMM_ALIGNMENT);
        int *const s = (int*)scratch, i;
        libxsmm_timer_tickint start;
        const char notrans = 'N';
        if (0 != perfest.scount) {
          if (NULL != scratch) {
            float *const a = (float*)LIBXSMM_UP2((uintptr_t)s + sizeof(int) * stack_size * 3, LIBXSMM_ALIGNMENT);
            float *const b = (float*)LIBXSMM_UP2((uintptr_t)a + sizeof(float) * na * mk, LIBXSMM_ALIGNMENT);
            float *const c = (float*)LIBXSMM_UP2((uintptr_t)b + sizeof(float) * nb * kn, LIBXSMM_ALIGNMENT);
            const float alpha = 1, beta = 1;
            init_stack(s, stack_size, mn, mk, kn, nc, na, nb);
# if defined(_OPENMP)
#           pragma omp parallel
# endif
            {
# if defined(_OPENMP)
#             pragma omp for
# endif
              for (i = 0; i < na; ++i) INIT_MAT(float, i + 42, &a[i*mk], m, k, 1.0 / (nc * na));
# if defined(_OPENMP)
#             pragma omp for
# endif
              for (i = 0; i < nb; ++i) INIT_MAT(float, i + 24, &b[i*kn], k, n, 1.0 / (nc * nb));
            }
            memset(c, 0, sizeof(float) * nc * mn);
            start = libxsmm_timer_tick();
            for (i = 0; i < nrepeat; ++i) {
              OPENCL_LIBSMM_USEOMP(libxsmm_gemm_batch)(
                LIBXSMM_GEMM_PRECISION_F32, LIBXSMM_GEMM_PRECISION_F32,
                &notrans, &notrans, m, n, k, &alpha, a, &m/*lda*/, b, &k/*ldb*/,
                &beta, c, &m/*ldc*/, 1/*index_base*/, sizeof(int) * 3,
                s + 0, s + 1, s + 2, stack_size);
            }
            opencl_libsmm_shst = 1E-9 * ((size_t)2 * m * n * k * stack_size * nrepeat) / (
                libxsmm_timer_duration(start, libxsmm_timer_tick())
              * OPENCL_LIBSMM_AI(m, n, k, sizeof(float)));
          }
          opencl_libsmm_sacc = (/*sqrt(perfest.gf_ai_sratio_max **/
            exp(perfest.gf_ai_sratio_sumlog / perfest.scount));
        }
        if (0 != perfest.dcount) {
          if (NULL != scratch) {
            double *const a = (double*)LIBXSMM_UP2((uintptr_t)s + sizeof(int) * stack_size * 3, LIBXSMM_ALIGNMENT);
            double *const b = (double*)LIBXSMM_UP2((uintptr_t)a + sizeof(double) * na * mk, LIBXSMM_ALIGNMENT);
            double *const c = (double*)LIBXSMM_UP2((uintptr_t)b + sizeof(double) * nb * kn, LIBXSMM_ALIGNMENT);
            const double alpha = 1, beta = 1;
            init_stack(s, stack_size, mn, mk, kn, nc, na, nb);
# if defined(_OPENMP)
#           pragma omp parallel
# endif
            {
# if defined(_OPENMP)
#             pragma omp for
# endif
              for (i = 0; i < na; ++i) INIT_MAT(double, i + 42, &a[i*mk], m, k, 1.0 / (nc * na));
# if defined(_OPENMP)
#             pragma omp for
# endif
              for (i = 0; i < nb; ++i) INIT_MAT(double, i + 24, &b[i*kn], k, n, 1.0 / (nc * nb));
            }
            memset(c, 0, sizeof(double) * nc * mn);
            start = libxsmm_timer_tick();
            for (i = 0; i < nrepeat; ++i) {
              OPENCL_LIBSMM_USEOMP(libxsmm_gemm_batch)(
                LIBXSMM_GEMM_PRECISION_F64, LIBXSMM_GEMM_PRECISION_F64,
                &notrans, &notrans, m, n, k, &alpha, a, &m/*lda*/, b, &k/*ldb*/,
                &beta, c, &m/*ldc*/, 1/*index_base*/, sizeof(int) * 3,
                s + 0, s + 1, s + 2, stack_size);
            }
            opencl_libsmm_dhst = 1E-9 * ((size_t)2 * m * n * k * stack_size * nrepeat) / (
                libxsmm_timer_duration(start, libxsmm_timer_tick())
              * OPENCL_LIBSMM_AI(m, n, k, sizeof(double)));
          }
          opencl_libsmm_dacc = (/*sqrt(perfest.gf_ai_dratio_max **/
            exp(perfest.gf_ai_dratio_sumlog / perfest.dcount));
        }
        libxsmm_free(scratch);
      }
#endif
    }
  }
  ACC_OPENCL_RETURN(result);
}


int libsmm_acc_finalize(void)
{
  /* Routine libsmm_acc_init is called in master thread inside of parallel region
   * However, libsmm_acc_finalize is indirectly called (c_dbcsr_acc_finalize)
   * inside of a parallel region (not just the master thread).
   */
#if defined(_OPENMP)
  /* initialization/finalization is not meant to be thread-safe */
  int result = ((0 == omp_in_parallel()
# if /*WORKAROUND*/defined(__DBCSR_ACC)
    || 0/*master*/ == omp_get_thread_num()
# endif
    ) ? EXIT_SUCCESS : EXIT_FAILURE);
#else
  int result = EXIT_SUCCESS;
#endif
#if LIBXSMM_VERSION3(1, 16, 1) <= LIBXSMM_VERSION3(LIBXSMM_VERSION_MAJOR, \
    LIBXSMM_VERSION_MINOR, LIBXSMM_VERSION_UPDATE) && 1159 <= LIBXSMM_VERSION_PATCH
  /* multiple calls to libsmm_acc_finalize are not considered as an error */
  if (0 == LIBXSMM_ATOMIC_SUB_FETCH(&opencl_libsmm_initialized, 1, LIBXSMM_ATOMIC_RELAXED)) {
    const void *regkey = NULL, *regentry = libxsmm_get_registry_begin(LIBXSMM_KERNEL_KIND_USER, &regkey);
    for (; NULL != regentry; regentry = libxsmm_get_registry_next(regentry, &regkey)) {
      /* opencl_libsmm_trans_t/opencl_libsmm_smm_t carry cl_kernel as 1st data member */
      const cl_kernel kernel = *(const cl_kernel*)regentry;
      if (NULL != kernel) {
        if (3 == c_dbcsr_acc_opencl_config.verbosity) {
          char fname[ACC_OPENCL_KERNELNAME_MAXSIZE];
          ACC_OPENCL_CHECK(clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME,
            ACC_OPENCL_BUFFERSIZE, fname, NULL), "retrieve function name", result);
          if (EXIT_SUCCESS == result) {
            if (NULL != strstr(fname, OPENCL_LIBSMM_KERNELNAME_TRANS)) { /* trans-kernel */
# if !defined(OPENCL_LIBSMM_DEBUG_TRANS)
              const opencl_libsmm_transkey_t *const desc = (const opencl_libsmm_transkey_t*)regkey;
              opencl_libsmm_trans_t *const entry = (opencl_libsmm_trans_t*)regentry;
              if (0 < entry->nexec) {
                const int size = (int)MIN(sizeof(entry->size) / sizeof(*entry->size), entry->nexec);
                int batchsize; OPENCL_LIBSMM_ISORT(entry->size, size); batchsize = entry->size[size>>1];
                if (0 == (1 & size)) batchsize = (batchsize + entry->size[(size>>1)-1]) >> 1;
                fprintf(stderr, "INFO ACC/OpenCL: %ix%i %sTRANS-kernel ss=%i geo=%.1f GB/s\n", desc->m, desc->n,
                  dbcsr_type_real_8 == desc->type ? "D" : (dbcsr_type_real_4 == desc->type ? "S" : ""),
                  batchsize, exp(entry->membw_sumlog / entry->nexec));
                entry->nexec = 0; /* reset */
              }
# endif
            }
            else if (NULL != strstr(fname, OPENCL_LIBSMM_KERNELNAME_SMM)) { /* SMM-kernel */
# if !defined(OPENCL_LIBSMM_DEBUG_SMM)
              const opencl_libsmm_smmkey_t *const desc = (const opencl_libsmm_smmkey_t*)regkey;
              opencl_libsmm_smm_t *const entry = (opencl_libsmm_smm_t*)regentry;
              if (0 < entry->nexec) {
                const double geo = exp(entry->gflops_sumlog / entry->nexec);
                const int size = (int)MIN(sizeof(entry->size) / sizeof(*entry->size), entry->nexec);
                int batchsize; OPENCL_LIBSMM_ISORT(entry->size, size); batchsize = entry->size[size>>1];
                if (0 == (1 & size)) batchsize = (batchsize + entry->size[(size>>1)-1]) >> 1;
                fprintf(stderr, "INFO ACC/OpenCL: %ix%ix%i", desc->m, desc->n, desc->k);
                switch (desc->type) {
                  case dbcsr_type_real_8: {
                    const double est = OPENCL_LIBSMM_AI(desc->m, desc->n, desc->k, sizeof(double)) * opencl_libsmm_dacc;
                    fprintf(stderr, " DSMM-kernel ss=%i geo=%.1f", batchsize, geo);
                    if (0 < est) fprintf(stderr, " est=%.1f", est);
                  } break;
                  case dbcsr_type_real_4: {
                    const double est = OPENCL_LIBSMM_AI(desc->m, desc->n, desc->k, sizeof(float)) * opencl_libsmm_sacc;
                    fprintf(stderr, " SSMM-kernel ss=%i geo=%.1f", batchsize, geo);
                    if (0 < est) fprintf(stderr, " est=%.1f", est);
                  } break;
                  default: result = EXIT_FAILURE;
                }
                fprintf(stderr, " GFLOPS/s\n");
                entry->nexec = 0; /* reset */
              }
# endif
            }
          }
        }
        ACC_OPENCL_CHECK(clReleaseKernel(kernel), "release kernel", result);
      }
    }
    opencl_libsmm_shst = opencl_libsmm_dhst = opencl_libsmm_sacc = opencl_libsmm_dacc = 0;
# if !defined(__DBCSR_ACC)
    /* DBCSR shall call c_dbcsr_acc_init as well as libsmm_acc_init (since both interfaces are used).
     * Also, libsmm_acc_init may privately call c_dbcsr_acc_init (as it depends on the ACC interface).
     * The implementation of c_dbcsr_acc_init should hence be safe against "over initialization".
     * However, DBCSR only calls c_dbcsr_acc_init (and expects an implicit libsmm_acc_init).
     */
    if (EXIT_SUCCESS == result) {
      result = c_dbcsr_acc_finalize();
    }
# endif
    libxsmm_finalize();
  }
#endif
  /* c_dbcsr_acc_finalize is not called since it can be used independently */
  return result;
}


c_dbcsr_acc_bool_t libsmm_acc_is_thread_safe(void)
{
  /* match DBCSR's threading level */
#if defined(_OPENMP)
  return 1;
#else
  return 0;
#endif
}


int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size,
  void* dev_data, libsmm_acc_data_t datatype, int m, int n, int max_kernel_dim, void* stream)
{
  int result = EXIT_SUCCESS;
#if defined(OPENCL_LIBSMM_SOURCE_TRANSPOSE)
  const int mn = m * n;
  assert((NULL != dev_trs_stack && NULL != dev_data && 0 <= offset && 0 <= stack_size) || 0 == stack_size);
  if ((
# if defined(OPENCL_LIBSMM_F64)
      dbcsr_type_real_8 == datatype
# else
      0
# endif
      ||
# if defined(OPENCL_LIBSMM_F32)
      dbcsr_type_real_4 == datatype
# else
      0
# endif
    ) &&
    0 < stack_size && 1 < mn && m <= max_kernel_dim && n <= max_kernel_dim)
  {
    opencl_libsmm_trans_t* config;
    opencl_libsmm_transkey_t key;
# if !defined(OPENCL_LIBSMM_DEBUG_TRANS)
    double duration;
    const libxsmm_timer_tickint start = libxsmm_timer_tick();
# endif
    LIBXSMM_MEMZERO127(&key); /* potentially heterogeneous key-data */
    key.type = datatype; key.m = m; key.n = n; /* initialize key */
    config = (opencl_libsmm_trans_t*)OPENCL_LIBSMM_DISPATCH(&key, sizeof(key));
    if (NULL == config) {
      char build_options[ACC_OPENCL_BUFFERSIZE], build_params[ACC_OPENCL_BUFFERSIZE];
      char fname[ACC_OPENCL_KERNELNAME_MAXSIZE];
      int nchar = ACC_OPENCL_SNPRINTF(fname, sizeof(fname),
        /* kernel name are meant to be unambiguous (BLAS-typeprefix and kernelsize) */
        "x" OPENCL_LIBSMM_KERNELNAME_TRANS "%ix%i", m, n);
# if defined(__DBCSR_ACC)
      int routine_handle;
      c_dbcsr_timeset(LIBSMM_ACC_TRANSPOSE_ROUTINE_NAME_STRPTR,
        LIBSMM_ACC_TRANSPOSE_ROUTINE_NAME_LENPTR, &routine_handle);
# endif
      if (0 < nchar && (int)sizeof(fname) > nchar) {
        cl_device_id active_device;
        result = c_dbcsr_acc_opencl_device(stream, &active_device);
        if (EXIT_SUCCESS == result) {
          const char *const param_format = "-DGLOBAL=%s -DINPLACE=%i -DFN=%s -DSM=%i -DSN=%i -DSWG=%i -DT=%s";
          const char *const cmem = (EXIT_SUCCESS != opencl_libsmm_use_cmem(active_device) ? "global" : "constant");
          const char *const env_options = getenv("OPENCL_LIBSMM_TRANS_BUILDOPTS"), *tname = "";
          const char *const env_inplace = getenv("OPENCL_LIBSMM_TRANS_INPLACE");
          const char *const env_bm = (NULL    == getenv("OPENCL_LIBSMM_TRANS_BLOCK_M")
            ? getenv("OPENCL_LIBSMM_TRANS_BM") : getenv("OPENCL_LIBSMM_TRANS_BLOCK_M"));
          const int inplace = ((m == n) && ((NULL == env_inplace || '\0' == *env_inplace)
# if defined(OPENCL_LIBSMM_TRANS_INPLACE)
            ? 1 : ('0' != *env_inplace)));
# else
            ? 0 : ('0' != *env_inplace)));
# endif
          const int blockm = ((NULL == env_bm || '\0' == *env_bm || '0' == *env_bm)
            ? m/*TODO*/ : atoi(env_bm));
          const int bm = LIBXSMM_CLMP(blockm, 1, m);
          int wgsize = 0, wgsize_max;
          result = c_dbcsr_acc_opencl_wgsize(active_device,
            NULL/*kernel*/, &wgsize_max, NULL/*prefmult*/);
          if (EXIT_SUCCESS == result) {
            switch (datatype) {
              case dbcsr_type_real_8: {
                tname = "char8"; /* double */
                fname[0] = 'd';
              } break;
              case dbcsr_type_real_4: {
                tname = "float";
                fname[0] = 's';
              } break;
              default: assert('\0' == *tname);
            }
            wgsize = MIN((m == bm || 0 == (m % bm)) ? bm : m, wgsize_max);
            nchar = ACC_OPENCL_SNPRINTF(build_options, sizeof(build_options),
              "%s" /* can finally be an empty string hence "<=" (nchar) */,
              (NULL == env_options || '\0' == *env_options) ? "" : env_options);
            if (0 <=/*<*/ nchar && (int)sizeof(build_options) > nchar) {
              nchar = ACC_OPENCL_SNPRINTF(build_params, sizeof(build_params),
                param_format, cmem, inplace, fname, m, n, wgsize, tname);
            }
          }
          if ('\0' != *tname && 0 < nchar && (int)sizeof(build_params) > nchar) {
            opencl_libsmm_trans_t new_config;
            memset(&new_config, 0, sizeof(new_config));
            result = c_dbcsr_acc_opencl_kernel(OPENCL_LIBSMM_SOURCE_TRANSPOSE, fname,
              build_params, build_options, NULL/*try*/, NULL/*try_ok*/,
              NULL/*extnames*/, 0/*num_exts*/, &new_config.kernel);
            if (EXIT_SUCCESS == result) {
              result = c_dbcsr_acc_opencl_wgsize(active_device,
                new_config.kernel, &wgsize_max, NULL/*prefmult*/);
              if (EXIT_SUCCESS == result) {
                assert(0 < wgsize_max);
                if (wgsize_max < wgsize) {
                  wgsize = wgsize_max;
                  nchar = ACC_OPENCL_SNPRINTF(build_params, sizeof(build_params),
                    param_format, cmem, inplace, fname, m, n, wgsize, tname);
                  if (0 < nchar && (int)sizeof(build_params) > nchar) {
                    result = c_dbcsr_acc_opencl_kernel(OPENCL_LIBSMM_SOURCE_TRANSPOSE, fname,
                      build_params, build_options, NULL/*try*/, NULL/*try_ok*/,
                      NULL/*extnames*/, 0/*num_exts*/, &new_config.kernel);
                  }
                  else result = EXIT_FAILURE;
                }
                if (EXIT_SUCCESS == result) {
                  new_config.wgsize = (size_t)wgsize;
                  config = (opencl_libsmm_trans_t*)OPENCL_LIBSMM_REGISTER(&key, sizeof(key),
                    sizeof(new_config), &new_config);
# if !defined(OPENCL_LIBSMM_DEBUG_TRANS)
                  if (2 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) {
                    duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
                    fprintf(stderr, "INFO ACC/OpenCL: %ix%i %sTRANS-kernel gen=%.1f ms\n", m, n,
                      dbcsr_type_real_8 == datatype ? "D" : (dbcsr_type_real_4 == datatype ? "S" : ""),
                      1000.0 * duration);
                  }
# endif
                }
              }
            }
          }
          else if (EXIT_SUCCESS == result) {
            result = EXIT_FAILURE;
          }
        }
      }
      else {
        result = EXIT_FAILURE;
      }
# if defined(__DBCSR_ACC)
      c_dbcsr_timestop(&routine_handle);
# endif
    }
    assert((NULL != config && NULL != config->kernel && 0 < config->wgsize) || EXIT_SUCCESS != result);
    if (EXIT_SUCCESS == result) {
      cl_event event, *const perf_event = ((0 <= c_dbcsr_acc_opencl_config.verbosity
        && 3 > c_dbcsr_acc_opencl_config.verbosity) ? NULL : &event);
      const size_t work_size = config->wgsize * stack_size;
      const int typesize = OPENCL_LIBSMM_TYPESIZE(datatype);
# if defined(OPENCL_LIBSMM_DEBUG_TRANS)
      const int offset_stack_size = offset + stack_size;
      char *imat = NULL, *omat = NULL, *gold = NULL;
      void *scratch = NULL;
      int *stack = NULL;
      size_t data_size;
      if (CL_SUCCESS == clGetMemObjectInfo(*ACC_OPENCL_MEM(dev_data),
        CL_MEM_SIZE, sizeof(size_t), &data_size, NULL))
      {
        const size_t scratch_size = (sizeof(int) * offset_stack_size)/*stack*/
          + data_size/*imat*/ + data_size/*omat*/ + (mn * typesize)/*gold*/
          + 3 * (LIBXSMM_ALIGNMENT - 1)/*alignments*/;
        scratch = libxsmm_aligned_scratch(scratch_size, LIBXSMM_ALIGNMENT);
        if (NULL != scratch) {
          stack = (int*)scratch;
          imat = (char*)LIBXSMM_UP2((uintptr_t)stack + sizeof(int) * offset_stack_size, LIBXSMM_ALIGNMENT);
          omat = (char*)LIBXSMM_UP2((uintptr_t)imat + data_size, LIBXSMM_ALIGNMENT);
          gold = (char*)LIBXSMM_UP2((uintptr_t)omat + data_size, LIBXSMM_ALIGNMENT);
          ACC_OPENCL_CHECK(c_dbcsr_acc_memcpy_d2h(dev_trs_stack, stack, sizeof(int) * offset_stack_size, stream),
            "transfer debug stack", result);
          ACC_OPENCL_CHECK(c_dbcsr_acc_memcpy_d2h(dev_data, imat, data_size, stream),
            "transfer debug input", result);
        }
        else result = EXIT_FAILURE;
      }
      else result = EXIT_FAILURE;
# endif
      assert(!(OPENCL_LIBSMM_NLOCKS_TRANS & (OPENCL_LIBSMM_NLOCKS_TRANS - 1))); /* POT */
      { /* OpenCL is thread-safe except for clSetKernelArg and launching such shared kernel */
        const unsigned int hash = libxsmm_hash(&config->kernel, sizeof(cl_kernel), 25071975/*seed*/);
        volatile int *const lock = opencl_libsmm_lock_trans + LIBXSMM_MOD2(hash, OPENCL_LIBSMM_NLOCKS_TRANS);
        LIBXSMM_ATOMIC_ACQUIRE(lock, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED);
        ACC_OPENCL_CHECK(clSetKernelArg(config->kernel, 0, sizeof(cl_mem), ACC_OPENCL_MEM(dev_trs_stack)),
          "set batch-list argument of transpose kernel", result);
        ACC_OPENCL_CHECK(clSetKernelArg(config->kernel, 1, sizeof(int), &offset),
          "set offset argument of transpose kernel", result);
        ACC_OPENCL_CHECK(clSetKernelArg(config->kernel, 2, sizeof(cl_mem), ACC_OPENCL_MEM(dev_data)),
          "set matrix-data argument of transpose kernel", result);
        ACC_OPENCL_CHECK(clEnqueueNDRangeKernel(*ACC_OPENCL_STREAM(stream),
          config->kernel, 1/*work_dim*/, NULL, &work_size, &config->wgsize, 0, NULL, perf_event),
          "launch transpose kernel", result);
        /* eventually update performance counters inside of locked region */
# if !defined(OPENCL_LIBSMM_DEBUG_TRANS)
        if (3 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) {
          if (NULL != perf_event) {
            cl_ulong begin = 0, end = 0;
            clWaitForEvents(1, perf_event);
            ACC_OPENCL_CHECK(clGetEventProfilingInfo(*perf_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &begin, NULL),
              "query kernel start time", result);
            ACC_OPENCL_CHECK(clGetEventProfilingInfo(*perf_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL),
              "query kernel end time", result);
            duration = LIBXSMM_DELTA(begin, end); /* Nanoseconds */
          }
          else duration = libxsmm_timer_duration(start, libxsmm_timer_tick()) * 1E9; /* Nanoseconds */
          if (EXIT_SUCCESS == result) {
            const double membw = (1E-9 * (1ULL << 30) * stack_size * (typesize * m * n)) / duration;
#   if LIBXSMM_VERSION3(1, 16, 1) <= LIBXSMM_VERSION3(LIBXSMM_VERSION_MAJOR, \
       LIBXSMM_VERSION_MINOR, LIBXSMM_VERSION_UPDATE) && 1159 <= LIBXSMM_VERSION_PATCH
            const size_t size = sizeof(config->size) / sizeof(*config->size); assert(2 <= size);
            libxsmm_kahan_sum(log(membw), &config->membw_sumlog, &config->membw_comp);
            if (size <= config->nexec) {
              const int s1 = size - 1, i = (int)((config->nexec++) % s1);
              config->size[i] = stack_size;
              if ((i + 1) == s1) { /* fill config->size with median */
                const int s2 = size >> 1; OPENCL_LIBSMM_ISORT(config->size, size); config->size[s1] = config->size[s2];
                if (0 == (1 & s1)) config->size[s1] = (config->size[s1] + config->size[s2-1]) >> 1;
              }
            }
            else config->size[config->nexec++] = stack_size;
#   endif
            if (4 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) {
              fprintf(stderr, "INFO ACC/OpenCL: %ix%i %sTRANS-kernel ss=%i cur=%.1f GB/s\n", m, n,
                dbcsr_type_real_8 == datatype ? "D" : (dbcsr_type_real_4 == datatype ? "S" : ""),
                stack_size, membw);
            }
          }
# endif
        }
        LIBXSMM_ATOMIC_RELEASE(lock, LIBXSMM_ATOMIC_RELAXED);
      }
# if defined(OPENCL_LIBSMM_DEBUG_TRANS)
      ACC_OPENCL_CHECK(c_dbcsr_acc_memcpy_d2h(dev_data, omat, data_size, stream),
        "transfer debug test", result);
# endif
# if defined(OPENCL_LIBSMM_DEBUG_TRANS)
      ACC_OPENCL_CHECK(c_dbcsr_acc_stream_sync(stream), "sync stream", result);
# endif
# if defined(OPENCL_LIBSMM_DEBUG_TRANS)
      if (EXIT_SUCCESS == result) {
        int i, j;
        if (0 != c_dbcsr_acc_opencl_config.verbosity) {
          fprintf(stderr, "libsmm_acc_transpose("
            "offset=%i, size=%i, type=%s, m=%i, n=%i, max=%i, stream=%p)", offset, stack_size,
            dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"),
            m, n, max_kernel_dim, stream);
        }
        for (i = offset; i < offset_stack_size; ++i) {
          const size_t index = stack[i];
          const char *const orig = imat + index * typesize;
          const char *const test = omat + index * typesize;
          assert((index * typesize) < data_size);
          memcpy(gold, orig, mn * typesize);
          libxsmm_itrans(gold, typesize, m, n, m, n);
          if (0 != memcmp(gold, test, mn * typesize)) {
            if (0 == c_dbcsr_acc_opencl_config.verbosity) {
              fprintf(stderr, "libsmm_acc_transpose("
                "offset=%i, size=%i, type=%s, m=%i, n=%i, max=%i, stream=%p)", offset, stack_size,
                dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"),
                m, n, max_kernel_dim, stream);
            }
            fprintf(stderr, " => ERROR\n");
#   if defined(_DEBUG)
            opencl_libsmm_print_matrix(stderr, "orig = ", datatype, orig, m, n);
            opencl_libsmm_print_matrix(stderr, "gold = ", datatype, gold, n, m);
            opencl_libsmm_print_matrix(stderr, "test = ", datatype, test, n, m);
            fprintf(stderr, "\n");
#   endif
#   if defined(OPENCL_LIBSMM_DEBUG_EXIT)
            exit(EXIT_FAILURE);
#   else
            result = EXIT_FAILURE; break;
#   endif
          }
          for (j = offset; j < i; ++j) {
            const size_t duplicate = stack[j];
            if (index == duplicate) {
              fprintf(stderr, " => ERROR\n");
#   if defined(OPENCL_LIBSMM_DEBUG_EXIT)
              exit(EXIT_FAILURE);
#   else
              i = offset_stack_size;
              result = EXIT_FAILURE;
              break;
#   endif
            }
          }
        }
        if (0 != c_dbcsr_acc_opencl_config.verbosity && EXIT_SUCCESS == result) {
          fprintf(stderr, " => OK\n");
        }
      }
      libxsmm_free(scratch);
# endif
    }
  }
#else
  result = EXIT_FAILURE;
#endif
  ACC_OPENCL_RETURN(result);
}


c_dbcsr_acc_bool_t libsmm_acc_process_suitable(
  c_dbcsr_acc_bool_t def_mnk, libsmm_acc_data_t datatype,
  int stack_size, int m_max, int n_max, int k_max,
  int max_kernel_dim)
{
  int result = 0;
#if defined(OPENCL_LIBSMM_SUITABLE)
  double hst = 0, acc = 0;
#endif
  switch (datatype) {
#if defined(OPENCL_LIBSMM_F64)
    case dbcsr_type_real_8: if (0 < m_max && 0 < n_max && 0 < k_max && 0 < stack_size
      /* allow k_max to exceed max_kernel_dim, TODO: BLAS for large kernels (m,n) */
      && m_max <= max_kernel_dim && n_max <= max_kernel_dim
      && 0 != def_mnk/*homogeneous*/)
    {
# if defined(OPENCL_LIBSMM_SUITABLE)
      const double ai = OPENCL_LIBSMM_AI(m_max, n_max, k_max, sizeof(double));
      hst = ai * opencl_libsmm_dhst; acc = ai * opencl_libsmm_dacc;
      if (0 == hst || 0 == acc || hst < acc)
# endif
      result = 1;
    } break;
#endif
#if defined(OPENCL_LIBSMM_F32)
    case dbcsr_type_real_4: if (0 < m_max && 0 < n_max && 0 < k_max && 0 < stack_size
      /* allow k_max to exceed max_kernel_dim , TODO: BLAS for large kernels (m,n) */
      && m_max <= max_kernel_dim && n_max <= max_kernel_dim
      && 0 != def_mnk/*homogeneous*/)
    {
# if defined(OPENCL_LIBSMM_SUITABLE)
      const double ai = OPENCL_LIBSMM_AI(m_max, n_max, k_max, sizeof(float));
      hst = ai * opencl_libsmm_shst; acc = ai * opencl_libsmm_sacc;
      if (0 == hst || 0 == acc || hst < acc)
# endif
      result = 1;
    } break;
#endif
    default: assert(0 == result);
  }
#if defined(OPENCL_LIBSMM_SUITABLE)
  if ((0 == result) &&
      (2 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity))
  {
    fprintf(stderr, "INFO ACC/OpenCL: %ix%ix%i %sSMM-kernel ss=%i", m_max, n_max, k_max,
      dbcsr_type_real_8 == datatype ? "D" : (dbcsr_type_real_4 == datatype ? "S" : ""),
      stack_size);
    if (0 < hst && 0 < acc) fprintf(stderr, " hst=%.1f acc=%.1f GFLOPS/s", hst, acc);
    fprintf(stderr, " not suitable%s", 0 != def_mnk ? "\n" : " (inhomogeneous)\n");
  }
#endif
  return result;
}


int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack, int stack_size,
  int nparams, libsmm_acc_data_t datatype, const void* dev_a_data, const void* dev_b_data, void* dev_c_data,
  int m_max, int n_max, int k_max, int max_kernel_dim, c_dbcsr_acc_bool_t def_mnk, void* stream, void* c_stream)
{
  int result = EXIT_SUCCESS;
#if defined(OPENCL_LIBSMM_SOURCE_MULTIPLY)
  ACC_OPENCL_UNUSED(c_stream); /* TODO */
  assert(0 == stack_size || (NULL != host_param_stack && NULL != dev_param_stack
    && NULL != dev_a_data && NULL != dev_b_data && NULL != dev_c_data));
  assert(0 < nparams && 0 < max_kernel_dim && NULL != stream);
  assert(0 <= stack_size && 0 <= m_max && 0 <= n_max && 0 <= k_max);
  if (0 != libsmm_acc_process_suitable(def_mnk, datatype, stack_size, m_max, n_max, k_max, max_kernel_dim)) {
    cl_device_id active_device;
    opencl_libsmm_smmkey_t key;
# if !defined(OPENCL_LIBSMM_DEBUG_SMM)
    double duration;
    const libxsmm_timer_tickint start = libxsmm_timer_tick();
# endif
    LIBXSMM_MEMZERO127(&key); /* potentially heterogeneous key-data */
    result = opencl_libsmm_device(stream, &active_device, &key.device);
    if (EXIT_SUCCESS == result) {
      opencl_libsmm_smm_t* config;
      assert(NULL != active_device);
      key.type = datatype; key.m = m_max; key.n = n_max; key.k = k_max; /* initialize key */
      config = (opencl_libsmm_smm_t*)OPENCL_LIBSMM_DISPATCH(&key, sizeof(key));
      if (NULL == config || NULL == config->kernel) {
        char build_options[ACC_OPENCL_BUFFERSIZE], build_params[ACC_OPENCL_BUFFERSIZE];
        char fname[ACC_OPENCL_KERNELNAME_MAXSIZE];
        int cl_level_major, nchar = ACC_OPENCL_SNPRINTF(fname, sizeof(fname),
          /* kernel name are meant to be unambiguous (BLAS-typeprefix and kernelsize) */
          "x" OPENCL_LIBSMM_KERNELNAME_SMM "%ix%ix%i", m_max, n_max, k_max);
        const char* extensions[] = { NULL, NULL };
        cl_device_type device_type;
# if defined(__DBCSR_ACC)
        int routine_handle;
        c_dbcsr_timeset(LIBSMM_ACC_PROCESS_ROUTINE_NAME_STRPTR,
          LIBSMM_ACC_PROCESS_ROUTINE_NAME_LENPTR, &routine_handle);
# endif
        if (0 < nchar && (int)sizeof(fname) > nchar
          && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_level(active_device,
            &cl_level_major, NULL/*level_minor*/, NULL/*cl_std*/, &device_type))
        {
          const char *tname = NULL, *atomic_type = "";
          int std_c11 = 0;
          switch (datatype) {
            case dbcsr_type_real_8: {
              extensions[0] = "cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics";
              tname = "double";
              fname[0] = 'd';
              if (2 <= cl_level_major
                && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions, 1))
              {
                atomic_type = "-DTA=long -DTM=atomic_long";
                std_c11 = 1;
              }
              else {
                extensions[0] = "cl_khr_fp64 cl_khr_int64_base_atomics";
                if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions, 1)) {
                  atomic_type = "-DTA=long";
                }
                else { /* fallback */
                  extensions[0] = "cl_khr_fp64 cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics";
                  if (2 <= cl_level_major
                    && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions, 1))
                  {
                    atomic_type = "-DATOMIC32_ADD64 -DTA=int -DTM=atomic_int";
                    std_c11 = 1;
                  }
                  else {
                    extensions[0] = "cl_khr_fp64 cl_khr_global_int32_base_atomics";
                    if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions, 1)) {
                      atomic_type = "-DATOMIC32_ADD64 -DTA=int";
                    }
                    else tname = NULL;
                  }
                }
              }
            } break;
            case dbcsr_type_real_4: {
              extensions[0] = "cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics";
              if (2 <= cl_level_major
                && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions, 1))
              {
                extensions[1] = "cl_khr_int64_base_atomics cl_khr_int64_extended_atomics";
                atomic_type = "-DTA=int -DTM=atomic_int";
                std_c11 = 1;
                tname = "float";
                fname[0] = 's';
              }
              else {
                extensions[0] = "cl_khr_global_int32_base_atomics";
                if (EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions, 1)) {
                  extensions[1] = "cl_khr_int64_base_atomics";
                  atomic_type = "-DTA=int";
                  tname = "float";
                  fname[0] = 's';
                }
              }
            } break;
            default: assert(NULL == tname);
          }
          if (NULL != tname) {
            static int cl_try_ok = EXIT_SUCCESS;
            const char *const env_wg = getenv("OPENCL_LIBSMM_SMM_WG"), *const env_lu = getenv("OPENCL_LIBSMM_SMM_LU");
            const char *const env_nz = getenv("OPENCL_LIBSMM_SMM_NZ"), *const env_al = getenv("OPENCL_LIBSMM_SMM_AL");
            const char *const env_tb = getenv("OPENCL_LIBSMM_SMM_TB"), *const env_tc = getenv("OPENCL_LIBSMM_SMM_TC");
            const char *const env_ap = getenv("OPENCL_LIBSMM_SMM_AP"), *const env_aa = getenv("OPENCL_LIBSMM_SMM_AA");
            const char *const env_ab = getenv("OPENCL_LIBSMM_SMM_AB"), *const env_ac = getenv("OPENCL_LIBSMM_SMM_AC");
            const int cl_nonv = (EXIT_SUCCESS != c_dbcsr_acc_opencl_device_vendor(active_device, "nvidia"));
            const char *const cl_try = ((EXIT_SUCCESS == cl_try_ok && 0 != c_dbcsr_acc_opencl_config.intel_id)
              ? "-cl-intel-disable-a64WA" : "");
            const int wg = LIBXSMM_CLMP((NULL == env_wg || '\0' == *env_wg)
              ? (NULL == config ? /*default*/0 : config->wg) : atoi(env_wg), 0, 2);
            const int lu = LIBXSMM_CLMP((NULL == env_lu || '\0' == *env_lu)
              ? (NULL == config ? /*default*/0 : config->lu) : atoi(env_lu), -1, 2);
            const int nz = LIBXSMM_CLMP((NULL == env_nz || '\0' == *env_nz)
              ? (NULL == config ? /*default*/0 : config->nz) : atoi(env_nz), 0, 1);
            const int al = LIBXSMM_CLMP((NULL == env_al || '\0' == *env_al)
              ? (NULL == config ? /*default*/0 : config->al) : atoi(env_al), 0, 1);
            const int tb = LIBXSMM_CLMP((NULL == env_tb || '\0' == *env_tb)
              ? (NULL == config ? /*default*/0 : config->tb) : atoi(env_tb), 0, 1);
            const int tc = LIBXSMM_CLMP((NULL == env_tc || '\0' == *env_tc)
              ? (NULL == config ? /*default*/1 : config->tc) : atoi(env_tc), 0, 1);
            const int ap = LIBXSMM_CLMP((NULL == env_ap || '\0' == *env_ap)
              ? (NULL == config ? /*default*/1 : config->ap) : atoi(env_ap), 0, 1);
            const int aa = LIBXSMM_CLMP((NULL == env_aa || '\0' == *env_aa)
              ? (NULL == config ? /*default*/((k_max % 16) ? 1 : 2) : config->aa) : atoi(env_aa), 0, 3);
            const int ab = LIBXSMM_CLMP((NULL == env_ab || '\0' == *env_ab)
              ? (NULL == config ? /*default*/3 : config->ab) : atoi(env_ab), 0, 3);
            const int ac = LIBXSMM_CLMP((NULL == env_ac || '\0' == *env_ac)
              ? (NULL == config ? /*default*/0 : config->ac) : atoi(env_ac), 0, 2);
            int wgsize_max, wgsize_prf, wgsize, bs, bm, bn, bk, ws, nbm, nbn;
            result = c_dbcsr_acc_opencl_wgsize(active_device,
              NULL/*device-specific*/, &wgsize_max, &wgsize_prf);
            assert(EXIT_SUCCESS != result || 0 < wgsize_prf);
            if (EXIT_SUCCESS == result) {
              const char *const env_bs = getenv("OPENCL_LIBSMM_SMM_BS");
              const char *const env_bm = getenv("OPENCL_LIBSMM_SMM_BM");
              const char *const env_bn = getenv("OPENCL_LIBSMM_SMM_BN");
              const char *const env_bk = getenv("OPENCL_LIBSMM_SMM_BK");
              const char *const env_ws = getenv("OPENCL_LIBSMM_SMM_WS");
              const int batchsize = ((NULL == env_bs || '\0' == *env_bs || '0' == *env_bs)
                ? (NULL == config ? 0 : config->bs) : atoi(env_bs));
              const int blockm = ((NULL == env_bm || '\0' == *env_bm || '0' == *env_bm)
                ? (NULL == config ? 0 : config->bm) : atoi(env_bm));
              const int blockn = ((NULL == env_bn || '\0' == *env_bn || '0' == *env_bn)
                ? (NULL == config ? 0 : config->bn) : atoi(env_bn));
              const int blockk = ((NULL == env_bk || '\0' == *env_bk || '0' == *env_bk)
                ? (NULL == config ? 0 : config->bk) : atoi(env_bk));
              const int wgmin = ((NULL == env_ws || '\0' == *env_ws || '0' == *env_ws)
                ? (NULL == config ? 0 : config->ws) : atoi(env_ws));
              /* default: decompose C-matrix into column-vectors (Mx1) */
              bm = MIN(0 < blockm ? blockm : /*default*/m_max, m_max);
              bn = MIN(0 < blockn ? blockn : /*default*/    1, n_max);
              bk = MIN(0 < blockk ? blockk : /*default*/m_max, m_max);
              ws = MIN(0 < wgmin ? wgmin : /*default*/n_max, n_max * m_max);
              bs = (0 < batchsize ? batchsize : /*default*/8);
              nbm = (m_max + bm - 1) / bm;
              nbn = (n_max + bn - 1) / bn;
              wgsize = MAX(nbm * nbn, ws);
# if  LIBXSMM_VERSION3(1, 16, 1) <= LIBXSMM_VERSION3(LIBXSMM_VERSION_MAJOR, \
      LIBXSMM_VERSION_MINOR, LIBXSMM_VERSION_UPDATE) && 1598 <= LIBXSMM_VERSION_PATCH
              if (1 <= wg) {
                const unsigned int limit = MAX(wgsize_prf, OPENCL_LIBSMM_VLEN);
                wgsize_prf = (int)libxsmm_remainder(wgsize, OPENCL_LIBSMM_VMIN,
                  &limit, NULL/*remainder*/);
              }
              else
# endif
              wgsize_prf = wgsize;
              if (2 <= wg) wgsize_prf = LIBXSMM_UP2POT(wgsize_prf);
              if (wgsize_prf < (2 * wgsize)) wgsize = wgsize_prf; /* limit */
              assert(1 <= bs && 0 < wgsize && 0 < wgsize_max && 0 < wgsize_prf);
              /* ensure minimum requested WG-size */
              while ((nbm * nbn) < ws && (nbm < m_max || nbn < n_max)) {
                if (nbn < n_max) {
                  ++nbn; bn = (n_max + nbn - 1) / nbn;
                }
                else if (nbm < m_max) {
                  ++nbm; bm = (m_max + nbm - 1) / nbm;
                }
                wgsize = (2 > wg ? (nbm * nbn) : ((int)LIBXSMM_UP2POT(nbm * nbn)));
              }
              /* limit WG-size to maximum WG-size */
              while (wgsize_max < wgsize && (bm < m_max || bn < n_max)) {
                if (bn < n_max) {
                  ++bn; nbn = (n_max + bn - 1) / bn;
                }
                else if (bm < m_max) {
                  ++bm; nbm = (m_max + bm - 1) / bm;
                }
                wgsize = (2 > wg ? (nbm * nbn) : ((int)LIBXSMM_UP2POT(nbm * nbn)));
              }
              if (wgsize <= wgsize_max) { /* SMMs can be potentially handled by device */
                const char *const cmem = (EXIT_SUCCESS != opencl_libsmm_use_cmem(active_device) ? "global" : "constant");
# if !defined(NDBGDEV)
                const char *const cl_debug = ((0 != c_dbcsr_acc_opencl_config.intel_id
                  && CL_DEVICE_TYPE_CPU != device_type) ? "-gline-tables-only" : "");
# else
                const char *const cl_debug = "";
# endif
                const char *const env_options = getenv("OPENCL_LIBSMM_SMM_BUILDOPTS");
                const char *const env_barrier = getenv("OPENCL_LIBSMM_SMM_BARRIER");
                const char *const env_atomics = getenv("OPENCL_LIBSMM_SMM_ATOMICS");
                const char *const env_nrepeat = getenv("SMM_NREPEAT");
                const char *barrier_expr = NULL, *atomic_ops = "";
                const char *atomic_exp = NULL, *atomic_expr2 = "";
                if (NULL == env_barrier || '0' != *env_barrier) {
                  barrier_expr = ((0 != std_c11 && (0 == c_dbcsr_acc_opencl_config.intel_id
                      || (CL_DEVICE_TYPE_CPU != device_type)))
                    ? "-D\"BARRIER(A)=work_group_barrier(A, memory_scope_work_group)\""
                    : "-D\"BARRIER(A)=barrier(A)\"");
                }
                else barrier_expr = ""; /* no barrier */
                assert(NULL != barrier_expr);
                if (NULL == env_atomics || '0' != *env_atomics) {
                  if (NULL == env_atomics || '\0' == *env_atomics) { /* no request made */
                    cl_bitfield fp_atomics;
                    assert(dbcsr_type_real_8 == datatype || dbcsr_type_real_4 == datatype);
                    if (CL_SUCCESS == clGetDeviceInfo(active_device,
                      (cl_device_info)(dbcsr_type_real_8 == datatype ? 0x4232 : 0x4231),
                      sizeof(cl_bitfield), &fp_atomics, NULL) && 0 != (/*add*/(1 << 1) & fp_atomics))
                    {
                      extensions[1] = "cl_ext_float_atomics";
                      atomic_exp = (dbcsr_type_real_8 == datatype
                        ? "atomic_fetch_add_explicit((global volatile atomic_double*)A, B, "
                          "memory_order_relaxed, memory_scope_work_group)"
                        : "atomic_fetch_add_explicit((global volatile atomic_float*)A, B, "
                          "memory_order_relaxed, memory_scope_work_group)");
                    }
                    else if (0 != c_dbcsr_acc_opencl_config.intel_id && 0x4905 != c_dbcsr_acc_opencl_config.intel_id
                      && 0 == c_dbcsr_acc_opencl_config.unified)
                    {
                      if (dbcsr_type_real_4 == datatype) {
                        extensions[1] = "cl_intel_global_float_atomics";
                        atomic_ops = "-Dcl_intel_global_float_atomics";
                        atomic_exp = (0 != std_c11
                          ? "atomic_fetch_add_explicit((global volatile atomic_float*)A, B, "
                            "memory_order_relaxed, memory_scope_work_group)"
                          : "atomic_add(A, B)");
                      }
                      else {
                        atomic_exp = "atomic_add_global_cmpxchg(A, B)";
                        atomic_ops = "-DCMPXCHG=atom_cmpxchg";
                      }
                    }
                    else if (cl_nonv) {
                      if (NULL != extensions[1] && 1 < bs && 1 == bn && bm >= m_max && 0 == al
                        && (0 == (m_max & 1) || (0 == c_dbcsr_acc_opencl_config.intel_id /*&& cl_nonv*/)) /* TODO */
                        && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions + 1, 1))
                      {
                        assert(dbcsr_type_real_4 == datatype);
                        atomic_expr2 = "-D\"ATOMIC_ADD2_GLOBAL(A,B)=atomic_add_global_cmpxchg2(A, B)\"";
                      }
                      atomic_exp = "atomic_add_global_cmpxchg(A, B)";
                      atomic_ops = (dbcsr_type_real_4 == datatype
                        ? "-DCMPXCHG=atomic_cmpxchg"
                        : "-DCMPXCHG=atom_cmpxchg");
                    }
                    else {
                      assert(NULL != atomic_ops && '\0' == *atomic_ops);
                      atomic_exp = "atomic_add_global_xchg(A, B)";
                    }
                  }
                  else if (NULL != c_dbcsr_acc_opencl_stristr(env_atomics, "cmpxchg")) {
                    if (NULL != extensions[1] && 1 < bs && 1 == bn && bm >= m_max && 0 == al
                      && (0 == (m_max & 1) || (0 == c_dbcsr_acc_opencl_config.intel_id && cl_nonv)) /* TODO */
                      && '2' == env_atomics[strlen(env_atomics)-1]
                      && EXIT_SUCCESS == c_dbcsr_acc_opencl_device_ext(active_device, extensions + 1, 1))
                    {
                      assert(dbcsr_type_real_4 == datatype);
                      atomic_expr2 = "-D\"ATOMIC_ADD2_GLOBAL(A,B)=atomic_add_global_cmpxchg2(A, B)\"";
                    }
                    atomic_exp = "atomic_add_global_cmpxchg(A, B)";
                    atomic_ops = (dbcsr_type_real_4 == datatype
                      ? "-DCMPXCHG=atomic_cmpxchg"
                      : "-DCMPXCHG=atom_cmpxchg");
                  }
                  else {
                    atomic_exp = "atomic_add_global_xchg(A, B)";
                    atomic_ops = (dbcsr_type_real_4 == datatype
                      ? "-DXCHG=atomic_xchg"
                      : "-DXCHG=atom_xchg");
                  }
                }
                else { /* unsynchronized */
                  assert(NULL != env_atomics);
                  atomic_exp = "*(A) += (B)"; /* non-atomic update */
                }
                assert(NULL != atomic_exp);
                /* compose build parameters and flags */
                nchar = ACC_OPENCL_SNPRINTF(build_params, sizeof(build_params),
                  "-DMAD=fma -DINTEL=%i -DGLOBAL=%s -DSWG=%i -DFN=%s -DREPEAT=%i "
                  "-DSM=%i -DSN=%i -DSK=%i -DBS=%i -DBM=%i -DBN=%i -DBK=%i -DT=%s -DTN=%i "
                  "%s %s %s %s %s %s %s %s %s %s %s -D\"ATOMIC_ADD_GLOBAL(A,B)=%s\" %s %s",
                  c_dbcsr_acc_opencl_config.intel_id, cmem, wgsize, fname,
                  NULL == env_nrepeat ? 1 : atoi(env_nrepeat),
                  m_max, n_max, k_max, bs, bm, bn, bk, tname, datatype,
                  0 == lu ? "-D\"UNROLL_SM=UNROLL_FORCE(SM)\"" : (1 == lu ? "-D\"UNROLL_SM=UNROLL_FORCE(1)\""
                                                               : (0  < lu  ? "-D\"UNROLL(N)=UNROLL_FORCE(N)\""
                                                               : "")),
                  0 == nz ? "" : "-DATOMIC_INC_NZ", 0 == al ? "" : "-DAL",
                  0 == tb ? "" : "-DTRACK_B", 0 != tc ? "-DTRACK_C" : "", 0 == ap ? "" : "-DSLM_P",
                  0 == aa ? "" : (1 == aa ? "-DSLM_A=1" : (2 == aa ? "-DSLM_A=2" : "-DREG_A")),
                  0 == ab ? "" : (1 == ab ? "-DSLM_B=1" : (2 == ab ? "-DSLM_B=2" : "-DREG_B")),
                  0 == ac ? "" : (1 == ac ? "-DSLM_C=1" : (2 == ac ? "-DSLM_C=2" : "-DREG_C")),
                  atomic_type, atomic_ops, atomic_exp, atomic_expr2, barrier_expr);
                if (0 < nchar && (int)sizeof(build_params) > nchar) {
                  nchar = ACC_OPENCL_SNPRINTF(build_options, sizeof(build_options),
                    "%s %s -cl-fast-relaxed-math -cl-denorms-are-zero",
                    (NULL == env_options || '\0' == *env_options) ? "" : env_options, cl_debug);
                  if (0 >= nchar || (int)sizeof(build_options) <= nchar) result = EXIT_FAILURE;
                }
                else result = EXIT_FAILURE;
              }
              else {
                result = EXIT_FAILURE;
                ACC_OPENCL_ERROR("matrix-size causes too large WG-size", result);
              }
            }
            if (EXIT_SUCCESS == result) {
              const char *const env_kernel = getenv("OPENCL_LIBSMM_SMM_KERNEL");
              opencl_libsmm_smm_t new_config;
              memset(&new_config, 0, sizeof(new_config));
              if (NULL != env_kernel) {
                FILE *const src_kernel = fopen(env_kernel, "r");
                if (NULL != src_kernel) {
                  const long int size = (EXIT_SUCCESS == fseek(
                    src_kernel, 0/*offset*/, SEEK_END) ? ftell(src_kernel) : 0);
                  char *const src = (char*)(EXIT_SUCCESS == fseek(
                    src_kernel, 0/*offset*/, SEEK_SET) ? malloc(size + 1/*terminator*/) : NULL);
                  if (NULL != src) {
                    if ((size_t)size == fread(src, 1/*sizeof(char)*/, size/*count*/, src_kernel)) {
                      src[size] = '\0';
                      result = c_dbcsr_acc_opencl_kernel(src,
                        fname, build_params, build_options, cl_try, &cl_try_ok,
                        extensions, sizeof(extensions) / sizeof(*extensions),
                        &new_config.kernel);
                    }
                    else free(src);
                  }
                  fclose(src_kernel);
                }
              }
              if (NULL == new_config.kernel) {
                result = c_dbcsr_acc_opencl_kernel(OPENCL_LIBSMM_SOURCE_MULTIPLY,
                  fname, build_params, build_options, cl_try, &cl_try_ok,
                  extensions, sizeof(extensions) / sizeof(*extensions),
                  &new_config.kernel);
              }
              if (EXIT_SUCCESS == result) {
                result = c_dbcsr_acc_opencl_wgsize(active_device,
                  new_config.kernel, &wgsize_max, NULL/*prefmult*/);
                if (EXIT_SUCCESS == result) {
                  assert(0 < wgsize && 0 < wgsize_max);
                  /* check planned WG-size against kernel-specific WG-size */
                  if (wgsize <= wgsize_max) {
                    if (NULL == config) {
                      config = (opencl_libsmm_smm_t*)OPENCL_LIBSMM_REGISTER(
                        &key, sizeof(key), sizeof(new_config), &new_config);
                    }
                    if (NULL != config) {
                      config->wgsize = (size_t)wgsize;  config->bs = bs;
                      config->bm = bm; config->bn = bn; config->bk = bk;
                      config->ws = ws; config->wg = wg; config->lu = lu;
                      config->nz = nz; config->al = al;
                      config->tb = tb; config->tc = tc;
                      config->ap = ap; config->aa = aa;
                      config->ab = ab; config->ac = ac;
                      config->kernel = new_config.kernel;
# if !defined(OPENCL_LIBSMM_DEBUG_SMM)
                      if (2 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) {
                        duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
                        fprintf(stderr, "INFO ACC/OpenCL: %ix%ix%i %sSMM-kernel "
                          "bs=%i bm=%i bn=%i bk=%i ws=%i wg=%i lu=%i nz=%i al=%i tb=%i tc=%i ap=%i aa=%i ab=%i ac=%i gen=%.1f ms\n",
                          m_max, n_max, k_max, dbcsr_type_real_8 == datatype ? "D" : (dbcsr_type_real_4 == datatype ? "S" : ""),
                          bs, bm, bn, bk, ws, wg, lu, nz, al, tb, tc, ap, aa, ab, ac, 1000.0 * duration);
                      }
# endif
                    }
                    else { /* failed to register config */
                      result = EXIT_FAILURE;
                    }
                  }
                  else {
                    result = EXIT_FAILURE;
                    ACC_OPENCL_ERROR("tile-size causes too large WG-size", result);
                  }
                }
              }
            }
          }
          else {
            result = EXIT_FAILURE;
            ACC_OPENCL_ERROR("insufficient device capabilities", result);
          }
        }
        else {
          result = EXIT_FAILURE;
        }
        /* remove configuration from registry to avoid infinitely retrying code generation */
        if (EXIT_SUCCESS != result && NULL != config) {
          libxsmm_xrelease(&key, sizeof(key));
        }
# if defined(__DBCSR_ACC)
        c_dbcsr_timestop(&routine_handle);
# endif
      }
      assert(EXIT_SUCCESS != result || (NULL != config && NULL != config->kernel));
      assert(EXIT_SUCCESS != result || ( 1 <= config->bm && config->bm <= m_max));
      assert(EXIT_SUCCESS != result || ( 1 <= config->bn && config->bn <= n_max));
      assert(EXIT_SUCCESS != result || ( 1 <= config->bk && config->bk <= m_max));
      assert(EXIT_SUCCESS != result || ( 1 <= config->ws && config->ws <= (m_max * n_max)));
      assert(EXIT_SUCCESS != result || ( 1 <= config->bs && 1 <= config->wgsize));
      assert(EXIT_SUCCESS != result || ( 0 <= config->wg && 2 >= config->wg));
      assert(EXIT_SUCCESS != result || (-1 <= config->lu && 2 >= config->lu));
      assert(EXIT_SUCCESS != result || ( 0 <= config->nz && 1 >= config->nz));
      assert(EXIT_SUCCESS != result || ( 0 <= config->al && 1 >= config->al));
      assert(EXIT_SUCCESS != result || ( 0 <= config->tb && 1 >= config->tb));
      assert(EXIT_SUCCESS != result || ( 0 <= config->tc && 1 >= config->tc));
      assert(EXIT_SUCCESS != result || ( 0 <= config->ap && 1 >= config->ap));
      assert(EXIT_SUCCESS != result || ( 0 <= config->aa && 3 >= config->aa));
      assert(EXIT_SUCCESS != result || ( 0 <= config->ab && 3 >= config->ab));
      assert(EXIT_SUCCESS != result || ( 0 <= config->ac && 2 >= config->ac));
      if (EXIT_SUCCESS == result) {
        cl_event event, *const perf_event = ((0 <= c_dbcsr_acc_opencl_config.verbosity
          && 3 > c_dbcsr_acc_opencl_config.verbosity) ? NULL : &event);
        /* adjust overall stacksize according to intra-kernel batchsize */
        const size_t work_size = ((stack_size + config->bs - 1) / config->bs) * config->wgsize;
# if defined(OPENCL_LIBSMM_DEBUG_SMM)
        char *ainp = NULL, *binp = NULL, *test = NULL, *gold = NULL, *btrn = NULL;
        const libxsmm_gemm_precision precision = (dbcsr_type_real_8 == datatype
          ? LIBXSMM_GEMM_PRECISION_F64 : (dbcsr_type_real_4 == datatype ? LIBXSMM_GEMM_PRECISION_F32
          : (libxsmm_gemm_precision)LIBXSMM_DATATYPE_UNSUPPORTED));
        const int typesize = OPENCL_LIBSMM_TYPESIZE(datatype);
        size_t asize, bsize, csize;
        void* scratch = NULL;
        libxsmm_xmmfunction kernel = { NULL };
        if (  CL_SUCCESS == clGetMemObjectInfo(*ACC_OPENCL_MEM(dev_a_data),
                CL_MEM_SIZE, sizeof(size_t), &asize, NULL)
          &&  CL_SUCCESS == clGetMemObjectInfo(*ACC_OPENCL_MEM(dev_b_data),
                CL_MEM_SIZE, sizeof(size_t), &bsize, NULL)
          &&  CL_SUCCESS == clGetMemObjectInfo(*ACC_OPENCL_MEM(dev_c_data),
                CL_MEM_SIZE, sizeof(size_t), &csize, NULL))
        {
          const double alpha = 1, beta = 1;
          libxsmm_descriptor_blob blob;
          libxsmm_gemm_descriptor *const desc = libxsmm_gemm_descriptor_dinit(&blob,
            precision, m_max, n_max, k_max, m_max, k_max, m_max, alpha, beta,
            LIBXSMM_GEMM_FLAG_NONE, LIBXSMM_PREFETCH_NONE);
          scratch = libxsmm_aligned_scratch(asize + bsize + csize + csize + k_max * n_max * typesize
            + 4 * (LIBXSMM_ALIGNMENT - 1)/*alignments*/, LIBXSMM_ALIGNMENT);
          if (NULL != desc && NULL != scratch) {
            ainp = (char*)scratch;
            binp = (char*)LIBXSMM_UP2((uintptr_t)ainp + asize, LIBXSMM_ALIGNMENT);
            test = (char*)LIBXSMM_UP2((uintptr_t)binp + bsize, LIBXSMM_ALIGNMENT);
            gold = (char*)LIBXSMM_UP2((uintptr_t)test + csize, LIBXSMM_ALIGNMENT);
            btrn = (char*)LIBXSMM_UP2((uintptr_t)gold + csize, LIBXSMM_ALIGNMENT);
            ACC_OPENCL_CHECK(c_dbcsr_acc_memcpy_d2h(dev_a_data, ainp, asize, stream),
              "transfer debug a-data", result);
            ACC_OPENCL_CHECK(c_dbcsr_acc_memcpy_d2h(dev_b_data, binp, bsize, stream),
              "transfer debug b-data", result);
            ACC_OPENCL_CHECK(c_dbcsr_acc_memcpy_d2h(dev_c_data, gold, csize, stream),
              "transfer debug c-data", result);
            kernel = libxsmm_xmmdispatch(desc);
            assert(NULL != kernel.xmm);
          }
          else result = EXIT_FAILURE;
        }
        else result = EXIT_FAILURE;
# endif
        assert(!(OPENCL_LIBSMM_NLOCKS_SMM & (OPENCL_LIBSMM_NLOCKS_SMM - 1))); /* POT */
        { /* OpenCL is thread-safe except for clSetKernelArg and launching such shared kernel */
          const unsigned int hash = libxsmm_hash(&config->kernel, sizeof(cl_kernel), 25071975/*seed*/);
          volatile int *const lock = opencl_libsmm_lock_smm + LIBXSMM_MOD2(hash, OPENCL_LIBSMM_NLOCKS_SMM);
          LIBXSMM_ATOMIC_ACQUIRE(lock, LIBXSMM_SYNC_NPAUSE, LIBXSMM_ATOMIC_RELAXED);
          ACC_OPENCL_CHECK(clSetKernelArg(config->kernel, 0, sizeof(cl_mem), ACC_OPENCL_MEM(dev_c_data)),
            "set C-matrix argument of SMM-kernel", result);
          ACC_OPENCL_CHECK(clSetKernelArg(config->kernel, 1, sizeof(cl_mem), ACC_OPENCL_MEM(dev_a_data)),
            "set A-matrix argument of SMM-kernel", result);
          ACC_OPENCL_CHECK(clSetKernelArg(config->kernel, 2, sizeof(cl_mem), ACC_OPENCL_MEM(dev_b_data)),
            "set B-matrix argument of SMM-kernel", result);
          ACC_OPENCL_CHECK(clSetKernelArg(config->kernel, 3, sizeof(cl_mem), ACC_OPENCL_MEM(dev_param_stack)),
            "set batch-list argument of SMM-kernel", result);
          if (1 < config->bs) {
            ACC_OPENCL_CHECK(clSetKernelArg(config->kernel, 4, sizeof(int), &stack_size),
              "set stacksize argument of SMM-kernel", result);
          }
          ACC_OPENCL_CHECK(clEnqueueNDRangeKernel(*ACC_OPENCL_STREAM(stream),
            config->kernel, 1/*work_dim*/, NULL, &work_size, &config->wgsize, 0, NULL, perf_event),
            "launch SMM-kernel", result);
          /* eventually update performance counters inside of locked region */
# if !defined(OPENCL_LIBSMM_DEBUG_SMM)
          if (3 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) {
            if (NULL != perf_event) {
              cl_ulong begin = 0, end = 0;
              clWaitForEvents(1, perf_event);
              ACC_OPENCL_CHECK(clGetEventProfilingInfo(*perf_event, CL_PROFILING_COMMAND_START,
                sizeof(cl_ulong), &begin, NULL), "query kernel start time", result);
              ACC_OPENCL_CHECK(clGetEventProfilingInfo(*perf_event, CL_PROFILING_COMMAND_END,
                sizeof(cl_ulong), &end, NULL), "query kernel end time", result);
              duration = LIBXSMM_DELTA(begin, end); /* Nanoseconds */
            }
            else duration = libxsmm_timer_duration(start, libxsmm_timer_tick()) * 1E9; /* Nanoseconds */
            if (EXIT_SUCCESS == result) {
              const double gflops = (2.0 * m_max * n_max * k_max * stack_size) / duration;
#   if LIBXSMM_VERSION3(1, 16, 1) <= LIBXSMM_VERSION3(LIBXSMM_VERSION_MAJOR, \
         LIBXSMM_VERSION_MINOR, LIBXSMM_VERSION_UPDATE) && 1159 <= LIBXSMM_VERSION_PATCH
              const size_t size = sizeof(config->size) / sizeof(*config->size); assert(2 <= size);
              libxsmm_kahan_sum(log(gflops), &config->gflops_sumlog, &config->gflops_comp);
              if (size <= config->nexec) {
                const int s1 = size - 1, i = (int)((config->nexec++) % s1);
                config->size[i] = stack_size;
                if ((i + 1) == s1) { /* fill config->size with median */
                  const int s2 = size >> 1; OPENCL_LIBSMM_ISORT(config->size, size); config->size[s1] = config->size[s2];
                  if (0 == (1 & s1)) config->size[s1] = (config->size[s1] + config->size[s2-1]) >> 1;
                }
              }
              else config->size[config->nexec++] = stack_size;
#   endif
              if (4 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) {
                fprintf(stderr, "INFO ACC/OpenCL: %ix%ix%i", m_max, n_max, k_max);
                switch (datatype) {
                  case dbcsr_type_real_8: {
                    const double est = OPENCL_LIBSMM_AI(m_max, n_max, k_max, sizeof(double)) * opencl_libsmm_dacc;
                    fprintf(stderr, " DSMM-kernel ss=%i cur=%.1f", stack_size, gflops);
                    if (0 < est) fprintf(stderr, " est=%.1f", est);
                  } break;
                  case dbcsr_type_real_4: {
                    const double est = OPENCL_LIBSMM_AI(m_max, n_max, k_max, sizeof(float)) * opencl_libsmm_sacc;
                    fprintf(stderr, " SSMM-kernel ss=%i cur=%.1f", stack_size, gflops);
                    if (0 < est) fprintf(stderr, " est=%.1f", est);
                  } break;
                  default: result = EXIT_FAILURE;
                }
                fprintf(stderr, " GFLOPS/s\n");
              }
            }
          }
# endif
          LIBXSMM_ATOMIC_RELEASE(lock, LIBXSMM_ATOMIC_RELAXED);
        }
# if defined(OPENCL_LIBSMM_DEBUG_SMM)
        ACC_OPENCL_CHECK(c_dbcsr_acc_memcpy_d2h(dev_c_data, test, csize, stream),
          "transfer debug test", result);
# endif
# if defined(OPENCL_LIBSMM_DEBUG_SMM)
        ACC_OPENCL_CHECK(c_dbcsr_acc_stream_sync(stream), "sync stream", result);
# endif
# if defined(OPENCL_LIBSMM_DEBUG_SMM)
        if (EXIT_SUCCESS == result) {
          const char *const env_tol = getenv("OPENCL_LIBSMM_SMM_TOLERANCE");
          const double tolerance = ((NULL == env_tol || '\0' == *env_tol) ? 1E-3 : atof(env_tol));
          const int *const params = host_param_stack + (4 <= nparams ? (nparams - 4) : 0);
          size_t i;
          if (0 != c_dbcsr_acc_opencl_config.verbosity) {
            fprintf(stderr, "libsmm_acc_process(size=%i, type=%s, m=%i, n=%i, k=%i, max=%i, stream=%p)", stack_size,
              dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"),
              m_max, n_max, k_max, max_kernel_dim, stream);
          }
          for (i = 0; i < ((size_t)stack_size * nparams); i += nparams) {
            const size_t ia = (size_t)(params[i+0] - 1) * typesize;
            const size_t ib = (size_t)(params[i+1] - 1) * typesize;
            const size_t ic = (size_t)(params[i+2] - 1) * typesize;
            assert(ia < asize && ib < bsize && ic < csize);
            libxsmm_otrans(btrn, binp + ib, typesize, n_max, k_max, n_max, k_max);
            kernel.xmm(ainp + ia, btrn, gold + ic);
          }
          /* some result may be validated multiple times in case of duplicated c-indexes */
          for (i = 0; i < ((size_t)stack_size * nparams); i += nparams) {
            const size_t ic = (size_t)(params[i+2] - 1) * typesize;
            libxsmm_matdiff_info diff;
            libxsmm_matdiff(&diff, (libxsmm_datatype)precision, m_max, n_max,
              gold + ic, test + ic, &m_max/*ldref*/, &m_max/*ldtst*/);
            if (tolerance < diff.normf_rel) {
              if (0 == c_dbcsr_acc_opencl_config.verbosity) {
                fprintf(stderr, "libsmm_acc_process(size=%i, type=%s, m=%i, n=%i, k=%i, max=%i, stream=%p)", stack_size,
                  dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"),
                  m_max, n_max, k_max, max_kernel_dim, stream);
              }
#   if  LIBXSMM_VERSION3(1, 16, 1) <= LIBXSMM_VERSION3(LIBXSMM_VERSION_MAJOR, \
          LIBXSMM_VERSION_MINOR, LIBXSMM_VERSION_UPDATE) && 1014 <= LIBXSMM_VERSION_PATCH
              fprintf(stderr, " => ERROR diff=%g (%g != %g)\n", diff.linf_abs, diff.v_ref, diff.v_tst);
#   else
              fprintf(stderr, " => ERROR diff=%g\n", diff.linf_abs);
#   endif
#   if defined(_DEBUG)
              opencl_libsmm_print_matrix(stderr, "gold = ", datatype, gold + ic, m_max, n_max);
              opencl_libsmm_print_matrix(stderr, "test = ", datatype, test + ic, m_max, n_max);
              fprintf(stderr, "\n");
#   endif
#   if defined(OPENCL_LIBSMM_DEBUG_EXIT)
              exit(EXIT_FAILURE);
#   else
              result = EXIT_FAILURE;
              break;
#   endif
            }
          }
          if (0 != c_dbcsr_acc_opencl_config.verbosity && EXIT_SUCCESS == result) {
            fprintf(stderr, " => OK\n");
          }
        }
        libxsmm_free(scratch);
# elif defined(NDEBUG)
        ACC_OPENCL_UNUSED(host_param_stack);
        ACC_OPENCL_UNUSED(nparams);
# endif
      }
    }
  }
  else if (0 < stack_size) { /* inhomogeneous, large kernel, or unsupported datatype */
    return -1; /* TODO: document result code to trigger host-fallback */
  }
#else
  result = EXIT_FAILURE;
#endif
  ACC_OPENCL_RETURN(result);
}

#if defined(__cplusplus)
}
#endif

#endif /*__OPENCL*/
